* [RFC PATCH 01/12] perf/x86: Use x86_perf_regs in the x86 nmi handler
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 02/12] perf/x86: Setup the regs data kan.liang
` (12 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
More and more regs will be supported in the overflow, e.g., more vector
registers, SSP, etc. The generic pt_regs struct cannot store all of
them. Use the X86 specific x86_perf_regs instead. The x86_perf_regs is
already used in PEBS to store the XMM registers.
The struct pt_regs *regs is still passed to x86_pmu_handle_irq(). There
is no functional change for the existing code.
AMD IBS's NMI handler doesn't utilize the static call
x86_pmu_handle_irq(). The x86_perf_regs struct doesn't apply to the AMD
IBS. It can be added separately later when AMD IBS supports more regs.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7610f26dfbd9..64a7a8aa2e38 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1752,6 +1752,7 @@ void perf_events_lapic_init(void)
static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ struct x86_perf_regs x86_regs;
u64 start_clock;
u64 finish_clock;
int ret;
@@ -1764,7 +1765,8 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE;
start_clock = sched_clock();
- ret = static_call(x86_pmu_handle_irq)(regs);
+ x86_regs.regs = *regs;
+ ret = static_call(x86_pmu_handle_irq)(&x86_regs.regs);
finish_clock = sched_clock();
perf_sample_event_took(finish_clock - start_clock);
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 02/12] perf/x86: Setup the regs data
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
2025-06-13 13:49 ` [RFC PATCH 01/12] perf/x86: Use x86_perf_regs in the x86 nmi handler kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi kan.liang
` (11 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
The current code relies on the generic code to setup the regs data.
It will not work well when there are more regs introduced.
Introduce a X86-specific x86_pmu_setup_regs_data().
Now, it's the same as the generic code. More X86-specific codes will be
added later with the new regs.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 32 ++++++++++++++++++++++++++++++++
arch/x86/events/intel/ds.c | 4 +++-
arch/x86/events/perf_event.h | 4 ++++
3 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 64a7a8aa2e38..c601ad761534 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1685,6 +1685,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)
static_call_cond(x86_pmu_del)(event);
}
+void x86_pmu_setup_regs_data(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ if (user_mode(regs)) {
+ data->regs_user.abi = perf_reg_abi(current);
+ data->regs_user.regs = regs;
+ } else if (!(current->flags & PF_KTHREAD)) {
+ perf_get_regs_user(&data->regs_user, regs);
+ } else {
+ data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
+ data->regs_user.regs = NULL;
+ }
+ data->dyn_size += sizeof(u64);
+ if (data->regs_user.regs)
+ data->dyn_size += hweight64(event->attr.sample_regs_user) * sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ data->regs_intr.regs = regs;
+ data->regs_intr.abi = perf_reg_abi(current);
+ data->dyn_size += sizeof(u64);
+ if (data->regs_intr.regs)
+ data->dyn_size += hweight64(event->attr.sample_regs_intr) * sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
+ }
+}
+
int x86_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c0b7ac1c7594..e67d8a03ddfe 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2126,8 +2126,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
regs->flags &= ~PERF_EFLAGS_EXACT;
}
- if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
+ if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
adaptive_pebs_save_regs(regs, gprs);
+ x86_pmu_setup_regs_data(event, data, regs);
+ }
}
if (format_group & PEBS_DATACFG_MEMINFO) {
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2b969386dcdd..12682a059608 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1278,6 +1278,10 @@ void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
+void x86_pmu_setup_regs_data(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
+
void x86_pmu_show_pmu_cap(struct pmu *pmu);
static inline int x86_pmu_num_counters(struct pmu *pmu)
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
2025-06-13 13:49 ` [RFC PATCH 01/12] perf/x86: Use x86_perf_regs in the x86 nmi handler kan.liang
2025-06-13 13:49 ` [RFC PATCH 02/12] perf/x86: Setup the regs data kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 14:39 ` Dave Hansen
2025-06-13 13:49 ` [RFC PATCH 04/12] perf: Move has_extended_regs() to header file kan.liang
` (10 subsequent siblings)
13 siblings, 1 reply; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Linux perf_event subsystem needs to retrieve the current vector
registers in an overflow. The overflow is handled by NMI.
Add an interface to retrieve the actual register contents when the NMI
hit. It's the invoker's responsibility to make sure the contents are
properly filtered before exposing them to the end user.
The mask may be changed according to the end user's request. The XSAVES
with the modified optimizations are chosen.
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/include/asm/fpu/xstate.h | 1 +
arch/x86/kernel/fpu/xstate.c | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+)
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b308a76afbb7..87c170d61138 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -106,6 +106,7 @@ extern void __init update_regset_xstate_info(unsigned int size,
int xfeature_size(int xfeature_nr);
void xsaves(struct xregs_state *xsave, u64 mask);
+void xsaves_nmi(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);
int xfd_enable_feature(u64 xfd_err);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9aa9ac8399ae..5b0bae135aff 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1424,6 +1424,28 @@ void xsaves(struct xregs_state *xstate, u64 mask)
WARN_ON_ONCE(err);
}
+/**
+ * xsaves_nmi - Save selected components to a kernel xstate buffer in NMI
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to save
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header.
+ *
+ * This function can only be invoked in an NMI. It returns the *ACTUAL*
+ * register contents when the NMI hit.
+ */
+void xsaves_nmi(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!in_nmi())
+ return;
+
+ XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
/**
* xrstors - Restore selected components from a kernel xstate buffer
* @xstate: Pointer to the buffer
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi
2025-06-13 13:49 ` [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi kan.liang
@ 2025-06-13 14:39 ` Dave Hansen
2025-06-13 14:54 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 14:39 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
> + * This function can only be invoked in an NMI. It returns the *ACTUAL*
> + * register contents when the NMI hit.
Yes, but why is this important and what are the implications?
It's important because all of the other mechanisms that deal with xstate
are _trying_ to get something coherent. They're trying to, for instance,
poke at the PKRU register for userspace and we need to ensure that the
PKRU value that's being targeted is for the right task and is actually
in memory (if that's what we're after).
This interface is totally *in*coherent. There's no telling what was in
the registers when the NMI hit. That seems crazy compared to all the
other FPU code in the kernel. But it's actually OK for perf because
there's a separate hardware mechanism that saves XSAVE-managed state off
to memory. That mechanism also writes whatever was in the registers when
the NMI hit. It's also completely incoherent.
That's really the only reason this insanity is OK. perf can _already_
handle XSAVE "snapshots" from random code running. This just provides
another XSAVE data source at a random time.
Could we get some of that ^ into the changelog and function comment, please?
One other thing...
XSAVES uses the modified optimization. That means if you did something
like this:
NMI=>
xsaves_nmi();
<=IRET
... run a little bit in the kernel
NMI=> // another NMI
xsaves_nmi();
<=IRET
The second XSAVES might not actually write anything to the buffer
because the registers didn't change (they weren't modified). Is that OK?
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi
2025-06-13 14:39 ` Dave Hansen
@ 2025-06-13 14:54 ` Liang, Kan
2025-06-13 15:19 ` Dave Hansen
0 siblings, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-13 14:54 UTC (permalink / raw)
To: Dave Hansen, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 2025-06-13 10:39 a.m., Dave Hansen wrote:
> On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
>> + * This function can only be invoked in an NMI. It returns the *ACTUAL*
>> + * register contents when the NMI hit.
>
> Yes, but why is this important and what are the implications?
>
> It's important because all of the other mechanisms that deal with xstate
> are _trying_ to get something coherent. They're trying to, for instance,
> poke at the PKRU register for userspace and we need to ensure that the
> PKRU value that's being targeted is for the right task and is actually
> in memory (if that's what we're after).
>
> This interface is totally *in*coherent. There's no telling what was in
> the registers when the NMI hit. That seems crazy compared to all the
> other FPU code in the kernel. But it's actually OK for perf because
> there's a separate hardware mechanism that saves XSAVE-managed state off
> to memory. That mechanism also writes whatever was in the registers when
> the NMI hit. It's also completely incoherent.
>
> That's really the only reason this insanity is OK. perf can _already_
> handle XSAVE "snapshots" from random code running. This just provides
> another XSAVE data source at a random time.
>
> Could we get some of that ^ into the changelog and function comment, please?
Sure. Thanks for the details. I will add it in both the changelog and
function comments.
>
> One other thing...
>
> XSAVES uses the modified optimization. That means if you did something
> like this:
>
> NMI=>
> xsaves_nmi();
> <=IRET
> ... run a little bit in the kernel
> NMI=> // another NMI
> xsaves_nmi();
> <=IRET
>
> The second XSAVES might not actually write anything to the buffer
> because the registers didn't change (they weren't modified). Is that OK?
Yes. The per-cpu buffer in perf is only used by this XSAVES. No one will
clear it or modify it between the two xsaves_nmi().
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi
2025-06-13 14:54 ` Liang, Kan
@ 2025-06-13 15:19 ` Dave Hansen
0 siblings, 0 replies; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 15:19 UTC (permalink / raw)
To: Liang, Kan, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 6/13/25 07:54, Liang, Kan wrote:
>> One other thing...
>>
>> XSAVES uses the modified optimization. That means if you did something
>> like this:
>>
>> NMI=>
>> xsaves_nmi();
>> <=IRET
>> ... run a little bit in the kernel
>> NMI=> // another NMI
>> xsaves_nmi();
>> <=IRET
>>
>> The second XSAVES might not actually write anything to the buffer
>> because the registers didn't change (they weren't modified). Is that OK?
> Yes. The per-cpu buffer in perf is only used by this XSAVES. No one will
> clear it or modify it between the two xsaves_nmi().
What prevents it from being freed, reallocated to some other user, freed
and reallocated to the per-cpu buffer? ;)
But, really, the thing that saves you here is that the modified
optimization only comes into effect if you XRSTOR from an XSAVE buffer.
This buffer is only ever XSAVES'd to.
So I think it's safe ... from the modified optimization. I'll look at
the init optimization later in the series.
^ permalink raw reply [flat|nested] 60+ messages in thread
* [RFC PATCH 04/12] perf: Move has_extended_regs() to header file
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (2 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 03/12] x86/fpu/xstate: Add xsaves_nmi kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER kan.liang
` (9 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
The function will also be used in the ARCH-specific code.
Rename it to follow the naming rule of the existing functions.
No functional change.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 8 ++++++++
kernel/events/core.c | 8 +-------
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 52dc7cfab0e0..74c188a699e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1488,6 +1488,14 @@ perf_event__output_id_sample(struct perf_event *event,
extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);
+static inline bool event_has_extended_regs(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ return (attr->sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cc77f127e11a..7f0d98d73629 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12502,12 +12502,6 @@ int perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
-static inline bool has_extended_regs(struct perf_event *event)
-{
- return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
- (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
-}
-
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
@@ -12542,7 +12536,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
goto err_pmu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event)) {
+ event_has_extended_regs(event)) {
ret = -EOPNOTSUPP;
goto err_destroy;
}
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (3 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 04/12] perf: Move has_extended_regs() to header file kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 15:15 ` Dave Hansen
2025-06-13 15:34 ` Dave Hansen
2025-06-13 13:49 ` [RFC PATCH 06/12] perf: Support extension of sample_regs kan.liang
` (8 subsequent siblings)
13 siblings, 2 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Collecting the XMM registers in a PEBS record has been supported since
the Icelake. But non-PEBS events don't supported the feature. It's
possible to retrieve the XMM registers from the XSAVE for non-PEBS.
Add it to make the support complete.
To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
per-CPU ext_regs_buf to store the vector registers.
Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
perf_get_regs_user() returns the regs from the task_pt_regs(current),
which is struct pt_regs. Need to move it to local struct x86_perf_regs
x86_user_regs.
For PEBS, the HW support is still preferred. The XMM should be retrieved
from PEBS records.
There could be more vector registers supported later. Add ext_regs_mask
to track the supported vector register group.
For now, the feature is only supported for newer Intel platforms (PEBS
V4+ or archPerfmonExt (0x23)). In theory, the vector registers can be
retrieved as long as the CPU supports. The support for the old
generations may be added later if there is a requirement.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 119 ++++++++++++++++++++++++++++++-----
arch/x86/events/intel/core.c | 27 ++++++++
arch/x86/events/intel/ds.c | 10 ++-
arch/x86/events/perf_event.h | 12 +++-
4 files changed, 148 insertions(+), 20 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index c601ad761534..6b1c347cc17a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -406,6 +406,63 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+static DEFINE_PER_CPU(void *, ext_regs_buf);
+
+static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
+{
+ void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
+ struct xregs_state *xregs_xsave = xsave;
+ u64 xcomp_bv;
+
+ if (WARN_ON_ONCE(!xsave))
+ return;
+
+ xsaves_nmi(xsave, mask);
+
+ xcomp_bv = xregs_xsave->header.xcomp_bv;
+ if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
+ perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
+}
+
+static void release_ext_regs_buffers(void)
+{
+ int cpu;
+
+ if (!x86_pmu.ext_regs_mask)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ kfree(per_cpu(ext_regs_buf, cpu));
+ per_cpu(ext_regs_buf, cpu) = NULL;
+ }
+}
+
+static void reserve_ext_regs_buffers(void)
+{
+ size_t size;
+ int cpu;
+
+ if (!x86_pmu.ext_regs_mask)
+ return;
+
+ size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ /* XSAVE feature requires 64-byte alignment. */
+ size += 64;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(ext_regs_buf, cpu) = kzalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!per_cpu(ext_regs_buf, cpu))
+ goto err;
+ }
+
+ return;
+
+err:
+ release_ext_regs_buffers();
+}
+
int x86_reserve_hardware(void)
{
int err = 0;
@@ -418,6 +475,7 @@ int x86_reserve_hardware(void)
} else {
reserve_ds_buffers();
reserve_lbr_buffers();
+ reserve_ext_regs_buffers();
}
}
if (!err)
@@ -434,6 +492,7 @@ void x86_release_hardware(void)
release_pmc_hardware();
release_ds_buffers();
release_lbr_buffers();
+ release_ext_regs_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -642,21 +701,18 @@ int x86_pmu_hw_config(struct perf_event *event)
return -EINVAL;
}
- /* sample_regs_user never support XMM registers */
- if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
- return -EINVAL;
- /*
- * Besides the general purpose registers, XMM registers may
- * be collected in PEBS on some platforms, e.g. Icelake
- */
- if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
- if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
- return -EINVAL;
-
- if (!event->attr.precise_ip)
- return -EINVAL;
+ if (event->attr.sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+ /*
+ * Besides the general purpose registers, XMM registers may
+ * be collected as well.
+ */
+ if (event_has_extended_regs(event)) {
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
+ return -EINVAL;
+ if (!(x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_XMM)))
+ return -EINVAL;
+ }
}
-
return x86_setup_perfctr(event);
}
@@ -1685,18 +1741,40 @@ static void x86_pmu_del(struct perf_event *event, int flags)
static_call_cond(x86_pmu_del)(event);
}
+static DEFINE_PER_CPU(struct x86_perf_regs, x86_user_regs);
+
+static struct x86_perf_regs *
+x86_pmu_perf_get_regs_user(struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct x86_perf_regs *x86_regs_user = this_cpu_ptr(&x86_user_regs);
+ struct perf_regs regs_user;
+
+ perf_get_regs_user(®s_user, regs);
+ data->regs_user.abi = regs_user.abi;
+ x86_regs_user->regs = *regs_user.regs;
+ data->regs_user.regs = &x86_regs_user->regs;
+ return x86_regs_user;
+}
+
void x86_pmu_setup_regs_data(struct perf_event *event,
struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ u64 ignore_mask)
{
+ struct x86_perf_regs *perf_regs = container_of(regs, struct x86_perf_regs, regs);
u64 sample_type = event->attr.sample_type;
+ u64 mask = 0;
+
+ if (!(event->attr.sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)))
+ return;
if (sample_type & PERF_SAMPLE_REGS_USER) {
if (user_mode(regs)) {
data->regs_user.abi = perf_reg_abi(current);
data->regs_user.regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
- perf_get_regs_user(&data->regs_user, regs);
+ perf_regs = x86_pmu_perf_get_regs_user(data, regs);
} else {
data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
data->regs_user.regs = NULL;
@@ -1715,6 +1793,15 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
data->dyn_size += hweight64(event->attr.sample_regs_intr) * sizeof(u64);
data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
+
+ if (event_has_extended_regs(event)) {
+ perf_regs->xmm_regs = NULL;
+ mask |= XFEATURE_MASK_SSE;
+ }
+
+ mask &= ~ignore_mask;
+ if (mask)
+ x86_pmu_get_ext_regs(perf_regs, mask);
}
int x86_pmu_handle_irq(struct pt_regs *regs)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c2fb729c270e..5706ee562684 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3284,6 +3284,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
if (has_branch_stack(event))
intel_pmu_lbr_save_brstack(&data, cpuc, event);
+ x86_pmu_setup_regs_data(event, &data, regs, 0);
+
perf_event_overflow(event, &data, regs);
}
@@ -5272,6 +5274,29 @@ static inline bool intel_pmu_broken_perf_cap(void)
return false;
}
+static void intel_extended_regs_init(struct pmu *pmu)
+{
+ /*
+ * Extend the vector registers support to non-PEBS.
+ * The feature is limited to newer Intel machines with
+ * PEBS V4+ or archPerfmonExt (0x23) enabled for now.
+ * In theory, the vector registers can be retrieved as
+ * long as the CPU supports. The support for the old
+ * generations may be added later if there is a
+ * requirement.
+ * Only support the extension when XSAVES is available.
+ */
+ if (!boot_cpu_has(X86_FEATURE_XSAVES))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_XMM) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return;
+
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_XMM);
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+}
+
static void update_pmu_cap(struct pmu *pmu)
{
unsigned int cntr, fixed_cntr, ecx, edx;
@@ -5306,6 +5331,8 @@ static void update_pmu_cap(struct pmu *pmu)
/* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities);
}
+
+ intel_extended_regs_init(pmu);
}
static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e67d8a03ddfe..ccb5c3ddab3b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1415,8 +1415,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
if (gprs || (attr->precise_ip < 2) || tsx_weight)
pebs_data_cfg |= PEBS_DATACFG_GP;
- if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
- (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
+ if (event_has_extended_regs(event))
pebs_data_cfg |= PEBS_DATACFG_XMMS;
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -2127,8 +2126,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+ u64 mask = 0;
+
adaptive_pebs_save_regs(regs, gprs);
- x86_pmu_setup_regs_data(event, data, regs);
+ if (format_group & PEBS_DATACFG_XMMS)
+ mask |= XFEATURE_MASK_SSE;
+ x86_pmu_setup_regs_data(event, data, regs, mask);
}
}
@@ -2755,6 +2758,7 @@ void __init intel_pebs_init(void)
x86_pmu.flags |= PMU_FL_PEBS_ALL;
x86_pmu.pebs_capable = ~0ULL;
pebs_qual = "-baseline";
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_XMM);
x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 12682a059608..b48f4215f37c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -687,6 +687,10 @@ enum {
x86_lbr_exclusive_max,
};
+enum {
+ X86_EXT_REGS_XMM = 0,
+};
+
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10
@@ -992,6 +996,11 @@ struct x86_pmu {
struct extra_reg *extra_regs;
unsigned int flags;
+ /*
+ * Extended regs, e.g., vector registers
+ */
+ u64 ext_regs_mask;
+
/*
* Intel host/guest support (KVM)
*/
@@ -1280,7 +1289,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_setup_regs_data(struct perf_event *event,
struct perf_sample_data *data,
- struct pt_regs *regs);
+ struct pt_regs *regs,
+ u64 ignore_mask);
void x86_pmu_show_pmu_cap(struct pmu *pmu);
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER
2025-06-13 13:49 ` [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER kan.liang
@ 2025-06-13 15:15 ` Dave Hansen
2025-06-13 17:51 ` Liang, Kan
2025-06-13 15:34 ` Dave Hansen
1 sibling, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 15:15 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
> +static DEFINE_PER_CPU(void *, ext_regs_buf);
This should probably use one of the types in asm/fpu/types.h, not void*.
> +static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
> +{
> + void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
I'd just align the allocation to avoid having to align it at runtime
like this.
> + struct xregs_state *xregs_xsave = xsave;
> + u64 xcomp_bv;
> +
> + if (WARN_ON_ONCE(!xsave))
> + return;
> +
> + xsaves_nmi(xsave, mask);
> +
> + xcomp_bv = xregs_xsave->header.xcomp_bv;
> + if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
> + perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
> +}
Could we please align the types on:
perf_regs->xmm_regs
and
xregs_xsave->i387.xmm_space
so that no casting is required?
> +static void reserve_ext_regs_buffers(void)
> +{
> + size_t size;
> + int cpu;
> +
> + if (!x86_pmu.ext_regs_mask)
> + return;
> +
> + size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
> +
> + /* XSAVE feature requires 64-byte alignment. */
> + size += 64;
Does this actually work? ;)
Take a look at your system when it boots. You should see some helpful
pr_info()'s:
> [ 0.137276] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
> [ 0.138799] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
> [ 0.139681] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
> [ 0.140576] x86/fpu: Supporting XSAVE feature 0x020: 'AVX-512 opmask'
> [ 0.141569] x86/fpu: Supporting XSAVE feature 0x040: 'AVX-512 Hi256'
> [ 0.142804] x86/fpu: Supporting XSAVE feature 0x080: 'AVX-512 ZMM_Hi256'
> [ 0.143665] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys User registers'
> [ 0.144436] x86/fpu: xstate_offset[2]: 576, xstate_sizes[2]: 256
> [ 0.145290] x86/fpu: xstate_offset[5]: 832, xstate_sizes[5]: 64
> [ 0.146238] x86/fpu: xstate_offset[6]: 896, xstate_sizes[6]: 512
> [ 0.146803] x86/fpu: xstate_offset[7]: 1408, xstate_sizes[7]: 1024
> [ 0.147397] x86/fpu: xstate_offset[9]: 2432, xstate_sizes[9]: 8
> [ 0.147986] x86/fpu: Enabled xstate features 0x2e7, context size is 2440 bytes, using 'compacted' format.
Notice that we're talking about a buffer which is ~2k in size when
AVX-512 is in play. Is 'size' above that big?
> + for_each_possible_cpu(cpu) {
> + per_cpu(ext_regs_buf, cpu) = kzalloc_node(size, GFP_KERNEL,
> + cpu_to_node(cpu));
> + if (!per_cpu(ext_regs_buf, cpu))
> + goto err;
> + }
Right now, any kmalloc() >=256b is going to be rounded up and aligned to
a power of 2 and thus also be 64b aligned although this is just an
implementation detail today. There's a _guarantee_ that all kmalloc()'s
with powers of 2 are naturally aligned and also 64b aligned.
In other words, in practice, these kzalloc_node() are 64b aligned and
rounded up to a power of 2 size.
You can *guarantee* they'll be 64b aligned by just rounding size up to
the next power of 2. This won't increase the size because they're
already being rounded up internally.
I can also grumble a little bit because this reinvents the wheel, and I
suspect it'll continue reinventing the wheel when it actually sizes the
buffer correctly.
We already have code in the kernel to dynamically allocate an fpstate:
fpstate_realloc(). It uses vmalloc() which wouldn't be my first choice
for this, but I also don't think it will hurt much. Looking at it, I'm
not sure how much of it you want to refactor and reuse, but you should
at least take a look.
There's also xstate_calculate_size(). That, you _definitely_ want to use
if you end up doing your own allocations.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER
2025-06-13 15:15 ` Dave Hansen
@ 2025-06-13 17:51 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-13 17:51 UTC (permalink / raw)
To: Dave Hansen, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 2025-06-13 11:15 a.m., Dave Hansen wrote:
>> +static DEFINE_PER_CPU(void *, ext_regs_buf);
>
> This should probably use one of the types in asm/fpu/types.h, not void*.
>
>> +static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
>> +{
>> + void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
>
> I'd just align the allocation to avoid having to align it at runtime
> like this.
>
>> + struct xregs_state *xregs_xsave = xsave;
>> + u64 xcomp_bv;
>> +
>> + if (WARN_ON_ONCE(!xsave))
>> + return;
>> +
>> + xsaves_nmi(xsave, mask);
>> +
>> + xcomp_bv = xregs_xsave->header.xcomp_bv;
>> + if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
>> + perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
>> +}
>
> Could we please align the types on:
>
> perf_regs->xmm_regs
> and
> xregs_xsave->i387.xmm_space
>
> so that no casting is required?
>
>> +static void reserve_ext_regs_buffers(void)
>> +{
>> + size_t size;
>> + int cpu;
>> +
>> + if (!x86_pmu.ext_regs_mask)
>> + return;
>> +
>> + size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
>> +
>> + /* XSAVE feature requires 64-byte alignment. */
>> + size += 64;
>
> Does this actually work? ;)
>
> Take a look at your system when it boots. You should see some helpful
> pr_info()'s:
>
>> [ 0.137276] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
>> [ 0.138799] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
>> [ 0.139681] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
>> [ 0.140576] x86/fpu: Supporting XSAVE feature 0x020: 'AVX-512 opmask'
>> [ 0.141569] x86/fpu: Supporting XSAVE feature 0x040: 'AVX-512 Hi256'
>> [ 0.142804] x86/fpu: Supporting XSAVE feature 0x080: 'AVX-512 ZMM_Hi256'
>> [ 0.143665] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys User registers'
>> [ 0.144436] x86/fpu: xstate_offset[2]: 576, xstate_sizes[2]: 256
>> [ 0.145290] x86/fpu: xstate_offset[5]: 832, xstate_sizes[5]: 64
>> [ 0.146238] x86/fpu: xstate_offset[6]: 896, xstate_sizes[6]: 512
>> [ 0.146803] x86/fpu: xstate_offset[7]: 1408, xstate_sizes[7]: 1024
>> [ 0.147397] x86/fpu: xstate_offset[9]: 2432, xstate_sizes[9]: 8
>> [ 0.147986] x86/fpu: Enabled xstate features 0x2e7, context size is 2440 bytes, using 'compacted' format.
>
> Notice that we're talking about a buffer which is ~2k in size when
> AVX-512 is in play. Is 'size' above that big?
>
>> + for_each_possible_cpu(cpu) {
>> + per_cpu(ext_regs_buf, cpu) = kzalloc_node(size, GFP_KERNEL,
>> + cpu_to_node(cpu));
>> + if (!per_cpu(ext_regs_buf, cpu))
>> + goto err;
>> + }
>
> Right now, any kmalloc() >=256b is going to be rounded up and aligned to
> a power of 2 and thus also be 64b aligned although this is just an
> implementation detail today. There's a _guarantee_ that all kmalloc()'s
> with powers of 2 are naturally aligned and also 64b aligned.
>
> In other words, in practice, these kzalloc_node() are 64b aligned and
> rounded up to a power of 2 size.
>
> You can *guarantee* they'll be 64b aligned by just rounding size up to
> the next power of 2. This won't increase the size because they're
> already being rounded up internally.
>
> I can also grumble a little bit because this reinvents the wheel, and I
> suspect it'll continue reinventing the wheel when it actually sizes the
> buffer correctly.
>
> We already have code in the kernel to dynamically allocate an fpstate:
> fpstate_realloc(). It uses vmalloc() which wouldn't be my first choice
> for this, but I also don't think it will hurt much. Looking at it, I'm
> not sure how much of it you want to refactor and reuse, but you should
> at least take a look.
>
> There's also xstate_calculate_size(). That, you _definitely_ want to use
> if you end up doing your own allocations.
>
The fpstate_realloc() seems too complicate for this simple usage.
I will use the xstate_calculate_size() to get the size of each
component. The size is the real size after 64-byte alignment for each
component. Then the vmalloc() is used to allocate the buffer. I think it
should be good enough for this usage.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER
2025-06-13 13:49 ` [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER kan.liang
2025-06-13 15:15 ` Dave Hansen
@ 2025-06-13 15:34 ` Dave Hansen
2025-06-13 18:14 ` Liang, Kan
1 sibling, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 15:34 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
> +static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
> +{
> + void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
> + struct xregs_state *xregs_xsave = xsave;
> + u64 xcomp_bv;
> +
> + if (WARN_ON_ONCE(!xsave))
> + return;
> +
> + xsaves_nmi(xsave, mask);
> +
> + xcomp_bv = xregs_xsave->header.xcomp_bv;
> + if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
> + perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
> +}
Now that I'm thinking about the init optimization... This is buggy.
Isn't XSAVE fun?
Here's a little primer:
xcomp_bv - tells you what the format of the buffer is.
Which states are where.
xstate_bv - (aka. xfeatures) tells you which things XSAVES
wrote to the buffer.
It's totally valid to have a feature set in xcomp_bv but not xstate_bv.
xcomp_bv is actually pretty boring:
The XSAVES instructions sets bit 63 of the XCOMP_BV field of the
XSAVE header while writing RFBM[62:0] to XCOMP_BV[62:0]
Since you know the RFBM, you also know xstate_bv. You don't need to read
it out of the buffer even.
Oh, and what's with the:
xcomp_bv & XFEATURE_SSE
? xcomp_bv is a bitmap, just like 'mask'
So, what do you do when
if (!(xregs_xsave->header.xfeatures & XFEATURE_MASK_SSE))
... here?
The "XSAVE-Enabled Registers Group" docs say:
The first eight bytes include the XSAVES instruction’s XSTATE_BV
bit vector (reflecting INIT optimization). This field
is in XCR0 format.
So the PEBS parsing code has to know how to deal with this situation too
and not copy the xmm_regs out to users.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER
2025-06-13 15:34 ` Dave Hansen
@ 2025-06-13 18:14 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-13 18:14 UTC (permalink / raw)
To: Dave Hansen, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 2025-06-13 11:34 a.m., Dave Hansen wrote:
> On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
>> +static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
>> +{
>> + void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
>> + struct xregs_state *xregs_xsave = xsave;
>> + u64 xcomp_bv;
>> +
>> + if (WARN_ON_ONCE(!xsave))
>> + return;
>> +
>> + xsaves_nmi(xsave, mask);
>> +
>> + xcomp_bv = xregs_xsave->header.xcomp_bv;
>> + if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
>> + perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
>> +}
>
> Now that I'm thinking about the init optimization... This is buggy.
>
> Isn't XSAVE fun?
>
> Here's a little primer:
>
> xcomp_bv - tells you what the format of the buffer is.
> Which states are where.
> xstate_bv - (aka. xfeatures) tells you which things XSAVES
> wrote to the buffer.
I got the definitions of the two reversed. :(
>
> It's totally valid to have a feature set in xcomp_bv but not xstate_bv.
>
> xcomp_bv is actually pretty boring:
>
> The XSAVES instructions sets bit 63 of the XCOMP_BV field of the
> XSAVE header while writing RFBM[62:0] to XCOMP_BV[62:0]
>
> Since you know the RFBM, you also know xstate_bv. You don't need to read
> it out of the buffer even.
>
> Oh, and what's with the:
>
> xcomp_bv & XFEATURE_SSE
>
> ? xcomp_bv is a bitmap, just like 'mask'
>
> So, what do you do when
>
> if (!(xregs_xsave->header.xfeatures & XFEATURE_MASK_SSE))
> ... here?
>
> The "XSAVE-Enabled Registers Group" docs say:
>
> The first eight bytes include the XSAVES instruction’s XSTATE_BV
> bit vector (reflecting INIT optimization). This field
> is in XCR0 format.
>
> So the PEBS parsing code has to know how to deal with this situation too
> and not copy the xmm_regs out to users.
Now, perf will copy all 0 to users if !perf_regs->xmm_regs. But 0 should
be a valid value for the xmm_regs. Perf probably need to dump a bitmap
which indicates what regs are really collected. It may be slightly
different from the requested bitmap because of the above case.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (4 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 05/12] perf/x86: Support XMM register for non-PEBS and REGS_USER kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-17 8:00 ` Mi, Dapeng
2025-06-17 8:14 ` Peter Zijlstra
2025-06-13 13:49 ` [RFC PATCH 07/12] perf/x86: Add YMMH in extended regs kan.liang
` (7 subsequent siblings)
13 siblings, 2 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
More regs may be required in a sample, e.g., the vector registers. The
current sample_regs_XXX has run out of space.
Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
as a bitmap for the extension regs. There will be more than 64 registers
added.
Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
supports sample_ext_regs_intr/user.
Extend the perf_reg_validate() to support the validation of the
extension regs.
Extend the perf_reg_value() to retrieve the extension regs. The regs may
be larger than u64. Add two parameters to store the pointer and size.
Add a dedicated perf_output_sample_ext_regs() to dump the extension
regs.
This is just a generic support for the extension regs. Any attempts to
manipulate the extension regs will error out, until the driver-specific
supports are implemented, which will be done in the following patch.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/arm/kernel/perf_regs.c | 9 +++--
arch/arm64/kernel/perf_regs.c | 9 +++--
arch/csky/kernel/perf_regs.c | 9 +++--
arch/loongarch/kernel/perf_regs.c | 8 +++--
arch/mips/kernel/perf_regs.c | 9 +++--
arch/powerpc/perf/perf_regs.c | 9 +++--
arch/riscv/kernel/perf_regs.c | 8 +++--
arch/s390/kernel/perf_regs.c | 9 +++--
arch/x86/kernel/perf_regs.c | 13 ++++++--
include/linux/perf_event.h | 15 +++++++++
include/linux/perf_regs.h | 29 +++++++++++++---
include/uapi/linux/perf_event.h | 8 +++++
kernel/events/core.c | 55 ++++++++++++++++++++++++++++---
13 files changed, 162 insertions(+), 28 deletions(-)
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
index 0529f90395c9..b6161c30bd40 100644
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -8,8 +8,10 @@
#include <asm/perf_regs.h>
#include <asm/ptrace.h>
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
return 0;
@@ -18,8 +20,11 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
+
if (!mask || mask & REG_RESERVED)
return -EINVAL;
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index b4eece3eb17d..668b54a7faf9 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -27,8 +27,10 @@ static u64 perf_ext_regs_value(int idx)
}
}
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX))
return 0;
@@ -77,10 +79,13 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
#define REG_RESERVED (~((1ULL << PERF_REG_ARM64_MAX) - 1))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
u64 reserved_mask = REG_RESERVED;
+ if (mask_ext)
+ return -EINVAL;
+
if (system_supports_sve())
reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG);
diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c
index 09b7f88a2d6a..5988ef55bf0a 100644
--- a/arch/csky/kernel/perf_regs.c
+++ b/arch/csky/kernel/perf_regs.c
@@ -8,8 +8,10 @@
#include <asm/perf_regs.h>
#include <asm/ptrace.h>
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
if (WARN_ON_ONCE((u32)idx >= PERF_REG_CSKY_MAX))
return 0;
@@ -18,8 +20,11 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
#define REG_RESERVED (~((1ULL << PERF_REG_CSKY_MAX) - 1))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
+
if (!mask || mask & REG_RESERVED)
return -EINVAL;
diff --git a/arch/loongarch/kernel/perf_regs.c b/arch/loongarch/kernel/perf_regs.c
index 263ac4ab5af6..798dadee75ff 100644
--- a/arch/loongarch/kernel/perf_regs.c
+++ b/arch/loongarch/kernel/perf_regs.c
@@ -25,8 +25,10 @@ u64 perf_reg_abi(struct task_struct *tsk)
}
#endif /* CONFIG_32BIT */
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask)
return -EINVAL;
if (mask & ~((1ull << PERF_REG_LOONGARCH_MAX) - 1))
@@ -34,8 +36,10 @@ int perf_reg_validate(u64 mask)
return 0;
}
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
if (WARN_ON_ONCE((u32)idx >= PERF_REG_LOONGARCH_MAX))
return 0;
diff --git a/arch/mips/kernel/perf_regs.c b/arch/mips/kernel/perf_regs.c
index e686780d1647..f3fcbf7e5aa6 100644
--- a/arch/mips/kernel/perf_regs.c
+++ b/arch/mips/kernel/perf_regs.c
@@ -28,8 +28,10 @@ u64 perf_reg_abi(struct task_struct *tsk)
}
#endif /* CONFIG_32BIT */
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask)
return -EINVAL;
if (mask & ~((1ull << PERF_REG_MIPS_MAX) - 1))
@@ -37,10 +39,13 @@ int perf_reg_validate(u64 mask)
return 0;
}
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
long v;
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
+
switch (idx) {
case PERF_REG_MIPS_PC:
v = regs->cp0_epc;
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 350dccb0143c..556466409c76 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -99,8 +99,11 @@ static u64 get_ext_regs_value(int idx)
}
}
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
+
if (idx == PERF_REG_POWERPC_SIER &&
(IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
IS_ENABLED(CONFIG_PPC32) ||
@@ -125,8 +128,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return regs_get_register(regs, pt_regs_offset[idx]);
}
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask || mask & REG_RESERVED)
return -EINVAL;
return 0;
diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c
index fd304a248de6..05a4f1e7b243 100644
--- a/arch/riscv/kernel/perf_regs.c
+++ b/arch/riscv/kernel/perf_regs.c
@@ -8,8 +8,10 @@
#include <asm/perf_regs.h>
#include <asm/ptrace.h>
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
if (WARN_ON_ONCE((u32)idx >= PERF_REG_RISCV_MAX))
return 0;
@@ -18,8 +20,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
#define REG_RESERVED (~((1ULL << PERF_REG_RISCV_MAX) - 1))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask || mask & REG_RESERVED)
return -EINVAL;
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index a6b058ee4a36..2e17ae51279e 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -7,10 +7,13 @@
#include <asm/ptrace.h>
#include <asm/fpu.h>
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
freg_t fp;
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
+
if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15)
return regs->gprs[idx];
@@ -34,8 +37,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
#define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask || mask & REG_RESERVED)
return -EINVAL;
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 624703af80a1..b9d5106afc26 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -57,10 +57,13 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
#endif
};
-u64 perf_reg_value(struct pt_regs *regs, int idx)
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
struct x86_perf_regs *perf_regs;
+ if (WARN_ON_ONCE(ext || ext_size))
+ return 0;
+
if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
perf_regs = container_of(regs, struct x86_perf_regs, regs);
if (!perf_regs->xmm_regs)
@@ -87,8 +90,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
(1ULL << PERF_REG_X86_R14) | \
(1ULL << PERF_REG_X86_R15))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
@@ -112,8 +117,10 @@ void perf_get_regs_user(struct perf_regs *regs_user,
(1ULL << PERF_REG_X86_FS) | \
(1ULL << PERF_REG_X86_GS))
-int perf_reg_validate(u64 mask)
+int perf_reg_validate(u64 mask, u64 *mask_ext)
{
+ if (mask_ext)
+ return -EINVAL;
if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 74c188a699e4..42b288ab4d2c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -305,6 +305,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
#define PERF_PMU_CAP_AUX_PAUSE 0x0200
#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400
+#define PERF_PMU_CAP_EXTENDED_REGS2 0x0800 /* sample_ext_regs_intr/user */
/**
* pmu::scope
@@ -1496,6 +1497,20 @@ static inline bool event_has_extended_regs(struct perf_event *event)
(attr->sample_regs_intr & PERF_REG_EXTENDED_MASK);
}
+static inline bool event_has_extended_regs2(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int i;
+
+ for (i = 0; i < PERF_ATTR_EXT_REGS_SIZE; i++) {
+ if (attr->sample_ext_regs_intr[i] ||
+ attr->sample_ext_regs_user[i])
+ return true;
+ }
+
+ return false;
+}
+
static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index f632c5725f16..6119bcb010fb 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -16,23 +16,42 @@ struct perf_regs {
#define PERF_REG_EXTENDED_MASK 0
#endif
-u64 perf_reg_value(struct pt_regs *regs, int idx);
-int perf_reg_validate(u64 mask);
+#define PERF_EXT_REGS_SIZE_MAX 8
+
+/**
+ * perf_reg_value - Get a reg value
+ * @regs: The area where stores all registers
+ * @idx: The index of the request register.
+ * The below @ext indicates the index is for
+ * a regular register or an extension register.
+ * @ext: Pointer to the buffer which stores the
+ * value of the request extension register.
+ * NULL means request for a regular register.
+ * @ext_size: Size of the extension register.
+ *
+ * If it fails, 0 returns.
+ * If it succeeds, for a regular register (!ext),
+ * the value of the register returns.
+ * For an extension register (ext), ext[0] returns.
+ */
+u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size);
+int perf_reg_validate(u64 mask, u64 *mask_ext);
u64 perf_reg_abi(struct task_struct *task);
void perf_get_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs);
#else
#define PERF_REG_EXTENDED_MASK 0
+#define PERF_EXT_REGS_SIZE_MAX 8
-static inline u64 perf_reg_value(struct pt_regs *regs, int idx)
+static inline u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
return 0;
}
-static inline int perf_reg_validate(u64 mask)
+static inline int perf_reg_validate(u64 mask, u64 *mask_ext)
{
- return mask ? -ENOSYS : 0;
+ return mask || mask_ext ? -ENOSYS : 0;
}
static inline u64 perf_reg_abi(struct task_struct *task)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 78a362b80027..e22ba72efcdb 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,10 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
+#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
+ /* Add: sample_ext_regs_user */
+
+#define PERF_ATTR_EXT_REGS_SIZE 2
/*
* 'struct perf_event_attr' contains various attributes that define
@@ -543,6 +547,10 @@ struct perf_event_attr {
__u64 sig_data;
__u64 config3; /* extension of config2 */
+
+ /* extension of sample_regs_XXX */
+ __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
+ __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
};
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f0d98d73629..c4279e1bf91a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7385,11 +7385,40 @@ perf_output_sample_regs(struct perf_output_handle *handle,
for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
u64 val;
- val = perf_reg_value(regs, bit);
+ val = perf_reg_value(regs, bit, NULL, NULL);
perf_output_put(handle, val);
}
}
+static void
+__perf_output_sample_ext_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask, int base)
+{
+ u64 val[PERF_EXT_REGS_SIZE_MAX];
+ int i, bit, size = 0;
+ DECLARE_BITMAP(_mask, 64);
+
+ if (!mask)
+ return;
+ bitmap_from_u64(_mask, mask);
+ for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
+ perf_reg_value(regs, bit + base, val, &size);
+
+ for (i = 0; i < size; i++)
+ perf_output_put(handle, val[i]);
+ }
+}
+
+static void
+perf_output_sample_ext_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 *mask)
+{
+ int i;
+
+ for (i = 0; i < PERF_ATTR_EXT_REGS_SIZE; i++)
+ __perf_output_sample_ext_regs(handle, regs, mask[i], i * 64);
+}
+
static void perf_sample_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs)
{
@@ -7940,9 +7969,14 @@ void perf_output_sample(struct perf_output_handle *handle,
if (abi) {
u64 mask = event->attr.sample_regs_user;
+ u64 *ext_mask = event->attr.sample_ext_regs_user;
+
perf_output_sample_regs(handle,
data->regs_user.regs,
mask);
+ perf_output_sample_ext_regs(handle,
+ data->regs_user.regs,
+ ext_mask);
}
}
@@ -7971,10 +8005,14 @@ void perf_output_sample(struct perf_output_handle *handle,
if (abi) {
u64 mask = event->attr.sample_regs_intr;
+ u64 *ext_mask = event->attr.sample_ext_regs_intr;
perf_output_sample_regs(handle,
data->regs_intr.regs,
mask);
+ perf_output_sample_ext_regs(handle,
+ data->regs_intr.regs,
+ ext_mask);
}
}
@@ -12535,6 +12573,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ret)
goto err_pmu;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS2) &&
+ event_has_extended_regs2(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
+
if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
event_has_extended_regs(event)) {
ret = -EOPNOTSUPP;
@@ -13073,7 +13117,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
- ret = perf_reg_validate(attr->sample_regs_user);
+ ret = perf_reg_validate(attr->sample_regs_user,
+ attr->sample_ext_regs_user);
if (ret)
return ret;
}
@@ -13096,8 +13141,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (!attr->sample_max_stack)
attr->sample_max_stack = sysctl_perf_event_max_stack;
- if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
- ret = perf_reg_validate(attr->sample_regs_intr);
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR) {
+ ret = perf_reg_validate(attr->sample_regs_intr,
+ attr->sample_ext_regs_intr);
+ }
#ifndef CONFIG_CGROUP_PERF
if (attr->sample_type & PERF_SAMPLE_CGROUP)
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-13 13:49 ` [RFC PATCH 06/12] perf: Support extension of sample_regs kan.liang
@ 2025-06-17 8:00 ` Mi, Dapeng
2025-06-17 8:14 ` Peter Zijlstra
1 sibling, 0 replies; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-17 8:00 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: ak, zide.chen
On 6/13/2025 9:49 PM, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> More regs may be required in a sample, e.g., the vector registers. The
> current sample_regs_XXX has run out of space.
>
> Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
> as a bitmap for the extension regs. There will be more than 64 registers
> added.
> Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
> supports sample_ext_regs_intr/user.
>
> Extend the perf_reg_validate() to support the validation of the
> extension regs.
>
> Extend the perf_reg_value() to retrieve the extension regs. The regs may
> be larger than u64. Add two parameters to store the pointer and size.
> Add a dedicated perf_output_sample_ext_regs() to dump the extension
> regs.
>
> This is just a generic support for the extension regs. Any attempts to
> manipulate the extension regs will error out, until the driver-specific
> supports are implemented, which will be done in the following patch.
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> arch/arm/kernel/perf_regs.c | 9 +++--
> arch/arm64/kernel/perf_regs.c | 9 +++--
> arch/csky/kernel/perf_regs.c | 9 +++--
> arch/loongarch/kernel/perf_regs.c | 8 +++--
> arch/mips/kernel/perf_regs.c | 9 +++--
> arch/powerpc/perf/perf_regs.c | 9 +++--
> arch/riscv/kernel/perf_regs.c | 8 +++--
> arch/s390/kernel/perf_regs.c | 9 +++--
> arch/x86/kernel/perf_regs.c | 13 ++++++--
> include/linux/perf_event.h | 15 +++++++++
> include/linux/perf_regs.h | 29 +++++++++++++---
> include/uapi/linux/perf_event.h | 8 +++++
> kernel/events/core.c | 55 ++++++++++++++++++++++++++++---
> 13 files changed, 162 insertions(+), 28 deletions(-)
>
> diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
> index 0529f90395c9..b6161c30bd40 100644
> --- a/arch/arm/kernel/perf_regs.c
> +++ b/arch/arm/kernel/perf_regs.c
> @@ -8,8 +8,10 @@
> #include <asm/perf_regs.h>
> #include <asm/ptrace.h>
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
> return 0;
>
> @@ -18,8 +20,11 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>
> #define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> +
> if (!mask || mask & REG_RESERVED)
> return -EINVAL;
>
> diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
> index b4eece3eb17d..668b54a7faf9 100644
> --- a/arch/arm64/kernel/perf_regs.c
> +++ b/arch/arm64/kernel/perf_regs.c
> @@ -27,8 +27,10 @@ static u64 perf_ext_regs_value(int idx)
> }
> }
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX))
> return 0;
>
> @@ -77,10 +79,13 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>
> #define REG_RESERVED (~((1ULL << PERF_REG_ARM64_MAX) - 1))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> u64 reserved_mask = REG_RESERVED;
>
> + if (mask_ext)
> + return -EINVAL;
> +
> if (system_supports_sve())
> reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG);
>
> diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c
> index 09b7f88a2d6a..5988ef55bf0a 100644
> --- a/arch/csky/kernel/perf_regs.c
> +++ b/arch/csky/kernel/perf_regs.c
> @@ -8,8 +8,10 @@
> #include <asm/perf_regs.h>
> #include <asm/ptrace.h>
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> if (WARN_ON_ONCE((u32)idx >= PERF_REG_CSKY_MAX))
> return 0;
>
> @@ -18,8 +20,11 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>
> #define REG_RESERVED (~((1ULL << PERF_REG_CSKY_MAX) - 1))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> +
> if (!mask || mask & REG_RESERVED)
> return -EINVAL;
>
> diff --git a/arch/loongarch/kernel/perf_regs.c b/arch/loongarch/kernel/perf_regs.c
> index 263ac4ab5af6..798dadee75ff 100644
> --- a/arch/loongarch/kernel/perf_regs.c
> +++ b/arch/loongarch/kernel/perf_regs.c
> @@ -25,8 +25,10 @@ u64 perf_reg_abi(struct task_struct *tsk)
> }
> #endif /* CONFIG_32BIT */
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask)
> return -EINVAL;
> if (mask & ~((1ull << PERF_REG_LOONGARCH_MAX) - 1))
> @@ -34,8 +36,10 @@ int perf_reg_validate(u64 mask)
> return 0;
> }
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> if (WARN_ON_ONCE((u32)idx >= PERF_REG_LOONGARCH_MAX))
> return 0;
>
> diff --git a/arch/mips/kernel/perf_regs.c b/arch/mips/kernel/perf_regs.c
> index e686780d1647..f3fcbf7e5aa6 100644
> --- a/arch/mips/kernel/perf_regs.c
> +++ b/arch/mips/kernel/perf_regs.c
> @@ -28,8 +28,10 @@ u64 perf_reg_abi(struct task_struct *tsk)
> }
> #endif /* CONFIG_32BIT */
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask)
> return -EINVAL;
> if (mask & ~((1ull << PERF_REG_MIPS_MAX) - 1))
> @@ -37,10 +39,13 @@ int perf_reg_validate(u64 mask)
> return 0;
> }
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> long v;
>
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> +
> switch (idx) {
> case PERF_REG_MIPS_PC:
> v = regs->cp0_epc;
> diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
> index 350dccb0143c..556466409c76 100644
> --- a/arch/powerpc/perf/perf_regs.c
> +++ b/arch/powerpc/perf/perf_regs.c
> @@ -99,8 +99,11 @@ static u64 get_ext_regs_value(int idx)
> }
> }
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> +
> if (idx == PERF_REG_POWERPC_SIER &&
> (IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
> IS_ENABLED(CONFIG_PPC32) ||
> @@ -125,8 +128,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
> return regs_get_register(regs, pt_regs_offset[idx]);
> }
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask || mask & REG_RESERVED)
> return -EINVAL;
> return 0;
> diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c
> index fd304a248de6..05a4f1e7b243 100644
> --- a/arch/riscv/kernel/perf_regs.c
> +++ b/arch/riscv/kernel/perf_regs.c
> @@ -8,8 +8,10 @@
> #include <asm/perf_regs.h>
> #include <asm/ptrace.h>
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> if (WARN_ON_ONCE((u32)idx >= PERF_REG_RISCV_MAX))
> return 0;
>
> @@ -18,8 +20,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>
> #define REG_RESERVED (~((1ULL << PERF_REG_RISCV_MAX) - 1))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask || mask & REG_RESERVED)
> return -EINVAL;
>
> diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
> index a6b058ee4a36..2e17ae51279e 100644
> --- a/arch/s390/kernel/perf_regs.c
> +++ b/arch/s390/kernel/perf_regs.c
> @@ -7,10 +7,13 @@
> #include <asm/ptrace.h>
> #include <asm/fpu.h>
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> freg_t fp;
>
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> +
> if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15)
> return regs->gprs[idx];
>
> @@ -34,8 +37,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>
> #define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask || mask & REG_RESERVED)
> return -EINVAL;
>
> diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
> index 624703af80a1..b9d5106afc26 100644
> --- a/arch/x86/kernel/perf_regs.c
> +++ b/arch/x86/kernel/perf_regs.c
> @@ -57,10 +57,13 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
> #endif
> };
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx)
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> struct x86_perf_regs *perf_regs;
>
> + if (WARN_ON_ONCE(ext || ext_size))
> + return 0;
> +
> if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
> perf_regs = container_of(regs, struct x86_perf_regs, regs);
> if (!perf_regs->xmm_regs)
> @@ -87,8 +90,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
> (1ULL << PERF_REG_X86_R14) | \
> (1ULL << PERF_REG_X86_R15))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
> return -EINVAL;
>
> @@ -112,8 +117,10 @@ void perf_get_regs_user(struct perf_regs *regs_user,
> (1ULL << PERF_REG_X86_FS) | \
> (1ULL << PERF_REG_X86_GS))
>
> -int perf_reg_validate(u64 mask)
> +int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> + if (mask_ext)
> + return -EINVAL;
> if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
> return -EINVAL;
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 74c188a699e4..42b288ab4d2c 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
> #define PERF_PMU_CAP_AUX_PAUSE 0x0200
> #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400
> +#define PERF_PMU_CAP_EXTENDED_REGS2 0x0800 /* sample_ext_regs_intr/user */
>
> /**
> * pmu::scope
> @@ -1496,6 +1497,20 @@ static inline bool event_has_extended_regs(struct perf_event *event)
> (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK);
> }
>
> +static inline bool event_has_extended_regs2(struct perf_event *event)
> +{
> + struct perf_event_attr *attr = &event->attr;
> + int i;
> +
> + for (i = 0; i < PERF_ATTR_EXT_REGS_SIZE; i++) {
> + if (attr->sample_ext_regs_intr[i] ||
> + attr->sample_ext_regs_user[i])
> + return true;
> + }
> +
> + return false;
> +}
> +
> static inline bool event_has_any_exclude_flag(struct perf_event *event)
> {
> struct perf_event_attr *attr = &event->attr;
> diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
> index f632c5725f16..6119bcb010fb 100644
> --- a/include/linux/perf_regs.h
> +++ b/include/linux/perf_regs.h
> @@ -16,23 +16,42 @@ struct perf_regs {
> #define PERF_REG_EXTENDED_MASK 0
> #endif
>
> -u64 perf_reg_value(struct pt_regs *regs, int idx);
> -int perf_reg_validate(u64 mask);
> +#define PERF_EXT_REGS_SIZE_MAX 8
> +
> +/**
> + * perf_reg_value - Get a reg value
> + * @regs: The area where stores all registers
> + * @idx: The index of the request register.
> + * The below @ext indicates the index is for
> + * a regular register or an extension register.
> + * @ext: Pointer to the buffer which stores the
> + * value of the request extension register.
> + * NULL means request for a regular register.
> + * @ext_size: Size of the extension register.
> + *
> + * If it fails, 0 returns.
> + * If it succeeds, for a regular register (!ext),
> + * the value of the register returns.
> + * For an extension register (ext), ext[0] returns.
> + */
> +u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size);
Better to add an separate perf_ext_reg_value() helper for extended
registers. Combining the extended regs into perf_reg_value() makes this
helper over-complicated and hard to understand.
> +int perf_reg_validate(u64 mask, u64 *mask_ext);
> u64 perf_reg_abi(struct task_struct *task);
> void perf_get_regs_user(struct perf_regs *regs_user,
> struct pt_regs *regs);
> #else
>
> #define PERF_REG_EXTENDED_MASK 0
> +#define PERF_EXT_REGS_SIZE_MAX 8
>
> -static inline u64 perf_reg_value(struct pt_regs *regs, int idx)
> +static inline u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
> {
> return 0;
> }
>
> -static inline int perf_reg_validate(u64 mask)
> +static inline int perf_reg_validate(u64 mask, u64 *mask_ext)
> {
> - return mask ? -ENOSYS : 0;
> + return mask || mask_ext ? -ENOSYS : 0;
> }
>
> static inline u64 perf_reg_abi(struct task_struct *task)
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 78a362b80027..e22ba72efcdb 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -382,6 +382,10 @@ enum perf_event_read_format {
> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
> + /* Add: sample_ext_regs_user */
> +
> +#define PERF_ATTR_EXT_REGS_SIZE 2
>
> /*
> * 'struct perf_event_attr' contains various attributes that define
> @@ -543,6 +547,10 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> +
> + /* extension of sample_regs_XXX */
> + __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
> + __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
> };
>
> /*
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 7f0d98d73629..c4279e1bf91a 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7385,11 +7385,40 @@ perf_output_sample_regs(struct perf_output_handle *handle,
> for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
> u64 val;
>
> - val = perf_reg_value(regs, bit);
> + val = perf_reg_value(regs, bit, NULL, NULL);
> perf_output_put(handle, val);
> }
> }
>
> +static void
> +__perf_output_sample_ext_regs(struct perf_output_handle *handle,
> + struct pt_regs *regs, u64 mask, int base)
> +{
> + u64 val[PERF_EXT_REGS_SIZE_MAX];
> + int i, bit, size = 0;
> + DECLARE_BITMAP(_mask, 64);
> +
> + if (!mask)
> + return;
> + bitmap_from_u64(_mask, mask);
> + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
> + perf_reg_value(regs, bit + base, val, &size);
> +
> + for (i = 0; i < size; i++)
> + perf_output_put(handle, val[i]);
> + }
> +}
> +
> +static void
> +perf_output_sample_ext_regs(struct perf_output_handle *handle,
> + struct pt_regs *regs, u64 *mask)
> +{
> + int i;
> +
> + for (i = 0; i < PERF_ATTR_EXT_REGS_SIZE; i++)
> + __perf_output_sample_ext_regs(handle, regs, mask[i], i * 64);
> +}
> +
> static void perf_sample_regs_user(struct perf_regs *regs_user,
> struct pt_regs *regs)
> {
> @@ -7940,9 +7969,14 @@ void perf_output_sample(struct perf_output_handle *handle,
>
> if (abi) {
> u64 mask = event->attr.sample_regs_user;
> + u64 *ext_mask = event->attr.sample_ext_regs_user;
> +
> perf_output_sample_regs(handle,
> data->regs_user.regs,
> mask);
> + perf_output_sample_ext_regs(handle,
> + data->regs_user.regs,
> + ext_mask);
> }
> }
>
> @@ -7971,10 +8005,14 @@ void perf_output_sample(struct perf_output_handle *handle,
>
> if (abi) {
> u64 mask = event->attr.sample_regs_intr;
> + u64 *ext_mask = event->attr.sample_ext_regs_intr;
>
> perf_output_sample_regs(handle,
> data->regs_intr.regs,
> mask);
> + perf_output_sample_ext_regs(handle,
> + data->regs_intr.regs,
> + ext_mask);
> }
> }
>
> @@ -12535,6 +12573,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
> if (ret)
> goto err_pmu;
>
> + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS2) &&
> + event_has_extended_regs2(event)) {
> + ret = -EOPNOTSUPP;
> + goto err_destroy;
> + }
> +
> if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
> event_has_extended_regs(event)) {
> ret = -EOPNOTSUPP;
> @@ -13073,7 +13117,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
> }
>
> if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
> - ret = perf_reg_validate(attr->sample_regs_user);
> + ret = perf_reg_validate(attr->sample_regs_user,
> + attr->sample_ext_regs_user);
> if (ret)
> return ret;
> }
> @@ -13096,8 +13141,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
> if (!attr->sample_max_stack)
> attr->sample_max_stack = sysctl_perf_event_max_stack;
>
> - if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
> - ret = perf_reg_validate(attr->sample_regs_intr);
> + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) {
> + ret = perf_reg_validate(attr->sample_regs_intr,
> + attr->sample_ext_regs_intr);
> + }
>
> #ifndef CONFIG_CGROUP_PERF
> if (attr->sample_type & PERF_SAMPLE_CGROUP)
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-13 13:49 ` [RFC PATCH 06/12] perf: Support extension of sample_regs kan.liang
2025-06-17 8:00 ` Mi, Dapeng
@ 2025-06-17 8:14 ` Peter Zijlstra
2025-06-17 9:49 ` Mi, Dapeng
1 sibling, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 8:14 UTC (permalink / raw)
To: kan.liang
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On Fri, Jun 13, 2025 at 06:49:37AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> More regs may be required in a sample, e.g., the vector registers. The
> current sample_regs_XXX has run out of space.
>
> Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
> as a bitmap for the extension regs. There will be more than 64 registers
> added.
> Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
> supports sample_ext_regs_intr/user.
>
> Extend the perf_reg_validate() to support the validation of the
> extension regs.
>
> Extend the perf_reg_value() to retrieve the extension regs. The regs may
> be larger than u64. Add two parameters to store the pointer and size.
> Add a dedicated perf_output_sample_ext_regs() to dump the extension
> regs.
>
> This is just a generic support for the extension regs. Any attempts to
> manipulate the extension regs will error out, until the driver-specific
> supports are implemented, which will be done in the following patch.
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 78a362b80027..e22ba72efcdb 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -382,6 +382,10 @@ enum perf_event_read_format {
> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
> + /* Add: sample_ext_regs_user */
> +
> +#define PERF_ATTR_EXT_REGS_SIZE 2
>
> /*
> * 'struct perf_event_attr' contains various attributes that define
> @@ -543,6 +547,10 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> +
> + /* extension of sample_regs_XXX */
> + __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
> + __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
> };
Did anybody read this email?
https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
The current regs interface really was designed for regular registers,
and trying to squish SIMD registers into it is a trainwreck.
Not to mention that AAAARGHH64 and Risc-V have vector widths up to 2048
bit.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 8:14 ` Peter Zijlstra
@ 2025-06-17 9:49 ` Mi, Dapeng
2025-06-17 10:28 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-17 9:49 UTC (permalink / raw)
To: Peter Zijlstra, kan.liang
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 6/17/2025 4:14 PM, Peter Zijlstra wrote:
> On Fri, Jun 13, 2025 at 06:49:37AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> More regs may be required in a sample, e.g., the vector registers. The
>> current sample_regs_XXX has run out of space.
>>
>> Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
>> as a bitmap for the extension regs. There will be more than 64 registers
>> added.
>> Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
>> supports sample_ext_regs_intr/user.
>>
>> Extend the perf_reg_validate() to support the validation of the
>> extension regs.
>>
>> Extend the perf_reg_value() to retrieve the extension regs. The regs may
>> be larger than u64. Add two parameters to store the pointer and size.
>> Add a dedicated perf_output_sample_ext_regs() to dump the extension
>> regs.
>>
>> This is just a generic support for the extension regs. Any attempts to
>> manipulate the extension regs will error out, until the driver-specific
>> supports are implemented, which will be done in the following patch.
>>
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 78a362b80027..e22ba72efcdb 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -382,6 +382,10 @@ enum perf_event_read_format {
>> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
>> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
>> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
>> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
>> + /* Add: sample_ext_regs_user */
>> +
>> +#define PERF_ATTR_EXT_REGS_SIZE 2
>>
>> /*
>> * 'struct perf_event_attr' contains various attributes that define
>> @@ -543,6 +547,10 @@ struct perf_event_attr {
>> __u64 sig_data;
>>
>> __u64 config3; /* extension of config2 */
>> +
>> + /* extension of sample_regs_XXX */
>> + __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
>> + __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
>> };
> Did anybody read this email?
>
> https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
>
> The current regs interface really was designed for regular registers,
> and trying to squish SIMD registers into it is a trainwreck.
>
> Not to mention that AAAARGHH64 and Risc-V have vector widths up to 2048
> bit.
Yes, we followed this discussion. In sample_ext_regs_intr/user[], each bit
represents an extended register regardless of how much bits does the
register have. At the beginning we added a item "sample_simd_reg_words" to
represent the bit-width of the corresponding extended register, but we
found it's unnecessary since the regs bitmap is fixed for a specific arch
and the arch-specific code would know how many bits for the certain regs,
e.g., on x86 platform, the bit 0 would represent YMMH0 in this patchset ,
then the x86 specific perf code would know its bit-wdith is 128bits.
The reason that we define an array with 2 u64 words is that we plan to
support YMM (16 bits) + APX (16 bits) + OPMASK (8 bits) + ZMM (32 bits) +
SSP (1 bit) regs which needs 73 bits and one u64 word is not enough.
>
>
>
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 9:49 ` Mi, Dapeng
@ 2025-06-17 10:28 ` Peter Zijlstra
2025-06-17 12:14 ` Mi, Dapeng
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 10:28 UTC (permalink / raw)
To: Mi, Dapeng
Cc: kan.liang, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel, ak,
zide.chen
On Tue, Jun 17, 2025 at 05:49:13PM +0800, Mi, Dapeng wrote:
>
> On 6/17/2025 4:14 PM, Peter Zijlstra wrote:
> > On Fri, Jun 13, 2025 at 06:49:37AM -0700, kan.liang@linux.intel.com wrote:
> >> From: Kan Liang <kan.liang@linux.intel.com>
> >>
> >> More regs may be required in a sample, e.g., the vector registers. The
> >> current sample_regs_XXX has run out of space.
> >>
> >> Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
> >> as a bitmap for the extension regs. There will be more than 64 registers
> >> added.
> >> Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
> >> supports sample_ext_regs_intr/user.
> >>
> >> Extend the perf_reg_validate() to support the validation of the
> >> extension regs.
> >>
> >> Extend the perf_reg_value() to retrieve the extension regs. The regs may
> >> be larger than u64. Add two parameters to store the pointer and size.
> >> Add a dedicated perf_output_sample_ext_regs() to dump the extension
> >> regs.
> >>
> >> This is just a generic support for the extension regs. Any attempts to
> >> manipulate the extension regs will error out, until the driver-specific
> >> supports are implemented, which will be done in the following patch.
> >>
> >> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> >> index 78a362b80027..e22ba72efcdb 100644
> >> --- a/include/uapi/linux/perf_event.h
> >> +++ b/include/uapi/linux/perf_event.h
> >> @@ -382,6 +382,10 @@ enum perf_event_read_format {
> >> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
> >> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
> >> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
> >> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
> >> + /* Add: sample_ext_regs_user */
> >> +
> >> +#define PERF_ATTR_EXT_REGS_SIZE 2
> >>
> >> /*
> >> * 'struct perf_event_attr' contains various attributes that define
> >> @@ -543,6 +547,10 @@ struct perf_event_attr {
> >> __u64 sig_data;
> >>
> >> __u64 config3; /* extension of config2 */
> >> +
> >> + /* extension of sample_regs_XXX */
> >> + __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
> >> + __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
> >> };
> > Did anybody read this email?
> >
> > https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
> >
> > The current regs interface really was designed for regular registers,
> > and trying to squish SIMD registers into it is a trainwreck.
> >
> > Not to mention that AAAARGHH64 and Risc-V have vector widths up to 2048
> > bit.
>
> Yes, we followed this discussion. In sample_ext_regs_intr/user[], each bit
> represents an extended register regardless of how much bits does the
> register have. At the beginning we added a item "sample_simd_reg_words" to
> represent the bit-width of the corresponding extended register, but we
> found it's unnecessary since the regs bitmap is fixed for a specific arch
So I disagree. Fundamentally we have only 32 SIMD registers on x86. We
should not have more bits than that.
> and the arch-specific code would know how many bits for the certain regs,
> e.g., on x86 platform, the bit 0 would represent YMMH0 in this patchset ,
> then the x86 specific perf code would know its bit-wdith is 128bits.
This is nonsense; YMMH0 is not a register. It should never be in this
array.
> The reason that we define an array with 2 u64 words is that we plan to
> support YMM (16 bits) + APX (16 bits) + OPMASK (8 bits) + ZMM (32 bits) +
> SSP (1 bit) regs which needs 73 bits and one u64 word is not enough.
This is insane. So now you're having 16 XMM 'regs', 16 YMMH 'regs' and
32 ZMMH 'regs' for a total of 64 bits that should have been just 32.
Suppose we're going to be doing AVX-1024, because awesome. That means we
need another 32 bits to denote whatever letter comes after 'Z'.
So no, this idiocy stops now.
We're going to do a sane SIMD register set with variable width, and
reclaim the XMM regs from the normal set.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 10:28 ` Peter Zijlstra
@ 2025-06-17 12:14 ` Mi, Dapeng
2025-06-17 13:33 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-17 12:14 UTC (permalink / raw)
To: Peter Zijlstra
Cc: kan.liang, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel, ak,
zide.chen
On 6/17/2025 6:28 PM, Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 05:49:13PM +0800, Mi, Dapeng wrote:
>> On 6/17/2025 4:14 PM, Peter Zijlstra wrote:
>>> On Fri, Jun 13, 2025 at 06:49:37AM -0700, kan.liang@linux.intel.com wrote:
>>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>>
>>>> More regs may be required in a sample, e.g., the vector registers. The
>>>> current sample_regs_XXX has run out of space.
>>>>
>>>> Add sample_ext_regs_intr/user[2] in the struct perf_event_attr. It's used
>>>> as a bitmap for the extension regs. There will be more than 64 registers
>>>> added.
>>>> Add a new flag PERF_PMU_CAP_EXTENDED_REGS2 to indicate the PMU which
>>>> supports sample_ext_regs_intr/user.
>>>>
>>>> Extend the perf_reg_validate() to support the validation of the
>>>> extension regs.
>>>>
>>>> Extend the perf_reg_value() to retrieve the extension regs. The regs may
>>>> be larger than u64. Add two parameters to store the pointer and size.
>>>> Add a dedicated perf_output_sample_ext_regs() to dump the extension
>>>> regs.
>>>>
>>>> This is just a generic support for the extension regs. Any attempts to
>>>> manipulate the extension regs will error out, until the driver-specific
>>>> supports are implemented, which will be done in the following patch.
>>>>
>>>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>>>> index 78a362b80027..e22ba72efcdb 100644
>>>> --- a/include/uapi/linux/perf_event.h
>>>> +++ b/include/uapi/linux/perf_event.h
>>>> @@ -382,6 +382,10 @@ enum perf_event_read_format {
>>>> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
>>>> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
>>>> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
>>>> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_ext_regs_intr */
>>>> + /* Add: sample_ext_regs_user */
>>>> +
>>>> +#define PERF_ATTR_EXT_REGS_SIZE 2
>>>>
>>>> /*
>>>> * 'struct perf_event_attr' contains various attributes that define
>>>> @@ -543,6 +547,10 @@ struct perf_event_attr {
>>>> __u64 sig_data;
>>>>
>>>> __u64 config3; /* extension of config2 */
>>>> +
>>>> + /* extension of sample_regs_XXX */
>>>> + __u64 sample_ext_regs_intr[PERF_ATTR_EXT_REGS_SIZE];
>>>> + __u64 sample_ext_regs_user[PERF_ATTR_EXT_REGS_SIZE];
>>>> };
>>> Did anybody read this email?
>>>
>>> https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
>>>
>>> The current regs interface really was designed for regular registers,
>>> and trying to squish SIMD registers into it is a trainwreck.
>>>
>>> Not to mention that AAAARGHH64 and Risc-V have vector widths up to 2048
>>> bit.
>> Yes, we followed this discussion. In sample_ext_regs_intr/user[], each bit
>> represents an extended register regardless of how much bits does the
>> register have. At the beginning we added a item "sample_simd_reg_words" to
>> represent the bit-width of the corresponding extended register, but we
>> found it's unnecessary since the regs bitmap is fixed for a specific arch
> So I disagree. Fundamentally we have only 32 SIMD registers on x86. We
> should not have more bits than that.
>
>> and the arch-specific code would know how many bits for the certain regs,
>> e.g., on x86 platform, the bit 0 would represent YMMH0 in this patchset ,
>> then the x86 specific perf code would know its bit-wdith is 128bits.
> This is nonsense; YMMH0 is not a register. It should never be in this
> array.
>
>> The reason that we define an array with 2 u64 words is that we plan to
>> support YMM (16 bits) + APX (16 bits) + OPMASK (8 bits) + ZMM (32 bits) +
>> SSP (1 bit) regs which needs 73 bits and one u64 word is not enough.
> This is insane. So now you're having 16 XMM 'regs', 16 YMMH 'regs' and
> 32 ZMMH 'regs' for a total of 64 bits that should have been just 32.
>
> Suppose we're going to be doing AVX-1024, because awesome. That means we
> need another 32 bits to denote whatever letter comes after 'Z'.
>
> So no, this idiocy stops now.
>
> We're going to do a sane SIMD register set with variable width, and
> reclaim the XMM regs from the normal set.
Ok, so we need to add two width variables like
sample_ext_regs_words_intr/user, then reuse the XMM regs bitmap to
represent the extend regs bitmap. Considering the OPMASK regs and APX
extended GPRs have same bit-width (64 bits), we may have to combine them
into a single bitmap, e.g. bits[15:0] represents R31~R16 and bits[23:16]
represents OPMASK7 ~ OPMASK0.
Peter, is this what you expected?
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 12:14 ` Mi, Dapeng
@ 2025-06-17 13:33 ` Peter Zijlstra
2025-06-17 14:06 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 13:33 UTC (permalink / raw)
To: Mi, Dapeng
Cc: kan.liang, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel, ak,
zide.chen
On Tue, Jun 17, 2025 at 08:14:36PM +0800, Mi, Dapeng wrote:
> > We're going to do a sane SIMD register set with variable width, and
> > reclaim the XMM regs from the normal set.
>
> Ok, so we need to add two width variables like
> sample_ext_regs_words_intr/user,
s/ext/simd/
Not sure it makes sense to have separate vector widths for kernel and
user regs, but sure.
> then reuse the XMM regs bitmap to represent the extend regs bitmap.
But its not extended; its the normal bitmap.
> Considering the OPMASK regs and APX
> extended GPRs have same bit-width (64 bits), we may have to combine them
> into a single bitmap, e.g. bits[15:0] represents R31~R16 and bits[23:16]
> represents OPMASK7 ~ OPMASK0.
Again confused, bits 0:23 are the normal registers (in a lunatic
order). The XMM regs are in 32:63 and will be free if the SIMD thing is
present.
SPP+APX should definitely go there.
Not sure about OPMASK; those really do belong with the SIMD state. Let
me go figure out what ARM and Risc-V look like in more detail.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 13:33 ` Peter Zijlstra
@ 2025-06-17 14:06 ` Peter Zijlstra
2025-06-17 14:24 ` Mark Rutland
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 14:06 UTC (permalink / raw)
To: Mi, Dapeng
Cc: kan.liang, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel, ak,
zide.chen
On Tue, Jun 17, 2025 at 03:33:33PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 08:14:36PM +0800, Mi, Dapeng wrote:
>
> > > We're going to do a sane SIMD register set with variable width, and
> > > reclaim the XMM regs from the normal set.
> >
> > Ok, so we need to add two width variables like
> > sample_ext_regs_words_intr/user,
>
> s/ext/simd/
>
> Not sure it makes sense to have separate vector widths for kernel and
> user regs, but sure.
>
> > then reuse the XMM regs bitmap to represent the extend regs bitmap.
>
> But its not extended; its the normal bitmap.
>
> > Considering the OPMASK regs and APX
> > extended GPRs have same bit-width (64 bits), we may have to combine them
> > into a single bitmap, e.g. bits[15:0] represents R31~R16 and bits[23:16]
> > represents OPMASK7 ~ OPMASK0.
>
> Again confused, bits 0:23 are the normal registers (in a lunatic
> order). The XMM regs are in 32:63 and will be free if the SIMD thing is
> present.
>
> SPP+APX should definitely go there.
>
> Not sure about OPMASK; those really do belong with the SIMD state. Let
> me go figure out what ARM and Risc-V look like in more detail.
So ARM-SVE has 32 vector registers with 16 predicate registers.
Risc-V Zv seems to only have 32 vector registers; no special purpose
predicate registers, instead a regular vector register can be used as a
predicate register.
PowerPC VSX has 64 vector registers and no predicate registers afaict.
While reading this, I came across the useful note that predicate
registers are 1/8-th the length of the vector registers (because the
minimal element is a byte). So while the current AVX-512 predicate
registers are indeed 64bits, this would no longer be true for the
hypothetical AVX-1024 (or even AVX-512 if we allow 4bit elements).
As such, I don't think we should stick the predicate registers in the
normal group -- they really are not normal registers and won't fit for
future extensions.
This then leaves us two options:
- stick the predicate registers in the high bits of the vector register
word, or
- add an explicit predicate register word.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 14:06 ` Peter Zijlstra
@ 2025-06-17 14:24 ` Mark Rutland
2025-06-17 14:44 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Mark Rutland @ 2025-06-17 14:24 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mi, Dapeng, kan.liang, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel,
ak, zide.chen
On Tue, Jun 17, 2025 at 04:06:17PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 03:33:33PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 17, 2025 at 08:14:36PM +0800, Mi, Dapeng wrote:
> >
> > > > We're going to do a sane SIMD register set with variable width, and
> > > > reclaim the XMM regs from the normal set.
> > >
> > > Ok, so we need to add two width variables like
> > > sample_ext_regs_words_intr/user,
> >
> > s/ext/simd/
> >
> > Not sure it makes sense to have separate vector widths for kernel and
> > user regs, but sure.
> >
> > > then reuse the XMM regs bitmap to represent the extend regs bitmap.
> >
> > But its not extended; its the normal bitmap.
> >
> > > Considering the OPMASK regs and APX
> > > extended GPRs have same bit-width (64 bits), we may have to combine them
> > > into a single bitmap, e.g. bits[15:0] represents R31~R16 and bits[23:16]
> > > represents OPMASK7 ~ OPMASK0.
> >
> > Again confused, bits 0:23 are the normal registers (in a lunatic
> > order). The XMM regs are in 32:63 and will be free if the SIMD thing is
> > present.
> >
> > SPP+APX should definitely go there.
> >
> > Not sure about OPMASK; those really do belong with the SIMD state. Let
> > me go figure out what ARM and Risc-V look like in more detail.
>
> So ARM-SVE has 32 vector registers with 16 predicate registers.
>
> Risc-V Zv seems to only have 32 vector registers; no special purpose
> predicate registers, instead a regular vector register can be used as a
> predicate register.
>
> PowerPC VSX has 64 vector registers and no predicate registers afaict.
>
> While reading this, I came across the useful note that predicate
> registers are 1/8-th the length of the vector registers (because the
> minimal element is a byte). So while the current AVX-512 predicate
> registers are indeed 64bits, this would no longer be true for the
> hypothetical AVX-1024 (or even AVX-512 if we allow 4bit elements).
>
> As such, I don't think we should stick the predicate registers in the
> normal group -- they really are not normal registers and won't fit for
> future extensions.
>
> This then leaves us two options:
>
> - stick the predicate registers in the high bits of the vector register
> word, or
>
> - add an explicit predicate register word.
TBH, I don't think we can handle extended state in a generic way unless
we treat this like a ptrace regset, and delegate the format of each
specific register set to the architecture code.
On arm64, the behaviour is modal (with two different vector lengths for
streaming/non-streaming SVE when SME is implemented), per-task
configurable (with different vector lengths), can differ between
host/guest for KVM, and some of the registers only exist in some
configurations (e.g. the FFR only exists for SME if FA64 is
implemented).
Mark.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 14:24 ` Mark Rutland
@ 2025-06-17 14:44 ` Peter Zijlstra
2025-06-17 14:55 ` Mark Rutland
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 14:44 UTC (permalink / raw)
To: Mark Rutland
Cc: Mi, Dapeng, kan.liang, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel,
ak, zide.chen
On Tue, Jun 17, 2025 at 03:24:01PM +0100, Mark Rutland wrote:
> TBH, I don't think we can handle extended state in a generic way unless
> we treat this like a ptrace regset, and delegate the format of each
> specific register set to the architecture code.
>
> On arm64, the behaviour is modal (with two different vector lengths for
> streaming/non-streaming SVE when SME is implemented), per-task
> configurable (with different vector lengths), can differ between
> host/guest for KVM, and some of the registers only exist in some
> configurations (e.g. the FFR only exists for SME if FA64 is
> implemented).
Well, much of this is per necessity architecture specific. But the
general form of vector registers is similar enough.
The main point is to not try and cram the vector registers into multiple
GP regs (sadly that is exactly what x86 started doing).
Anyway, your conditional length thing is 'fun' and has two solutions:
- the arch can refuse to create per-cpu counters with SIMD samples, or
- 0 pad all 'unobtainable state'.
Same when asking for wider vectors than the hardware supports; eg.
asking for 512 wide registers on Intel clients will likely end up in a
lot of 0s for the high bits -- seeing how AVX512 is mostly a server
thing on Intel.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 14:44 ` Peter Zijlstra
@ 2025-06-17 14:55 ` Mark Rutland
2025-06-17 19:00 ` Mark Brown
2025-06-17 20:32 ` Liang, Kan
0 siblings, 2 replies; 60+ messages in thread
From: Mark Rutland @ 2025-06-17 14:55 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mi, Dapeng, kan.liang, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel,
ak, zide.chen, broonie
On Tue, Jun 17, 2025 at 04:44:16PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 03:24:01PM +0100, Mark Rutland wrote:
>
> > TBH, I don't think we can handle extended state in a generic way unless
> > we treat this like a ptrace regset, and delegate the format of each
> > specific register set to the architecture code.
> >
> > On arm64, the behaviour is modal (with two different vector lengths for
> > streaming/non-streaming SVE when SME is implemented), per-task
> > configurable (with different vector lengths), can differ between
> > host/guest for KVM, and some of the registers only exist in some
> > configurations (e.g. the FFR only exists for SME if FA64 is
> > implemented).
>
> Well, much of this is per necessity architecture specific. But the
> general form of vector registers is similar enough.
>
> The main point is to not try and cram the vector registers into multiple
> GP regs (sadly that is exactly what x86 started doing).
I see, sorry for the noise. I completely agree that we shouldn't cram
this stuff into GP regs.
> Anyway, your conditional length thing is 'fun' and has two solutions:
>
> - the arch can refuse to create per-cpu counters with SIMD samples, or
>
> - 0 pad all 'unobtainable state'.
>
> Same when asking for wider vectors than the hardware supports; eg.
> asking for 512 wide registers on Intel clients will likely end up in a
> lot of 0s for the high bits -- seeing how AVX512 is mostly a server
> thing on Intel.
Yep, those options may work for us, but we'd need to think harder about
it. Our approach for ptrace and signals has been to have a header and
pack at the active vector length, so padding to a max width would be
different, but maybe it's fine.
Having another representation feels like a recipe waiting to happen.
Mark.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 14:55 ` Mark Rutland
@ 2025-06-17 19:00 ` Mark Brown
2025-06-17 20:32 ` Liang, Kan
1 sibling, 0 replies; 60+ messages in thread
From: Mark Brown @ 2025-06-17 19:00 UTC (permalink / raw)
To: Mark Rutland
Cc: Peter Zijlstra, Mi, Dapeng, kan.liang, mingo, acme, namhyung,
tglx, dave.hansen, irogers, adrian.hunter, jolsa,
alexander.shishkin, linux-kernel, ak, zide.chen
[-- Attachment #1: Type: text/plain, Size: 3251 bytes --]
On Tue, Jun 17, 2025 at 03:55:00PM +0100, Mark Rutland wrote:
> On Tue, Jun 17, 2025 at 04:44:16PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 17, 2025 at 03:24:01PM +0100, Mark Rutland wrote:
> > Anyway, your conditional length thing is 'fun' and has two solutions:
> > - the arch can refuse to create per-cpu counters with SIMD samples, or
> > - 0 pad all 'unobtainable state'.
We currently do a *bit* of the 0 for unobtainable state thing for FFR
when in !FA64 streaming mode, that's for a whole register though.
Probably also worth pointing out that we've got 16 predicate registers
plus FFR which is sized like a predicate register, I don't think it
makes much difference for this discussion but just in case.
> > Same when asking for wider vectors than the hardware supports; eg.
> > asking for 512 wide registers on Intel clients will likely end up in a
> > lot of 0s for the high bits -- seeing how AVX512 is mostly a server
> > thing on Intel.
> Yep, those options may work for us, but we'd need to think harder about
> it. Our approach for ptrace and signals has been to have a header and
> pack at the active vector length, so padding to a max width would be
> different, but maybe it's fine.
> Having another representation feels like a recipe waiting to happen.
Given that we have a different header format for everywhere we expose
the register state it's *probably* fine if the "header" is that
userspace selected the VL to record with, but like you say it is
different and therefore concerning. We have something similar with KVM
where we expose these registers with the maximum VL we configured for
the guest regardless of what vector length the guest has configured for
itself. It's certainly going to be more fiddly to read and write a
non-native format if you're not running in a higher EL like KVM though.
Another thought is that KVM exposes the vector lengths as virtual
registers, we could perhaps use a similar approach and write the active
VL out as part of the sample which does start to look like a header and
is perhaps not too horrifying for the perf abstractions (this being very
much a pick your poison situation)? Even if the VL used to format the
data that's written out is fixed I'd expect we'll want to be able to
include enough state to figure out the actual VL along with it.
If we do padding I worry a bit about the overhead whenever we have to do
it. AIUI with x86 the register sizes are constant on a given system so
userspace can simply not select a register size larger than the hardware
if they're concerned about the cost. On arm64 when a system has both
SVE and SME we are expecting they will frequently implement different
vector lengths for each so needing to pad would be a much more common
case, it is expected that programs will only be in streaming mode for
the minimum amount of time required to do the SME operations they need.
Given that SME will tend to have the larger VL but be less frequently
used we'd probably pad more often than not by default which doesn't seem
ideal. But having said that I have a feeling the overhead of just
recording things may be sufficiently high that the additional cost of
doing padding will be basically noise.
Like you say it needs thought.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 14:55 ` Mark Rutland
2025-06-17 19:00 ` Mark Brown
@ 2025-06-17 20:32 ` Liang, Kan
2025-06-18 9:35 ` Peter Zijlstra
1 sibling, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-17 20:32 UTC (permalink / raw)
To: Mark Rutland, Peter Zijlstra
Cc: Mi, Dapeng, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel, ak,
zide.chen, broonie
On 2025-06-17 10:55 a.m., Mark Rutland wrote:
> On Tue, Jun 17, 2025 at 04:44:16PM +0200, Peter Zijlstra wrote:
>> On Tue, Jun 17, 2025 at 03:24:01PM +0100, Mark Rutland wrote:
>>
>>> TBH, I don't think we can handle extended state in a generic way unless
>>> we treat this like a ptrace regset, and delegate the format of each
>>> specific register set to the architecture code.
>>>
>>> On arm64, the behaviour is modal (with two different vector lengths for
>>> streaming/non-streaming SVE when SME is implemented), per-task
>>> configurable (with different vector lengths), can differ between
>>> host/guest for KVM, and some of the registers only exist in some
>>> configurations (e.g. the FFR only exists for SME if FA64 is
>>> implemented).
>>
>> Well, much of this is per necessity architecture specific. But the
>> general form of vector registers is similar enough.
>>
>> The main point is to not try and cram the vector registers into multiple
>> GP regs (sadly that is exactly what x86 started doing).
>
> I see, sorry for the noise. I completely agree that we shouldn't cram
> this stuff into GP regs.
>
>> Anyway, your conditional length thing is 'fun' and has two solutions:
>>
>> - the arch can refuse to create per-cpu counters with SIMD samples, or
>>
>> - 0 pad all 'unobtainable state'.
>>
>> Same when asking for wider vectors than the hardware supports; eg.
>> asking for 512 wide registers on Intel clients will likely end up in a
>> lot of 0s for the high bits -- seeing how AVX512 is mostly a server
>> thing on Intel.
>
> Yep, those options may work for us, but we'd need to think harder about
> it. Our approach for ptrace and signals has been to have a header and
> pack at the active vector length, so padding to a max width would be
> different, but maybe it's fine.
>
> Having another representation feels like a recipe waiting to happen.
>
I'd like to make sure I understand correctly.
If we'd like an explicit predicate register word, the below change in
struct perf_event_attr is OK for ARM as well, right?
__u16 sample_simd_pred_reg_words;
__u16 sample_simd_pred_reg_intr;
__u16 sample_simd_pred_reg_user;
__u16 sample_simd_reg_words;
__u64 sample_simd_reg_intr;
__u64 sample_simd_reg_user;
BTW: would that be easier for ARM if changing the _words to _type?
You may define some types like, stream_sve, n_stream_sve, etc.
The output will depend on the types, rather than the max length of
registers.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-17 20:32 ` Liang, Kan
@ 2025-06-18 9:35 ` Peter Zijlstra
2025-06-18 10:10 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-18 9:35 UTC (permalink / raw)
To: Liang, Kan
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On Tue, Jun 17, 2025 at 04:32:24PM -0400, Liang, Kan wrote:
> > Yep, those options may work for us, but we'd need to think harder about
> > it. Our approach for ptrace and signals has been to have a header and
> > pack at the active vector length, so padding to a max width would be
> > different, but maybe it's fine.
> >
> > Having another representation feels like a recipe waiting to happen.
> >
>
> I'd like to make sure I understand correctly.
> If we'd like an explicit predicate register word, the below change in
> struct perf_event_attr is OK for ARM as well, right?
>
> __u16 sample_simd_pred_reg_words;
> __u16 sample_simd_pred_reg_intr;
> __u16 sample_simd_pred_reg_user;
> __u16 sample_simd_reg_words;
> __u64 sample_simd_reg_intr;
> __u64 sample_simd_reg_user;
>
> BTW: would that be easier for ARM if changing the _words to _type?
> You may define some types like, stream_sve, n_stream_sve, etc.
> The output will depend on the types, rather than the max length of
> registers.
I'm thinking what they're after is something like:
PERF_SAMPLE_SIMD_REGS := {
u16 nr_vectors;
u16 vector_length;
u16 nr_pred;
u16 pred_length;
u64 data[];
}
Where the output data also has a length. Such that even if we ask for
512 bit vectors, the thing is allowed to respond with say 128 bit
vectors if that is all the machine has at that time.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 9:35 ` Peter Zijlstra
@ 2025-06-18 10:10 ` Liang, Kan
2025-06-18 13:30 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 10:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 2025-06-18 5:35 a.m., Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 04:32:24PM -0400, Liang, Kan wrote:
>
>>> Yep, those options may work for us, but we'd need to think harder about
>>> it. Our approach for ptrace and signals has been to have a header and
>>> pack at the active vector length, so padding to a max width would be
>>> different, but maybe it's fine.
>>>
>>> Having another representation feels like a recipe waiting to happen.
>>>
>>
>> I'd like to make sure I understand correctly.
>> If we'd like an explicit predicate register word, the below change in
>> struct perf_event_attr is OK for ARM as well, right?
>>
>> __u16 sample_simd_pred_reg_words;
>> __u16 sample_simd_pred_reg_intr;
>> __u16 sample_simd_pred_reg_user;
>> __u16 sample_simd_reg_words;
>> __u64 sample_simd_reg_intr;
>> __u64 sample_simd_reg_user;
>>
>> BTW: would that be easier for ARM if changing the _words to _type?
>> You may define some types like, stream_sve, n_stream_sve, etc.
>> The output will depend on the types, rather than the max length of
>> registers.
>
> I'm thinking what they're after is something like:
>
> PERF_SAMPLE_SIMD_REGS := {
> u16 nr_vectors;
> u16 vector_length;
> u16 nr_pred;
> u16 pred_length;
> u64 data[];
> }
Maybe we should use a mask to replace the nr_vectors.
Because Dave mentioned that the XSAVES may fail.
Currently, perf gives all 0 for the failing case. But 0 should also be a
valid output.
The mask can tell the tool that some regs are failed to be collected. So
the tool can give proper feedback to the end user.
PERF_SAMPLE_SIMD_REGS := {
u64 vectors_mask;
u16 vector_length;
u64 pred_mask;
u16 pred_length;
u64 data[];
}
Thanks,
Kan>
> Where the output data also has a length. Such that even if we ask for
> 512 bit vectors, the thing is allowed to respond with say 128 bit
> vectors if that is all the machine has at that time.
>
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 10:10 ` Liang, Kan
@ 2025-06-18 13:30 ` Peter Zijlstra
2025-06-18 13:52 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-18 13:30 UTC (permalink / raw)
To: Liang, Kan
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On Wed, Jun 18, 2025 at 06:10:20AM -0400, Liang, Kan wrote:
> Maybe we should use a mask to replace the nr_vectors.
> Because Dave mentioned that the XSAVES may fail.
XSAVE is a pain in the arse :/
> PERF_SAMPLE_SIMD_REGS := {
> u64 vectors_mask;
> u16 vector_length;
> u64 pred_mask;
> u16 pred_length;
That is not u64 aligned...
> u64 data[];
> }
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 13:30 ` Peter Zijlstra
@ 2025-06-18 13:52 ` Liang, Kan
2025-06-18 14:30 ` Dave Hansen
2025-06-18 14:45 ` Peter Zijlstra
0 siblings, 2 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 13:52 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 2025-06-18 9:30 a.m., Peter Zijlstra wrote:
> On Wed, Jun 18, 2025 at 06:10:20AM -0400, Liang, Kan wrote:
>
>> Maybe we should use a mask to replace the nr_vectors.
>> Because Dave mentioned that the XSAVES may fail.
>
> XSAVE is a pain in the arse :/
>
>> PERF_SAMPLE_SIMD_REGS := {
>> u64 vectors_mask;
>> u16 vector_length;
>> u64 pred_mask;
>> u16 pred_length;
>
> That is not u64 aligned...
I didn't know we have the alignment requirement for the output.
If so,
PERF_SAMPLE_SIMD_REGS := {
u64 vectors_mask;
u64 pred_mask;
u64 vector_length:16,
pred_length:16,
reserved:32;
u64 data[];
}
Thanks,
Kan>
>> u64 data[];
>> }
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 13:52 ` Liang, Kan
@ 2025-06-18 14:30 ` Dave Hansen
2025-06-18 14:47 ` Dave Hansen
2025-06-18 14:45 ` Peter Zijlstra
1 sibling, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-18 14:30 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 6/18/25 06:52, Liang, Kan wrote:
> On 2025-06-18 9:30 a.m., Peter Zijlstra wrote:
>> On Wed, Jun 18, 2025 at 06:10:20AM -0400, Liang, Kan wrote:
>>
>>> Maybe we should use a mask to replace the nr_vectors.
>>> Because Dave mentioned that the XSAVES may fail.
>> XSAVE is a pain in the arse :/
>>
>>> PERF_SAMPLE_SIMD_REGS := {
>>> u64 vectors_mask;
>>> u16 vector_length;
>>> u64 pred_mask;
>>> u16 pred_length;
>> That is not u64 aligned...
> I didn't know we have the alignment requirement for the output.
> If so,
>
> PERF_SAMPLE_SIMD_REGS := {
> u64 vectors_mask;
> u64 pred_mask;
> u64 vector_length:16,
> pred_length:16,
> reserved:32;
> u64 data[];
> }
There are three different in-memory register layouts that are in play:
* The "sane" format that, for instance, packs all of the bytes of ZMM0
in memory next to each other, like you've been talking about in the
thread.
* The PEBS XER Record Format. There's a 16-byte header before the real
registers start. The registers have an XSAVE-style split where (for
instance) ZMM0 is in three pieces.
* The XSAVE{,C,S,OPT} format. There's a 160-byte of "x87 state" gunk at
the beginning that's not read or written, then XMM[0-16], then 112
bytes of space, then X{STATE,COMP}_BV, a 48 byte gap, then the AVX
state. There's a bunch of space in the first 576 bytes.
XSAVE can't write the first two formats at *all*, although the PEBS and
XSAVE formats are the same for AVX and later.
So one of the immediate questions is whether we want to expose the XSAVE
format as part of the perf ABI. I'm _assuming_ that the PEBS format is
going to be exposed to userspace, so should we expose XSAVE or munge it
into one of the other two formats?
If software is going to munge the XSAVE format, then you don't have to
worry about alignment because you'd save it to some probably per-cpu
64-byte-aligned buffer and then munge it into the unaligned
PERF_SAMPLE_SIMD_REGS above.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 14:30 ` Dave Hansen
@ 2025-06-18 14:47 ` Dave Hansen
2025-06-18 15:24 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-18 14:47 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 6/18/25 07:30, Dave Hansen wrote:
> If software is going to munge the XSAVE format, then you don't have to
> worry about alignment because you'd save it to some probably per-cpu
> 64-byte-aligned buffer and then munge it into the unaligned
> PERF_SAMPLE_SIMD_REGS above.
BTW, Peter just mentioned that you're just going to munge both the XSAVE
and PEBS formats before letting userspace see them. Big ack from me on
that, fwiw.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 14:47 ` Dave Hansen
@ 2025-06-18 15:24 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 15:24 UTC (permalink / raw)
To: Dave Hansen, Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 2025-06-18 10:47 a.m., Dave Hansen wrote:
> On 6/18/25 07:30, Dave Hansen wrote:
>> If software is going to munge the XSAVE format, then you don't have to
>> worry about alignment because you'd save it to some probably per-cpu
>> 64-byte-aligned buffer and then munge it into the unaligned
>> PERF_SAMPLE_SIMD_REGS above.
>
> BTW, Peter just mentioned that you're just going to munge both the XSAVE
> and PEBS formats before letting userspace see them.
Right, there is no plan to export the XSAVE formats directly to the
userspace.
> Big ack from me on
> that, fwiw.
Thanks. :)
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 13:52 ` Liang, Kan
2025-06-18 14:30 ` Dave Hansen
@ 2025-06-18 14:45 ` Peter Zijlstra
2025-06-18 15:22 ` Liang, Kan
1 sibling, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-18 14:45 UTC (permalink / raw)
To: Liang, Kan
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On Wed, Jun 18, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
> I didn't know we have the alignment requirement for the output.
Perf buffer is in u64 units.
> If so,
>
> PERF_SAMPLE_SIMD_REGS := {
> u64 vectors_mask;
> u64 pred_mask;
> u64 vector_length:16,
> pred_length:16,
> reserved:32;
> u64 data[];
> }
I really don't think we need this complication; XSAVE is a real pain in
the arse, but I think we have enough bits in XSTATE_BV to tell what is
what.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 06/12] perf: Support extension of sample_regs
2025-06-18 14:45 ` Peter Zijlstra
@ 2025-06-18 15:22 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 15:22 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mark Rutland, Mi, Dapeng, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen, broonie
On 2025-06-18 10:45 a.m., Peter Zijlstra wrote:
> On Wed, Jun 18, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
>
>> I didn't know we have the alignment requirement for the output.
>
> Perf buffer is in u64 units.
Sure.
>
>> If so,
>>
>> PERF_SAMPLE_SIMD_REGS := {
>> u64 vectors_mask;
>> u64 pred_mask;
>> u64 vector_length:16,
>> pred_length:16,
>> reserved:32;
>> u64 data[];
>> }
>
> I really don't think we need this complication; XSAVE is a real pain in
> the arse, but I think we have enough bits in XSTATE_BV to tell what is
> what.
I just realized that XSAVE takes the vector as a whole.
It's impossible that XMM0 can be retrieved but XMM1 fails via XSAVE.
I will use the suggested output format.
PERF_SAMPLE_SIMD_REGS := {
u16 nr_vectors;
u16 vector_length;
u16 nr_pred;
u16 pred_length;
u64 data[];
}
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* [RFC PATCH 07/12] perf/x86: Add YMMH in extended regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (5 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 06/12] perf: Support extension of sample_regs kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 15:48 ` Dave Hansen
2025-06-13 13:49 ` [RFC PATCH 08/12] perf/x86: Add APX " kan.liang
` (6 subsequent siblings)
13 siblings, 1 reply; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support YMMH as the extended registers. It can be configured in the
sample_ext_regs_intr/user.
Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
The value can be retrieved via the XSAVES.
Add sanity check in the perf_reg_validate.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 26 ++++++++++++++
arch/x86/events/perf_event.h | 22 ++++++++++++
arch/x86/include/asm/perf_event.h | 1 +
arch/x86/include/uapi/asm/perf_regs.h | 28 +++++++++++++++
arch/x86/kernel/perf_regs.c | 49 +++++++++++++++++++++++++--
5 files changed, 124 insertions(+), 2 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6b1c347cc17a..91039c0256b3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -422,6 +422,14 @@ static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
xcomp_bv = xregs_xsave->header.xcomp_bv;
if (mask & XFEATURE_MASK_SSE && xcomp_bv & XFEATURE_SSE)
perf_regs->xmm_regs = (u64 *)xregs_xsave->i387.xmm_space;
+
+ xsave += FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ /* The XSAVES instruction always uses the compacted format */
+ if (mask & XFEATURE_MASK_YMM && xcomp_bv & XFEATURE_MASK_YMM) {
+ perf_regs->ymmh_regs = xsave;
+ xsave += XSAVE_YMM_SIZE;
+ }
}
static void release_ext_regs_buffers(void)
@@ -447,6 +455,9 @@ static void reserve_ext_regs_buffers(void)
size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_YMM))
+ size += XSAVE_YMM_SIZE;
+
/* XSAVE feature requires 64-byte alignment. */
size += 64;
@@ -712,6 +723,13 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!(x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_XMM)))
return -EINVAL;
}
+ if (event_has_extended_regs2(event)) {
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS2))
+ return -EINVAL;
+ if (x86_pmu_get_event_num_ext_regs(event, X86_EXT_REGS_YMM) &&
+ !(x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_YMM)))
+ return -EINVAL;
+ }
}
return x86_setup_perfctr(event);
}
@@ -1765,6 +1783,7 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
struct x86_perf_regs *perf_regs = container_of(regs, struct x86_perf_regs, regs);
u64 sample_type = event->attr.sample_type;
u64 mask = 0;
+ int num;
if (!(event->attr.sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)))
return;
@@ -1799,6 +1818,13 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
mask |= XFEATURE_MASK_SSE;
}
+ num = x86_pmu_get_event_num_ext_regs(event, X86_EXT_REGS_YMM);
+ if (num) {
+ perf_regs->ymmh_regs = NULL;
+ mask |= XFEATURE_MASK_YMM;
+ data->dyn_size += num * PERF_X86_EXT_REG_YMMH_SIZE * sizeof(u64);
+ }
+
mask &= ~ignore_mask;
if (mask)
x86_pmu_get_ext_regs(perf_regs, mask);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index b48f4215f37c..911916bc8e36 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -689,6 +689,7 @@ enum {
enum {
X86_EXT_REGS_XMM = 0,
+ X86_EXT_REGS_YMM,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
@@ -1319,6 +1320,27 @@ static inline u64 x86_pmu_get_event_config(struct perf_event *event)
return event->attr.config & hybrid(event->pmu, config_mask);
}
+static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
+{
+ u64 mask;
+
+ switch (type) {
+ case X86_EXT_REGS_YMM:
+ mask = GENMASK_ULL(PERF_REG_X86_YMMH15, PERF_REG_X86_YMMH0);
+ return hweight64(ext_regs[0] & mask);
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+static inline int x86_pmu_get_event_num_ext_regs(struct perf_event *event,
+ unsigned int type)
+{
+ return get_num_ext_regs(event->attr.sample_ext_regs_intr, type) +
+ get_num_ext_regs(event->attr.sample_ext_regs_user, type);
+}
+
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 70d1d94aca7e..c30571f4de26 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -593,6 +593,7 @@ struct pt_regs;
struct x86_perf_regs {
struct pt_regs regs;
u64 *xmm_regs;
+ u64 *ymmh_regs;
};
extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index 7c9d2bb3833b..f37644513e33 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -55,4 +55,32 @@ enum perf_event_x86_regs {
#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1))
+enum perf_event_x86_ext_regs {
+ /* YMMH Registers */
+ PERF_REG_X86_YMMH0 = 0,
+ PERF_REG_X86_YMMH1,
+ PERF_REG_X86_YMMH2,
+ PERF_REG_X86_YMMH3,
+ PERF_REG_X86_YMMH4,
+ PERF_REG_X86_YMMH5,
+ PERF_REG_X86_YMMH6,
+ PERF_REG_X86_YMMH7,
+ PERF_REG_X86_YMMH8,
+ PERF_REG_X86_YMMH9,
+ PERF_REG_X86_YMMH10,
+ PERF_REG_X86_YMMH11,
+ PERF_REG_X86_YMMH12,
+ PERF_REG_X86_YMMH13,
+ PERF_REG_X86_YMMH14,
+ PERF_REG_X86_YMMH15,
+
+ PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_YMMH15,
+};
+
+enum perf_event_x86_ext_reg_size {
+ PERF_X86_EXT_REG_YMMH_SIZE = 2,
+
+ /* max of PERF_REG_X86_XXX_SIZE */
+ PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_YMMH_SIZE,
+};
#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index b9d5106afc26..f12ef60a1a8a 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -57,10 +57,46 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
#endif
};
+static_assert(PERF_REG_X86_EXT_REGS_MAX < PERF_ATTR_EXT_REGS_SIZE * 64);
+static_assert(PERF_X86_EXT_REG_SIZE_MAX <= PERF_EXT_REGS_SIZE_MAX);
+
+static inline u64 __perf_ext_reg_value(u64 *ext, int *ext_size,
+ int idx, u64 *regs, int size)
+{
+ if (!regs)
+ return 0;
+ memcpy(ext, ®s[idx * size], sizeof(u64) * size);
+ *ext_size = size;
+ return ext[0];
+}
+
+static u64 perf_ext_reg_value(struct pt_regs *regs, int idx,
+ u64 *ext, int *ext_size)
+{
+ struct x86_perf_regs *perf_regs;
+
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ switch (idx) {
+ case PERF_REG_X86_YMMH0 ... PERF_REG_X86_YMMH15:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_YMMH0,
+ perf_regs->ymmh_regs,
+ PERF_X86_EXT_REG_YMMH_SIZE);
+ default:
+ WARN_ON_ONCE(1);
+ *ext_size = 0;
+ break;
+ }
+ return 0;
+}
+
u64 perf_reg_value(struct pt_regs *regs, int idx, u64 *ext, int *ext_size)
{
struct x86_perf_regs *perf_regs;
+ if (ext && ext_size)
+ return perf_ext_reg_value(regs, idx, ext, ext_size);
+
if (WARN_ON_ONCE(ext || ext_size))
return 0;
@@ -117,13 +153,22 @@ void perf_get_regs_user(struct perf_regs *regs_user,
(1ULL << PERF_REG_X86_FS) | \
(1ULL << PERF_REG_X86_GS))
+static_assert (PERF_ATTR_EXT_REGS_SIZE == 2);
+
int perf_reg_validate(u64 mask, u64 *mask_ext)
{
- if (mask_ext)
+ if (!mask && !mask_ext)
return -EINVAL;
- if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
+ if (mask && (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
+ if (mask_ext) {
+ int h = mask_ext[1] ? fls64(mask_ext[1]) + 64 : fls64(mask_ext[0]);
+
+ if (h > PERF_REG_X86_EXT_REGS_MAX + 1)
+ return -EINVAL;
+ }
+
return 0;
}
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 07/12] perf/x86: Add YMMH in extended regs
2025-06-13 13:49 ` [RFC PATCH 07/12] perf/x86: Add YMMH in extended regs kan.liang
@ 2025-06-13 15:48 ` Dave Hansen
0 siblings, 0 replies; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 15:48 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
> size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
>
> + if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_YMM))
> + size += XSAVE_YMM_SIZE;
Sorry, but this is probably buggy too:
> The value returned by ECX[1] indicates the alignment of state
> component i when the compacted format of the extended region of an
> XSAVE area is used (see Section 13.4.3).
You can't simply stack all the sizes on top of each other. You have to
align some of the components individually to a 64-byte boundary. It
works for the large components because they're multiples of 64 bytes
already.
^ permalink raw reply [flat|nested] 60+ messages in thread
* [RFC PATCH 08/12] perf/x86: Add APX in extended regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (6 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 07/12] perf/x86: Add YMMH in extended regs kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 16:02 ` Dave Hansen
2025-06-17 8:19 ` Peter Zijlstra
2025-06-13 13:49 ` [RFC PATCH 09/12] perf/x86: Add OPMASK " kan.liang
` (5 subsequent siblings)
13 siblings, 2 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support APX as the extended registers. It can be configured in the
sample_ext_regs_intr/user.
Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
The value can be retrieved via the XSAVES.
Define several macros to simplify the code.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 48 +++++++++++++++++++--------
arch/x86/events/perf_event.h | 4 +++
arch/x86/include/asm/perf_event.h | 1 +
arch/x86/include/uapi/asm/perf_regs.h | 21 +++++++++++-
arch/x86/kernel/perf_regs.c | 5 +++
5 files changed, 65 insertions(+), 14 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 91039c0256b3..67f62268f063 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -408,6 +408,14 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
static DEFINE_PER_CPU(void *, ext_regs_buf);
+#define __x86_pmu_get_regs(_mask, _regs, _size) \
+do { \
+ if (mask & _mask && xcomp_bv & _mask) { \
+ _regs = xsave; \
+ xsave += _size; \
+ } \
+} while (0)
+
static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
{
void *xsave = (void *)ALIGN((unsigned long)per_cpu(ext_regs_buf, smp_processor_id()), 64);
@@ -426,10 +434,8 @@ static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
xsave += FXSAVE_SIZE + XSAVE_HDR_SIZE;
/* The XSAVES instruction always uses the compacted format */
- if (mask & XFEATURE_MASK_YMM && xcomp_bv & XFEATURE_MASK_YMM) {
- perf_regs->ymmh_regs = xsave;
- xsave += XSAVE_YMM_SIZE;
- }
+ __x86_pmu_get_regs(XFEATURE_MASK_YMM, perf_regs->ymmh_regs, XSAVE_YMM_SIZE);
+ __x86_pmu_get_regs(XFEATURE_MASK_APX, perf_regs->apx_regs, sizeof(struct apx_state));
}
static void release_ext_regs_buffers(void)
@@ -457,6 +463,8 @@ static void reserve_ext_regs_buffers(void)
if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_YMM))
size += XSAVE_YMM_SIZE;
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_APX))
+ size += sizeof(struct apx_state);
/* XSAVE feature requires 64-byte alignment. */
size += 64;
@@ -642,6 +650,13 @@ int x86_pmu_max_precise(void)
return precise;
}
+#define check_ext_regs(_type) \
+do { \
+ if (x86_pmu_get_event_num_ext_regs(event, _type) && \
+ !(x86_pmu.ext_regs_mask & BIT_ULL(_type))) \
+ return -EINVAL; \
+} while (0)
+
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
@@ -726,9 +741,8 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event_has_extended_regs2(event)) {
if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS2))
return -EINVAL;
- if (x86_pmu_get_event_num_ext_regs(event, X86_EXT_REGS_YMM) &&
- !(x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_YMM)))
- return -EINVAL;
+ check_ext_regs(X86_EXT_REGS_YMM);
+ check_ext_regs(X86_EXT_REGS_APX);
}
}
return x86_setup_perfctr(event);
@@ -1775,6 +1789,16 @@ x86_pmu_perf_get_regs_user(struct perf_sample_data *data,
return x86_regs_user;
}
+#define init_ext_regs_data(_type, _regs, _mask, _size) \
+do { \
+ num = x86_pmu_get_event_num_ext_regs(event, _type); \
+ if (num) { \
+ _regs = NULL; \
+ mask |= _mask; \
+ data->dyn_size += num * _size * sizeof(u64); \
+ } \
+} while (0)
+
void x86_pmu_setup_regs_data(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
@@ -1818,12 +1842,10 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
mask |= XFEATURE_MASK_SSE;
}
- num = x86_pmu_get_event_num_ext_regs(event, X86_EXT_REGS_YMM);
- if (num) {
- perf_regs->ymmh_regs = NULL;
- mask |= XFEATURE_MASK_YMM;
- data->dyn_size += num * PERF_X86_EXT_REG_YMMH_SIZE * sizeof(u64);
- }
+ init_ext_regs_data(X86_EXT_REGS_YMM, perf_regs->ymmh_regs,
+ XFEATURE_MASK_YMM, PERF_X86_EXT_REG_YMMH_SIZE);
+ init_ext_regs_data(X86_EXT_REGS_APX, perf_regs->apx_regs,
+ XFEATURE_MASK_APX, PERF_X86_EXT_REG_APX_SIZE);
mask &= ~ignore_mask;
if (mask)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 911916bc8e36..1c40b5d9c025 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -690,6 +690,7 @@ enum {
enum {
X86_EXT_REGS_XMM = 0,
X86_EXT_REGS_YMM,
+ X86_EXT_REGS_APX,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
@@ -1328,6 +1329,9 @@ static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
case X86_EXT_REGS_YMM:
mask = GENMASK_ULL(PERF_REG_X86_YMMH15, PERF_REG_X86_YMMH0);
return hweight64(ext_regs[0] & mask);
+ case X86_EXT_REGS_APX:
+ mask = GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16);
+ return hweight64(ext_regs[0] & mask);
default:
return 0;
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index c30571f4de26..9e4d60f3a9a2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -594,6 +594,7 @@ struct x86_perf_regs {
struct pt_regs regs;
u64 *xmm_regs;
u64 *ymmh_regs;
+ u64 *apx_regs;
};
extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index f37644513e33..e23fb112faac 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -74,11 +74,30 @@ enum perf_event_x86_ext_regs {
PERF_REG_X86_YMMH14,
PERF_REG_X86_YMMH15,
- PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_YMMH15,
+ /* APX Registers */
+ PERF_REG_X86_R16,
+ PERF_REG_X86_R17,
+ PERF_REG_X86_R18,
+ PERF_REG_X86_R19,
+ PERF_REG_X86_R20,
+ PERF_REG_X86_R21,
+ PERF_REG_X86_R22,
+ PERF_REG_X86_R23,
+ PERF_REG_X86_R24,
+ PERF_REG_X86_R25,
+ PERF_REG_X86_R26,
+ PERF_REG_X86_R27,
+ PERF_REG_X86_R28,
+ PERF_REG_X86_R29,
+ PERF_REG_X86_R30,
+ PERF_REG_X86_R31,
+
+ PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_R31,
};
enum perf_event_x86_ext_reg_size {
PERF_X86_EXT_REG_YMMH_SIZE = 2,
+ PERF_X86_EXT_REG_APX_SIZE = 1,
/* max of PERF_REG_X86_XXX_SIZE */
PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_YMMH_SIZE,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index f12ef60a1a8a..518497bafdf0 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -82,6 +82,11 @@ static u64 perf_ext_reg_value(struct pt_regs *regs, int idx,
idx - PERF_REG_X86_YMMH0,
perf_regs->ymmh_regs,
PERF_X86_EXT_REG_YMMH_SIZE);
+ case PERF_REG_X86_R16 ... PERF_REG_X86_R31:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_R16,
+ perf_regs->apx_regs,
+ PERF_X86_EXT_REG_APX_SIZE);
default:
WARN_ON_ONCE(1);
*ext_size = 0;
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 08/12] perf/x86: Add APX in extended regs
2025-06-13 13:49 ` [RFC PATCH 08/12] perf/x86: Add APX " kan.liang
@ 2025-06-13 16:02 ` Dave Hansen
2025-06-13 17:17 ` Liang, Kan
2025-06-17 8:19 ` Peter Zijlstra
1 sibling, 1 reply; 60+ messages in thread
From: Dave Hansen @ 2025-06-13 16:02 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
> +#define __x86_pmu_get_regs(_mask, _regs, _size) \
> +do { \
> + if (mask & _mask && xcomp_bv & _mask) { \
> + _regs = xsave; \
> + xsave += _size; \
> + } \
> +} while (0)
Ewww.
First of all, this doesn't work generally because of the previously
mentioned alignment. Second, it's using xcomp_bv which doesn't tell you
if XSAVES wrote the data.
Last, this attempts to reimplement get_xsave_addr().
I'd do something like this:
for (xfeature_nr in mask) {
void *src = get_xsave_addr(xsave, xfeature_nr);
void *dst = ... a function to map XFEATURE_MASK_APX
to perf_regs->apx_regs,
int size = xstate_sizes(xfeature_nr);
if (!src)
continue;
memcpy(dst, src, size);
}
That should handle *all* of the nastiness. The alignment, the init
optimization. *Please* use get_xsave_addr() or one of the other helpers.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 08/12] perf/x86: Add APX in extended regs
2025-06-13 16:02 ` Dave Hansen
@ 2025-06-13 17:17 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-13 17:17 UTC (permalink / raw)
To: Dave Hansen, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen
On 2025-06-13 12:02 p.m., Dave Hansen wrote:
> On 6/13/25 06:49, kan.liang@linux.intel.com wrote:
>> +#define __x86_pmu_get_regs(_mask, _regs, _size) \
>> +do { \
>> + if (mask & _mask && xcomp_bv & _mask) { \
>> + _regs = xsave; \
>> + xsave += _size; \
>> + } \
>> +} while (0)
>
> Ewww.
>
> First of all, this doesn't work generally because of the previously
> mentioned alignment. Second, it's using xcomp_bv which doesn't tell you
> if XSAVES wrote the data.
>
> Last, this attempts to reimplement get_xsave_addr().
>
> I'd do something like this:
>
> for (xfeature_nr in mask) {
> void *src = get_xsave_addr(xsave, xfeature_nr);
> void *dst = ... a function to map XFEATURE_MASK_APX
> to perf_regs->apx_regs,
> int size = xstate_sizes(xfeature_nr);
>
> if (!src)
> continue;
>
> memcpy(dst, src, size);
The data will eventually be copied to a buffer which shared with the
user-space tool. I probably have to avoid the memcpy and do it later
when outputting the sample.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/events/core.c#n7389
> }
>
> That should handle *all* of the nastiness. The alignment, the init
> optimization. *Please* use get_xsave_addr() or one of the other helpers.
Thanks, I will try to reuse the existing fpu functions as much as I can.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 08/12] perf/x86: Add APX in extended regs
2025-06-13 13:49 ` [RFC PATCH 08/12] perf/x86: Add APX " kan.liang
2025-06-13 16:02 ` Dave Hansen
@ 2025-06-17 8:19 ` Peter Zijlstra
1 sibling, 0 replies; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 8:19 UTC (permalink / raw)
To: kan.liang
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On Fri, Jun 13, 2025 at 06:49:39AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Support APX as the extended registers. It can be configured in the
> sample_ext_regs_intr/user.
>
> Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
> The value can be retrieved via the XSAVES.
>
> Define several macros to simplify the code.
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Yuck.
How about reclaiming the current horrible XMM register space when the
new sample_simd_reg_words != 0 ?
^ permalink raw reply [flat|nested] 60+ messages in thread
* [RFC PATCH 09/12] perf/x86: Add OPMASK in extended regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (7 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 08/12] perf/x86: Add APX " kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 10/12] perf/x86: Add ZMM " kan.liang
` (4 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support OPMASK as the extended registers. It can be configured in the
sample_ext_regs_intr/user.
Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
The value can be retrieved via the XSAVES.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 6 ++++++
arch/x86/events/perf_event.h | 4 ++++
arch/x86/include/asm/perf_event.h | 1 +
arch/x86/include/uapi/asm/perf_regs.h | 13 ++++++++++++-
arch/x86/kernel/perf_regs.c | 5 +++++
5 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 67f62268f063..741e6dfd50a5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -436,6 +436,7 @@ static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
/* The XSAVES instruction always uses the compacted format */
__x86_pmu_get_regs(XFEATURE_MASK_YMM, perf_regs->ymmh_regs, XSAVE_YMM_SIZE);
__x86_pmu_get_regs(XFEATURE_MASK_APX, perf_regs->apx_regs, sizeof(struct apx_state));
+ __x86_pmu_get_regs(XFEATURE_MASK_OPMASK, perf_regs->opmask_regs, sizeof(struct avx_512_opmask_state));
}
static void release_ext_regs_buffers(void)
@@ -465,6 +466,8 @@ static void reserve_ext_regs_buffers(void)
size += XSAVE_YMM_SIZE;
if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_APX))
size += sizeof(struct apx_state);
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_OPMASK))
+ size += sizeof(struct avx_512_opmask_state);
/* XSAVE feature requires 64-byte alignment. */
size += 64;
@@ -743,6 +746,7 @@ int x86_pmu_hw_config(struct perf_event *event)
return -EINVAL;
check_ext_regs(X86_EXT_REGS_YMM);
check_ext_regs(X86_EXT_REGS_APX);
+ check_ext_regs(X86_EXT_REGS_OPMASK);
}
}
return x86_setup_perfctr(event);
@@ -1846,6 +1850,8 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
XFEATURE_MASK_YMM, PERF_X86_EXT_REG_YMMH_SIZE);
init_ext_regs_data(X86_EXT_REGS_APX, perf_regs->apx_regs,
XFEATURE_MASK_APX, PERF_X86_EXT_REG_APX_SIZE);
+ init_ext_regs_data(X86_EXT_REGS_OPMASK, perf_regs->opmask_regs,
+ XFEATURE_MASK_OPMASK, PERF_X86_EXT_REG_OPMASK_SIZE);
mask &= ~ignore_mask;
if (mask)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1c40b5d9c025..c2626dcea1a0 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -691,6 +691,7 @@ enum {
X86_EXT_REGS_XMM = 0,
X86_EXT_REGS_YMM,
X86_EXT_REGS_APX,
+ X86_EXT_REGS_OPMASK,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
@@ -1332,6 +1333,9 @@ static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
case X86_EXT_REGS_APX:
mask = GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16);
return hweight64(ext_regs[0] & mask);
+ case X86_EXT_REGS_OPMASK:
+ mask = GENMASK_ULL(PERF_REG_X86_OPMASK7, PERF_REG_X86_OPMASK0);
+ return hweight64(ext_regs[0] & mask);
default:
return 0;
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9e4d60f3a9a2..4e971f38ff94 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -595,6 +595,7 @@ struct x86_perf_regs {
u64 *xmm_regs;
u64 *ymmh_regs;
u64 *apx_regs;
+ u64 *opmask_regs;
};
extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index e23fb112faac..b9ec58b98c5e 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -92,12 +92,23 @@ enum perf_event_x86_ext_regs {
PERF_REG_X86_R30,
PERF_REG_X86_R31,
- PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_R31,
+ /* OPMASK Registers */
+ PERF_REG_X86_OPMASK0,
+ PERF_REG_X86_OPMASK1,
+ PERF_REG_X86_OPMASK2,
+ PERF_REG_X86_OPMASK3,
+ PERF_REG_X86_OPMASK4,
+ PERF_REG_X86_OPMASK5,
+ PERF_REG_X86_OPMASK6,
+ PERF_REG_X86_OPMASK7,
+
+ PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_OPMASK7,
};
enum perf_event_x86_ext_reg_size {
PERF_X86_EXT_REG_YMMH_SIZE = 2,
PERF_X86_EXT_REG_APX_SIZE = 1,
+ PERF_X86_EXT_REG_OPMASK_SIZE = 1,
/* max of PERF_REG_X86_XXX_SIZE */
PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_YMMH_SIZE,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 518497bafdf0..34b94b846f00 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -87,6 +87,11 @@ static u64 perf_ext_reg_value(struct pt_regs *regs, int idx,
idx - PERF_REG_X86_R16,
perf_regs->apx_regs,
PERF_X86_EXT_REG_APX_SIZE);
+ case PERF_REG_X86_OPMASK0 ... PERF_REG_X86_OPMASK7:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_OPMASK0,
+ perf_regs->opmask_regs,
+ PERF_X86_EXT_REG_OPMASK_SIZE);
default:
WARN_ON_ONCE(1);
*ext_size = 0;
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 10/12] perf/x86: Add ZMM in extended regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (8 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 09/12] perf/x86: Add OPMASK " kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 11/12] perf/x86: Add SSP " kan.liang
` (3 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support ZMM as the extended registers. It can be configured in the
sample_ext_regs_intr/user.
Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
The value can be retrieved via the XSAVES.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 14 +++++++++
arch/x86/events/perf_event.h | 11 ++++++-
arch/x86/include/asm/perf_event.h | 2 ++
arch/x86/include/uapi/asm/perf_regs.h | 43 +++++++++++++++++++++++++--
arch/x86/kernel/perf_regs.c | 10 +++++++
5 files changed, 77 insertions(+), 3 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 741e6dfd50a5..9bcef9a32dd2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -437,6 +437,10 @@ static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
__x86_pmu_get_regs(XFEATURE_MASK_YMM, perf_regs->ymmh_regs, XSAVE_YMM_SIZE);
__x86_pmu_get_regs(XFEATURE_MASK_APX, perf_regs->apx_regs, sizeof(struct apx_state));
__x86_pmu_get_regs(XFEATURE_MASK_OPMASK, perf_regs->opmask_regs, sizeof(struct avx_512_opmask_state));
+ __x86_pmu_get_regs(XFEATURE_MASK_ZMM_Hi256, perf_regs->zmmh_regs,
+ sizeof(struct avx_512_zmm_uppers_state));
+ __x86_pmu_get_regs(XFEATURE_MASK_Hi16_ZMM, perf_regs->h16zmm_regs,
+ sizeof(struct avx_512_hi16_state));
}
static void release_ext_regs_buffers(void)
@@ -468,6 +472,10 @@ static void reserve_ext_regs_buffers(void)
size += sizeof(struct apx_state);
if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_OPMASK))
size += sizeof(struct avx_512_opmask_state);
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_ZMMH))
+ size += sizeof(struct avx_512_zmm_uppers_state);
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_H16ZMM))
+ size += sizeof(struct avx_512_hi16_state);
/* XSAVE feature requires 64-byte alignment. */
size += 64;
@@ -747,6 +755,8 @@ int x86_pmu_hw_config(struct perf_event *event)
check_ext_regs(X86_EXT_REGS_YMM);
check_ext_regs(X86_EXT_REGS_APX);
check_ext_regs(X86_EXT_REGS_OPMASK);
+ check_ext_regs(X86_EXT_REGS_ZMMH);
+ check_ext_regs(X86_EXT_REGS_H16ZMM);
}
}
return x86_setup_perfctr(event);
@@ -1852,6 +1862,10 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
XFEATURE_MASK_APX, PERF_X86_EXT_REG_APX_SIZE);
init_ext_regs_data(X86_EXT_REGS_OPMASK, perf_regs->opmask_regs,
XFEATURE_MASK_OPMASK, PERF_X86_EXT_REG_OPMASK_SIZE);
+ init_ext_regs_data(X86_EXT_REGS_ZMMH, perf_regs->zmmh_regs,
+ XFEATURE_MASK_ZMM_Hi256, PERF_X86_EXT_REG_ZMMH_SIZE);
+ init_ext_regs_data(X86_EXT_REGS_H16ZMM, perf_regs->h16zmm_regs,
+ XFEATURE_MASK_Hi16_ZMM, PERF_X86_EXT_REG_H16ZMM_SIZE);
mask &= ~ignore_mask;
if (mask)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index c2626dcea1a0..93a65c529afe 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -692,6 +692,8 @@ enum {
X86_EXT_REGS_YMM,
X86_EXT_REGS_APX,
X86_EXT_REGS_OPMASK,
+ X86_EXT_REGS_ZMMH,
+ X86_EXT_REGS_H16ZMM,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
@@ -1324,7 +1326,7 @@ static inline u64 x86_pmu_get_event_config(struct perf_event *event)
static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
{
- u64 mask;
+ u64 mask, mask2;
switch (type) {
case X86_EXT_REGS_YMM:
@@ -1336,6 +1338,13 @@ static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
case X86_EXT_REGS_OPMASK:
mask = GENMASK_ULL(PERF_REG_X86_OPMASK7, PERF_REG_X86_OPMASK0);
return hweight64(ext_regs[0] & mask);
+ case X86_EXT_REGS_ZMMH:
+ mask = GENMASK_ULL(PERF_REG_X86_ZMMH15, PERF_REG_X86_ZMMH0);
+ return hweight64(ext_regs[0] & mask);
+ case X86_EXT_REGS_H16ZMM:
+ mask = GENMASK_ULL(PERF_REG_X86_EXT_REGS_64, PERF_REG_X86_ZMM16);
+ mask2 = GENMASK_ULL(PERF_REG_X86_ZMM31 - 64, 0);
+ return hweight64(ext_regs[0] & mask) + hweight64(ext_regs[1] & mask2);
default:
return 0;
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 4e971f38ff94..eb35ba9afbb4 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -596,6 +596,8 @@ struct x86_perf_regs {
u64 *ymmh_regs;
u64 *apx_regs;
u64 *opmask_regs;
+ u64 *zmmh_regs;
+ u64 *h16zmm_regs;
};
extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index b9ec58b98c5e..c43a025b0c01 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -102,15 +102,54 @@ enum perf_event_x86_ext_regs {
PERF_REG_X86_OPMASK6,
PERF_REG_X86_OPMASK7,
- PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_OPMASK7,
+ /* ZMMH 0-15 Registers */
+ PERF_REG_X86_ZMMH0,
+ PERF_REG_X86_ZMMH1,
+ PERF_REG_X86_ZMMH2,
+ PERF_REG_X86_ZMMH3,
+ PERF_REG_X86_ZMMH4,
+ PERF_REG_X86_ZMMH5,
+ PERF_REG_X86_ZMMH6,
+ PERF_REG_X86_ZMMH7,
+ PERF_REG_X86_ZMMH8,
+ PERF_REG_X86_ZMMH9,
+ PERF_REG_X86_ZMMH10,
+ PERF_REG_X86_ZMMH11,
+ PERF_REG_X86_ZMMH12,
+ PERF_REG_X86_ZMMH13,
+ PERF_REG_X86_ZMMH14,
+ PERF_REG_X86_ZMMH15,
+
+ /* H16ZMM 16-31 Registers */
+ PERF_REG_X86_ZMM16,
+ PERF_REG_X86_ZMM17,
+ PERF_REG_X86_ZMM18,
+ PERF_REG_X86_ZMM19,
+ PERF_REG_X86_ZMM20,
+ PERF_REG_X86_ZMM21,
+ PERF_REG_X86_ZMM22,
+ PERF_REG_X86_ZMM23,
+ PERF_REG_X86_ZMM24,
+ PERF_REG_X86_ZMM25,
+ PERF_REG_X86_ZMM26,
+ PERF_REG_X86_ZMM27,
+ PERF_REG_X86_ZMM28,
+ PERF_REG_X86_ZMM29,
+ PERF_REG_X86_ZMM30,
+ PERF_REG_X86_ZMM31,
+
+ PERF_REG_X86_EXT_REGS_64 = PERF_REG_X86_ZMM23,
+ PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_ZMM31,
};
enum perf_event_x86_ext_reg_size {
PERF_X86_EXT_REG_YMMH_SIZE = 2,
PERF_X86_EXT_REG_APX_SIZE = 1,
PERF_X86_EXT_REG_OPMASK_SIZE = 1,
+ PERF_X86_EXT_REG_ZMMH_SIZE = 4,
+ PERF_X86_EXT_REG_H16ZMM_SIZE = 8,
/* max of PERF_REG_X86_XXX_SIZE */
- PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_YMMH_SIZE,
+ PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_H16ZMM_SIZE,
};
#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 34b94b846f00..d5721ea85c5d 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -92,6 +92,16 @@ static u64 perf_ext_reg_value(struct pt_regs *regs, int idx,
idx - PERF_REG_X86_OPMASK0,
perf_regs->opmask_regs,
PERF_X86_EXT_REG_OPMASK_SIZE);
+ case PERF_REG_X86_ZMMH0 ... PERF_REG_X86_ZMMH15:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_ZMMH0,
+ perf_regs->zmmh_regs,
+ PERF_X86_EXT_REG_ZMMH_SIZE);
+ case PERF_REG_X86_ZMM16 ... PERF_REG_X86_ZMM31:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_ZMM16,
+ perf_regs->h16zmm_regs,
+ PERF_X86_EXT_REG_H16ZMM_SIZE);
default:
WARN_ON_ONCE(1);
*ext_size = 0;
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 11/12] perf/x86: Add SSP in extended regs
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (9 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 10/12] perf/x86: Add ZMM " kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-13 13:49 ` [RFC PATCH 12/12] perf/x86/intel: Support extended registers kan.liang
` (2 subsequent siblings)
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support SSP as the extended registers. It can be configured in the
sample_ext_regs_intr/user.
Only the PMU with PERF_PMU_CAP_EXTENDED_REGS2 supports the feature.
The value can be retrieved via the XSAVES.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/core.c | 7 +++++++
arch/x86/events/perf_event.h | 5 +++++
arch/x86/include/asm/perf_event.h | 1 +
arch/x86/include/uapi/asm/perf_regs.h | 6 +++++-
arch/x86/kernel/perf_regs.c | 5 +++++
5 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9bcef9a32dd2..65e4460fdc28 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -441,6 +441,8 @@ static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
sizeof(struct avx_512_zmm_uppers_state));
__x86_pmu_get_regs(XFEATURE_MASK_Hi16_ZMM, perf_regs->h16zmm_regs,
sizeof(struct avx_512_hi16_state));
+ __x86_pmu_get_regs(XFEATURE_MASK_CET_USER, perf_regs->cet_regs,
+ sizeof(struct cet_user_state));
}
static void release_ext_regs_buffers(void)
@@ -476,6 +478,8 @@ static void reserve_ext_regs_buffers(void)
size += sizeof(struct avx_512_zmm_uppers_state);
if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_H16ZMM))
size += sizeof(struct avx_512_hi16_state);
+ if (x86_pmu.ext_regs_mask & BIT_ULL(X86_EXT_REGS_CET))
+ size += sizeof(struct cet_user_state);
/* XSAVE feature requires 64-byte alignment. */
size += 64;
@@ -757,6 +761,7 @@ int x86_pmu_hw_config(struct perf_event *event)
check_ext_regs(X86_EXT_REGS_OPMASK);
check_ext_regs(X86_EXT_REGS_ZMMH);
check_ext_regs(X86_EXT_REGS_H16ZMM);
+ check_ext_regs(X86_EXT_REGS_CET);
}
}
return x86_setup_perfctr(event);
@@ -1866,6 +1871,8 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
XFEATURE_MASK_ZMM_Hi256, PERF_X86_EXT_REG_ZMMH_SIZE);
init_ext_regs_data(X86_EXT_REGS_H16ZMM, perf_regs->h16zmm_regs,
XFEATURE_MASK_Hi16_ZMM, PERF_X86_EXT_REG_H16ZMM_SIZE);
+ init_ext_regs_data(X86_EXT_REGS_CET, perf_regs->cet_regs,
+ XFEATURE_MASK_CET_USER, PERF_X86_EXT_REG_SSP_SIZE);
mask &= ~ignore_mask;
if (mask)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 93a65c529afe..e4906c0b33da 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -694,6 +694,7 @@ enum {
X86_EXT_REGS_OPMASK,
X86_EXT_REGS_ZMMH,
X86_EXT_REGS_H16ZMM,
+ X86_EXT_REGS_CET,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x100
@@ -1345,6 +1346,10 @@ static inline int get_num_ext_regs(u64 *ext_regs, unsigned int type)
mask = GENMASK_ULL(PERF_REG_X86_EXT_REGS_64, PERF_REG_X86_ZMM16);
mask2 = GENMASK_ULL(PERF_REG_X86_ZMM31 - 64, 0);
return hweight64(ext_regs[0] & mask) + hweight64(ext_regs[1] & mask2);
+ case X86_EXT_REGS_CET:
+ if (ext_regs[1] & BIT_ULL(PERF_REG_X86_SSP - 64))
+ return 1;
+ return 0;
default:
return 0;
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index eb35ba9afbb4..e49a26886e64 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -598,6 +598,7 @@ struct x86_perf_regs {
u64 *opmask_regs;
u64 *zmmh_regs;
u64 *h16zmm_regs;
+ u64 *cet_regs;
};
extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index c43a025b0c01..82df5c65d701 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -138,8 +138,11 @@ enum perf_event_x86_ext_regs {
PERF_REG_X86_ZMM30,
PERF_REG_X86_ZMM31,
+ /* shadow stack pointer (SSP) */
+ PERF_REG_X86_SSP,
+
PERF_REG_X86_EXT_REGS_64 = PERF_REG_X86_ZMM23,
- PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_ZMM31,
+ PERF_REG_X86_EXT_REGS_MAX = PERF_REG_X86_SSP,
};
enum perf_event_x86_ext_reg_size {
@@ -148,6 +151,7 @@ enum perf_event_x86_ext_reg_size {
PERF_X86_EXT_REG_OPMASK_SIZE = 1,
PERF_X86_EXT_REG_ZMMH_SIZE = 4,
PERF_X86_EXT_REG_H16ZMM_SIZE = 8,
+ PERF_X86_EXT_REG_SSP_SIZE = 1,
/* max of PERF_REG_X86_XXX_SIZE */
PERF_X86_EXT_REG_SIZE_MAX = PERF_X86_EXT_REG_H16ZMM_SIZE,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index d5721ea85c5d..6a5936ed7143 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -102,6 +102,11 @@ static u64 perf_ext_reg_value(struct pt_regs *regs, int idx,
idx - PERF_REG_X86_ZMM16,
perf_regs->h16zmm_regs,
PERF_X86_EXT_REG_H16ZMM_SIZE);
+ case PERF_REG_X86_SSP:
+ return __perf_ext_reg_value(ext, ext_size,
+ idx - PERF_REG_X86_SSP,
+ &perf_regs->cet_regs[1],
+ PERF_X86_EXT_REG_SSP_SIZE);
default:
WARN_ON_ONCE(1);
*ext_size = 0;
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* [RFC PATCH 12/12] perf/x86/intel: Support extended registers
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (10 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 11/12] perf/x86: Add SSP " kan.liang
@ 2025-06-13 13:49 ` kan.liang
2025-06-17 7:50 ` [RFC PATCH 00/12] Support vector and more extended registers in perf Mi, Dapeng
2025-06-17 8:24 ` Peter Zijlstra
13 siblings, 0 replies; 60+ messages in thread
From: kan.liang @ 2025-06-13 13:49 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung, tglx, dave.hansen, irogers,
adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: dapeng1.mi, ak, zide.chen, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Support YMM, APX, OPMASK, ZMM, and SSP if there is XSAVES support.
Disable large PEBS if the extended regs are required.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/intel/core.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5706ee562684..4218067b1843 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4035,6 +4035,8 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
flags &= ~PERF_SAMPLE_REGS_USER;
if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
+ if (event_has_extended_regs2(event))
+ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
return flags;
}
@@ -5295,6 +5297,26 @@ static void intel_extended_regs_init(struct pmu *pmu)
x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_XMM);
x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_YMM);
+ if (boot_cpu_has(X86_FEATURE_APX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_APX, NULL))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_APX);
+ if (boot_cpu_has(X86_FEATURE_AVX512F)) {
+ if (cpu_has_xfeatures(XFEATURE_MASK_OPMASK, NULL))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_OPMASK);
+ if (cpu_has_xfeatures(XFEATURE_MASK_ZMM_Hi256, NULL))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_ZMMH);
+ if (cpu_has_xfeatures(XFEATURE_MASK_Hi16_ZMM, NULL))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_H16ZMM);
+ }
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ x86_pmu.ext_regs_mask |= BIT_ULL(X86_EXT_REGS_CET);
+
+ if (x86_pmu.ext_regs_mask != BIT_ULL(X86_EXT_REGS_XMM))
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS2;
}
static void update_pmu_cap(struct pmu *pmu)
--
2.38.1
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (11 preceding siblings ...)
2025-06-13 13:49 ` [RFC PATCH 12/12] perf/x86/intel: Support extended registers kan.liang
@ 2025-06-17 7:50 ` Mi, Dapeng
2025-06-17 8:24 ` Peter Zijlstra
13 siblings, 0 replies; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-17 7:50 UTC (permalink / raw)
To: kan.liang, peterz, mingo, acme, namhyung, tglx, dave.hansen,
irogers, adrian.hunter, jolsa, alexander.shishkin, linux-kernel
Cc: ak, zide.chen
On 6/13/2025 9:49 PM, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Starting from the Intel Ice Lake, the XMM registers can be collected in
> a PEBS record. More registers, e.g., YMM, ZMM, OPMASK, SPP and APX, will
> be added in the upcoming Architecture PEBS as well. But it requires the
> hardware support.
>
> The patch set provides a software solution to mitigate the hardware
> requirement. It utilizes the XSAVES command to retrieve the requested
> registers in the overflow handler. The feature isn't limited to the PEBS
> event or specific platforms anymore.
> The hardware solution (if available) is still preferred, since it has
> low overhead (especially with the large PEBS) and is more accurate.
>
> In theory, the solution should work for all X86 platforms. But I only
> have newer Inter platforms to test. The patch set only enable the
> feature for Intel Ice Lake and later platforms.
>
> Open:
> The new registers include YMM, ZMM, OPMASK, SSP, and APX.
> The sample_regs_user/intr has run out. A new field in the
> struct perf_event_attr is required for the registers.
> There could be several options as below for the new field.
>
> - Follow a similar format to XSAVES. Introduce the below fields to store
> the bitmap of the registers.
> struct perf_event_attr {
> ...
> __u64 sample_ext_regs_intr[2];
> __u64 sample_ext_regs_user[2];
> ...
> }
> Includes YMMH (16 bits), APX (16 bits), OPMASK (8 bits),
> ZMMH0-15 (16 bits), H16ZMM (16 bits), SSP
> For example, if a user wants YMM8, the perf tool needs to set the
> corresponding bits of XMM8 and YMMH8, and reconstruct the result.
> The method is similar to the existing method for
> sample_regs_user/intr, and match the XSAVES format.
> The kernel doesn't need to do extra configuration and reconstruction.
> It's implemented in the patch set.
>
> - Similar to the above method. But the fields are the bitmap of the
> complete registers, E.g., YMM (16 bits), APX (16 bits),
> OPMASK (8 bits), ZMM (32 bits), SSP.
> The kernel needs to do extra configuration and reconstruction,
> which may brings extra overhead.
>
> - Combine the XMM, YMM, and ZMM. So all the registers can be put into
> one u64 field.
> ...
> union {
> __u64 sample_ext_regs_intr; //sample_ext_regs_user is simiar
> struct {
> __u32 vector_bitmap;
> __u32 vector_type : 3, //0b001 XMM 0b010 YMM 0b100 ZMM
> apx_bitmap : 16,
> opmask_bitmap : 8,
> ssp_bitmap : 1,
> reserved : 4,
>
> };
> ...
> For example, if the YMM8-15 is required,
> vector_bitmap: 0x0000ff00
> vector_type: 0x2
> This method can save two __u64 in the struct perf_event_attr.
> But it's not straightforward since it mixes the type and bitmap.
> The kernel also needs to do extra configuration and reconstruction.
>
> Please let me know if there are more ideas.
+1 for method 1 or 2, and the method 2 is more preferred.
Method 1 doesn't need to reconstruct YMM/ZMM regs in kernel space, but it
offloads the reconstructions into user space, all user space perf related
tools have to reconstruct them by themselves. Not 100% sure, but I suppose
this needs a big change for perf tools to reconstruct and show the YMM/ZMM
regs.
The cons of method 2 is that it could need to extra memory space and memory
copy if users intent to sample these overlapped regs simultaneously, like
XMM0/YMM0/ZMM0, but suppose we can add extra check in perf tools and tell
users that these regs are overlapped and just force to sample the regs with
largest bit-width.
>
> Thanks,
> Kan
>
>
>
> Kan Liang (12):
> perf/x86: Use x86_perf_regs in the x86 nmi handler
> perf/x86: Setup the regs data
> x86/fpu/xstate: Add xsaves_nmi
> perf: Move has_extended_regs() to header file
> perf/x86: Support XMM register for non-PEBS and REGS_USER
> perf: Support extension of sample_regs
> perf/x86: Add YMMH in extended regs
> perf/x86: Add APX in extended regs
> perf/x86: Add OPMASK in extended regs
> perf/x86: Add ZMM in extended regs
> perf/x86: Add SSP in extended regs
> perf/x86/intel: Support extended registers
>
> arch/arm/kernel/perf_regs.c | 9 +-
> arch/arm64/kernel/perf_regs.c | 9 +-
> arch/csky/kernel/perf_regs.c | 9 +-
> arch/loongarch/kernel/perf_regs.c | 8 +-
> arch/mips/kernel/perf_regs.c | 9 +-
> arch/powerpc/perf/perf_regs.c | 9 +-
> arch/riscv/kernel/perf_regs.c | 8 +-
> arch/s390/kernel/perf_regs.c | 9 +-
> arch/x86/events/core.c | 226 ++++++++++++++++++++++++--
> arch/x86/events/intel/core.c | 49 ++++++
> arch/x86/events/intel/ds.c | 12 +-
> arch/x86/events/perf_event.h | 58 +++++++
> arch/x86/include/asm/fpu/xstate.h | 1 +
> arch/x86/include/asm/perf_event.h | 6 +
> arch/x86/include/uapi/asm/perf_regs.h | 101 ++++++++++++
> arch/x86/kernel/fpu/xstate.c | 22 +++
> arch/x86/kernel/perf_regs.c | 85 +++++++++-
> include/linux/perf_event.h | 23 +++
> include/linux/perf_regs.h | 29 +++-
> include/uapi/linux/perf_event.h | 8 +
> kernel/events/core.c | 63 +++++--
> 21 files changed, 699 insertions(+), 54 deletions(-)
>
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-13 13:49 [RFC PATCH 00/12] Support vector and more extended registers in perf kan.liang
` (12 preceding siblings ...)
2025-06-17 7:50 ` [RFC PATCH 00/12] Support vector and more extended registers in perf Mi, Dapeng
@ 2025-06-17 8:24 ` Peter Zijlstra
2025-06-17 13:52 ` Liang, Kan
13 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 8:24 UTC (permalink / raw)
To: kan.liang
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On Fri, Jun 13, 2025 at 06:49:31AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Starting from the Intel Ice Lake, the XMM registers can be collected in
> a PEBS record. More registers, e.g., YMM, ZMM, OPMASK, SPP and APX, will
> be added in the upcoming Architecture PEBS as well. But it requires the
> hardware support.
>
> The patch set provides a software solution to mitigate the hardware
> requirement. It utilizes the XSAVES command to retrieve the requested
> registers in the overflow handler. The feature isn't limited to the PEBS
> event or specific platforms anymore.
> The hardware solution (if available) is still preferred, since it has
> low overhead (especially with the large PEBS) and is more accurate.
>
> In theory, the solution should work for all X86 platforms. But I only
> have newer Inter platforms to test. The patch set only enable the
> feature for Intel Ice Lake and later platforms.
>
> Open:
> The new registers include YMM, ZMM, OPMASK, SSP, and APX.
> The sample_regs_user/intr has run out. A new field in the
> struct perf_event_attr is required for the registers.
> There could be several options as below for the new field.
>
> - Follow a similar format to XSAVES. Introduce the below fields to store
> the bitmap of the registers.
> struct perf_event_attr {
> ...
> __u64 sample_ext_regs_intr[2];
> __u64 sample_ext_regs_user[2];
> ...
> }
> Includes YMMH (16 bits), APX (16 bits), OPMASK (8 bits),
> ZMMH0-15 (16 bits), H16ZMM (16 bits), SSP
> For example, if a user wants YMM8, the perf tool needs to set the
> corresponding bits of XMM8 and YMMH8, and reconstruct the result.
> The method is similar to the existing method for
> sample_regs_user/intr, and match the XSAVES format.
> The kernel doesn't need to do extra configuration and reconstruction.
> It's implemented in the patch set.
>
> - Similar to the above method. But the fields are the bitmap of the
> complete registers, E.g., YMM (16 bits), APX (16 bits),
> OPMASK (8 bits), ZMM (32 bits), SSP.
> The kernel needs to do extra configuration and reconstruction,
> which may brings extra overhead.
>
> - Combine the XMM, YMM, and ZMM. So all the registers can be put into
> one u64 field.
> ...
> union {
> __u64 sample_ext_regs_intr; //sample_ext_regs_user is simiar
> struct {
> __u32 vector_bitmap;
> __u32 vector_type : 3, //0b001 XMM 0b010 YMM 0b100 ZMM
> apx_bitmap : 16,
> opmask_bitmap : 8,
> ssp_bitmap : 1,
> reserved : 4,
>
> };
> ...
> For example, if the YMM8-15 is required,
> vector_bitmap: 0x0000ff00
> vector_type: 0x2
> This method can save two __u64 in the struct perf_event_attr.
> But it's not straightforward since it mixes the type and bitmap.
> The kernel also needs to do extra configuration and reconstruction.
>
> Please let me know if there are more ideas.
https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
comes to mind. Combine that with a rule that reclaims the XMM register
space from perf_event_x86_regs when sample_simd_reg_words != 0, and then
we can put APX and SPP there.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-17 8:24 ` Peter Zijlstra
@ 2025-06-17 13:52 ` Liang, Kan
2025-06-17 14:29 ` Peter Zijlstra
0 siblings, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-17 13:52 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On 2025-06-17 4:24 a.m., Peter Zijlstra wrote:
> On Fri, Jun 13, 2025 at 06:49:31AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> Starting from the Intel Ice Lake, the XMM registers can be collected in
>> a PEBS record. More registers, e.g., YMM, ZMM, OPMASK, SPP and APX, will
>> be added in the upcoming Architecture PEBS as well. But it requires the
>> hardware support.
>>
>> The patch set provides a software solution to mitigate the hardware
>> requirement. It utilizes the XSAVES command to retrieve the requested
>> registers in the overflow handler. The feature isn't limited to the PEBS
>> event or specific platforms anymore.
>> The hardware solution (if available) is still preferred, since it has
>> low overhead (especially with the large PEBS) and is more accurate.
>>
>> In theory, the solution should work for all X86 platforms. But I only
>> have newer Inter platforms to test. The patch set only enable the
>> feature for Intel Ice Lake and later platforms.
>>
>> Open:
>> The new registers include YMM, ZMM, OPMASK, SSP, and APX.
>> The sample_regs_user/intr has run out. A new field in the
>> struct perf_event_attr is required for the registers.
>> There could be several options as below for the new field.
>>
>> - Follow a similar format to XSAVES. Introduce the below fields to store
>> the bitmap of the registers.
>> struct perf_event_attr {
>> ...
>> __u64 sample_ext_regs_intr[2];
>> __u64 sample_ext_regs_user[2];
>> ...
>> }
>> Includes YMMH (16 bits), APX (16 bits), OPMASK (8 bits),
>> ZMMH0-15 (16 bits), H16ZMM (16 bits), SSP
>> For example, if a user wants YMM8, the perf tool needs to set the
>> corresponding bits of XMM8 and YMMH8, and reconstruct the result.
>> The method is similar to the existing method for
>> sample_regs_user/intr, and match the XSAVES format.
>> The kernel doesn't need to do extra configuration and reconstruction.
>> It's implemented in the patch set.
>>
>> - Similar to the above method. But the fields are the bitmap of the
>> complete registers, E.g., YMM (16 bits), APX (16 bits),
>> OPMASK (8 bits), ZMM (32 bits), SSP.
>> The kernel needs to do extra configuration and reconstruction,
>> which may brings extra overhead.
>>
>> - Combine the XMM, YMM, and ZMM. So all the registers can be put into
>> one u64 field.
>> ...
>> union {
>> __u64 sample_ext_regs_intr; //sample_ext_regs_user is simiar
>> struct {
>> __u32 vector_bitmap;
>> __u32 vector_type : 3, //0b001 XMM 0b010 YMM 0b100 ZMM
>> apx_bitmap : 16,
>> opmask_bitmap : 8,
>> ssp_bitmap : 1,
>> reserved : 4,
>>
>> };
>> ...
>> For example, if the YMM8-15 is required,
>> vector_bitmap: 0x0000ff00
>> vector_type: 0x2
>> This method can save two __u64 in the struct perf_event_attr.
>> But it's not straightforward since it mixes the type and bitmap.
>> The kernel also needs to do extra configuration and reconstruction.
>>
>> Please let me know if there are more ideas.
>
> https://lkml.kernel.org/r/20250416155327.GD17910@noisy.programming.kicks-ass.net
>
It's similar to the third method, but using the words to replace the
type. Also there are more space left in case we add more SIMDs in future.
I will implement it in the V2.
> comes to mind. Combine that with a rule that reclaims the XMM register
> space from perf_event_x86_regs when sample_simd_reg_words != 0, and then
> we can put APX and SPP there.
OK. So the sample_simd_reg_words actually has another meaning now. It's
used as a flag to tell whether utilizing the old format.
If so, I think it may be better to have a dedicate sample_simd_reg_flag
field.
For example,
#define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
#define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
__u8 sample_simd_reg_flags;
__u8 sample_simd_reg_words;
__u64 sample_simd_reg_intr;
__u64 sample_simd_reg_user;
If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
Does it make sense?
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-17 13:52 ` Liang, Kan
@ 2025-06-17 14:29 ` Peter Zijlstra
2025-06-17 15:23 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 14:29 UTC (permalink / raw)
To: Liang, Kan
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
> OK. So the sample_simd_reg_words actually has another meaning now.
Well, any simd field being non-zero means userspace knows about it. Sort
of an implicit flag.
> It's used as a flag to tell whether utilizing the old format.
>
> If so, I think it may be better to have a dedicate sample_simd_reg_flag
> field.
>
> For example,
>
> #define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
> #define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
>
> __u8 sample_simd_reg_flags;
> __u8 sample_simd_reg_words;
> __u64 sample_simd_reg_intr;
> __u64 sample_simd_reg_user;
>
> If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
>
> Does it make sense?
Not sure, it eats up a whole byte. Dapeng seemed to favour separate
intr/user vector width (although I'm not quite sure what the use would
be).
If you want an explicit bit, we might as well use one from __reserved_1,
we still have some left.
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-17 14:29 ` Peter Zijlstra
@ 2025-06-17 15:23 ` Liang, Kan
2025-06-17 17:34 ` Peter Zijlstra
2025-06-18 0:57 ` Mi, Dapeng
0 siblings, 2 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-17 15:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On 2025-06-17 10:29 a.m., Peter Zijlstra wrote:
> On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
>
>> OK. So the sample_simd_reg_words actually has another meaning now.
>
> Well, any simd field being non-zero means userspace knows about it. Sort
> of an implicit flag.
Yes, but the tool probably wouldn't to touch any simd fields if user
doesn't ask for simd registers
>
>> It's used as a flag to tell whether utilizing the old format.
>>
>> If so, I think it may be better to have a dedicate sample_simd_reg_flag
>> field.
>>
>> For example,
>>
>> #define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
>> #define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
>>
>> __u8 sample_simd_reg_flags;
>> __u8 sample_simd_reg_words;
>> __u64 sample_simd_reg_intr;
>> __u64 sample_simd_reg_user;
>>
>> If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
>>
>> Does it make sense?
>
> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
> intr/user vector width (although I'm not quite sure what the use would
> be).
>
> If you want an explicit bit, we might as well use one from __reserved_1,
> we still have some left.
OK. I may add a sample_simd_reg : 1 to explicitly tell kernel to utilize
the sample_simd_reg_XXX.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-17 15:23 ` Liang, Kan
@ 2025-06-17 17:34 ` Peter Zijlstra
2025-06-18 0:57 ` Mi, Dapeng
1 sibling, 0 replies; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-17 17:34 UTC (permalink / raw)
To: Liang, Kan
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, dapeng1.mi, ak,
zide.chen
On Tue, Jun 17, 2025 at 11:23:10AM -0400, Liang, Kan wrote:
>
>
> On 2025-06-17 10:29 a.m., Peter Zijlstra wrote:
> > On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
> >
> >> OK. So the sample_simd_reg_words actually has another meaning now.
> >
> > Well, any simd field being non-zero means userspace knows about it. Sort
> > of an implicit flag.
>
> Yes, but the tool probably wouldn't to touch any simd fields if user
> doesn't ask for simd registers
Trivial enough to have the tool unconditionally write a simd_words size
if the attr thing is big enough. But sure, whatever :-)
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-17 15:23 ` Liang, Kan
2025-06-17 17:34 ` Peter Zijlstra
@ 2025-06-18 0:57 ` Mi, Dapeng
2025-06-18 10:47 ` Liang, Kan
1 sibling, 1 reply; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-18 0:57 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 6/17/2025 11:23 PM, Liang, Kan wrote:
>
> On 2025-06-17 10:29 a.m., Peter Zijlstra wrote:
>> On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
>>
>>> OK. So the sample_simd_reg_words actually has another meaning now.
>> Well, any simd field being non-zero means userspace knows about it. Sort
>> of an implicit flag.
> Yes, but the tool probably wouldn't to touch any simd fields if user
> doesn't ask for simd registers
>
>>> It's used as a flag to tell whether utilizing the old format.
>>>
>>> If so, I think it may be better to have a dedicate sample_simd_reg_flag
>>> field.
>>>
>>> For example,
>>>
>>> #define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
>>> #define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
>>>
>>> __u8 sample_simd_reg_flags;
>>> __u8 sample_simd_reg_words;
>>> __u64 sample_simd_reg_intr;
>>> __u64 sample_simd_reg_user;
>>>
>>> If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
>>>
>>> Does it make sense?
Not sure if I missed some discussion, but are these fields only intended
for SIMD regs? What about the APX extended GPRs? Suppose APX eGPRs can
reuse the legacy XMM bitmaps in sample_regs_user/intr[47:32], but we need
an extra flag to distinguish it's XMM regs or APX eGPRs, maybe add an extra
bit sample_egpr_reg : 1 in sample_simd_reg_words, but the *simd* word in
the name would become ambiguous.
>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>> intr/user vector width (although I'm not quite sure what the use would
>> be).
The reason that I prefer to add 2 separate "words" item is that user could
sample interrupt and user space SIMD regs (but with different bit-width)
simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
>>
>> If you want an explicit bit, we might as well use one from __reserved_1,
>> we still have some left.
> OK. I may add a sample_simd_reg : 1 to explicitly tell kernel to utilize
> the sample_simd_reg_XXX.
>
> Thanks,
> Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-18 0:57 ` Mi, Dapeng
@ 2025-06-18 10:47 ` Liang, Kan
2025-06-18 12:28 ` Mi, Dapeng
0 siblings, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 10:47 UTC (permalink / raw)
To: Mi, Dapeng, Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 2025-06-17 8:57 p.m., Mi, Dapeng wrote:
>
> On 6/17/2025 11:23 PM, Liang, Kan wrote:
>>
>> On 2025-06-17 10:29 a.m., Peter Zijlstra wrote:
>>> On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
>>>
>>>> OK. So the sample_simd_reg_words actually has another meaning now.
>>> Well, any simd field being non-zero means userspace knows about it. Sort
>>> of an implicit flag.
>> Yes, but the tool probably wouldn't to touch any simd fields if user
>> doesn't ask for simd registers
>>
>>>> It's used as a flag to tell whether utilizing the old format.
>>>>
>>>> If so, I think it may be better to have a dedicate sample_simd_reg_flag
>>>> field.
>>>>
>>>> For example,
>>>>
>>>> #define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
>>>> #define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
>>>>
>>>> __u8 sample_simd_reg_flags;
>>>> __u8 sample_simd_reg_words;
>>>> __u64 sample_simd_reg_intr;
>>>> __u64 sample_simd_reg_user;
>>>>
>>>> If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
>>>>
>>>> Does it make sense?
>
> Not sure if I missed some discussion, but are these fields only intended
> for SIMD regs? What about the APX extended GPRs? Suppose APX eGPRs can
> reuse the legacy XMM bitmaps in sample_regs_user/intr[47:32], but we need
> an extra flag to distinguish it's XMM regs or APX eGPRs, maybe add an extra
> bit sample_egpr_reg : 1 in sample_simd_reg_words, but the *simd* word in
> the name would become ambiguous.
It can be used to explicitly tell the kernel that a new format is
expected. The new format means
- Put APX and SPP into sample_regs_user/intr[47:32]
- Use the sample_simd_reg_*
Alternatively, as Peter suggested, we can use the sample_simd_reg_words
to imply the new format.
If so, I will make it an union, for example.
union {
__u16 sample_reg_flags;
__u16 sample_simd_reg_words;
};
The first thing the tool does should be to set sample_reg_flags = 1,
regardless of whether simd is requested.
>
>
>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>> intr/user vector width (although I'm not quite sure what the use would
>>> be).
>
> The reason that I prefer to add 2 separate "words" item is that user could
> sample interrupt and user space SIMD regs (but with different bit-width)
> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
I'm not sure why the user wants a different bit-width. The
--user-regs=XMM0" doesn't seem to provide more useful information.
Anyway, I believe the tool can handle this case. The tool can always ask
YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
--user-regs. The only drawback is that the kernel may dump extra
information for the --user-regs. I don't think it's a big problem.
Thanks,
Kan
>
>
>>>
>>> If you want an explicit bit, we might as well use one from __reserved_1,
>>> we still have some left.
>> OK. I may add a sample_simd_reg : 1 to explicitly tell kernel to utilize
>> the sample_simd_reg_XXX.
>>
>> Thanks,
>> Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-18 10:47 ` Liang, Kan
@ 2025-06-18 12:28 ` Mi, Dapeng
2025-06-18 13:15 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-18 12:28 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 6/18/2025 6:47 PM, Liang, Kan wrote:
>
> On 2025-06-17 8:57 p.m., Mi, Dapeng wrote:
>> On 6/17/2025 11:23 PM, Liang, Kan wrote:
>>> On 2025-06-17 10:29 a.m., Peter Zijlstra wrote:
>>>> On Tue, Jun 17, 2025 at 09:52:12AM -0400, Liang, Kan wrote:
>>>>
>>>>> OK. So the sample_simd_reg_words actually has another meaning now.
>>>> Well, any simd field being non-zero means userspace knows about it. Sort
>>>> of an implicit flag.
>>> Yes, but the tool probably wouldn't to touch any simd fields if user
>>> doesn't ask for simd registers
>>>
>>>>> It's used as a flag to tell whether utilizing the old format.
>>>>>
>>>>> If so, I think it may be better to have a dedicate sample_simd_reg_flag
>>>>> field.
>>>>>
>>>>> For example,
>>>>>
>>>>> #define SAMPLE_SIMD_FLAGS_FORMAT_LEGACY 0x0
>>>>> #define SAMPLE_SIMD_FLAGS_FORMAT_WORDS 0x1
>>>>>
>>>>> __u8 sample_simd_reg_flags;
>>>>> __u8 sample_simd_reg_words;
>>>>> __u64 sample_simd_reg_intr;
>>>>> __u64 sample_simd_reg_user;
>>>>>
>>>>> If (sample_simd_reg_flags != 0) reclaims the XMM space for APX and SPP.
>>>>>
>>>>> Does it make sense?
>> Not sure if I missed some discussion, but are these fields only intended
>> for SIMD regs? What about the APX extended GPRs? Suppose APX eGPRs can
>> reuse the legacy XMM bitmaps in sample_regs_user/intr[47:32], but we need
>> an extra flag to distinguish it's XMM regs or APX eGPRs, maybe add an extra
>> bit sample_egpr_reg : 1 in sample_simd_reg_words, but the *simd* word in
>> the name would become ambiguous.
> It can be used to explicitly tell the kernel that a new format is
> expected. The new format means
> - Put APX and SPP into sample_regs_user/intr[47:32]
> - Use the sample_simd_reg_*
>
> Alternatively, as Peter suggested, we can use the sample_simd_reg_words
> to imply the new format.
> If so, I will make it an union, for example.
> union {
> __u16 sample_reg_flags;
> __u16 sample_simd_reg_words;
> };
>
> The first thing the tool does should be to set sample_reg_flags = 1,
> regardless of whether simd is requested.
So just double check, as long as the sample_reg_flags
(sample_simd_reg_words) > 0, the below new format would be used.
sample_regs_user/intr[31:0] bits unchanged, still represent the
original GPRs.
sample_regs_user/intr[47:32] bits represents APX eGPRs R31 - R16.
As for the SIMD regs including XMM regs, they are represented by the
dedicated SIMD regs structure ( or regs bitmap and regs word length) .
If sample_reg_flags (sample_simd_reg_words) == 0, then it falls back to
current format.
sample_regs_user/intr[31:0] bits represent the original GPRs.
sample_regs_user/intr[63:32] bits represent XMM regs.
If so, I think it's fine. The new format looks more reasonable than current
one.
>
>>
>>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>>> intr/user vector width (although I'm not quite sure what the use would
>>>> be).
>> The reason that I prefer to add 2 separate "words" item is that user could
>> sample interrupt and user space SIMD regs (but with different bit-width)
>> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
> I'm not sure why the user wants a different bit-width. The
> --user-regs=XMM0" doesn't seem to provide more useful information.
>
> Anyway, I believe the tool can handle this case. The tool can always ask
> YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
> --user-regs. The only drawback is that the kernel may dump extra
> information for the --user-regs. I don't think it's a big problem.
If we intent to handle it in user space tools, I'm not sure if user space
tool can easily know which records are from user space and filter out the
SIMD regs from kernel space and how complicated would the change be. IMO,
adding an extra u16 "words" would be much easier and won't consume too much
memory.
>
> Thanks,
> Kan
>>
>>>> If you want an explicit bit, we might as well use one from __reserved_1,
>>>> we still have some left.
>>> OK. I may add a sample_simd_reg : 1 to explicitly tell kernel to utilize
>>> the sample_simd_reg_XXX.
>>>
>>> Thanks,
>>> Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-18 12:28 ` Mi, Dapeng
@ 2025-06-18 13:15 ` Liang, Kan
2025-06-19 0:41 ` Mi, Dapeng
0 siblings, 1 reply; 60+ messages in thread
From: Liang, Kan @ 2025-06-18 13:15 UTC (permalink / raw)
To: Mi, Dapeng, Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 2025-06-18 8:28 a.m., Mi, Dapeng wrote:
>>>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>>>> intr/user vector width (although I'm not quite sure what the use would
>>>>> be).
>>> The reason that I prefer to add 2 separate "words" item is that user could
>>> sample interrupt and user space SIMD regs (but with different bit-width)
>>> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
>> I'm not sure why the user wants a different bit-width. The
>> --user-regs=XMM0" doesn't seem to provide more useful information.
>>
>> Anyway, I believe the tool can handle this case. The tool can always ask
>> YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
>> --user-regs. The only drawback is that the kernel may dump extra
>> information for the --user-regs. I don't think it's a big problem.
> If we intent to handle it in user space tools, I'm not sure if user space
> tool can easily know which records are from user space and filter out the
> SIMD regs from kernel space and how complicated would the change be. IMO,
> adding an extra u16 "words" would be much easier and won't consume too much
> memory.
The filter is always done in kernel for --user-regs. The only difference
is that the YMM (after filter) will be dumped to the perf.data. The tool
just show the XMM registers to the end user.
Thanks,
Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-18 13:15 ` Liang, Kan
@ 2025-06-19 0:41 ` Mi, Dapeng
2025-06-19 11:11 ` Liang, Kan
0 siblings, 1 reply; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-19 0:41 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 6/18/2025 9:15 PM, Liang, Kan wrote:
>
> On 2025-06-18 8:28 a.m., Mi, Dapeng wrote:
>>>>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>>>>> intr/user vector width (although I'm not quite sure what the use would
>>>>>> be).
>>>> The reason that I prefer to add 2 separate "words" item is that user could
>>>> sample interrupt and user space SIMD regs (but with different bit-width)
>>>> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
>>> I'm not sure why the user wants a different bit-width. The
>>> --user-regs=XMM0" doesn't seem to provide more useful information.
>>>
>>> Anyway, I believe the tool can handle this case. The tool can always ask
>>> YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
>>> --user-regs. The only drawback is that the kernel may dump extra
>>> information for the --user-regs. I don't think it's a big problem.
>> If we intent to handle it in user space tools, I'm not sure if user space
>> tool can easily know which records are from user space and filter out the
>> SIMD regs from kernel space and how complicated would the change be. IMO,
>> adding an extra u16 "words" would be much easier and won't consume too much
>> memory.
> The filter is always done in kernel for --user-regs. The only difference
> is that the YMM (after filter) will be dumped to the perf.data. The tool
> just show the XMM registers to the end user.
Ok. But there could be another case, user may want to sample some APX eGPRs
in user space and sample SIMD regs in interrupt, like "--intr-regs=YMM0,
--user-regs=R16", then we have to define 2 separate "words" fields.
Anyway, it looks we would define a SIMD_REGS structure like below, and I
suppose we would create 2 instances, one is for interrupt, the other is for
user space. It's enough.
PERF_SAMPLE_SIMD_REGS := {
u16 nr_vectors;
u16 vector_length;
u16 nr_pred;
u16 pred_length;
u64 data[];
}
>
> Thanks,
> Kan
>
>
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-19 0:41 ` Mi, Dapeng
@ 2025-06-19 11:11 ` Liang, Kan
2025-06-19 12:26 ` Mi, Dapeng
2025-06-19 13:38 ` Peter Zijlstra
0 siblings, 2 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-19 11:11 UTC (permalink / raw)
To: Mi, Dapeng, Peter Zijlstra, Mark Rutland
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 2025-06-18 8:41 p.m., Mi, Dapeng wrote:
>
> On 6/18/2025 9:15 PM, Liang, Kan wrote:
>>
>> On 2025-06-18 8:28 a.m., Mi, Dapeng wrote:
>>>>>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>>>>>> intr/user vector width (although I'm not quite sure what the use would
>>>>>>> be).
>>>>> The reason that I prefer to add 2 separate "words" item is that user could
>>>>> sample interrupt and user space SIMD regs (but with different bit-width)
>>>>> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
>>>> I'm not sure why the user wants a different bit-width. The
>>>> --user-regs=XMM0" doesn't seem to provide more useful information.
>>>>
>>>> Anyway, I believe the tool can handle this case. The tool can always ask
>>>> YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
>>>> --user-regs. The only drawback is that the kernel may dump extra
>>>> information for the --user-regs. I don't think it's a big problem.
>>> If we intent to handle it in user space tools, I'm not sure if user space
>>> tool can easily know which records are from user space and filter out the
>>> SIMD regs from kernel space and how complicated would the change be. IMO,
>>> adding an extra u16 "words" would be much easier and won't consume too much
>>> memory.
>> The filter is always done in kernel for --user-regs. The only difference
>> is that the YMM (after filter) will be dumped to the perf.data. The tool
>> just show the XMM registers to the end user.
>
> Ok. But there could be another case, user may want to sample some APX eGPRs
> in user space and sample SIMD regs in interrupt, like "--intr-regs=YMM0,
> --user-regs=R16", then we have to define 2 separate "words" fields.
>
Not for eGPRs. It uses the regular GP regs space, which implies u64 for
a 64b kernel. The "words" fields is only for vector and predicate registers.
I've stated working on the V2. The new interface would be as below.
diff --git a/include/uapi/linux/perf_event.h
b/include/uapi/linux/perf_event.h
index 78a362b80027..f7b8971fa99d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
+#define PERF_ATTR_SIZE_VER9 184 /* Add: sample_simd_regs */
/*
* 'struct perf_event_attr' contains various attributes that define
@@ -543,6 +544,24 @@ struct perf_event_attr {
__u64 sig_data;
__u64 config3; /* extension of config2 */
+
+
+ /*
+ * Defines set of SIMD registers to dump on samples.
+ * The sample_simd_req_enabled !=0 implies the
+ * set of SIMD registers is used to config all SIMD registers.
+ * If !sample_simd_req_enabled, sample_regs_XXX may be used to
+ * config some SIMD registers on X86.
+ */
+ union {
+ __u16 sample_simd_reg_enabled;
+ __u16 sample_simd_pred_reg_qwords;
+ };
+ __u16 sample_simd_pred_reg_intr;
+ __u16 sample_simd_pred_reg_user;
+ __u16 sample_simd_reg_qwords;
+ __u64 sample_simd_reg_intr;
+ __u64 sample_simd_reg_user;
};
/*
@@ -1016,7 +1035,15 @@ enum perf_event_type {
* } && PERF_SAMPLE_BRANCH_STACK
*
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+ * u64 regs[weight(mask)];
+ * struct {
+ * u16 nr_vectors;
+ * u16 vector_qwords;
+ * u16 nr_pred;
+ * u16 pred_qwords;
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && sample_simd_reg_enabled
+ * } && PERF_SAMPLE_REGS_USER
*
* { u64 size;
* char data[size];
@@ -1043,7 +1070,15 @@ enum perf_event_type {
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * u64 regs[weight(mask)];
+ * struct {
+ * u16 nr_vectors;
+ * u16 vector_qwords;
+ * u16 nr_pred;
+ * u16 pred_qwords;
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && sample_simd_reg_enabled
+ * } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 cgroup;} && PERF_SAMPLE_CGROUP
* { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
Thanks,
Kan
^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-19 11:11 ` Liang, Kan
@ 2025-06-19 12:26 ` Mi, Dapeng
2025-06-19 13:38 ` Peter Zijlstra
1 sibling, 0 replies; 60+ messages in thread
From: Mi, Dapeng @ 2025-06-19 12:26 UTC (permalink / raw)
To: Liang, Kan, Peter Zijlstra, Mark Rutland
Cc: mingo, acme, namhyung, tglx, dave.hansen, irogers, adrian.hunter,
jolsa, alexander.shishkin, linux-kernel, ak, zide.chen
On 6/19/2025 7:11 PM, Liang, Kan wrote:
>
> On 2025-06-18 8:41 p.m., Mi, Dapeng wrote:
>> On 6/18/2025 9:15 PM, Liang, Kan wrote:
>>> On 2025-06-18 8:28 a.m., Mi, Dapeng wrote:
>>>>>>>> Not sure, it eats up a whole byte. Dapeng seemed to favour separate
>>>>>>>> intr/user vector width (although I'm not quite sure what the use would
>>>>>>>> be).
>>>>>> The reason that I prefer to add 2 separate "words" item is that user could
>>>>>> sample interrupt and user space SIMD regs (but with different bit-width)
>>>>>> simultaneously in theory, like "--intr-regs=YMM0, --user-regs=XMM0".
>>>>> I'm not sure why the user wants a different bit-width. The
>>>>> --user-regs=XMM0" doesn't seem to provide more useful information.
>>>>>
>>>>> Anyway, I believe the tool can handle this case. The tool can always ask
>>>>> YMM0 for both --intr-regs and --user-regs, but only output the XMM0 for
>>>>> --user-regs. The only drawback is that the kernel may dump extra
>>>>> information for the --user-regs. I don't think it's a big problem.
>>>> If we intent to handle it in user space tools, I'm not sure if user space
>>>> tool can easily know which records are from user space and filter out the
>>>> SIMD regs from kernel space and how complicated would the change be. IMO,
>>>> adding an extra u16 "words" would be much easier and won't consume too much
>>>> memory.
>>> The filter is always done in kernel for --user-regs. The only difference
>>> is that the YMM (after filter) will be dumped to the perf.data. The tool
>>> just show the XMM registers to the end user.
>> Ok. But there could be another case, user may want to sample some APX eGPRs
>> in user space and sample SIMD regs in interrupt, like "--intr-regs=YMM0,
>> --user-regs=R16", then we have to define 2 separate "words" fields.
>>
> Not for eGPRs. It uses the regular GP regs space, which implies u64 for
> a 64b kernel. The "words" fields is only for vector and predicate registers.
>
> I've stated working on the V2. The new interface would be as below.
>
> diff --git a/include/uapi/linux/perf_event.h
> b/include/uapi/linux/perf_event.h
> index 78a362b80027..f7b8971fa99d 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -382,6 +382,7 @@ enum perf_event_read_format {
> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
> +#define PERF_ATTR_SIZE_VER9 184 /* Add: sample_simd_regs */
>
> /*
> * 'struct perf_event_attr' contains various attributes that define
> @@ -543,6 +544,24 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> +
> +
> + /*
> + * Defines set of SIMD registers to dump on samples.
> + * The sample_simd_req_enabled !=0 implies the
> + * set of SIMD registers is used to config all SIMD registers.
> + * If !sample_simd_req_enabled, sample_regs_XXX may be used to
> + * config some SIMD registers on X86.
> + */
> + union {
> + __u16 sample_simd_reg_enabled;
> + __u16 sample_simd_pred_reg_qwords;
> + };
> + __u16 sample_simd_pred_reg_intr;
> + __u16 sample_simd_pred_reg_user;
This is still a bitmap, right? Is it enough for ARM?
> + __u16 sample_simd_reg_qwords;
> + __u64 sample_simd_reg_intr;
> + __u64 sample_simd_reg_user;
> };
>
> /*
> @@ -1016,7 +1035,15 @@ enum perf_event_type {
> * } && PERF_SAMPLE_BRANCH_STACK
> *
> * { u64 abi; # enum perf_sample_regs_abi
> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
> + * u64 regs[weight(mask)];
> + * struct {
> + * u16 nr_vectors;
> + * u16 vector_qwords;
> + * u16 nr_pred;
> + * u16 pred_qwords;
> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
> + * } && sample_simd_reg_enabled
> + * } && PERF_SAMPLE_REGS_USER
> *
> * { u64 size;
> * char data[size];
> @@ -1043,7 +1070,15 @@ enum perf_event_type {
> * { u64 data_src; } && PERF_SAMPLE_DATA_SRC
> * { u64 transaction; } && PERF_SAMPLE_TRANSACTION
> * { u64 abi; # enum perf_sample_regs_abi
> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> + * u64 regs[weight(mask)];
> + * struct {
> + * u16 nr_vectors;
> + * u16 vector_qwords;
> + * u16 nr_pred;
> + * u16 pred_qwords;
> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
> + * } && sample_simd_reg_enabled
> + * } && PERF_SAMPLE_REGS_INTR
> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> * { u64 cgroup;} && PERF_SAMPLE_CGROUP
> * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
>
>
> Thanks,
> Kan
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-19 11:11 ` Liang, Kan
2025-06-19 12:26 ` Mi, Dapeng
@ 2025-06-19 13:38 ` Peter Zijlstra
2025-06-19 14:27 ` Liang, Kan
1 sibling, 1 reply; 60+ messages in thread
From: Peter Zijlstra @ 2025-06-19 13:38 UTC (permalink / raw)
To: Liang, Kan
Cc: Mi, Dapeng, Mark Rutland, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen
On Thu, Jun 19, 2025 at 07:11:23AM -0400, Liang, Kan wrote:
> @@ -543,6 +544,24 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> +
> +
> + /*
> + * Defines set of SIMD registers to dump on samples.
> + * The sample_simd_req_enabled !=0 implies the
> + * set of SIMD registers is used to config all SIMD registers.
> + * If !sample_simd_req_enabled, sample_regs_XXX may be used to
> + * config some SIMD registers on X86.
> + */
> + union {
> + __u16 sample_simd_reg_enabled;
> + __u16 sample_simd_pred_reg_qwords;
> + };
> + __u16 sample_simd_pred_reg_intr;
> + __u16 sample_simd_pred_reg_user;
This limits things to max 16 predicate registers. ARM will fully fill
that with present hardware.
> + __u16 sample_simd_reg_qwords;
> + __u64 sample_simd_reg_intr;
> + __u64 sample_simd_reg_user;
I would perhaps make this vec_reg.
> };
>
> /*
> @@ -1016,7 +1035,15 @@ enum perf_event_type {
> * } && PERF_SAMPLE_BRANCH_STACK
> *
> * { u64 abi; # enum perf_sample_regs_abi
> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
> + * u64 regs[weight(mask)];
> + * struct {
> + * u16 nr_vectors;
> + * u16 vector_qwords;
> + * u16 nr_pred;
> + * u16 pred_qwords;
> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
> + * } && sample_simd_reg_enabled
Instead of using sample_simd_reg_enabled here I would perhaps extend
perf_sample_regs_abi. The current abi word is woefully underused.
Also, realistically, what you want to look at here is:
sample_simd_{pred,vec}_reg_user;
If those are empty, there will be no registers.
> + * } && PERF_SAMPLE_REGS_USER
> *
> * { u64 size;
> * char data[size];
^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [RFC PATCH 00/12] Support vector and more extended registers in perf
2025-06-19 13:38 ` Peter Zijlstra
@ 2025-06-19 14:27 ` Liang, Kan
0 siblings, 0 replies; 60+ messages in thread
From: Liang, Kan @ 2025-06-19 14:27 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mi, Dapeng, Mark Rutland, mingo, acme, namhyung, tglx,
dave.hansen, irogers, adrian.hunter, jolsa, alexander.shishkin,
linux-kernel, ak, zide.chen
On 2025-06-19 9:38 a.m., Peter Zijlstra wrote:
> On Thu, Jun 19, 2025 at 07:11:23AM -0400, Liang, Kan wrote:
>
>> @@ -543,6 +544,24 @@ struct perf_event_attr {
>> __u64 sig_data;
>>
>> __u64 config3; /* extension of config2 */
>> +
>> +
>> + /*
>> + * Defines set of SIMD registers to dump on samples.
>> + * The sample_simd_req_enabled !=0 implies the
>> + * set of SIMD registers is used to config all SIMD registers.
>> + * If !sample_simd_req_enabled, sample_regs_XXX may be used to
>> + * config some SIMD registers on X86.
>> + */
>> + union {
>> + __u16 sample_simd_reg_enabled;
>> + __u16 sample_simd_pred_reg_qwords;
>> + };
>> + __u16 sample_simd_pred_reg_intr;
>> + __u16 sample_simd_pred_reg_user;
>
> This limits things to max 16 predicate registers. ARM will fully fill
> that with present hardware.
I think I can use __u32 for predicate registers.
It means we need one more u64 for the qwords. It should not be a problem.
>
>> + __u16 sample_simd_reg_qwords;
>> + __u64 sample_simd_reg_intr;
>> + __u64 sample_simd_reg_user;
>
> I would perhaps make this vec_reg.
Sure.
>
>> };
>>
>> /*
>> @@ -1016,7 +1035,15 @@ enum perf_event_type {
>> * } && PERF_SAMPLE_BRANCH_STACK
>> *
>> * { u64 abi; # enum perf_sample_regs_abi
>> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
>> + * u64 regs[weight(mask)];
>> + * struct {
>> + * u16 nr_vectors;
>> + * u16 vector_qwords;
>> + * u16 nr_pred;
>> + * u16 pred_qwords;
>> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
>> + * } && sample_simd_reg_enabled
>
> Instead of using sample_simd_reg_enabled here I would perhaps extend
> perf_sample_regs_abi. The current abi word is woefully underused.
>
Yes. Now I think the abi is used like a version number. I guess I can
add PERF_SAMPLE_REGS_ABI_SIMD and change it to a bitmap.
There should be no impact on the existing tool, since version and bitmap
are the same for 1 and 2.
enum perf_sample_regs_abi {
- PERF_SAMPLE_REGS_ABI_NONE = 0,
- PERF_SAMPLE_REGS_ABI_32 = 1,
- PERF_SAMPLE_REGS_ABI_64 = 2,
+ PERF_SAMPLE_REGS_ABI_NONE = 0x0,
+ PERF_SAMPLE_REGS_ABI_32 = 0x1,
+ PERF_SAMPLE_REGS_ABI_64 = 0x2,
+ PERF_SAMPLE_REGS_ABI_SIMD = 0x4,
};
> Also, realistically, what you want to look at here is:
>
> sample_simd_{pred,vec}_reg_user;
>
> If those are empty, there will be no registers.
Sure. But I will still keep the sample_simd_reg_enabled, since it can
explicitly tell if the new format is used.
Thanks,
Kan
>
>> + * } && PERF_SAMPLE_REGS_USER
>> *
>> * { u64 size;
>> * char data[size];
>
^ permalink raw reply [flat|nested] 60+ messages in thread