From: Peter Zijlstra <peterz@infradead.org>
To: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
Namhyung Kim <namhyung@kernel.org>,
Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
Alexander Shishkin <alexander.shishkin@linux.intel.com>,
Kan Liang <kan.liang@linux.intel.com>,
Andi Kleen <ak@linux.intel.com>,
Eranian Stephane <eranian@google.com>,
linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org,
Dapeng Mi <dapeng1.mi@intel.com>
Subject: Re: [Patch v3 16/22] perf/core: Support to capture higher width vector registers
Date: Tue, 15 Apr 2025 16:36:26 +0200 [thread overview]
Message-ID: <20250415143626.GF4031@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20250415114428.341182-17-dapeng1.mi@linux.intel.com>
On Tue, Apr 15, 2025 at 11:44:22AM +0000, Dapeng Mi wrote:
> extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
> diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
> index f9c5b16b1882..5e2d9796b2cc 100644
> --- a/arch/x86/include/uapi/asm/perf_regs.h
> +++ b/arch/x86/include/uapi/asm/perf_regs.h
> @@ -33,7 +33,7 @@ enum perf_event_x86_regs {
> PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
> PERF_REG_X86_64_MAX = PERF_REG_X86_SSP + 1,
>
> - /* These all need two bits set because they are 128bit */
> + /* These all need two bits set because they are 128 bits */
> PERF_REG_X86_XMM0 = 32,
> PERF_REG_X86_XMM1 = 34,
> PERF_REG_X86_XMM2 = 36,
> @@ -53,6 +53,83 @@ enum perf_event_x86_regs {
>
> /* These include both GPRs and XMMX registers */
> PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
> +
> + /* Leave bits[127:64] for other GP registers, like R16 ~ R31.*/
> +
> + /*
> + * Each YMM register need 4 bits to represent because they are 256 bits.
> + * PERF_REG_X86_YMMH0 = 128
> + */
> + PERF_REG_X86_YMM0 = 128,
> + PERF_REG_X86_YMM1 = PERF_REG_X86_YMM0 + 4,
> + PERF_REG_X86_YMM2 = PERF_REG_X86_YMM1 + 4,
> + PERF_REG_X86_YMM3 = PERF_REG_X86_YMM2 + 4,
> + PERF_REG_X86_YMM4 = PERF_REG_X86_YMM3 + 4,
> + PERF_REG_X86_YMM5 = PERF_REG_X86_YMM4 + 4,
> + PERF_REG_X86_YMM6 = PERF_REG_X86_YMM5 + 4,
> + PERF_REG_X86_YMM7 = PERF_REG_X86_YMM6 + 4,
> + PERF_REG_X86_YMM8 = PERF_REG_X86_YMM7 + 4,
> + PERF_REG_X86_YMM9 = PERF_REG_X86_YMM8 + 4,
> + PERF_REG_X86_YMM10 = PERF_REG_X86_YMM9 + 4,
> + PERF_REG_X86_YMM11 = PERF_REG_X86_YMM10 + 4,
> + PERF_REG_X86_YMM12 = PERF_REG_X86_YMM11 + 4,
> + PERF_REG_X86_YMM13 = PERF_REG_X86_YMM12 + 4,
> + PERF_REG_X86_YMM14 = PERF_REG_X86_YMM13 + 4,
> + PERF_REG_X86_YMM15 = PERF_REG_X86_YMM14 + 4,
> + PERF_REG_X86_YMM_MAX = PERF_REG_X86_YMM15 + 4,
> +
> + /*
> + * Each ZMM register needs 8 bits to represent because they are 512 bits
> + * PERF_REG_X86_ZMMH0 = 192
> + */
> + PERF_REG_X86_ZMM0 = PERF_REG_X86_YMM_MAX,
> + PERF_REG_X86_ZMM1 = PERF_REG_X86_ZMM0 + 8,
> + PERF_REG_X86_ZMM2 = PERF_REG_X86_ZMM1 + 8,
> + PERF_REG_X86_ZMM3 = PERF_REG_X86_ZMM2 + 8,
> + PERF_REG_X86_ZMM4 = PERF_REG_X86_ZMM3 + 8,
> + PERF_REG_X86_ZMM5 = PERF_REG_X86_ZMM4 + 8,
> + PERF_REG_X86_ZMM6 = PERF_REG_X86_ZMM5 + 8,
> + PERF_REG_X86_ZMM7 = PERF_REG_X86_ZMM6 + 8,
> + PERF_REG_X86_ZMM8 = PERF_REG_X86_ZMM7 + 8,
> + PERF_REG_X86_ZMM9 = PERF_REG_X86_ZMM8 + 8,
> + PERF_REG_X86_ZMM10 = PERF_REG_X86_ZMM9 + 8,
> + PERF_REG_X86_ZMM11 = PERF_REG_X86_ZMM10 + 8,
> + PERF_REG_X86_ZMM12 = PERF_REG_X86_ZMM11 + 8,
> + PERF_REG_X86_ZMM13 = PERF_REG_X86_ZMM12 + 8,
> + PERF_REG_X86_ZMM14 = PERF_REG_X86_ZMM13 + 8,
> + PERF_REG_X86_ZMM15 = PERF_REG_X86_ZMM14 + 8,
> + PERF_REG_X86_ZMM16 = PERF_REG_X86_ZMM15 + 8,
> + PERF_REG_X86_ZMM17 = PERF_REG_X86_ZMM16 + 8,
> + PERF_REG_X86_ZMM18 = PERF_REG_X86_ZMM17 + 8,
> + PERF_REG_X86_ZMM19 = PERF_REG_X86_ZMM18 + 8,
> + PERF_REG_X86_ZMM20 = PERF_REG_X86_ZMM19 + 8,
> + PERF_REG_X86_ZMM21 = PERF_REG_X86_ZMM20 + 8,
> + PERF_REG_X86_ZMM22 = PERF_REG_X86_ZMM21 + 8,
> + PERF_REG_X86_ZMM23 = PERF_REG_X86_ZMM22 + 8,
> + PERF_REG_X86_ZMM24 = PERF_REG_X86_ZMM23 + 8,
> + PERF_REG_X86_ZMM25 = PERF_REG_X86_ZMM24 + 8,
> + PERF_REG_X86_ZMM26 = PERF_REG_X86_ZMM25 + 8,
> + PERF_REG_X86_ZMM27 = PERF_REG_X86_ZMM26 + 8,
> + PERF_REG_X86_ZMM28 = PERF_REG_X86_ZMM27 + 8,
> + PERF_REG_X86_ZMM29 = PERF_REG_X86_ZMM28 + 8,
> + PERF_REG_X86_ZMM30 = PERF_REG_X86_ZMM29 + 8,
> + PERF_REG_X86_ZMM31 = PERF_REG_X86_ZMM30 + 8,
> + PERF_REG_X86_ZMM_MAX = PERF_REG_X86_ZMM31 + 8,
> +
> + /*
> + * OPMASK Registers
> + * PERF_REG_X86_OPMASK0 = 448
> + */
> + PERF_REG_X86_OPMASK0 = PERF_REG_X86_ZMM_MAX,
> + PERF_REG_X86_OPMASK1 = PERF_REG_X86_OPMASK0 + 1,
> + PERF_REG_X86_OPMASK2 = PERF_REG_X86_OPMASK1 + 1,
> + PERF_REG_X86_OPMASK3 = PERF_REG_X86_OPMASK2 + 1,
> + PERF_REG_X86_OPMASK4 = PERF_REG_X86_OPMASK3 + 1,
> + PERF_REG_X86_OPMASK5 = PERF_REG_X86_OPMASK4 + 1,
> + PERF_REG_X86_OPMASK6 = PERF_REG_X86_OPMASK5 + 1,
> + PERF_REG_X86_OPMASK7 = PERF_REG_X86_OPMASK6 + 1,
> +
> + PERF_REG_X86_VEC_MAX = PERF_REG_X86_OPMASK7 + 1,
> };
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 5fc753c23734..78aae0464a54 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -379,6 +379,10 @@ enum perf_event_read_format {
> #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */
> #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */
> #define PERF_ATTR_SIZE_VER8 136 /* add: config3 */
> +#define PERF_ATTR_SIZE_VER9 168 /* add: sample_regs_intr_ext[PERF_EXT_REGS_ARRAY_SIZE] */
> +
> +#define PERF_EXT_REGS_ARRAY_SIZE 7
> +#define PERF_NUM_EXT_REGS (PERF_EXT_REGS_ARRAY_SIZE * 64)
>
> /*
> * Hardware event_id to monitor via a performance monitoring event:
> @@ -533,6 +537,13 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> +
> + /*
> + * Extension sets of regs to dump for each sample.
> + * See asm/perf_regs.h for details.
> + */
> + __u64 sample_regs_intr_ext[PERF_EXT_REGS_ARRAY_SIZE];
> + __u64 sample_regs_user_ext[PERF_EXT_REGS_ARRAY_SIZE];
> };
>
> /*
I still utterly hate this interface. This is a giant waste of bits.
What makes it even worse is that XMMn is the lower half of YMMn which in
turn is the lower half of ZMMn.
So by exposing only ZMMn you already expose all of them. The interface
explicitly allows asking for sub-words.
But most importantly of all, last time I asked if there are users that
actually care about the whole per-register thing and I don't see an
answer here.
Can we please find a better interface? Ideally one that scales up to
1024 and 2048 bit vector width, because I'd hate to have to rev this
again.
Perhaps add sample_vec_regs_*[] with a saner format, and if that is !0
then the XMM regs dissapear from sample_regs_*[] and we get to use that
space to extended GPs.
next prev parent reply other threads:[~2025-04-15 14:36 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-15 11:44 [Patch v3 00/22] Arch-PEBS and PMU supports for Clearwater Forest and Panther Lake Dapeng Mi
2025-04-15 11:44 ` [Patch v3 01/22] perf/x86/intel: Add Panther Lake support Dapeng Mi
2025-04-17 13:01 ` [tip: perf/core] " tip-bot2 for Kan Liang
2025-04-15 11:44 ` [Patch v3 02/22] perf/x86/intel: Add PMU support for Clearwater Forest Dapeng Mi
2025-04-17 13:01 ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 03/22] perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs Dapeng Mi
2025-04-17 13:01 ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 04/22] perf/x86/intel: Decouple BTS initialization from PEBS initialization Dapeng Mi
2025-04-17 13:01 ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 05/22] perf/x86/intel: Rename x86_pmu.pebs to x86_pmu.ds_pebs Dapeng Mi
2025-04-17 13:01 ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 06/22] perf/x86/intel: Introduce pairs of PEBS static calls Dapeng Mi
2025-04-17 13:00 ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 07/22] perf/x86/intel: Initialize architectural PEBS Dapeng Mi
2025-04-15 11:44 ` [Patch v3 08/22] perf/x86/intel/ds: Factor out PEBS record processing code to functions Dapeng Mi
2025-04-15 11:44 ` [Patch v3 09/22] perf/x86/intel/ds: Factor out PEBS group " Dapeng Mi
2025-04-15 11:44 ` [Patch v3 10/22] perf/x86/intel: Process arch-PEBS records or record fragments Dapeng Mi
2025-04-15 13:57 ` Peter Zijlstra
2025-04-15 16:09 ` Liang, Kan
2025-04-15 11:44 ` [Patch v3 11/22] perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR Dapeng Mi
2025-04-15 13:45 ` Peter Zijlstra
2025-04-16 0:59 ` Mi, Dapeng
2025-04-15 13:48 ` Peter Zijlstra
2025-04-16 1:03 ` Mi, Dapeng
2025-04-15 11:44 ` [Patch v3 12/22] perf/x86/intel: Update dyn_constranit base on PEBS event precise level Dapeng Mi
2025-04-15 13:53 ` Peter Zijlstra
2025-04-15 16:31 ` Liang, Kan
2025-04-16 1:46 ` Mi, Dapeng
2025-04-16 13:59 ` Liang, Kan
2025-04-17 1:15 ` Mi, Dapeng
2025-04-16 15:32 ` Peter Zijlstra
2025-04-16 19:45 ` Liang, Kan
2025-04-16 19:56 ` Peter Zijlstra
2025-04-22 22:50 ` Liang, Kan
2025-04-15 11:44 ` [Patch v3 13/22] perf/x86/intel: Setup PEBS data configuration and enable legacy groups Dapeng Mi
2025-04-15 11:44 ` [Patch v3 14/22] perf/x86/intel: Add counter group support for arch-PEBS Dapeng Mi
2025-04-15 11:44 ` [Patch v3 15/22] perf/x86/intel: Support SSP register capturing " Dapeng Mi
2025-04-15 14:07 ` Peter Zijlstra
2025-04-16 5:49 ` Mi, Dapeng
2025-04-15 11:44 ` [Patch v3 16/22] perf/core: Support to capture higher width vector registers Dapeng Mi
2025-04-15 14:36 ` Peter Zijlstra [this message]
2025-04-16 6:42 ` Mi, Dapeng
2025-04-16 15:53 ` Peter Zijlstra
2025-04-17 2:00 ` Mi, Dapeng
2025-04-22 3:05 ` Mi, Dapeng
2025-04-15 11:44 ` [Patch v3 17/22] perf/x86/intel: Support arch-PEBS vector registers group capturing Dapeng Mi
2025-04-15 11:44 ` [Patch v3 18/22] perf tools: Support to show SSP register Dapeng Mi
2025-04-15 11:44 ` [Patch v3 19/22] perf tools: Enhance arch__intr/user_reg_mask() helpers Dapeng Mi
2025-04-15 11:44 ` [Patch v3 20/22] perf tools: Enhance sample_regs_user/intr to capture more registers Dapeng Mi
2025-04-15 11:44 ` [Patch v3 21/22] perf tools: Support to capture more vector registers (x86/Intel) Dapeng Mi
2025-04-15 11:44 ` [Patch v3 22/22] perf tools/tests: Add vector registers PEBS sampling test Dapeng Mi
2025-04-15 15:21 ` [Patch v3 00/22] Arch-PEBS and PMU supports for Clearwater Forest and Panther Lake Liang, Kan
2025-04-16 7:42 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250415143626.GF4031@noisy.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=ak@linux.intel.com \
--cc=alexander.shishkin@linux.intel.com \
--cc=dapeng1.mi@intel.com \
--cc=dapeng1.mi@linux.intel.com \
--cc=eranian@google.com \
--cc=irogers@google.com \
--cc=kan.liang@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=namhyung@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox