All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Namhyung Kim <namhyung@kernel.org>,
	Ian Rogers <irogers@google.com>,
	Adrian Hunter <adrian.hunter@intel.com>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Kan Liang <kan.liang@linux.intel.com>,
	Andi Kleen <ak@linux.intel.com>,
	Eranian Stephane <eranian@google.com>
Cc: linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org,
	Dapeng Mi <dapeng1.mi@intel.com>,
	Dapeng Mi <dapeng1.mi@linux.intel.com>
Subject: [Patch v3 17/22] perf/x86/intel: Support arch-PEBS vector registers group capturing
Date: Tue, 15 Apr 2025 11:44:23 +0000	[thread overview]
Message-ID: <20250415114428.341182-18-dapeng1.mi@linux.intel.com> (raw)
In-Reply-To: <20250415114428.341182-1-dapeng1.mi@linux.intel.com>

Add x86/intel specific vector register (VECR) group capturing for
arch-PEBS. Enable corresponding VECR group bits in
GPx_CFG_C/FX0_CFG_C MSRs if users configures these vector registers
bitmap in perf_event_attr and parse VECR group in arch-PEBS record.

Currently vector registers capturing is only supported by PEBS based
sampling, PMU driver would return error if PMI based sampling tries to
capture these vector registers.

Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
 arch/x86/events/core.c            | 90 +++++++++++++++++++++++++++++-
 arch/x86/events/intel/core.c      | 15 +++++
 arch/x86/events/intel/ds.c        | 93 ++++++++++++++++++++++++++++---
 arch/x86/include/asm/msr-index.h  |  6 ++
 arch/x86/include/asm/perf_event.h | 20 +++++++
 5 files changed, 214 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 0ccbe8385c7f..16f019ff44f1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -580,6 +580,73 @@ int x86_pmu_max_precise(struct pmu *pmu)
 	return precise;
 }
 
+static bool has_vec_regs(struct perf_event *event, bool user,
+			 int start, int end)
+{
+	int idx = (start - PERF_REG_EXTENDED_OFFSET) / 64;
+	int s = start % 64;
+	int e = end % 64;
+	u64 regs_mask;
+
+	if (user)
+		regs_mask = event->attr.sample_regs_user_ext[idx];
+	else
+		regs_mask = event->attr.sample_regs_intr_ext[idx];
+
+	return regs_mask & GENMASK_ULL(e, s);
+}
+
+static inline bool has_ymm_regs(struct perf_event *event, bool user)
+{
+	return has_vec_regs(event, user, PERF_REG_X86_YMM0, PERF_REG_X86_YMM_MAX - 1);
+}
+
+static inline bool has_zmm_regs(struct perf_event *event, bool user)
+{
+	return has_vec_regs(event, user, PERF_REG_X86_ZMM0, PERF_REG_X86_ZMM8 - 1) ||
+	       has_vec_regs(event, user, PERF_REG_X86_ZMM8, PERF_REG_X86_ZMM16 - 1);
+}
+
+static inline bool has_h16zmm_regs(struct perf_event *event, bool user)
+{
+	return has_vec_regs(event, user, PERF_REG_X86_ZMM16, PERF_REG_X86_ZMM24 - 1) ||
+	       has_vec_regs(event, user, PERF_REG_X86_ZMM24, PERF_REG_X86_ZMM_MAX - 1);
+}
+
+static inline bool has_opmask_regs(struct perf_event *event, bool user)
+{
+	return has_vec_regs(event, user, PERF_REG_X86_OPMASK0, PERF_REG_X86_OPMASK7);
+}
+
+static bool ext_vec_regs_supported(struct perf_event *event, bool user)
+{
+	u64 caps = hybrid(event->pmu, arch_pebs_cap).caps;
+
+	if (!(event->pmu->capabilities & PERF_PMU_CAP_MORE_EXT_REGS))
+		return false;
+
+	if (has_opmask_regs(event, user) && !(caps & ARCH_PEBS_VECR_OPMASK))
+		return false;
+
+	if (has_ymm_regs(event, user) && !(caps & ARCH_PEBS_VECR_YMMH))
+		return false;
+
+	if (has_zmm_regs(event, user) && !(caps & ARCH_PEBS_VECR_ZMMH))
+		return false;
+
+	if (has_h16zmm_regs(event, user) && !(caps & ARCH_PEBS_VECR_H16ZMM))
+		return false;
+
+	if (!event->attr.precise_ip)
+		return false;
+
+	/* Only user space sampling is allowed for extended vector registers. */
+	if (user && !event->attr.exclude_kernel)
+		return false;
+
+	return true;
+}
+
 int x86_pmu_hw_config(struct perf_event *event)
 {
 	if (event->attr.precise_ip) {
@@ -665,9 +732,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 			return -EINVAL;
 	}
 
-	/* sample_regs_user never support XMM registers */
-	if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
-		return -EINVAL;
+	if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) {
+		/* Only user space sampling is allowed for XMM registers. */
+		if (!event->attr.exclude_kernel)
+			return -EINVAL;
+	}
+
 	/*
 	 * Besides the general purpose registers, XMM registers may
 	 * be collected in PEBS on some platforms, e.g. Icelake
@@ -680,6 +750,20 @@ int x86_pmu_hw_config(struct perf_event *event)
 			return -EINVAL;
 	}
 
+	/*
+	 * Architectural PEBS supports to capture more vector registers besides
+	 * XMM registers, like YMM, OPMASK and ZMM registers.
+	 */
+	if (unlikely(has_more_extended_user_regs(event))) {
+		if (!ext_vec_regs_supported(event, true))
+			return -EINVAL;
+	}
+
+	if (unlikely(has_more_extended_intr_regs(event))) {
+		if (!ext_vec_regs_supported(event, false))
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index b6416535f84d..9bd77974d83b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3007,6 +3007,18 @@ static void intel_pmu_enable_event_ext(struct perf_event *event)
 			if (pebs_data_cfg & PEBS_DATACFG_XMMS)
 				ext |= ARCH_PEBS_VECR_XMM & cap.caps;
 
+			if (pebs_data_cfg & PEBS_DATACFG_YMMHS)
+				ext |= ARCH_PEBS_VECR_YMMH & cap.caps;
+
+			if (pebs_data_cfg & PEBS_DATACFG_OPMASKS)
+				ext |= ARCH_PEBS_VECR_OPMASK & cap.caps;
+
+			if (pebs_data_cfg & PEBS_DATACFG_ZMMHS)
+				ext |= ARCH_PEBS_VECR_ZMMH & cap.caps;
+
+			if (pebs_data_cfg & PEBS_DATACFG_H16ZMMS)
+				ext |= ARCH_PEBS_VECR_H16ZMM & cap.caps;
+
 			if (pebs_data_cfg & PEBS_DATACFG_LBRS)
 				ext |= ARCH_PEBS_LBR & cap.caps;
 
@@ -5426,6 +5438,9 @@ static inline void __intel_update_pmu_caps(struct pmu *pmu)
 
 	if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM)
 		dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+
+	if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_EXT)
+		dest_pmu->capabilities |= PERF_PMU_CAP_MORE_EXT_REGS;
 }
 
 static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 91a093cba11f..26220bfbe885 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1425,6 +1425,34 @@ void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc)
 				PERF_SAMPLE_TRANSACTION |		     \
 				PERF_SAMPLE_DATA_PAGE_SIZE)
 
+static u64 pebs_get_ext_reg_data_cfg(unsigned long *ext_reg)
+{
+	u64 pebs_data_cfg = 0;
+	int bit;
+
+	for_each_set_bit(bit, ext_reg, PERF_NUM_EXT_REGS) {
+		switch (bit + PERF_REG_EXTENDED_OFFSET) {
+		case PERF_REG_X86_OPMASK0 ... PERF_REG_X86_OPMASK7:
+			pebs_data_cfg |= PEBS_DATACFG_OPMASKS;
+			break;
+		case PERF_REG_X86_YMM0 ... PERF_REG_X86_YMM_MAX - 1:
+			pebs_data_cfg |= PEBS_DATACFG_YMMHS | PEBS_DATACFG_XMMS;
+			break;
+		case PERF_REG_X86_ZMM0 ... PERF_REG_X86_ZMM16 - 1:
+			pebs_data_cfg |= PEBS_DATACFG_ZMMHS | PEBS_DATACFG_YMMHS |
+					 PEBS_DATACFG_XMMS;
+			break;
+		case PERF_REG_X86_ZMM16 ... PERF_REG_X86_ZMM_MAX - 1:
+			pebs_data_cfg |= PEBS_DATACFG_H16ZMMS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return pebs_data_cfg;
+}
+
 static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -1459,9 +1487,21 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	if (gprs || (attr->precise_ip < 2) || tsx_weight)
 		pebs_data_cfg |= PEBS_DATACFG_GP;
 
-	if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
-	    (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
-		pebs_data_cfg |= PEBS_DATACFG_XMMS;
+	if (sample_type & PERF_SAMPLE_REGS_INTR) {
+		if (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK)
+			pebs_data_cfg |= PEBS_DATACFG_XMMS;
+
+		pebs_data_cfg |= pebs_get_ext_reg_data_cfg(
+			(unsigned long *)event->attr.sample_regs_intr_ext);
+	}
+
+	if (sample_type & PERF_SAMPLE_REGS_USER) {
+		if (attr->sample_regs_user & PERF_REG_EXTENDED_MASK)
+			pebs_data_cfg |= PEBS_DATACFG_XMMS;
+
+		pebs_data_cfg |= pebs_get_ext_reg_data_cfg(
+			(unsigned long *)event->attr.sample_regs_user_ext);
+	}
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 		/*
@@ -2245,6 +2285,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 
 	perf_regs = container_of(regs, struct x86_perf_regs, regs);
 	perf_regs->xmm_regs = NULL;
+	perf_regs->ymmh_regs = NULL;
+	perf_regs->opmask_regs = NULL;
+	perf_regs->zmmh_regs = NULL;
+	perf_regs->h16zmm_regs = NULL;
 	perf_regs->ssp = 0;
 
 	format_group = basic->format_group;
@@ -2362,6 +2406,10 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 
 	perf_regs = container_of(regs, struct x86_perf_regs, regs);
 	perf_regs->xmm_regs = NULL;
+	perf_regs->ymmh_regs = NULL;
+	perf_regs->opmask_regs = NULL;
+	perf_regs->zmmh_regs = NULL;
+	perf_regs->h16zmm_regs = NULL;
 	perf_regs->ssp = 0;
 
 	__setup_perf_sample_data(event, iregs, data);
@@ -2412,14 +2460,45 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 					   meminfo->tsx_tuning, ax);
 	}
 
-	if (header->xmm) {
+	if (header->xmm || header->ymmh || header->opmask ||
+	    header->zmmh || header->h16zmm) {
 		struct arch_pebs_xmm *xmm;
+		struct arch_pebs_ymmh *ymmh;
+		struct arch_pebs_zmmh *zmmh;
+		struct arch_pebs_h16zmm *h16zmm;
+		struct arch_pebs_opmask *opmask;
 
 		next_record += sizeof(struct arch_pebs_xer_header);
 
-		xmm = next_record;
-		perf_regs->xmm_regs = xmm->xmm;
-		next_record = xmm + 1;
+		if (header->xmm) {
+			xmm = next_record;
+			perf_regs->xmm_regs = xmm->xmm;
+			next_record = xmm + 1;
+		}
+
+		if (header->ymmh) {
+			ymmh = next_record;
+			perf_regs->ymmh_regs = ymmh->ymmh;
+			next_record = ymmh + 1;
+		}
+
+		if (header->opmask) {
+			opmask = next_record;
+			perf_regs->opmask_regs = opmask->opmask;
+			next_record = opmask + 1;
+		}
+
+		if (header->zmmh) {
+			zmmh = next_record;
+			perf_regs->zmmh_regs = zmmh->zmmh;
+			next_record = zmmh + 1;
+		}
+
+		if (header->h16zmm) {
+			h16zmm = next_record;
+			perf_regs->h16zmm_regs = h16zmm->h16zmm;
+			next_record = h16zmm + 1;
+		}
 	}
 
 	if (header->lbr) {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c971ac09d881..93193eb6ff94 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -328,6 +328,12 @@
 #define ARCH_PEBS_LBR_SHIFT		40
 #define ARCH_PEBS_LBR			(0x3ull << ARCH_PEBS_LBR_SHIFT)
 #define ARCH_PEBS_VECR_XMM		BIT_ULL(49)
+#define ARCH_PEBS_VECR_YMMH		BIT_ULL(50)
+#define ARCH_PEBS_VECR_OPMASK		BIT_ULL(53)
+#define ARCH_PEBS_VECR_ZMMH		BIT_ULL(54)
+#define ARCH_PEBS_VECR_H16ZMM		BIT_ULL(55)
+#define ARCH_PEBS_VECR_EXT_SHIFT	50
+#define ARCH_PEBS_VECR_EXT		(0x3full << ARCH_PEBS_VECR_EXT_SHIFT)
 #define ARCH_PEBS_GPR			BIT_ULL(61)
 #define ARCH_PEBS_AUX			BIT_ULL(62)
 #define ARCH_PEBS_EN			BIT_ULL(63)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 560eb218868c..a7b2548bf7b4 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -142,6 +142,10 @@
 #define PEBS_DATACFG_LBRS	BIT_ULL(3)
 #define PEBS_DATACFG_CNTR	BIT_ULL(4)
 #define PEBS_DATACFG_METRICS	BIT_ULL(5)
+#define PEBS_DATACFG_YMMHS	BIT_ULL(6)
+#define PEBS_DATACFG_OPMASKS	BIT_ULL(7)
+#define PEBS_DATACFG_ZMMHS	BIT_ULL(8)
+#define PEBS_DATACFG_H16ZMMS	BIT_ULL(9)
 #define PEBS_DATACFG_LBR_SHIFT	24
 #define PEBS_DATACFG_CNTR_SHIFT	32
 #define PEBS_DATACFG_CNTR_MASK	GENMASK_ULL(15, 0)
@@ -589,6 +593,22 @@ struct arch_pebs_xmm {
 	u64 xmm[16*2];		/* two entries for each register */
 };
 
+struct arch_pebs_ymmh {
+	u64 ymmh[16*2];		/* two entries for each register */
+};
+
+struct arch_pebs_opmask {
+	u64 opmask[8];
+};
+
+struct arch_pebs_zmmh {
+	u64 zmmh[16*4];		/* four entries for each register */
+};
+
+struct arch_pebs_h16zmm {
+	u64 h16zmm[16*8];	/* eight entries for each register */
+};
+
 #define ARCH_PEBS_LBR_NAN		0x0
 #define ARCH_PEBS_LBR_NUM_8		0x1
 #define ARCH_PEBS_LBR_NUM_16		0x2
-- 
2.40.1


  parent reply	other threads:[~2025-04-15  8:24 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-15 11:44 [Patch v3 00/22] Arch-PEBS and PMU supports for Clearwater Forest and Panther Lake Dapeng Mi
2025-04-15 11:44 ` [Patch v3 01/22] perf/x86/intel: Add Panther Lake support Dapeng Mi
2025-04-17 13:01   ` [tip: perf/core] " tip-bot2 for Kan Liang
2025-04-15 11:44 ` [Patch v3 02/22] perf/x86/intel: Add PMU support for Clearwater Forest Dapeng Mi
2025-04-17 13:01   ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 03/22] perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs Dapeng Mi
2025-04-17 13:01   ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 04/22] perf/x86/intel: Decouple BTS initialization from PEBS initialization Dapeng Mi
2025-04-17 13:01   ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 05/22] perf/x86/intel: Rename x86_pmu.pebs to x86_pmu.ds_pebs Dapeng Mi
2025-04-17 13:01   ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 06/22] perf/x86/intel: Introduce pairs of PEBS static calls Dapeng Mi
2025-04-17 13:00   ` [tip: perf/core] " tip-bot2 for Dapeng Mi
2025-04-15 11:44 ` [Patch v3 07/22] perf/x86/intel: Initialize architectural PEBS Dapeng Mi
2025-04-15 11:44 ` [Patch v3 08/22] perf/x86/intel/ds: Factor out PEBS record processing code to functions Dapeng Mi
2025-04-15 11:44 ` [Patch v3 09/22] perf/x86/intel/ds: Factor out PEBS group " Dapeng Mi
2025-04-15 11:44 ` [Patch v3 10/22] perf/x86/intel: Process arch-PEBS records or record fragments Dapeng Mi
2025-04-15 13:57   ` Peter Zijlstra
2025-04-15 16:09     ` Liang, Kan
2025-04-15 11:44 ` [Patch v3 11/22] perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR Dapeng Mi
2025-04-15 13:45   ` Peter Zijlstra
2025-04-16  0:59     ` Mi, Dapeng
2025-04-15 13:48   ` Peter Zijlstra
2025-04-16  1:03     ` Mi, Dapeng
2025-04-15 11:44 ` [Patch v3 12/22] perf/x86/intel: Update dyn_constranit base on PEBS event precise level Dapeng Mi
2025-04-15 13:53   ` Peter Zijlstra
2025-04-15 16:31     ` Liang, Kan
2025-04-16  1:46       ` Mi, Dapeng
2025-04-16 13:59         ` Liang, Kan
2025-04-17  1:15           ` Mi, Dapeng
2025-04-16 15:32       ` Peter Zijlstra
2025-04-16 19:45         ` Liang, Kan
2025-04-16 19:56           ` Peter Zijlstra
2025-04-22 22:50             ` Liang, Kan
2025-04-15 11:44 ` [Patch v3 13/22] perf/x86/intel: Setup PEBS data configuration and enable legacy groups Dapeng Mi
2025-04-15 11:44 ` [Patch v3 14/22] perf/x86/intel: Add counter group support for arch-PEBS Dapeng Mi
2025-04-15 11:44 ` [Patch v3 15/22] perf/x86/intel: Support SSP register capturing " Dapeng Mi
2025-04-15 14:07   ` Peter Zijlstra
2025-04-16  5:49     ` Mi, Dapeng
2025-04-15 11:44 ` [Patch v3 16/22] perf/core: Support to capture higher width vector registers Dapeng Mi
2025-04-15 14:36   ` Peter Zijlstra
2025-04-16  6:42     ` Mi, Dapeng
2025-04-16 15:53       ` Peter Zijlstra
2025-04-17  2:00         ` Mi, Dapeng
2025-04-22  3:05         ` Mi, Dapeng
2025-04-15 11:44 ` Dapeng Mi [this message]
2025-04-15 11:44 ` [Patch v3 18/22] perf tools: Support to show SSP register Dapeng Mi
2025-04-15 11:44 ` [Patch v3 19/22] perf tools: Enhance arch__intr/user_reg_mask() helpers Dapeng Mi
2025-04-15 11:44 ` [Patch v3 20/22] perf tools: Enhance sample_regs_user/intr to capture more registers Dapeng Mi
2025-04-15 11:44 ` [Patch v3 21/22] perf tools: Support to capture more vector registers (x86/Intel) Dapeng Mi
2025-04-15 11:44 ` [Patch v3 22/22] perf tools/tests: Add vector registers PEBS sampling test Dapeng Mi
2025-04-15 15:21 ` [Patch v3 00/22] Arch-PEBS and PMU supports for Clearwater Forest and Panther Lake Liang, Kan
2025-04-16  7:42   ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250415114428.341182-18-dapeng1.mi@linux.intel.com \
    --to=dapeng1.mi@linux.intel.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=ak@linux.intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=dapeng1.mi@intel.com \
    --cc=eranian@google.com \
    --cc=irogers@google.com \
    --cc=kan.liang@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.