[PATCH V6 1/3] perf/x86/intel/ds: Add PEBS format 6

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V6 1/3] perf/x86/intel/ds: Add PEBS format 6
@ 2024-12-18 15:16 kan.liang
  2024-12-18 15:16 ` [PATCH V6 2/3] perf: Extend perf_output_read kan.liang
  2024-12-18 15:16 ` [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
  0 siblings, 2 replies; 10+ messages in thread
From: kan.liang @ 2024-12-18 15:16 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, linux-kernel,
	linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang, stable

From: Kan Liang <kan.liang@linux.intel.com>

The only difference between 5 and 6 is the new counters snapshotting
group, without the following counters snapshotting enabling patches,
it's impossible to utilize the feature in a PEBS record. It's safe to
share the same code path with format 5.

Add format 6, so the end user can at least utilize the legacy PEBS
features.

Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: stable@vger.kernel.org
---

No changes since V5

 arch/x86/events/intel/ds.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8dcf90f6fb59..ba74e1198328 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2551,6 +2551,7 @@ void __init intel_ds_init(void)
 			x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
 			break;
 
+		case 6:
 		case 5:
 			x86_pmu.pebs_ept = 1;
 			fallthrough;
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH V6 2/3] perf: Extend perf_output_read
  2024-12-18 15:16 [PATCH V6 1/3] perf/x86/intel/ds: Add PEBS format 6 kan.liang
@ 2024-12-18 15:16 ` kan.liang
  2024-12-19 22:21   ` Peter Zijlstra
  2024-12-18 15:16 ` [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
  1 sibling, 1 reply; 10+ messages in thread
From: kan.liang @ 2024-12-18 15:16 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, linux-kernel,
	linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The event may have been updated in the PMU-specific implementation,
e.g., Intel PEBS counters snapshotting. The common code should not
read and overwrite the value.

The PERF_SAMPLE_READ in the data->sample_type can be used to detect
whether the PMU-specific value is available. If yes, avoid the
pmu->read() in the common code.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V5

 kernel/events/core.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 684d631e78da..e2045403521a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7422,7 +7422,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 
 static void perf_output_read_group(struct perf_output_handle *handle,
 				   struct perf_event *event,
-				   u64 enabled, u64 running)
+				   u64 enabled, u64 running, bool read)
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
@@ -7445,7 +7445,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = running;
 
-	if ((leader != event) &&
+	if ((leader != event) && read &&
 	    (leader->state == PERF_EVENT_STATE_ACTIVE))
 		leader->pmu->read(leader);
 
@@ -7460,7 +7460,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	for_each_sibling_event(sub, leader) {
 		n = 0;
 
-		if ((sub != event) &&
+		if ((sub != event) && read &&
 		    (sub->state == PERF_EVENT_STATE_ACTIVE))
 			sub->pmu->read(sub);
 
@@ -7491,7 +7491,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
  * all cores.
  */
 static void perf_output_read(struct perf_output_handle *handle,
-			     struct perf_event *event)
+			     struct perf_event *event,
+			     bool read)
 {
 	u64 enabled = 0, running = 0, now;
 	u64 read_format = event->attr.read_format;
@@ -7509,7 +7510,7 @@ static void perf_output_read(struct perf_output_handle *handle,
 		calc_timer_values(event, &now, &enabled, &running);
 
 	if (event->attr.read_format & PERF_FORMAT_GROUP)
-		perf_output_read_group(handle, event, enabled, running);
+		perf_output_read_group(handle, event, enabled, running, read);
 	else
 		perf_output_read_one(handle, event, enabled, running);
 }
@@ -7551,7 +7552,7 @@ void perf_output_sample(struct perf_output_handle *handle,
 		perf_output_put(handle, data->period);
 
 	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(handle, event);
+		perf_output_read(handle, event, !(data->sample_flags & PERF_SAMPLE_READ));
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int size = 1;
@@ -8195,7 +8196,7 @@ perf_event_read_event(struct perf_event *event,
 		return;
 
 	perf_output_put(&handle, read_event);
-	perf_output_read(&handle, event);
+	perf_output_read(&handle, event, true);
 	perf_event__output_id_sample(event, &handle, &sample);
 
 	perf_output_end(&handle);
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-18 15:16 [PATCH V6 1/3] perf/x86/intel/ds: Add PEBS format 6 kan.liang
  2024-12-18 15:16 ` [PATCH V6 2/3] perf: Extend perf_output_read kan.liang
@ 2024-12-18 15:16 ` kan.liang
  2024-12-18 16:32   ` Peter Zijlstra
  2024-12-20 14:22   ` Peter Zijlstra
  1 sibling, 2 replies; 10+ messages in thread
From: kan.liang @ 2024-12-18 15:16 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, linux-kernel,
	linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The counters snapshotting is a new adaptive PEBS extension, which can
capture programmable counters, fixed-function counters, and performance
metrics in a PEBS record. The feature is available in the PEBS format
V6.

The target counters can be configured in the new fields of MSR_PEBS_CFG.
Then the PEBS HW will generate the bit mask of counters (Counters Group
Header) followed by the content of all the requested counters into a
PEBS record.

The current Linux perf sample read feature intends to read the counters
of other member events when the leader event is overflowing. But the
current read is in the NMI handler, which may has a small gap from
overflow. Using the counters snapshotting feature for the sample read.

Extend intel_update_topdown_event() to accept the value from PEBS
records.

Add a new PEBS_CNTR flag to indicate a sample read group that utilizes
the counters snapshotting feature. When the group is scheduled, the
PEBS configure can be updated accordingly.

To prevent the case that a PEBS record value might be in the past
relative to what is already in the event, perf always stops the PMU and
drains the PEBS buffer before updating the corresponding event->count.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V5:
- Drop the extension for the x86_pmu_update(). Only extend
  intel_pmu_update_topdown_event()
- Add dedicated intel_perf_event_pmc_to_count() to update the
  event->count from a PEBS record
- Always drain the PEBS buffer before updating the corresponding
  event->count and update an example in the comments.

 arch/x86/events/intel/core.c       |  95 ++++++++++++++++----
 arch/x86/events/intel/ds.c         | 137 ++++++++++++++++++++++++++---
 arch/x86/events/perf_event.h       |  15 +++-
 arch/x86/events/perf_event_flags.h |   2 +-
 arch/x86/include/asm/perf_event.h  |  15 ++++
 5 files changed, 235 insertions(+), 29 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2e1e26846050..403eea444b71 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2710,7 +2710,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
  * modify by a NMI. PMU has to be disabled before calling this function.
  */
 
-static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2718,13 +2718,18 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 	bool reset = true;
 	int idx;
 
-	/* read Fixed counter 3 */
-	rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
-	if (!slots)
-		return 0;
+	if (!val) {
+		/* read Fixed counter 3 */
+		rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
+		if (!slots)
+			return 0;
 
-	/* read PERF_METRICS */
-	rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+		/* read PERF_METRICS */
+		rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+	} else {
+		slots = val[0];
+		metrics = val[1];
+	}
 
 	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
@@ -2767,13 +2772,14 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 	return slots;
 }
 
-static u64 icl_update_topdown_event(struct perf_event *event)
+static u64 icl_update_topdown_event(struct perf_event *event, u64 *val)
 {
 	return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
-						 x86_pmu.num_topdown_events - 1);
+						 x86_pmu.num_topdown_events - 1,
+					  val);
 }
 
-DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
+DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
 
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
@@ -2785,14 +2791,15 @@ static void intel_pmu_read_topdown_event(struct perf_event *event)
 		return;
 
 	perf_pmu_disable(event->pmu);
-	static_call(intel_pmu_update_topdown_event)(event);
+	static_call(intel_pmu_update_topdown_event)(event, NULL);
 	perf_pmu_enable(event->pmu);
 }
 
 static void intel_pmu_read_event(struct perf_event *event)
 {
-	if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
-		intel_pmu_auto_reload_read(event);
+	if ((event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    is_pebs_counter_event(event))
+		intel_pmu_pebs_read(event);
 	else if (is_topdown_count(event))
 		intel_pmu_read_topdown_event(event);
 	else
@@ -2925,7 +2932,7 @@ static int intel_pmu_set_period(struct perf_event *event)
 static u64 intel_pmu_update(struct perf_event *event)
 {
 	if (unlikely(is_topdown_count(event)))
-		return static_call(intel_pmu_update_topdown_event)(event);
+		return static_call(intel_pmu_update_topdown_event)(event, NULL);
 
 	return x86_perf_event_update(event);
 }
@@ -3091,7 +3098,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	 */
 	if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
 		handled++;
-		static_call(intel_pmu_update_topdown_event)(NULL);
+		static_call(intel_pmu_update_topdown_event)(NULL, NULL);
 	}
 
 	/*
@@ -3109,6 +3116,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
+		/*
+		 * There may be unprocessed PEBS records in the PEBS buffer,
+		 * which still stores the previous values.
+		 * Process those records first before handling the latest value.
+		 * For example,
+		 * A is a regular counter
+		 * B is a PEBS event which reads A
+		 * C is a PEBS event
+		 *
+		 * The following can happen:
+		 * B-assist			A=1
+		 * C				A=2
+		 * B-assist			A=3
+		 * A-overflow-PMI		A=4
+		 * C-assist-PMI (PEBS buffer)	A=5
+		 *
+		 * The PEBS buffer has to be drained before handling the A-PMI
+		 */
+		if (is_pebs_counter_event(event))
+			x86_pmu.drain_pebs(regs, &data);
+
 		if (!intel_pmu_save_and_restart(event))
 			continue;
 
@@ -4056,6 +4084,23 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
 	}
 
+	if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
+	    (x86_pmu.intel_cap.pebs_format >= 6)) {
+		struct perf_event *leader = event->group_leader;
+		bool slots_leader = is_slots_event(leader);
+
+		if (slots_leader)
+			leader = list_next_entry(leader, sibling_list);
+
+		if (leader->attr.precise_ip) {
+			event->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+			if (slots_leader) {
+				leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+				event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+			}
+		}
+	}
+
 	if ((event->attr.type == PERF_TYPE_HARDWARE) ||
 	    (event->attr.type == PERF_TYPE_HW_CACHE))
 		return 0;
@@ -4159,6 +4204,24 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	return 0;
 }
 
+static int intel_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	struct perf_event *event;
+	int ret = x86_schedule_events(cpuc, n, assign);
+
+	if (ret)
+		return ret;
+
+	if (cpuc->is_fake)
+		return ret;
+
+	event = cpuc->event_list[n - 1];
+	if (event && (event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR))
+		intel_pmu_pebs_update_cfg(cpuc, n, assign);
+
+	return 0;
+}
+
 /*
  * Currently, the only caller of this function is the atomic_switch_perf_msrs().
  * The host perf context helps to prepare the values of the real hardware for
@@ -5292,7 +5355,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.set_period		= intel_pmu_set_period,
 	.update			= intel_pmu_update,
 	.hw_config		= intel_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
+	.schedule_events	= intel_pmu_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.fixedctr		= MSR_ARCH_PERFMON_FIXED_CTR0,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ba74e1198328..11bd97c6e582 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1308,10 +1308,63 @@ static void adaptive_pebs_record_size_update(void)
 		sz += sizeof(struct pebs_xmm);
 	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
 		sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+	if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) {
+		sz += sizeof(struct pebs_cntr_header);
+
+		/* Metrics base and Metrics Data */
+		if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+			sz += 2 * sizeof(u64);
+
+		if (pebs_data_cfg & PEBS_DATACFG_CNTR) {
+			sz += hweight64((pebs_data_cfg >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK)
+			      * sizeof(u64);
+			sz += hweight64((pebs_data_cfg >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK)
+			      * sizeof(u64);
+		}
+	}
 
 	cpuc->pebs_record_size = sz;
 }
 
+static void __intel_pmu_pebs_update_cfg(struct perf_event *event,
+					int idx, u64 *pebs_data_cfg)
+{
+	if (is_metric_event(event)) {
+		*pebs_data_cfg |= PEBS_DATACFG_METRICS;
+		return;
+	}
+
+	*pebs_data_cfg |= PEBS_DATACFG_CNTR;
+
+	if (idx >= INTEL_PMC_IDX_FIXED) {
+		*pebs_data_cfg |= ((1ULL << (idx - INTEL_PMC_IDX_FIXED)) & PEBS_DATACFG_FIX_MASK)
+				  << PEBS_DATACFG_FIX_SHIFT;
+	} else {
+		*pebs_data_cfg |= ((1ULL << idx) & PEBS_DATACFG_CNTR_MASK)
+				  << PEBS_DATACFG_CNTR_SHIFT;
+	}
+}
+
+void intel_pmu_pebs_update_cfg(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	struct perf_event *leader, *event;
+	u64 pebs_data_cfg = 0;
+	int i = n - 1;
+
+	leader = cpuc->event_list[i]->group_leader;
+	for (; i >= 0; i--) {
+		event = cpuc->event_list[i];
+		if (!is_pebs_counter_event(event))
+			continue;
+		if (leader != event->group_leader)
+			break;
+		__intel_pmu_pebs_update_cfg(event, assign[i], &pebs_data_cfg);
+	}
+
+	if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+		cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
+}
+
 #define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
 				PERF_SAMPLE_PHYS_ADDR |			     \
 				PERF_SAMPLE_WEIGHT_TYPE |		     \
@@ -1914,6 +1967,24 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
 #endif
 }
 
+static void intel_perf_event_pmc_to_count(struct perf_event *event, u64 pmc)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_pmc = local64_read(&hwc->prev_count);
+	int shift = 64 - x86_pmu.cntval_bits;
+	u64 delta;
+
+	/* Only update the count when the PMU is disabled */
+	WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+	local64_set(&hwc->prev_count, pmc);
+
+	delta = (pmc << shift) - (prev_pmc << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+}
+
 #define PEBS_LATENCY_MASK			0xffff
 
 /*
@@ -2049,6 +2120,41 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 		}
 	}
 
+	if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
+		struct pebs_cntr_header *cntr = next_record;
+		int bit;
+
+		next_record += sizeof(struct pebs_cntr_header);
+
+		for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
+			intel_perf_event_pmc_to_count(cpuc->events[bit], *(u64 *)next_record);
+			next_record += sizeof(u64);
+		}
+
+		for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) {
+			/* The slots event will be handled with perf_metric later */
+			if ((cntr->metrics == INTEL_CNTR_METRICS) &&
+			    (INTEL_PMC_IDX_FIXED_SLOTS == bit + INTEL_PMC_IDX_FIXED)) {
+				next_record += sizeof(u64);
+				continue;
+			}
+			intel_perf_event_pmc_to_count(cpuc->events[bit + INTEL_PMC_IDX_FIXED],
+						      *(u64 *)next_record);
+			next_record += sizeof(u64);
+		}
+
+		/* HW will reload the value right after the overflow. */
+		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+			local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period);
+
+		if (cntr->metrics == INTEL_CNTR_METRICS) {
+			static_call(intel_pmu_update_topdown_event)
+				   (event->group_leader, (u64 *)next_record);
+			next_record += 2 * sizeof(u64);
+		}
+		data->sample_flags |= PERF_SAMPLE_READ;
+	}
+
 	WARN_ONCE(next_record != __pebs + basic->format_size,
 			"PEBS record size %u, expected %llu, config %llx\n",
 			basic->format_size,
@@ -2094,10 +2200,8 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	return NULL;
 }
 
-void intel_pmu_auto_reload_read(struct perf_event *event)
+void intel_pmu_pebs_read(struct perf_event *event)
 {
-	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
 	perf_pmu_disable(event->pmu);
 	intel_pmu_drain_pebs_buffer();
 	perf_pmu_enable(event->pmu);
@@ -2211,13 +2315,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event,
 	}
 
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-		/*
-		 * Now, auto-reload is only enabled in fixed period mode.
-		 * The reload value is always hwc->sample_period.
-		 * May need to change it, if auto-reload is enabled in
-		 * freq mode later.
-		 */
-		intel_pmu_save_and_restart_reload(event, count);
+		if ((is_pebs_counter_event(event))) {
+			/*
+			 * The value of each sample has been updated when setup
+			 * the corresponding sample data.
+			 */
+			perf_event_update_userpage(event);
+		} else {
+			/*
+			 * Now, auto-reload is only enabled in fixed period mode.
+			 * The reload value is always hwc->sample_period.
+			 * May need to change it, if auto-reload is enabled in
+			 * freq mode later.
+			 */
+			intel_pmu_save_and_restart_reload(event, count);
+		}
 	} else
 		intel_pmu_save_and_restart(event);
 }
@@ -2552,6 +2664,9 @@ void __init intel_ds_init(void)
 			break;
 
 		case 6:
+			if (x86_pmu.intel_cap.pebs_baseline)
+				x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+			fallthrough;
 		case 5:
 			x86_pmu.pebs_ept = 1;
 			fallthrough;
@@ -2576,7 +2691,7 @@ void __init intel_ds_init(void)
 					  PERF_SAMPLE_REGS_USER |
 					  PERF_SAMPLE_REGS_INTR);
 			}
-			pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
+			pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual);
 
 			if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) {
 				pr_cont("PEBS-via-PT, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 82c6f45ce975..8545c779b61e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event)
 	return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
 }
 
+static inline bool is_pebs_counter_event(struct perf_event *event)
+{
+	return event->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
+}
+
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
 	int refcnt; /* reference count */
@@ -1147,6 +1152,12 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
+static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val)
+{
+	return x86_perf_event_update(event);
+}
+DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
+
 static inline unsigned int x86_pmu_config_addr(int index)
 {
 	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
@@ -1642,7 +1653,9 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
 
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_pebs_update_cfg(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void intel_pmu_pebs_read(struct perf_event *event);
 
 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 6c977c19f2cd..1d9e385649b5 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW,		0x00008) /* haswell style datala, load */
 PERF_ARCH(PEBS_NA_HSW,		0x00010) /* haswell style datala, unknown */
 PERF_ARCH(EXCL,			0x00020) /* HT exclusivity on counter */
 PERF_ARCH(DYNAMIC,		0x00040) /* dynamic alloc'd constraint */
-			/*	0x00080	*/
+PERF_ARCH(PEBS_CNTR,		0x00080) /* PEBS counters snapshot */
 PERF_ARCH(EXCL_ACCT,		0x00100) /* accounted EXCL event */
 PERF_ARCH(AUTO_RELOAD,		0x00200) /* use PEBS auto-reload */
 PERF_ARCH(LARGE_PEBS,		0x00400) /* use large PEBS */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index cd8023d5ea46..4e28ee0f2f3e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -140,6 +140,12 @@
 #define PEBS_DATACFG_XMMS	BIT_ULL(2)
 #define PEBS_DATACFG_LBRS	BIT_ULL(3)
 #define PEBS_DATACFG_LBR_SHIFT	24
+#define PEBS_DATACFG_CNTR	BIT_ULL(4)
+#define PEBS_DATACFG_CNTR_SHIFT	32
+#define PEBS_DATACFG_CNTR_MASK	GENMASK_ULL(15, 0)
+#define PEBS_DATACFG_FIX_SHIFT	48
+#define PEBS_DATACFG_FIX_MASK	GENMASK_ULL(7, 0)
+#define PEBS_DATACFG_METRICS	BIT_ULL(5)
 
 /* Steal the highest bit of pebs_data_cfg for SW usage */
 #define PEBS_UPDATE_DS_SW	BIT_ULL(63)
@@ -467,6 +473,15 @@ struct pebs_xmm {
 
 #define IBS_CPUID_FEATURES		0x8000001b
 
+struct pebs_cntr_header {
+	u32 cntr;
+	u32 fixed;
+	u32 metrics;
+	u32 reserved;
+};
+
+#define INTEL_CNTR_METRICS		0x3
+
 /*
  * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
  * bit 0 is used to indicate the existence of IBS.
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-18 15:16 ` [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
@ 2024-12-18 16:32   ` Peter Zijlstra
  2024-12-18 16:55     ` Liang, Kan
  2024-12-20 14:22   ` Peter Zijlstra
  1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-12-18 16:32 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi

On Wed, Dec 18, 2024 at 07:16:43AM -0800, kan.liang@linux.intel.com wrote:

> To prevent the case that a PEBS record value might be in the past
> relative to what is already in the event, perf always stops the PMU and
> drains the PEBS buffer before updating the corresponding event->count.

Like I wrote here:

  https://lkml.kernel.org/r/20241218082404.GI11133@noisy.programming.kicks-ass.net

I don't think this is sufficient.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-18 16:32   ` Peter Zijlstra
@ 2024-12-18 16:55     ` Liang, Kan
  2024-12-18 17:01       ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Liang, Kan @ 2024-12-18 16:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi



On 2024-12-18 11:32 a.m., Peter Zijlstra wrote:
> On Wed, Dec 18, 2024 at 07:16:43AM -0800, kan.liang@linux.intel.com wrote:
> 
>> To prevent the case that a PEBS record value might be in the past
>> relative to what is already in the event, perf always stops the PMU and
>> drains the PEBS buffer before updating the corresponding event->count.
> 
> Like I wrote here:
> 
>   https://lkml.kernel.org/r/20241218082404.GI11133@noisy.programming.kicks-ass.net
> 
> I don't think this is sufficient.


I replied with an explanation this morning in the old V5 thread. I'm not
sure if you got a chance to look at it.
https://lore.kernel.org/all/5a4ab06e-8628-4e1d-addb-2af920deffad@linux.intel.com/

There will be a drain_pebs() right before handling A-overflow-PMI.

  B-assist			A=1
  C				A=2
  B-assist			A=3
  		<- drain_pebs()
  A-overflow-PMI		A=4
  C-assist-PMI (DS buffer)	A=5

So the A-overflow-PMI will
   - Process the DS. adjust A->count to 3
   - adjust A->count to 4

Is it sufficient?
If not, could you please share more details?

Thanks,
Kan



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-18 16:55     ` Liang, Kan
@ 2024-12-18 17:01       ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-12-18 17:01 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi

On Wed, Dec 18, 2024 at 11:55:53AM -0500, Liang, Kan wrote:
> 
> 
> On 2024-12-18 11:32 a.m., Peter Zijlstra wrote:
> > On Wed, Dec 18, 2024 at 07:16:43AM -0800, kan.liang@linux.intel.com wrote:
> > 
> >> To prevent the case that a PEBS record value might be in the past
> >> relative to what is already in the event, perf always stops the PMU and
> >> drains the PEBS buffer before updating the corresponding event->count.
> > 
> > Like I wrote here:
> > 
> >   https://lkml.kernel.org/r/20241218082404.GI11133@noisy.programming.kicks-ass.net
> > 
> > I don't think this is sufficient.
> 
> 
> I replied with an explanation this morning in the old V5 thread. I'm not
> sure if you got a chance to look at it.
> https://lore.kernel.org/all/5a4ab06e-8628-4e1d-addb-2af920deffad@linux.intel.com/

Bah, I actually checked there before replying and didn't see the email
-- it is there now.

> There will be a drain_pebs() right before handling A-overflow-PMI.
> 
>   B-assist			A=1
>   C				A=2
>   B-assist			A=3
>   		<- drain_pebs()
>   A-overflow-PMI		A=4
>   C-assist-PMI (DS buffer)	A=5
> 
> So the A-overflow-PMI will
>    - Process the DS. adjust A->count to 3
>    - adjust A->count to 4
> 
> Is it sufficient?

Yes, that will work. A bit yuck but perhaps good enough.

OK, I'll go stare at this new series tomorrow -- brain is about to give
out for the day.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 2/3] perf: Extend perf_output_read
  2024-12-18 15:16 ` [PATCH V6 2/3] perf: Extend perf_output_read kan.liang
@ 2024-12-19 22:21   ` Peter Zijlstra
  2024-12-20  0:42     ` Liang, Kan
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-12-19 22:21 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi

On Wed, Dec 18, 2024 at 07:16:42AM -0800, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The event may have been updated in the PMU-specific implementation,
> e.g., Intel PEBS counters snapshotting. The common code should not
> read and overwrite the value.
> 
> The PERF_SAMPLE_READ in the data->sample_type can be used to detect
> whether the PMU-specific value is available. If yes, avoid the
> pmu->read() in the common code.

I had a poke at this, and ended up with the below. Not sure though,
wdyt?

---
 include/linux/perf_event.h  |  8 +++++++-
 kernel/events/core.c        | 33 ++++++++++++++++-----------------
 kernel/events/ring_buffer.c |  1 +
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8333f132f4a9..582f517a5dc8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1062,7 +1062,13 @@ struct perf_output_handle {
 	struct perf_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	u64				aux_flags;
+	union {
+		u64			flags;	   /* perf_output*() */
+		u64			aux_flags; /* perf_aux_output*() */
+		struct {
+			u64		skip_read : 1;
+		};
+	};
 	union {
 		void			*addr;
 		unsigned long		head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2bc67791f84..f91ba29048ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1191,6 +1191,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu)
 	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
 }
 
+static inline void perf_pmu_read(struct perf_event *event)
+{
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		event->pmu->read(event);
+}
+
 static void get_ctx(struct perf_event_context *ctx)
 {
 	refcount_inc(&ctx->refcount);
@@ -3473,8 +3479,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * we know the event must be on the current CPU, therefore we
 	 * don't need to use it.
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		event->pmu->read(event);
+	perf_pmu_read(event);
 
 	perf_event_update_time(event);
 
@@ -4618,15 +4623,8 @@ static void __perf_event_read(void *info)
 
 	pmu->read(event);
 
-	for_each_sibling_event(sub, event) {
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-			/*
-			 * Use sibling's PMU rather than @event's since
-			 * sibling could be on different (eg: software) PMU.
-			 */
-			sub->pmu->read(sub);
-		}
-	}
+	for_each_sibling_event(sub, event)
+		perf_pmu_read(sub);
 
 	data->ret = pmu->commit_txn(pmu);
 
@@ -7400,9 +7398,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = running;
 
-	if ((leader != event) &&
-	    (leader->state == PERF_EVENT_STATE_ACTIVE))
-		leader->pmu->read(leader);
+	if ((leader != event) && !handle->skip_read)
+		perf_pmu_read(leader);
 
 	values[n++] = perf_event_count(leader, self);
 	if (read_format & PERF_FORMAT_ID)
@@ -7415,9 +7412,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	for_each_sibling_event(sub, leader) {
 		n = 0;
 
-		if ((sub != event) &&
-		    (sub->state == PERF_EVENT_STATE_ACTIVE))
-			sub->pmu->read(sub);
+		if ((sub != event) && !handle->skip_read)
+			perf_pmu_read(sub);
 
 		values[n++] = perf_event_count(sub, self);
 		if (read_format & PERF_FORMAT_ID)
@@ -7476,6 +7472,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 {
 	u64 sample_type = data->type;
 
+	if (data->sample_flags & PERF_SAMPLE_READ)
+		handle->skip_read = 1;
+
 	perf_output_put(handle, *header);
 
 	if (sample_type & PERF_SAMPLE_IDENTIFIER)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4f46f688d0d4..9b49ecca693e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
 
 	handle->rb    = rb;
 	handle->event = event;
+	handle->flags = 0;
 
 	have_lost = local_read(&rb->lost);
 	if (unlikely(have_lost)) {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 2/3] perf: Extend perf_output_read
  2024-12-19 22:21   ` Peter Zijlstra
@ 2024-12-20  0:42     ` Liang, Kan
  0 siblings, 0 replies; 10+ messages in thread
From: Liang, Kan @ 2024-12-20  0:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi, Adrian Hunter



On 2024-12-19 5:21 p.m., Peter Zijlstra wrote:
> On Wed, Dec 18, 2024 at 07:16:42AM -0800, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The event may have been updated in the PMU-specific implementation,
>> e.g., Intel PEBS counters snapshotting. The common code should not
>> read and overwrite the value.
>>
>> The PERF_SAMPLE_READ in the data->sample_type can be used to detect
>> whether the PMU-specific value is available. If yes, avoid the
>> pmu->read() in the common code.
> 
> I had a poke at this, and ended up with the below. Not sure though,
> wdyt?

It looks good to me. I will do more tests tomorrow and send a V7.

Thanks,
Kan
> 
> ---
>  include/linux/perf_event.h  |  8 +++++++-
>  kernel/events/core.c        | 33 ++++++++++++++++-----------------
>  kernel/events/ring_buffer.c |  1 +
>  3 files changed, 24 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 8333f132f4a9..582f517a5dc8 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1062,7 +1062,13 @@ struct perf_output_handle {
>  	struct perf_buffer		*rb;
>  	unsigned long			wakeup;
>  	unsigned long			size;
> -	u64				aux_flags;
> +	union {
> +		u64			flags;	   /* perf_output*() */
> +		u64			aux_flags; /* perf_aux_output*() */
> +		struct {
> +			u64		skip_read : 1;
> +		};
> +	};
>  	union {
>  		void			*addr;
>  		unsigned long		head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b2bc67791f84..f91ba29048ce 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1191,6 +1191,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu)
>  	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
>  }
>  
> +static inline void perf_pmu_read(struct perf_event *event)
> +{
> +	if (event->state == PERF_EVENT_STATE_ACTIVE)
> +		event->pmu->read(event);
> +}
> +
>  static void get_ctx(struct perf_event_context *ctx)
>  {
>  	refcount_inc(&ctx->refcount);
> @@ -3473,8 +3479,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
>  	 * we know the event must be on the current CPU, therefore we
>  	 * don't need to use it.
>  	 */
> -	if (event->state == PERF_EVENT_STATE_ACTIVE)
> -		event->pmu->read(event);
> +	perf_pmu_read(event);
>  
>  	perf_event_update_time(event);
>  
> @@ -4618,15 +4623,8 @@ static void __perf_event_read(void *info)
>  
>  	pmu->read(event);
>  
> -	for_each_sibling_event(sub, event) {
> -		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
> -			/*
> -			 * Use sibling's PMU rather than @event's since
> -			 * sibling could be on different (eg: software) PMU.
> -			 */
> -			sub->pmu->read(sub);
> -		}
> -	}
> +	for_each_sibling_event(sub, event)
> +		perf_pmu_read(sub);
>  
>  	data->ret = pmu->commit_txn(pmu);
>  
> @@ -7400,9 +7398,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
>  	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
>  		values[n++] = running;
>  
> -	if ((leader != event) &&
> -	    (leader->state == PERF_EVENT_STATE_ACTIVE))
> -		leader->pmu->read(leader);
> +	if ((leader != event) && !handle->skip_read)
> +		perf_pmu_read(leader);
>  
>  	values[n++] = perf_event_count(leader, self);
>  	if (read_format & PERF_FORMAT_ID)
> @@ -7415,9 +7412,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
>  	for_each_sibling_event(sub, leader) {
>  		n = 0;
>  
> -		if ((sub != event) &&
> -		    (sub->state == PERF_EVENT_STATE_ACTIVE))
> -			sub->pmu->read(sub);
> +		if ((sub != event) && !handle->skip_read)
> +			perf_pmu_read(sub);
>  
>  		values[n++] = perf_event_count(sub, self);
>  		if (read_format & PERF_FORMAT_ID)
> @@ -7476,6 +7472,9 @@ void perf_output_sample(struct perf_output_handle *handle,
>  {
>  	u64 sample_type = data->type;
>  
> +	if (data->sample_flags & PERF_SAMPLE_READ)
> +		handle->skip_read = 1;
> +
>  	perf_output_put(handle, *header);
>  
>  	if (sample_type & PERF_SAMPLE_IDENTIFIER)
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 4f46f688d0d4..9b49ecca693e 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
>  
>  	handle->rb    = rb;
>  	handle->event = event;
> +	handle->flags = 0;
>  
>  	have_lost = local_read(&rb->lost);
>  	if (unlikely(have_lost)) {
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-18 15:16 ` [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
  2024-12-18 16:32   ` Peter Zijlstra
@ 2024-12-20 14:22   ` Peter Zijlstra
  2025-01-03 16:15     ` Liang, Kan
  1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-12-20 14:22 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi

On Wed, Dec 18, 2024 at 07:16:43AM -0800, kan.liang@linux.intel.com wrote:

> @@ -3109,6 +3116,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
>  		if (!test_bit(bit, cpuc->active_mask))
>  			continue;
>  

> +		if (is_pebs_counter_event(event))
> +			x86_pmu.drain_pebs(regs, &data);
> +
>  		if (!intel_pmu_save_and_restart(event))
>  			continue;
>  
> @@ -4056,6 +4084,23 @@ static int intel_pmu_hw_config(struct perf_event *event)
>  		event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
>  	}
>  
> +	if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
> +	    (x86_pmu.intel_cap.pebs_format >= 6)) {

Right, so the event that has SAMPLE_READ on is 'event'

> +		struct perf_event *leader = event->group_leader;
> +		bool slots_leader = is_slots_event(leader);
> +
> +		if (slots_leader)
> +			leader = list_next_entry(leader, sibling_list);

Uh, what, why?

> +
> +		if (leader->attr.precise_ip) {
> +			event->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
> +			if (slots_leader) {
> +				leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
> +				event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
> +			}
> +		}

And this is more confusion. You want event to be a PEBS event, not the
leader, you don't care about the leader.


> +	}
> +
>  	if ((event->attr.type == PERF_TYPE_HARDWARE) ||
>  	    (event->attr.type == PERF_TYPE_HW_CACHE))
>  		return 0;

> +static inline bool is_pebs_counter_event(struct perf_event *event)
> +{
> +	return event->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
> +}

For that drain_pebs() thing, you want all group members to have
PEBS_CNTR set.

That is, if PEBS>=6 and event is PEBS and event has SAMPLE_READ, then
mark the whole group with PEBS_CNTR

SAMPLE_READ doesn't particularly care who's the leader, the event that
has SAMPLE_READ will read the whole group. Heck they could all have
SAMPLE_READ and then all their samples will read each-other.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2024-12-20 14:22   ` Peter Zijlstra
@ 2025-01-03 16:15     ` Liang, Kan
  0 siblings, 0 replies; 10+ messages in thread
From: Liang, Kan @ 2025-01-03 16:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, linux-kernel, linux-perf-users,
	ak, eranian, dapeng1.mi

Hi Peter,

Sorry for the late response. I was on vacation.

On 2024-12-20 9:22 a.m., Peter Zijlstra wrote:
> On Wed, Dec 18, 2024 at 07:16:43AM -0800, kan.liang@linux.intel.com wrote:
> 
>> @@ -3109,6 +3116,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
>>  		if (!test_bit(bit, cpuc->active_mask))
>>  			continue;
>>  
> 
>> +		if (is_pebs_counter_event(event))
>> +			x86_pmu.drain_pebs(regs, &data);
>> +
>>  		if (!intel_pmu_save_and_restart(event))
>>  			continue;
>>  
>> @@ -4056,6 +4084,23 @@ static int intel_pmu_hw_config(struct perf_event *event)
>>  		event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
>>  	}
>>  
>> +	if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
>> +	    (x86_pmu.intel_cap.pebs_format >= 6)) {
> 
> Right, so the event that has SAMPLE_READ on is 'event'
> 
>> +		struct perf_event *leader = event->group_leader;
>> +		bool slots_leader = is_slots_event(leader);
>> +
>> +		if (slots_leader)
>> +			leader = list_next_entry(leader, sibling_list);
> 
> Uh, what, why?

This was to specially handle the perf metric topdown group.

> 
>> +
>> +		if (leader->attr.precise_ip) {
>> +			event->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
>> +			if (slots_leader) {
>> +				leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
>> +				event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
>> +			}
>> +		}
> 
> And this is more confusion. You want event to be a PEBS event, not the
> leader, you don't care about the leader.

Right

> >
>> +	}
>> +
>>  	if ((event->attr.type == PERF_TYPE_HARDWARE) ||
>>  	    (event->attr.type == PERF_TYPE_HW_CACHE))
>>  		return 0;
> 
>> +static inline bool is_pebs_counter_event(struct perf_event *event)
>> +{
>> +	return event->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
>> +}
> 
> For that drain_pebs() thing, you want all group members to have
> PEBS_CNTR set.
> 
> That is, if PEBS>=6 and event is PEBS and event has SAMPLE_READ, then
> mark the whole group with PEBS_CNTR

Yes, that was the design.

> 
> SAMPLE_READ doesn't particularly care who's the leader, the event that
> has SAMPLE_READ will read the whole group. Heck they could all have
> SAMPLE_READ and then all their samples will read each-other.

Right. It should be good enough to only set the flag for the
event->group_leader, since there is only one sampling event for a
SAMPLE_READ group.

The hw_config check can be simplified as below.

	if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
	    (x86_pmu.intel_cap.pebs_format >= 6) &&
	    is_sampling_event(event) &&
	    event->attr.precise_ip)
		event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;

Also, Only need to check the leader's flag to indicate the event in a
SAMPLE_READ group.

-static inline bool is_pebs_counter_event(struct perf_event *event)
+static inline bool is_pebs_counter_event_group(struct perf_event *event)
 {
-       return event->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
+       return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
 }


Thanks,
Kan



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-01-03 16:15 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-12-18 15:16 [PATCH V6 1/3] perf/x86/intel/ds: Add PEBS format 6 kan.liang
2024-12-18 15:16 ` [PATCH V6 2/3] perf: Extend perf_output_read kan.liang
2024-12-19 22:21   ` Peter Zijlstra
2024-12-20  0:42     ` Liang, Kan
2024-12-18 15:16 ` [PATCH V6 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
2024-12-18 16:32   ` Peter Zijlstra
2024-12-18 16:55     ` Liang, Kan
2024-12-18 17:01       ` Peter Zijlstra
2024-12-20 14:22   ` Peter Zijlstra
2025-01-03 16:15     ` Liang, Kan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).