[PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read
@ 2025-01-15 18:43 kan.liang
  2025-01-15 18:43 ` [PATCH V9 2/3] perf: Avoid the read if the count is already updated kan.liang
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: kan.liang @ 2025-01-15 18:43 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, adrian.hunter,
	linux-kernel, linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang, stable

From: Kan Liang <kan.liang@linux.intel.com>

The WARN_ON(this_cpu_read(cpu_hw_events.enabled)) in the
intel_pmu_save_and_restart_reload() is triggered, when sampling read
topdown events.

In a NMI handler, the cpu_hw_events.enabled is set and used to indicate
the status of core PMU. The generic pmu->pmu_disable_count, updated in
the perf_pmu_disable/enable pair, is not touched.
However, the perf_pmu_disable/enable pair is invoked when sampling read
in a NMI handler. The cpuc->enabled is mistakenly set by the
perf_pmu_enable().

Avoid perf_pmu_disable/enable() if the core PMU is already disabled.

Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: stable@vger.kernel.org
---

A new patch to fix the issue found on a legacy platform.
(Not related to counters snapshotting feature)

But since it also touches the sampling read code, the patches to enable
the counters snapshotting feature must be on top of the patch.
The patch itself can be applied separately.


 arch/x86/events/intel/core.c | 7 +++++--
 arch/x86/events/intel/ds.c   | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2a2824e9c50d..bce423ad3fad 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2778,15 +2778,18 @@ DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int pmu_enabled = cpuc->enabled;
 
 	/* Only need to call update_topdown_event() once for group read. */
 	if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
 	    !is_slots_event(event))
 		return;
 
-	perf_pmu_disable(event->pmu);
+	if (pmu_enabled)
+		perf_pmu_disable(event->pmu);
 	static_call(intel_pmu_update_topdown_event)(event);
-	perf_pmu_enable(event->pmu);
+	if (pmu_enabled)
+		perf_pmu_enable(event->pmu);
 }
 
 static void intel_pmu_read_event(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ba74e1198328..81b6ec8e824e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2096,11 +2096,14 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 
 void intel_pmu_auto_reload_read(struct perf_event *event)
 {
-	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
+	int pmu_enabled = this_cpu_read(cpu_hw_events.enabled);
 
-	perf_pmu_disable(event->pmu);
+	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
+	if (pmu_enabled)
+		perf_pmu_disable(event->pmu);
 	intel_pmu_drain_pebs_buffer();
-	perf_pmu_enable(event->pmu);
+	if (pmu_enabled)
+		perf_pmu_enable(event->pmu);
 }
 
 /*
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V9 2/3] perf: Avoid the read if the count is already updated
  2025-01-15 18:43 [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read kan.liang
@ 2025-01-15 18:43 ` kan.liang
  2025-01-15 18:43 ` [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
  2025-01-16 10:32 ` [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read Peter Zijlstra
  2 siblings, 0 replies; 14+ messages in thread
From: kan.liang @ 2025-01-15 18:43 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, adrian.hunter,
	linux-kernel, linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang

From: "Peter Zijlstra (Intel)" <peterz@infradead.org>

The event may have been updated in the PMU-specific implementation,
e.g., Intel PEBS counters snapshotting. The common code should not
read and overwrite the value.

The PERF_SAMPLE_READ in the data->sample_type can be used to detect
whether the PMU-specific value is available. If yes, avoid the
pmu->read() in the common code. Add a new flag, skip_read, to track the
case.

Factor out a perf_pmu_read() to clean up the code.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---

No changes since V8

 include/linux/perf_event.h  |  8 +++++++-
 kernel/events/core.c        | 33 ++++++++++++++++-----------------
 kernel/events/ring_buffer.c |  1 +
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8333f132f4a9..2d07bc1193f3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1062,7 +1062,13 @@ struct perf_output_handle {
 	struct perf_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	u64				aux_flags;
+	union {
+		u64			flags;		/* perf_output*() */
+		u64			aux_flags;	/* perf_aux_output*() */
+		struct {
+			u64		skip_read : 1;
+		};
+	};
 	union {
 		void			*addr;
 		unsigned long		head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2bc67791f84..f91ba29048ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1191,6 +1191,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu)
 	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
 }
 
+static inline void perf_pmu_read(struct perf_event *event)
+{
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		event->pmu->read(event);
+}
+
 static void get_ctx(struct perf_event_context *ctx)
 {
 	refcount_inc(&ctx->refcount);
@@ -3473,8 +3479,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * we know the event must be on the current CPU, therefore we
 	 * don't need to use it.
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		event->pmu->read(event);
+	perf_pmu_read(event);
 
 	perf_event_update_time(event);
 
@@ -4618,15 +4623,8 @@ static void __perf_event_read(void *info)
 
 	pmu->read(event);
 
-	for_each_sibling_event(sub, event) {
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-			/*
-			 * Use sibling's PMU rather than @event's since
-			 * sibling could be on different (eg: software) PMU.
-			 */
-			sub->pmu->read(sub);
-		}
-	}
+	for_each_sibling_event(sub, event)
+		perf_pmu_read(sub);
 
 	data->ret = pmu->commit_txn(pmu);
 
@@ -7400,9 +7398,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = running;
 
-	if ((leader != event) &&
-	    (leader->state == PERF_EVENT_STATE_ACTIVE))
-		leader->pmu->read(leader);
+	if ((leader != event) && !handle->skip_read)
+		perf_pmu_read(leader);
 
 	values[n++] = perf_event_count(leader, self);
 	if (read_format & PERF_FORMAT_ID)
@@ -7415,9 +7412,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	for_each_sibling_event(sub, leader) {
 		n = 0;
 
-		if ((sub != event) &&
-		    (sub->state == PERF_EVENT_STATE_ACTIVE))
-			sub->pmu->read(sub);
+		if ((sub != event) && !handle->skip_read)
+			perf_pmu_read(sub);
 
 		values[n++] = perf_event_count(sub, self);
 		if (read_format & PERF_FORMAT_ID)
@@ -7476,6 +7472,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 {
 	u64 sample_type = data->type;
 
+	if (data->sample_flags & PERF_SAMPLE_READ)
+		handle->skip_read = 1;
+
 	perf_output_put(handle, *header);
 
 	if (sample_type & PERF_SAMPLE_IDENTIFIER)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4f46f688d0d4..9b49ecca693e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
 
 	handle->rb    = rb;
 	handle->event = event;
+	handle->flags = 0;
 
 	have_lost = local_read(&rb->lost);
 	if (unlikely(have_lost)) {
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-15 18:43 [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read kan.liang
  2025-01-15 18:43 ` [PATCH V9 2/3] perf: Avoid the read if the count is already updated kan.liang
@ 2025-01-15 18:43 ` kan.liang
  2025-01-16 11:47   ` Peter Zijlstra
  2025-01-16 12:02   ` Peter Zijlstra
  2025-01-16 10:32 ` [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read Peter Zijlstra
  2 siblings, 2 replies; 14+ messages in thread
From: kan.liang @ 2025-01-15 18:43 UTC (permalink / raw)
  To: peterz, mingo, acme, namhyung, irogers, adrian.hunter,
	linux-kernel, linux-perf-users
  Cc: ak, eranian, dapeng1.mi, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The counters snapshotting is a new adaptive PEBS extension, which can
capture programmable counters, fixed-function counters, and performance
metrics in a PEBS record. The feature is available in the PEBS format
V6.

The target counters can be configured in the new fields of MSR_PEBS_CFG.
Then the PEBS HW will generate the bit mask of counters (Counters Group
Header) followed by the content of all the requested counters into a
PEBS record.

The current Linux perf sample read feature can read all events in the
group when any event in the group is overflowed. But the rdpmc in the
NMI/overflow handler has a small gap from overflow. Also, there is some
overhead for each rdpmc read. The counters snapshotting feature can be
used as an accurate and low-overhead replacement.

Extend intel_update_topdown_event() to accept the value from PEBS
records.

Add a new PEBS_CNTR flag to indicate a sample read group that utilizes
the counters snapshotting feature. When the group is scheduled, the
PEBS configure can be updated accordingly.

To prevent the case that a PEBS record value might be in the past
relative to what is already in the event, perf always stops the PMU and
drains the PEBS buffer before updating the corresponding event->count.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V8:
- Move the configuration update from schedule_events() to
  x86_pmu_enable()
- Fix an issue caused by running one sample read group and
  multiple PEBS groups at the same time. The counter snapshot fields
  should be ignored for the non-sample-read group.
- Shorten several long lines as suggested

 arch/x86/events/core.c             |  13 ++-
 arch/x86/events/intel/core.c       |  71 +++++++++---
 arch/x86/events/intel/ds.c         | 177 +++++++++++++++++++++++++++--
 arch/x86/events/perf_event.h       |  18 ++-
 arch/x86/events/perf_event_flags.h |   2 +-
 arch/x86/include/asm/perf_event.h  |  15 +++
 6 files changed, 268 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8f218ac0d445..79a4aad5a0a3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
 
 DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
 
+DEFINE_STATIC_CALL_NULL(x86_pmu_late_config, x86_pmu_late_config);
+
 /*
  * This one is magic, it will get called even when PMU init fails (because
  * there is no PMU), in which case it should simply return NULL.
@@ -1329,7 +1331,16 @@ static void x86_pmu_enable(struct pmu *pmu)
 		}
 
 		/*
-		 * step2: reprogram moved events into new counters
+		 * step2:
+		 * The late config (after counters are scheduled)
+		 * is required for some cases, e.g., PEBS counters
+		 * snapshotting. Because an accurate counter index
+		 * is needed.
+		 */
+		static_call_cond(x86_pmu_late_config)();
+
+		/*
+		 * step3: reprogram moved events into new counters
 		 */
 		for (i = 0; i < cpuc->n_events; i++) {
 			event = cpuc->event_list[i];
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index bce423ad3fad..ac532519344a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2710,7 +2710,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
  * modify by a NMI. PMU has to be disabled before calling this function.
  */
 
-static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2718,13 +2718,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 	bool reset = true;
 	int idx;
 
-	/* read Fixed counter 3 */
-	rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
-	if (!slots)
-		return 0;
+	if (!val) {
+		/* read Fixed counter 3 */
+		rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
+		if (!slots)
+			return 0;
 
-	/* read PERF_METRICS */
-	rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+		/* read PERF_METRICS */
+		rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
+	} else {
+		slots = val[0];
+		metrics = val[1];
+		/*
+		 * Don't reset the PERF_METRICS and Fixed counter 3
+		 * for each PEBS record read. Utilize the RDPMC metrics
+		 * clear mode.
+		 */
+		reset = false;
+	}
 
 	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
@@ -2767,13 +2778,14 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 	return slots;
 }
 
-static u64 icl_update_topdown_event(struct perf_event *event)
+static u64 icl_update_topdown_event(struct perf_event *event, u64 *val)
 {
 	return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
-						 x86_pmu.num_topdown_events - 1);
+						 x86_pmu.num_topdown_events - 1,
+					  val);
 }
 
-DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
+DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
 
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
@@ -2787,15 +2799,16 @@ static void intel_pmu_read_topdown_event(struct perf_event *event)
 
 	if (pmu_enabled)
 		perf_pmu_disable(event->pmu);
-	static_call(intel_pmu_update_topdown_event)(event);
+	static_call(intel_pmu_update_topdown_event)(event, NULL);
 	if (pmu_enabled)
 		perf_pmu_enable(event->pmu);
 }
 
 static void intel_pmu_read_event(struct perf_event *event)
 {
-	if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
-		intel_pmu_auto_reload_read(event);
+	if ((event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    is_pebs_counter_event_group(event))
+		intel_pmu_pebs_read(event);
 	else if (is_topdown_count(event))
 		intel_pmu_read_topdown_event(event);
 	else
@@ -2931,7 +2944,7 @@ static int intel_pmu_set_period(struct perf_event *event)
 static u64 intel_pmu_update(struct perf_event *event)
 {
 	if (unlikely(is_topdown_count(event)))
-		return static_call(intel_pmu_update_topdown_event)(event);
+		return static_call(intel_pmu_update_topdown_event)(event, NULL);
 
 	return x86_perf_event_update(event);
 }
@@ -3097,7 +3110,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	 */
 	if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
 		handled++;
-		static_call(intel_pmu_update_topdown_event)(NULL);
+		static_call(intel_pmu_update_topdown_event)(NULL, NULL);
 	}
 
 	/*
@@ -3115,6 +3128,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
+		/*
+		 * There may be unprocessed PEBS records in the PEBS buffer,
+		 * which still stores the previous values.
+		 * Process those records first before handling the latest value.
+		 * For example,
+		 * A is a regular counter
+		 * B is a PEBS event which reads A
+		 * C is a PEBS event
+		 *
+		 * The following can happen:
+		 * B-assist			A=1
+		 * C				A=2
+		 * B-assist			A=3
+		 * A-overflow-PMI		A=4
+		 * C-assist-PMI (PEBS buffer)	A=5
+		 *
+		 * The PEBS buffer has to be drained before handling the A-PMI
+		 */
+		if (is_pebs_counter_event_group(event))
+			x86_pmu.drain_pebs(regs, &data);
+
 		if (!intel_pmu_save_and_restart(event))
 			continue;
 
@@ -4062,6 +4096,13 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
 	}
 
+	if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
+	    (x86_pmu.intel_cap.pebs_format >= 6) &&
+	    x86_pmu.intel_cap.pebs_baseline &&
+	    is_sampling_event(event) &&
+	    event->attr.precise_ip)
+		event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+
 	if ((event->attr.type == PERF_TYPE_HARDWARE) ||
 	    (event->attr.type == PERF_TYPE_HW_CACHE))
 		return 0;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 81b6ec8e824e..10ce80230ad3 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
 	ds->pebs_interrupt_threshold = threshold;
 }
 
+#define PEBS_DATACFG_CNTRS(x)						\
+	((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK)
+
+#define PEBS_DATACFG_CNTR_BIT(x)					\
+	(((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT)
+
+#define PEBS_DATACFG_FIX(x)						\
+	((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK)
+
+#define PEBS_DATACFG_FIX_BIT(x)						\
+	(((1ULL << (x - INTEL_PMC_IDX_FIXED)) & PEBS_DATACFG_FIX_MASK)	\
+	 << PEBS_DATACFG_FIX_SHIFT)
+
 static void adaptive_pebs_record_size_update(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1308,10 +1321,58 @@ static void adaptive_pebs_record_size_update(void)
 		sz += sizeof(struct pebs_xmm);
 	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
 		sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+	if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) {
+		sz += sizeof(struct pebs_cntr_header);
+
+		/* Metrics base and Metrics Data */
+		if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+			sz += 2 * sizeof(u64);
+
+		if (pebs_data_cfg & PEBS_DATACFG_CNTR) {
+			sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) +
+			       hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) *
+			      sizeof(u64);
+		}
+	}
 
 	cpuc->pebs_record_size = sz;
 }
 
+static void __intel_pmu_pebs_update_cfg(struct perf_event *event,
+					int idx, u64 *pebs_data_cfg)
+{
+	if (is_metric_event(event)) {
+		*pebs_data_cfg |= PEBS_DATACFG_METRICS;
+		return;
+	}
+
+	*pebs_data_cfg |= PEBS_DATACFG_CNTR;
+
+	if (idx >= INTEL_PMC_IDX_FIXED)
+		*pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx);
+	else
+		*pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx);
+}
+
+
+static void intel_pmu_late_config(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event;
+	u64 pebs_data_cfg = 0;
+	int i;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		event = cpuc->event_list[i];
+		if (!is_pebs_counter_event_group(event))
+			continue;
+		__intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg);
+	}
+
+	if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+		cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
+}
+
 #define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
 				PERF_SAMPLE_PHYS_ADDR |			     \
 				PERF_SAMPLE_WEIGHT_TYPE |		     \
@@ -1914,6 +1975,34 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
 #endif
 }
 
+static void intel_perf_event_pmc_to_count(struct perf_event *event, u64 pmc)
+{
+	int shift = 64 - x86_pmu.cntval_bits;
+	struct hw_perf_event *hwc;
+	u64 delta, prev_pmc;
+
+	/*
+	 * The PEBS record doesn't shrink on pmu::del().
+	 * See pebs_update_state().
+	 * Ignore the non-exist event.
+	 */
+	if (!event)
+		return;
+
+	hwc = &event->hw;
+	prev_pmc = local64_read(&hwc->prev_count);
+
+	/* Only update the count when the PMU is disabled */
+	WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+	local64_set(&hwc->prev_count, pmc);
+
+	delta = (pmc << shift) - (prev_pmc << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+}
+
 #define PEBS_LATENCY_MASK			0xffff
 
 /*
@@ -2049,6 +2138,61 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 		}
 	}
 
+	if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
+		struct pebs_cntr_header *cntr = next_record;
+		int bit;
+
+		next_record += sizeof(struct pebs_cntr_header);
+
+		/*
+		 * The PEBS_DATA_CFG is a global register, which is the
+		 * superset configuration for all PEBS events.
+		 * For the PEBS record of non-sample-read group, ignore
+		 * the counter snapshot fields.
+		 */
+		if (!is_pebs_counter_event_group(event)) {
+			unsigned int nr;
+
+			nr = bitmap_weight((unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) +
+			     bitmap_weight((unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED);
+			if (cntr->metrics == INTEL_CNTR_METRICS)
+				nr += 2;
+			next_record += nr * sizeof(u64);
+			goto end_cntr;
+		}
+
+		for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
+			intel_perf_event_pmc_to_count(cpuc->events[bit], *(u64 *)next_record);
+			next_record += sizeof(u64);
+		}
+
+		for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) {
+			/* The slots event will be handled with perf_metric later */
+			if ((cntr->metrics == INTEL_CNTR_METRICS) &&
+			    (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) {
+				next_record += sizeof(u64);
+				continue;
+			}
+			intel_perf_event_pmc_to_count(cpuc->events[bit + INTEL_PMC_IDX_FIXED],
+						      *(u64 *)next_record);
+			next_record += sizeof(u64);
+		}
+
+		/* HW will reload the value right after the overflow. */
+		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+			local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period);
+
+		if (cntr->metrics == INTEL_CNTR_METRICS) {
+			static_call(intel_pmu_update_topdown_event)
+				   (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS],
+				    (u64 *)next_record);
+			next_record += 2 * sizeof(u64);
+		}
+		data->sample_flags |= PERF_SAMPLE_READ;
+	}
+
+end_cntr:
+
 	WARN_ONCE(next_record != __pebs + basic->format_size,
 			"PEBS record size %u, expected %llu, config %llx\n",
 			basic->format_size,
@@ -2094,11 +2238,10 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	return NULL;
 }
 
-void intel_pmu_auto_reload_read(struct perf_event *event)
+void intel_pmu_pebs_read(struct perf_event *event)
 {
 	int pmu_enabled = this_cpu_read(cpu_hw_events.enabled);
 
-	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
 	if (pmu_enabled)
 		perf_pmu_disable(event->pmu);
 	intel_pmu_drain_pebs_buffer();
@@ -2214,13 +2357,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event,
 	}
 
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-		/*
-		 * Now, auto-reload is only enabled in fixed period mode.
-		 * The reload value is always hwc->sample_period.
-		 * May need to change it, if auto-reload is enabled in
-		 * freq mode later.
-		 */
-		intel_pmu_save_and_restart_reload(event, count);
+		if ((is_pebs_counter_event_group(event))) {
+			/*
+			 * The value of each sample has been updated when setup
+			 * the corresponding sample data.
+			 */
+			perf_event_update_userpage(event);
+		} else {
+			/*
+			 * Now, auto-reload is only enabled in fixed period mode.
+			 * The reload value is always hwc->sample_period.
+			 * May need to change it, if auto-reload is enabled in
+			 * freq mode later.
+			 */
+			intel_pmu_save_and_restart_reload(event, count);
+		}
 	} else
 		intel_pmu_save_and_restart(event);
 }
@@ -2555,6 +2706,12 @@ void __init intel_ds_init(void)
 			break;
 
 		case 6:
+			if (x86_pmu.intel_cap.pebs_baseline) {
+				x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+				static_call_update(x86_pmu_late_config,
+						   &intel_pmu_late_config);
+			}
+			fallthrough;
 		case 5:
 			x86_pmu.pebs_ept = 1;
 			fallthrough;
@@ -2579,7 +2736,7 @@ void __init intel_ds_init(void)
 					  PERF_SAMPLE_REGS_USER |
 					  PERF_SAMPLE_REGS_INTR);
 			}
-			pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
+			pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual);
 
 			if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) {
 				pr_cont("PEBS-via-PT, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 31c2771545a6..c9c6ba49a926 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event)
 	return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
 }
 
+static inline bool is_pebs_counter_event_group(struct perf_event *event)
+{
+	return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
+}
+
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
 	int refcnt; /* reference count */
@@ -1148,6 +1153,17 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
+static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val)
+{
+	return x86_perf_event_update(event);
+}
+DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
+
+static inline void x86_pmu_late_config(void)
+{
+}
+DECLARE_STATIC_CALL(x86_pmu_late_config, x86_pmu_late_config);
+
 static inline unsigned int x86_pmu_config_addr(int index)
 {
 	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
@@ -1643,7 +1659,7 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
 
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_pebs_read(struct perf_event *event);
 
 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 6c977c19f2cd..1d9e385649b5 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW,		0x00008) /* haswell style datala, load */
 PERF_ARCH(PEBS_NA_HSW,		0x00010) /* haswell style datala, unknown */
 PERF_ARCH(EXCL,			0x00020) /* HT exclusivity on counter */
 PERF_ARCH(DYNAMIC,		0x00040) /* dynamic alloc'd constraint */
-			/*	0x00080	*/
+PERF_ARCH(PEBS_CNTR,		0x00080) /* PEBS counters snapshot */
 PERF_ARCH(EXCL_ACCT,		0x00100) /* accounted EXCL event */
 PERF_ARCH(AUTO_RELOAD,		0x00200) /* use PEBS auto-reload */
 PERF_ARCH(LARGE_PEBS,		0x00400) /* use large PEBS */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 1ac79f361645..adaeb8ca3a8a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -141,6 +141,12 @@
 #define PEBS_DATACFG_XMMS	BIT_ULL(2)
 #define PEBS_DATACFG_LBRS	BIT_ULL(3)
 #define PEBS_DATACFG_LBR_SHIFT	24
+#define PEBS_DATACFG_CNTR	BIT_ULL(4)
+#define PEBS_DATACFG_CNTR_SHIFT	32
+#define PEBS_DATACFG_CNTR_MASK	GENMASK_ULL(15, 0)
+#define PEBS_DATACFG_FIX_SHIFT	48
+#define PEBS_DATACFG_FIX_MASK	GENMASK_ULL(7, 0)
+#define PEBS_DATACFG_METRICS	BIT_ULL(5)
 
 /* Steal the highest bit of pebs_data_cfg for SW usage */
 #define PEBS_UPDATE_DS_SW	BIT_ULL(63)
@@ -471,6 +477,15 @@ struct pebs_xmm {
 
 #define IBS_CPUID_FEATURES		0x8000001b
 
+struct pebs_cntr_header {
+	u32 cntr;
+	u32 fixed;
+	u32 metrics;
+	u32 reserved;
+};
+
+#define INTEL_CNTR_METRICS		0x3
+
 /*
  * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
  * bit 0 is used to indicate the existence of IBS.
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-15 18:43 ` [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
@ 2025-01-16 11:47   ` Peter Zijlstra
  2025-01-16 15:55     ` Liang, Kan
  2025-01-16 12:02   ` Peter Zijlstra
  1 sibling, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 11:47 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi

On Wed, Jan 15, 2025 at 10:43:18AM -0800, kan.liang@linux.intel.com wrote:

> Reviewed-by: Andi Kleen <ak@linux.intel.com>
> Reviewed-by: Ian Rogers <irogers@google.com>

Did they really read all the various versions without spotting any of
the problems, or are you just rubber stamping things at this point :/

Best to drop review tags after serious changes to patches.

> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---

> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 8f218ac0d445..79a4aad5a0a3 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
>  
>  DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
>  
> +DEFINE_STATIC_CALL_NULL(x86_pmu_late_config, x86_pmu_late_config);

So all these static call are through struct x86_pmu (hence the naming),
but not this one ?!?

>  /*
>   * This one is magic, it will get called even when PMU init fails (because
>   * there is no PMU), in which case it should simply return NULL.
> @@ -1329,7 +1331,16 @@ static void x86_pmu_enable(struct pmu *pmu)
>  		}
>  
>  		/*
> -		 * step2: reprogram moved events into new counters
> +		 * step2:
> +		 * The late config (after counters are scheduled)
> +		 * is required for some cases, e.g., PEBS counters
> +		 * snapshotting. Because an accurate counter index
> +		 * is needed.
> +		 */
> +		static_call_cond(x86_pmu_late_config)();
> +
> +		/*
> +		 * step3: reprogram moved events into new counters
>  		 */
>  		for (i = 0; i < cpuc->n_events; i++) {
>  			event = cpuc->event_list[i];

Placement and naming are weird.

These 2 loops migrate all counters that got a new place, the first loop
taking out pre-existing counters that got a new place, the second loop
setting up these same and new counters.

Why do the pebs config in the middle, where the least number of counters
are set-up?

AFAICT all it really needs is cpuc->assignp[], which is unchanged
throughout. You can call this at any time before clearing n_added,
sticking it in the middle just seems weird.

Then naming, config is typically what we do at event creation,
x86_pmu_hw_config() like. This is not at all related, so perhaps pick a
different name?

Bah, we're so very close between x86_pmu::{enable, disable, assign}() :/

Also, I think I found you another bug... Consider what happens to the
counter value when we reschedule a HES_STOPPED counter, then we skip
x86_pmu_start(RELOAD) on step2, which leave the counter value with
'random' crap from whatever was there last.

But meanwhile you do program PEBS to sample it. That will happily sample
this garbage.

Hmm?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 11:47   ` Peter Zijlstra
@ 2025-01-16 15:55     ` Liang, Kan
  2025-01-16 20:42       ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Liang, Kan @ 2025-01-16 15:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi



On 2025-01-16 6:47 a.m., Peter Zijlstra wrote:
> On Wed, Jan 15, 2025 at 10:43:18AM -0800, kan.liang@linux.intel.com wrote:
> 
>> Reviewed-by: Andi Kleen <ak@linux.intel.com>
>> Reviewed-by: Ian Rogers <irogers@google.com>
> 
> Did they really read all the various versions without spotting any of
> the problems, or are you just rubber stamping things at this point :/
> 
> Best to drop review tags after serious changes to patches.
> 

Sure.

>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
> 
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 8f218ac0d445..79a4aad5a0a3 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
>>  
>>  DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
>>  
>> +DEFINE_STATIC_CALL_NULL(x86_pmu_late_config, x86_pmu_late_config);
> 
> So all these static call are through struct x86_pmu (hence the naming),
> but not this one ?!?

I will add it in struct x86_pmu.

> 
>>  /*
>>   * This one is magic, it will get called even when PMU init fails (because
>>   * there is no PMU), in which case it should simply return NULL.
>> @@ -1329,7 +1331,16 @@ static void x86_pmu_enable(struct pmu *pmu)
>>  		}
>>  
>>  		/*
>> -		 * step2: reprogram moved events into new counters
>> +		 * step2:
>> +		 * The late config (after counters are scheduled)
>> +		 * is required for some cases, e.g., PEBS counters
>> +		 * snapshotting. Because an accurate counter index
>> +		 * is needed.
>> +		 */
>> +		static_call_cond(x86_pmu_late_config)();
>> +
>> +		/*
>> +		 * step3: reprogram moved events into new counters
>>  		 */
>>  		for (i = 0; i < cpuc->n_events; i++) {
>>  			event = cpuc->event_list[i];
> 
> Placement and naming are weird.
> 
> These 2 loops migrate all counters that got a new place, the first loop
> taking out pre-existing counters that got a new place, the second loop
> setting up these same and new counters.
> 
> Why do the pebs config in the middle, where the least number of counters
> are set-up?
> 
> AFAICT all it really needs is cpuc->assignp[], which is unchanged
> throughout. You can call this at any time before clearing n_added,
> sticking it in the middle just seems weird.

I just tried to put it right before the x86_pmu_start(RELOAD), since the
MSR will be updated in the x86_pmu_start().
But yes, any place before it is good.

> 
> Then naming, config is typically what we do at event creation,
> x86_pmu_hw_config() like. This is not at all related, so perhaps pick a
> different name?

How about late_setup/extra_setup?

@@ -1300,6 +1300,9 @@ static void x86_pmu_enable(struct pmu *pmu)

 	if (cpuc->n_added) {
 		int n_running = cpuc->n_events - cpuc->n_added;
+
+		static_call_cond(x86_pmu_late_setup)();
+
 		/*
 		 * apply assignment obtained either from
 		 * hw_perf_group_sched_in() or x86_pmu_enable()


It can be put at the begin of the cpuc->n_added.

So perf does the late setup, and then move and start the pre-existing
and new events. Is it OK?

> 
> Bah, we're so very close between x86_pmu::{enable, disable, assign}() :/
> 
> Also, I think I found you another bug... Consider what happens to the
> counter value when we reschedule a HES_STOPPED counter, then we skip
> x86_pmu_start(RELOAD) on step2, which leave the counter value with
> 'random' crap from whatever was there last.
> 
> But meanwhile you do program PEBS to sample it. That will happily sample
> this garbage.
> 
> Hmm?

I'm not quite sure I understand the issue.

The HES_STOPPED counter should be a pre-existing counter. Just for some
reason, it's stopped, right? So perf doesn't need to re-configure the
PEBS__DATA_CFG, since the idx is not changed.

The worry is that the HES_STOPPED counter is restarted with
x86_pmu_start(0), right?

There should be 3 places which invoking the x86_pmu_start(0).
- In __perf_event_stop(). But according to the comments, it's only for
the events with AUX ring buffer. It should not be a problem for PEBS.
- Handle throttle. The event does stop(0) and then start(0). There
should be no 'random' garbage, since the counter has been stopped.
Everything should be just the same as when it's stopped.
- In the perf_adjust_freq_unthr_events(), when the period is not
adjusted, there should be no 'random' garbage as well.

Or do you mean that some 3rd party tools change the counter values while
it's stopped?

If so, I think we may force update for the pebs-counter-event-group as
below.

@@ -1517,7 +1522,8 @@ static void x86_pmu_start(struct perf_event
*event, int flags)
 	if (WARN_ON_ONCE(idx == -1))
 		return;

-	if (flags & PERF_EF_RELOAD) {
+	if (flags & PERF_EF_RELOAD ||
+		is_pebs_counter_event_group(event)) {
 		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
  		static_call(x86_pmu_set_period)(event);
 	}
@@ -1608,7 +1614,8 @@ void x86_pmu_stop(struct perf_event *event, int flags)
 		hwc->state |= PERF_HES_STOPPED;
 	}

-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+	if (((flags & PERF_EF_UPDATE) || is_pebs_counter_event_group(event)) &&
+	    !(hwc->state & PERF_HES_UPTODATE)) {
 		/*
 		 * Drain the remaining delta count out of a event
 		 * that we are disabling:


Thanks,
Kan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 15:55     ` Liang, Kan
@ 2025-01-16 20:42       ` Peter Zijlstra
  2025-01-16 20:56         ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 20:42 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi

On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:

> > Also, I think I found you another bug... Consider what happens to the
> > counter value when we reschedule a HES_STOPPED counter, then we skip
> > x86_pmu_start(RELOAD) on step2, which leave the counter value with
> > 'random' crap from whatever was there last.
> > 
> > But meanwhile you do program PEBS to sample it. That will happily sample
> > this garbage.
> > 
> > Hmm?
> 
> I'm not quite sure I understand the issue.
> 
> The HES_STOPPED counter should be a pre-existing counter. Just for some
> reason, it's stopped, right? So perf doesn't need to re-configure the
> PEBS__DATA_CFG, since the idx is not changed.

Suppose you have your group {A, B, C} and lets suppose A is the PEBS
event, further suppose that B is also a sampling event. Lets say they
get hardware counters 1,2 and 3 respectively.

Then lets say B gets throttled.

While it is throttled, we get a new event D scheduled, and D gets placed
on counter 2 -- where B lives, which gets moved over to counter 4.

Then our loops will update and remove B from 2, but because
throttled/HES_STOPPED it will not start it on counter 4.

Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
1,3 and 4.

PEBS assist happens, and samples the uninitialized counter 4.

We unthrottle B and program counter 4.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 20:42       ` Peter Zijlstra
@ 2025-01-16 20:56         ` Peter Zijlstra
  2025-01-16 21:50           ` Liang, Kan
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 20:56 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi

On Thu, Jan 16, 2025 at 09:42:25PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:
> 
> > > Also, I think I found you another bug... Consider what happens to the
> > > counter value when we reschedule a HES_STOPPED counter, then we skip
> > > x86_pmu_start(RELOAD) on step2, which leave the counter value with
> > > 'random' crap from whatever was there last.
> > > 
> > > But meanwhile you do program PEBS to sample it. That will happily sample
> > > this garbage.
> > > 
> > > Hmm?
> > 
> > I'm not quite sure I understand the issue.
> > 
> > The HES_STOPPED counter should be a pre-existing counter. Just for some
> > reason, it's stopped, right? So perf doesn't need to re-configure the
> > PEBS__DATA_CFG, since the idx is not changed.
> 
> Suppose you have your group {A, B, C} and lets suppose A is the PEBS
> event, further suppose that B is also a sampling event. Lets say they
> get hardware counters 1,2 and 3 respectively.
> 
> Then lets say B gets throttled.
> 
> While it is throttled, we get a new event D scheduled, and D gets placed
> on counter 2 -- where B lives, which gets moved over to counter 4.
> 
> Then our loops will update and remove B from 2, but because
> throttled/HES_STOPPED it will not start it on counter 4.
> 
> Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
> 1,3 and 4.
> 
> PEBS assist happens, and samples the uninitialized counter 4.

Also, by skipping x86_pmu_start() we miss the assignment of
cpuc->events[] so PEBS buffer decode can't even find the dodgy event.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 20:56         ` Peter Zijlstra
@ 2025-01-16 21:50           ` Liang, Kan
  2025-01-21 15:25             ` Liang, Kan
  2025-01-23  9:14             ` Peter Zijlstra
  0 siblings, 2 replies; 14+ messages in thread
From: Liang, Kan @ 2025-01-16 21:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi



On 2025-01-16 3:56 p.m., Peter Zijlstra wrote:
> On Thu, Jan 16, 2025 at 09:42:25PM +0100, Peter Zijlstra wrote:
>> On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:
>>
>>>> Also, I think I found you another bug... Consider what happens to the
>>>> counter value when we reschedule a HES_STOPPED counter, then we skip
>>>> x86_pmu_start(RELOAD) on step2, which leave the counter value with
>>>> 'random' crap from whatever was there last.
>>>>
>>>> But meanwhile you do program PEBS to sample it. That will happily sample
>>>> this garbage.
>>>>
>>>> Hmm?
>>>
>>> I'm not quite sure I understand the issue.
>>>
>>> The HES_STOPPED counter should be a pre-existing counter. Just for some
>>> reason, it's stopped, right? So perf doesn't need to re-configure the
>>> PEBS__DATA_CFG, since the idx is not changed.
>>
>> Suppose you have your group {A, B, C} and lets suppose A is the PEBS
>> event, further suppose that B is also a sampling event. Lets say they
>> get hardware counters 1,2 and 3 respectively.
>>
>> Then lets say B gets throttled.
>>
>> While it is throttled, we get a new event D scheduled, and D gets placed
>> on counter 2 -- where B lives, which gets moved over to counter 4.
>>
>> Then our loops will update and remove B from 2, but because
>> throttled/HES_STOPPED it will not start it on counter 4.
>>>> Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
>> 1,3 and 4.
>>
>> PEBS assist happens, and samples the uninitialized counter 4.
> > Also, by skipping x86_pmu_start() we miss the assignment of
> cpuc->events[] so PEBS buffer decode can't even find the dodgy event.
> 

Yes, counter 4 includes garbage before the B is started again.
But the cpuc->events[counter 4] is NULL either.

The current implementation ignores the NULL cpuc->events[]. The stopped
B should not be mistakenly updated.

+static void intel_perf_event_pmc_to_count(struct perf_event *event, u64
pmc)
+{
+	int shift = 64 - x86_pmu.cntval_bits;
+	struct hw_perf_event *hwc;
+	u64 delta, prev_pmc;
+
+	/*
+	 * The PEBS record doesn't shrink on pmu::del().
+	 * See pebs_update_state().
+	 * Ignore the non-exist event.
+	 */
+	if (!event)
+		return;


Thanks,
Kan


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 21:50           ` Liang, Kan
@ 2025-01-21 15:25             ` Liang, Kan
  2025-01-23  9:14             ` Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: Liang, Kan @ 2025-01-21 15:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi



On 2025-01-16 4:50 p.m., Liang, Kan wrote:
> 
> 
> On 2025-01-16 3:56 p.m., Peter Zijlstra wrote:
>> On Thu, Jan 16, 2025 at 09:42:25PM +0100, Peter Zijlstra wrote:
>>> On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:
>>>
>>>>> Also, I think I found you another bug... Consider what happens to the
>>>>> counter value when we reschedule a HES_STOPPED counter, then we skip
>>>>> x86_pmu_start(RELOAD) on step2, which leave the counter value with
>>>>> 'random' crap from whatever was there last.
>>>>>
>>>>> But meanwhile you do program PEBS to sample it. That will happily sample
>>>>> this garbage.
>>>>>
>>>>> Hmm?
>>>>
>>>> I'm not quite sure I understand the issue.
>>>>
>>>> The HES_STOPPED counter should be a pre-existing counter. Just for some
>>>> reason, it's stopped, right? So perf doesn't need to re-configure the
>>>> PEBS__DATA_CFG, since the idx is not changed.
>>>
>>> Suppose you have your group {A, B, C} and lets suppose A is the PEBS
>>> event, further suppose that B is also a sampling event. Lets say they
>>> get hardware counters 1,2 and 3 respectively.
>>>
>>> Then lets say B gets throttled.
>>>
>>> While it is throttled, we get a new event D scheduled, and D gets placed
>>> on counter 2 -- where B lives, which gets moved over to counter 4.
>>>
>>> Then our loops will update and remove B from 2, but because
>>> throttled/HES_STOPPED it will not start it on counter 4.
>>>>> Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
>>> 1,3 and 4.
>>>
>>> PEBS assist happens, and samples the uninitialized counter 4.
>>> Also, by skipping x86_pmu_start() we miss the assignment of
>> cpuc->events[] so PEBS buffer decode can't even find the dodgy event.
>>
> 
> Yes, counter 4 includes garbage before the B is started again.
> But the cpuc->events[counter 4] is NULL either.
> 
> The current implementation ignores the NULL cpuc->events[]. The stopped
> B should not be mistakenly updated.
> 
> +static void intel_perf_event_pmc_to_count(struct perf_event *event, u64
> pmc)
> +{
> +	int shift = 64 - x86_pmu.cntval_bits;
> +	struct hw_perf_event *hwc;
> +	u64 delta, prev_pmc;
> +
> +	/*
> +	 * The PEBS record doesn't shrink on pmu::del().
> +	 * See pebs_update_state().
> +	 * Ignore the non-exist event.
> +	 */
> +	if (!event)
> +		return;
> 
> 

I've sent a V10 to address all the comments in V9.
The above case is explained in the comments of
intel_perf_event_update_pmc() in V10.
https://lore.kernel.org/lkml/20250121152303.3128733-4-kan.liang@linux.intel.com/

Please take a look and let me know if it's not sufficient to handle the
case.

Thanks,
Kan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-16 21:50           ` Liang, Kan
  2025-01-21 15:25             ` Liang, Kan
@ 2025-01-23  9:14             ` Peter Zijlstra
  2025-01-23 15:36               ` Liang, Kan
  1 sibling, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-23  9:14 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi

On Thu, Jan 16, 2025 at 04:50:01PM -0500, Liang, Kan wrote:
> 
> 
> On 2025-01-16 3:56 p.m., Peter Zijlstra wrote:
> > On Thu, Jan 16, 2025 at 09:42:25PM +0100, Peter Zijlstra wrote:
> >> On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:
> >>
> >>>> Also, I think I found you another bug... Consider what happens to the
> >>>> counter value when we reschedule a HES_STOPPED counter, then we skip
> >>>> x86_pmu_start(RELOAD) on step2, which leave the counter value with
> >>>> 'random' crap from whatever was there last.
> >>>>
> >>>> But meanwhile you do program PEBS to sample it. That will happily sample
> >>>> this garbage.
> >>>>
> >>>> Hmm?
> >>>
> >>> I'm not quite sure I understand the issue.
> >>>
> >>> The HES_STOPPED counter should be a pre-existing counter. Just for some
> >>> reason, it's stopped, right? So perf doesn't need to re-configure the
> >>> PEBS__DATA_CFG, since the idx is not changed.
> >>
> >> Suppose you have your group {A, B, C} and lets suppose A is the PEBS
> >> event, further suppose that B is also a sampling event. Lets say they
> >> get hardware counters 1,2 and 3 respectively.
> >>
> >> Then lets say B gets throttled.
> >>
> >> While it is throttled, we get a new event D scheduled, and D gets placed
> >> on counter 2 -- where B lives, which gets moved over to counter 4.
> >>
> >> Then our loops will update and remove B from 2, but because
> >> throttled/HES_STOPPED it will not start it on counter 4.
> >>>> Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
> >> 1,3 and 4.
> >>
> >> PEBS assist happens, and samples the uninitialized counter 4.
> > > Also, by skipping x86_pmu_start() we miss the assignment of
> > cpuc->events[] so PEBS buffer decode can't even find the dodgy event.
> > 
> 
> Yes, counter 4 includes garbage before the B is started again.
> But the cpuc->events[counter 4] is NULL either.
> 
> The current implementation ignores the NULL cpuc->events[]. The stopped
> B should not be mistakenly updated.

Ah, indeed. I was so close.

One question though -- is this value ever exposed otherwise? I had a
quick look and I don't think we support PERF_SAMPLE_RAW for PEBS, but
what about PEBS-to-PT ? 

Anywya, let me go find this v10 thing :-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-23  9:14             ` Peter Zijlstra
@ 2025-01-23 15:36               ` Liang, Kan
  0 siblings, 0 replies; 14+ messages in thread
From: Liang, Kan @ 2025-01-23 15:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi



On 2025-01-23 4:14 a.m., Peter Zijlstra wrote:
> On Thu, Jan 16, 2025 at 04:50:01PM -0500, Liang, Kan wrote:
>>
>>
>> On 2025-01-16 3:56 p.m., Peter Zijlstra wrote:
>>> On Thu, Jan 16, 2025 at 09:42:25PM +0100, Peter Zijlstra wrote:
>>>> On Thu, Jan 16, 2025 at 10:55:46AM -0500, Liang, Kan wrote:
>>>>
>>>>>> Also, I think I found you another bug... Consider what happens to the
>>>>>> counter value when we reschedule a HES_STOPPED counter, then we skip
>>>>>> x86_pmu_start(RELOAD) on step2, which leave the counter value with
>>>>>> 'random' crap from whatever was there last.
>>>>>>
>>>>>> But meanwhile you do program PEBS to sample it. That will happily sample
>>>>>> this garbage.
>>>>>>
>>>>>> Hmm?
>>>>>
>>>>> I'm not quite sure I understand the issue.
>>>>>
>>>>> The HES_STOPPED counter should be a pre-existing counter. Just for some
>>>>> reason, it's stopped, right? So perf doesn't need to re-configure the
>>>>> PEBS__DATA_CFG, since the idx is not changed.
>>>>
>>>> Suppose you have your group {A, B, C} and lets suppose A is the PEBS
>>>> event, further suppose that B is also a sampling event. Lets say they
>>>> get hardware counters 1,2 and 3 respectively.
>>>>
>>>> Then lets say B gets throttled.
>>>>
>>>> While it is throttled, we get a new event D scheduled, and D gets placed
>>>> on counter 2 -- where B lives, which gets moved over to counter 4.
>>>>
>>>> Then our loops will update and remove B from 2, but because
>>>> throttled/HES_STOPPED it will not start it on counter 4.
>>>>>> Meanwhile, we do have the PEBS_DATA_CFG thing updated to sample counter
>>>> 1,3 and 4.
>>>>
>>>> PEBS assist happens, and samples the uninitialized counter 4.
>>>> Also, by skipping x86_pmu_start() we miss the assignment of
>>> cpuc->events[] so PEBS buffer decode can't even find the dodgy event.
>>>
>>
>> Yes, counter 4 includes garbage before the B is started again.
>> But the cpuc->events[counter 4] is NULL either.
>>
>> The current implementation ignores the NULL cpuc->events[]. The stopped
>> B should not be mistakenly updated.
> 
> Ah, indeed. I was so close.
> 
> One question though -- is this value ever exposed otherwise? I had a
> quick look and I don't think we support PERF_SAMPLE_RAW for PEBS, but
> what about PEBS-to-PT ? 
>

The counters snapshotting feature is only available on the latest
platforms, lunar lake/arrow lake, and future platforms with Arch PEBS.

The PEBS-to-PT is not enumerated on lunar lake/arrow lake. (It actually
never works on hybrid.) It will be deprecated on Arch PEBS as well.
So we don't need to worry about the PEBS-to-PT.

> Anywya, let me go find this v10 thing :-)

Thanks.

Kan


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting
  2025-01-15 18:43 ` [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
  2025-01-16 11:47   ` Peter Zijlstra
@ 2025-01-16 12:02   ` Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 12:02 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi

On Wed, Jan 15, 2025 at 10:43:18AM -0800, kan.liang@linux.intel.com wrote:

> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 81b6ec8e824e..10ce80230ad3 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
>  	ds->pebs_interrupt_threshold = threshold;
>  }
>  
> +#define PEBS_DATACFG_CNTRS(x)						\
> +	((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK)
> +
> +#define PEBS_DATACFG_CNTR_BIT(x)					\
> +	(((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT)
> +
> +#define PEBS_DATACFG_FIX(x)						\
> +	((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK)
> +
> +#define PEBS_DATACFG_FIX_BIT(x)						\
> +	(((1ULL << (x - INTEL_PMC_IDX_FIXED)) & PEBS_DATACFG_FIX_MASK)	\
> +	 << PEBS_DATACFG_FIX_SHIFT)

That INTEL_PMX_IDX_FIXED does not belong here, needs to be at the
callsite.


> @@ -1914,6 +1975,34 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
>  #endif
>  }
>  
> +static void intel_perf_event_pmc_to_count(struct perf_event *event, u64 pmc)
> +{
> +	int shift = 64 - x86_pmu.cntval_bits;
> +	struct hw_perf_event *hwc;
> +	u64 delta, prev_pmc;
> +
> +	/*
> +	 * The PEBS record doesn't shrink on pmu::del().
> +	 * See pebs_update_state().
> +	 * Ignore the non-exist event.
> +	 */
> +	if (!event)
> +		return;
> +
> +	hwc = &event->hw;
> +	prev_pmc = local64_read(&hwc->prev_count);
> +
> +	/* Only update the count when the PMU is disabled */
> +	WARN_ON(this_cpu_read(cpu_hw_events.enabled));
> +	local64_set(&hwc->prev_count, pmc);
> +
> +	delta = (pmc << shift) - (prev_pmc << shift);
> +	delta >>= shift;
> +
> +	local64_add(delta, &event->count);
> +	local64_sub(delta, &hwc->period_left);
> +}

Name and function don't really match at this point. When I suggested
this function it returned a count value, now it's closer to
intel_perf_event_update_pmc() or somesuch.



> @@ -2049,6 +2138,61 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
>  		}
>  	}
>  
> +	if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
> +		struct pebs_cntr_header *cntr = next_record;
> +		int bit;
> +
> +		next_record += sizeof(struct pebs_cntr_header);
> +
> +		/*
> +		 * The PEBS_DATA_CFG is a global register, which is the
> +		 * superset configuration for all PEBS events.
> +		 * For the PEBS record of non-sample-read group, ignore
> +		 * the counter snapshot fields.
> +		 */
> +		if (!is_pebs_counter_event_group(event)) {
> +			unsigned int nr;
> +
> +			nr = bitmap_weight((unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) +
> +			     bitmap_weight((unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED);

hweight32 ?

> +			if (cntr->metrics == INTEL_CNTR_METRICS)
> +				nr += 2;
> +			next_record += nr * sizeof(u64);
> +			goto end_cntr;

Yuck, can't you break out stuff into a helper function to get rid of
that goto?

> +		}
> +
> +		for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
> +			intel_perf_event_pmc_to_count(cpuc->events[bit], *(u64 *)next_record);
> +			next_record += sizeof(u64);
> +		}
> +
> +		for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) {
> +			/* The slots event will be handled with perf_metric later */
> +			if ((cntr->metrics == INTEL_CNTR_METRICS) &&
> +			    (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) {
> +				next_record += sizeof(u64);
> +				continue;
> +			}
> +			intel_perf_event_pmc_to_count(cpuc->events[bit + INTEL_PMC_IDX_FIXED],
> +						      *(u64 *)next_record);
> +			next_record += sizeof(u64);
> +		}
> +
> +		/* HW will reload the value right after the overflow. */
> +		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
> +			local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period);
> +
> +		if (cntr->metrics == INTEL_CNTR_METRICS) {
> +			static_call(intel_pmu_update_topdown_event)
> +				   (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS],
> +				    (u64 *)next_record);
> +			next_record += 2 * sizeof(u64);
> +		}
> +		data->sample_flags |= PERF_SAMPLE_READ;
> +	}
> +
> +end_cntr:
> +
>  	WARN_ONCE(next_record != __pebs + basic->format_size,
>  			"PEBS record size %u, expected %llu, config %llx\n",
>  			basic->format_size,

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read
  2025-01-15 18:43 [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read kan.liang
  2025-01-15 18:43 ` [PATCH V9 2/3] perf: Avoid the read if the count is already updated kan.liang
  2025-01-15 18:43 ` [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
@ 2025-01-16 10:32 ` Peter Zijlstra
  2025-01-16 10:51   ` Peter Zijlstra
  2 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 10:32 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi, stable

On Wed, Jan 15, 2025 at 10:43:16AM -0800, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The WARN_ON(this_cpu_read(cpu_hw_events.enabled)) in the
> intel_pmu_save_and_restart_reload() is triggered, when sampling read
> topdown events.
> 
> In a NMI handler, the cpu_hw_events.enabled is set and used to indicate
> the status of core PMU. The generic pmu->pmu_disable_count, updated in
> the perf_pmu_disable/enable pair, is not touched.
> However, the perf_pmu_disable/enable pair is invoked when sampling read
> in a NMI handler. The cpuc->enabled is mistakenly set by the
> perf_pmu_enable().
> 
> Avoid perf_pmu_disable/enable() if the core PMU is already disabled.
> 
> Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Cc: stable@vger.kernel.org
> ---
>  arch/x86/events/intel/core.c | 7 +++++--
>  arch/x86/events/intel/ds.c   | 9 ++++++---
>  2 files changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 2a2824e9c50d..bce423ad3fad 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -2778,15 +2778,18 @@ DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
>  static void intel_pmu_read_topdown_event(struct perf_event *event)
>  {
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	int pmu_enabled = cpuc->enabled;
>  
>  	/* Only need to call update_topdown_event() once for group read. */
>  	if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
>  	    !is_slots_event(event))
>  		return;
>  
> -	perf_pmu_disable(event->pmu);
> +	if (pmu_enabled)
> +		perf_pmu_disable(event->pmu);
>  	static_call(intel_pmu_update_topdown_event)(event);
> -	perf_pmu_enable(event->pmu);
> +	if (pmu_enabled)
> +		perf_pmu_enable(event->pmu);
>  }
>  
>  static void intel_pmu_read_event(struct perf_event *event)
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index ba74e1198328..81b6ec8e824e 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -2096,11 +2096,14 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
>  
>  void intel_pmu_auto_reload_read(struct perf_event *event)
>  {
> -	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
> +	int pmu_enabled = this_cpu_read(cpu_hw_events.enabled);
>  
> -	perf_pmu_disable(event->pmu);
> +	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
> +	if (pmu_enabled)
> +		perf_pmu_disable(event->pmu);
>  	intel_pmu_drain_pebs_buffer();
> -	perf_pmu_enable(event->pmu);
> +	if (pmu_enabled)
> +		perf_pmu_enable(event->pmu);
>  }

Hurmp.. would it not be nicer to merge that logic. Perhaps something
like so?

---
 arch/x86/events/intel/core.c | 41 +++++++++++++++++++++++------------------
 arch/x86/events/intel/ds.c   | 11 +----------
 arch/x86/events/perf_event.h |  2 +-
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7601196d1d18..5b491a6815e6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2785,28 +2785,33 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 
 DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
 
-static void intel_pmu_read_topdown_event(struct perf_event *event)
+static void intel_pmu_read_event(struct perf_event *event)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) {
+		struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+		bool pmu_enabled = cpuc->enabled;
 
-	/* Only need to call update_topdown_event() once for group read. */
-	if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
-	    !is_slots_event(event))
-		return;
+		/* Only need to call update_topdown_event() once for group read. */
+		if (is_topdown_event(event) && !is_slots_event(event) &&
+		    (cpuc->txn_flags & PERF_PMU_TXN_READ))
+			return;
 
-	perf_pmu_disable(event->pmu);
-	static_call(intel_pmu_update_topdown_event)(event);
-	perf_pmu_enable(event->pmu);
-}
+		cpuc->enabled = 0;
+		if (pmu_enabled)
+			intel_pmu_disable_all();
 
-static void intel_pmu_read_event(struct perf_event *event)
-{
-	if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
-		intel_pmu_auto_reload_read(event);
-	else if (is_topdown_count(event))
-		intel_pmu_read_topdown_event(event);
-	else
-		x86_perf_event_update(event);
+		if (is_topdown_event(event))
+			static_call(intel_pmu_update_topdown_event)(event);
+		else
+			intel_pmu_drain_pebs_buffer();
+
+		cpuc->enabled = pmu_enabled;
+		if (pmu_enabled)
+			intel_pmu_enable_all(0);
+		return;
+	}
+
+	x86_perf_event_update(event);
 }
 
 static void intel_pmu_enable_fixed(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ba74e1198328..050098c54ae7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -953,7 +953,7 @@ int intel_pmu_drain_bts_buffer(void)
 	return 1;
 }
 
-static inline void intel_pmu_drain_pebs_buffer(void)
+void intel_pmu_drain_pebs_buffer(void)
 {
 	struct perf_sample_data data;
 
@@ -2094,15 +2094,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	return NULL;
 }
 
-void intel_pmu_auto_reload_read(struct perf_event *event)
-{
-	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
-	perf_pmu_disable(event->pmu);
-	intel_pmu_drain_pebs_buffer();
-	perf_pmu_enable(event->pmu);
-}
-
 /*
  * Special variant of intel_pmu_save_and_restart() for auto-reload.
  */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 31c2771545a6..38b3e30f8988 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1643,7 +1643,7 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
 
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_drain_pebs_buffer(void);
 
 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read
  2025-01-16 10:32 ` [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read Peter Zijlstra
@ 2025-01-16 10:51   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2025-01-16 10:51 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, acme, namhyung, irogers, adrian.hunter, linux-kernel,
	linux-perf-users, ak, eranian, dapeng1.mi, stable

On Thu, Jan 16, 2025 at 11:32:56AM +0100, Peter Zijlstra wrote:

> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index ba74e1198328..050098c54ae7 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -953,7 +953,7 @@ int intel_pmu_drain_bts_buffer(void)
>  	return 1;
>  }
>  
> -static inline void intel_pmu_drain_pebs_buffer(void)
> +void intel_pmu_drain_pebs_buffer(void)
>  {
>  	struct perf_sample_data data;
>  

Also, while poking there, I noticed we never did the below :/


---
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5b491a6815e6..20a2556de645 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3081,7 +3081,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 
 		handled++;
 		x86_pmu_handle_guest_pebs(regs, &data);
-		x86_pmu.drain_pebs(regs, &data);
+		static_call(x86_pmu_drain_pebs)(regs, &data);
 		status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 
 		/*
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 050098c54ae7..75e5e6678282 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -956,8 +956,7 @@ int intel_pmu_drain_bts_buffer(void)
 void intel_pmu_drain_pebs_buffer(void)
 {
 	struct perf_sample_data data;
-
-	x86_pmu.drain_pebs(NULL, &data);
+	static_call(x86_pmu_drain_pebs)(NULL, &data);
 }
 
 /*
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 38b3e30f8988..fb635e2b9909 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1107,6 +1107,7 @@ extern struct x86_pmu x86_pmu __read_mostly;
 
 DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period);
 DECLARE_STATIC_CALL(x86_pmu_update,     *x86_pmu.update);
+DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
 
 static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
 {

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-01-23 15:36 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-15 18:43 [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read kan.liang
2025-01-15 18:43 ` [PATCH V9 2/3] perf: Avoid the read if the count is already updated kan.liang
2025-01-15 18:43 ` [PATCH V9 3/3] perf/x86/intel: Support PEBS counters snapshotting kan.liang
2025-01-16 11:47   ` Peter Zijlstra
2025-01-16 15:55     ` Liang, Kan
2025-01-16 20:42       ` Peter Zijlstra
2025-01-16 20:56         ` Peter Zijlstra
2025-01-16 21:50           ` Liang, Kan
2025-01-21 15:25             ` Liang, Kan
2025-01-23  9:14             ` Peter Zijlstra
2025-01-23 15:36               ` Liang, Kan
2025-01-16 12:02   ` Peter Zijlstra
2025-01-16 10:32 ` [PATCH V9 1/3] perf/x86/intel: Avoid pmu_disable/enable if !cpuc->enabled in sample read Peter Zijlstra
2025-01-16 10:51   ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox