From: Peter Zijlstra <peterz@infradead.org>
To: kan.liang@linux.intel.com
Cc: mingo@redhat.com, linux-kernel@vger.kernel.org,
eranian@google.com, ak@linux.intel.com
Subject: Re: [PATCH V4 1/2] perf/x86/intel/ds: Flush the PEBS buffer in PEBS enable
Date: Wed, 26 Apr 2023 15:18:12 +0200 [thread overview]
Message-ID: <20230426131812.GA1377058@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20230421184529.3320912-1-kan.liang@linux.intel.com>
On Fri, Apr 21, 2023 at 11:45:28AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Several similar kernel warnings can be triggered,
>
> [56605.607840] CPU0 PEBS record size 0, expected 32, config 0
> cpuc->record_size=208
>
> when the below commands are running in parallel for a while on SPR.
>
> while true; do perf record --no-buildid -a --intr-regs=AX -e
> cpu/event=0xd0,umask=0x81/pp -c 10003 -o /dev/null ./triad; done &
>
> while true; do perf record -o /tmp/out -W -d -e
> '{ld_blocks.store_forward:period=1000000,
> MEM_TRANS_RETIRED.LOAD_LATENCY:u:precise=2:ldlat=4}'
> -c 1037 ./triad; done
> *The triad program is just the generation of loads/stores.
>
> The warnings are triggered when an unexpected PEBS record (with a
> different config and size) is found.
>
> A system-wide PEBS event with the large PEBS config may be enabled
> during a context switch. Some PEBS records for the system-wide PEBS may
> be generated while the old task is sched out but the new one hasn't been
> sched in yet. When the new task is sched in, the cpuc->pebs_record_size
> may be updated for the per-task PEBS events. So the existing system-wide
> PEBS records have a different size from the later PEBS records.
>
> The PEBS buffer should be flushed right before the hardware is
> reprogrammed. The new size and threshold should be updated after the old
> buffer has been flushed.
>
> Reported-by: Stephane Eranian <eranian@google.com>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
So I find it much easier to read the whole thing when collapsed.
Something like the below; that ok with you?
---
arch/x86/events/intel/ds.c | 56 ++++++++++++++++++++++-----------------
arch/x86/include/asm/perf_event.h | 3 +++
2 files changed, 35 insertions(+), 24 deletions(-)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a2e566e53076..df88576d6b2a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1229,12 +1229,14 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
struct pmu *pmu = event->pmu;
+
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
* that does not hurt:
*/
- bool update = cpuc->n_pebs == 1;
+ if (cpuc->n_pebs == 1)
+ cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW;
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
if (!needed_cb)
@@ -1242,7 +1244,7 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
else
perf_sched_cb_dec(pmu);
- update = true;
+ cpuc->pebs_data_cfg |= PEBS_UPDATE_DS_SW;
}
/*
@@ -1252,24 +1254,13 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
if (x86_pmu.intel_cap.pebs_baseline && add) {
u64 pebs_data_cfg;
- /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */
- if (cpuc->n_pebs == 1) {
- cpuc->pebs_data_cfg = 0;
- cpuc->pebs_record_size = sizeof(struct pebs_basic);
- }
-
pebs_data_cfg = pebs_update_adaptive_cfg(event);
-
- /* Update pebs_record_size if new event requires more data. */
- if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
- cpuc->pebs_data_cfg |= pebs_data_cfg;
- adaptive_pebs_record_size_update();
- update = true;
- }
+ /*
+ * Be sure to update the thresholds when we change the record.
+ */
+ if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+ cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
}
-
- if (update)
- pebs_update_threshold(cpuc);
}
void intel_pmu_pebs_add(struct perf_event *event)
@@ -1326,9 +1317,17 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
wrmsrl(base + idx, value);
}
+static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc)
+{
+ if (cpuc->n_pebs == cpuc->n_large_pebs &&
+ cpuc->n_pebs != cpuc->n_pebs_via_pt)
+ intel_pmu_drain_pebs_buffer();
+}
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = cpuc->pebs_data_cfg & ~PEBS_UPDATE_DS_SW;
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
unsigned int idx = hwc->idx;
@@ -1344,11 +1343,22 @@ void intel_pmu_pebs_enable(struct perf_event *event)
if (x86_pmu.intel_cap.pebs_baseline) {
hwc->config |= ICL_EVENTSEL_ADAPTIVE;
- if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
- wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
- cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
+ if (pebs_data_cfg != cpuc->active_pebs_data_cfg) {
+ /*
+ * drain_pebs() assumes uniform record size;
+ * hence we need to drain when changing said
+ * size.
+ */
+ intel_pmu_drain_large_pebs(cpuc);
+ adaptive_pebs_record_size_update();
+ wrmsrl(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+ cpuc->active_pebs_data_cfg = pebs_data_cfg;
}
}
+ if (cpuc->pebs_data_cfg & PEBS_UPDATE_DS_SW) {
+ cpuc->pebs_data_cfg = pebs_data_cfg;
+ pebs_update_threshold(cpuc);
+ }
if (idx >= INTEL_PMC_IDX_FIXED) {
if (x86_pmu.intel_cap.pebs_format < 5)
@@ -1391,9 +1401,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- if (cpuc->n_pebs == cpuc->n_large_pebs &&
- cpuc->n_pebs != cpuc->n_pebs_via_pt)
- intel_pmu_drain_pebs_buffer();
+ intel_pmu_drain_large_pebs(cpuc);
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8fc15ed5e60b..abf09882f58b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -121,6 +121,9 @@
#define PEBS_DATACFG_LBRS BIT_ULL(3)
#define PEBS_DATACFG_LBR_SHIFT 24
+/* Steal the highest bit of pebs_data_cfg for SW usage */
+#define PEBS_UPDATE_DS_SW BIT_ULL(63)
+
/*
* Intel "Architectural Performance Monitoring" CPUID
* detection/enumeration details:
next prev parent reply other threads:[~2023-04-26 13:18 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-04-21 18:45 [PATCH V4 1/2] perf/x86/intel/ds: Flush the PEBS buffer in PEBS enable kan.liang
2023-04-21 18:45 ` [PATCH V4 2/2] perf/x86/intel/ds: Delay the threshold update kan.liang
2023-04-25 7:16 ` kernel test robot
2023-04-25 11:19 ` Peter Zijlstra
2023-04-27 6:26 ` Yujie Liu
2023-05-04 9:11 ` Peter Zijlstra
2023-05-04 10:16 ` Peter Zijlstra
2023-04-26 13:18 ` Peter Zijlstra [this message]
2023-04-26 14:23 ` [PATCH V4 1/2] perf/x86/intel/ds: Flush the PEBS buffer in PEBS enable Liang, Kan
2023-05-10 13:25 ` [tip: perf/urgent] perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG tip-bot2 for Kan Liang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230426131812.GA1377058@hirez.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=ak@linux.intel.com \
--cc=eranian@google.com \
--cc=kan.liang@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox