* [PATCH 2/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also
2024-09-06 13:00 [PATCH 0/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also Adrian Hunter
2024-09-06 13:00 ` [PATCH 1/3] KVM: x86: Fix Intel PT IA32_RTIT_CTL MSR validation Adrian Hunter
@ 2024-09-06 13:00 ` Adrian Hunter
2024-09-06 13:00 ` [PATCH 3/3] KVM: selftests: Add guest Intel PT test Adrian Hunter
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Adrian Hunter @ 2024-09-06 13:00 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Peter Zijlstra
Cc: Zhenyu Wang, kvm, Shuah Khan, linux-kselftest, Ingo Molnar,
Thomas Gleixner, Borislav Petkov, Dave Hansen, x86, H Peter Anvin,
Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo,
Jiri Olsa, Namhyung Kim, Ian Rogers, Kan Liang, linux-kernel,
linux-perf-users
Ensure Intel PT tracing is disabled before VM-Entry in Intel PT Host/Guest
mode.
Intel PT has 2 modes for tracing virtual machines. The default is System
mode whereby host and guest output to the host trace buffer. The other is
Host/Guest mode whereby host and guest output to their own buffers.
Host/Guest mode is selected by kvm_intel module parameter pt_mode=1.
In Host/Guest mode, the following rule must be followed:
If the logical processor is operating with Intel PT enabled
(if IA32_RTIT_CTL.TraceEn = 1) at the time of VM entry, the
"load IA32_RTIT_CTL" VM-entry control must be 0.
However, "load IA32_RTIT_CTL" VM-entry control is always 1 in Host/Guest
mode, so IA32_RTIT_CTL.TraceEn must always be 0 at VM entry, irrespective
of whether guest IA32_RTIT_CTL.TraceEn is 1.
Fix by stopping host Intel PT tracing always at VM entry in Host/Guest
mode. That also fixes the issue whereby the Intel PT NMI handler would
set IA32_RTIT_CTL.TraceEn back to 1 after KVM has just set it to 0.
Fixes: 2ef444f1600b ("KVM: x86: Add Intel PT context switch for each vcpu")
Cc: stable@vger.kernel.org
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
arch/x86/events/intel/pt.c | 131 +++++++++++++++++++++++++++++++-
arch/x86/events/intel/pt.h | 10 +++
arch/x86/include/asm/intel_pt.h | 4 +
arch/x86/kvm/vmx/vmx.c | 23 ++----
arch/x86/kvm/vmx/vmx.h | 1 -
5 files changed, 147 insertions(+), 22 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index fd4670a6694e..a4c8feb94040 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -480,16 +480,20 @@ static u64 pt_config_filters(struct perf_event *event)
*/
/* avoid redundant msr writes */
- if (pt->filters.filter[range].msr_a != filter->msr_a) {
+ if (pt->filters.filter[range].msr_a != filter->msr_a ||
+ pt->write_filter_msrs[range]) {
wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
pt->filters.filter[range].msr_a = filter->msr_a;
}
- if (pt->filters.filter[range].msr_b != filter->msr_b) {
+ if (pt->filters.filter[range].msr_b != filter->msr_b ||
+ pt->write_filter_msrs[range]) {
wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
pt->filters.filter[range].msr_b = filter->msr_b;
}
+ pt->write_filter_msrs[range] = false;
+
rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
}
@@ -534,6 +538,11 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.aux_config = reg;
+
+ /* Configuration is complete, it is now OK to handle an NMI */
+ barrier();
+ WRITE_ONCE(pt->handle_nmi, 1);
+
pt_config_start(event);
}
@@ -945,6 +954,7 @@ static void pt_handle_status(struct pt *pt)
pt_buffer_advance(buf);
wrmsrl(MSR_IA32_RTIT_STATUS, status);
+ pt->status = status;
}
/**
@@ -1583,7 +1593,6 @@ static void pt_event_start(struct perf_event *event, int mode)
goto fail_end_stop;
}
- WRITE_ONCE(pt->handle_nmi, 1);
hwc->state = 0;
pt_config_buffer(buf);
@@ -1638,6 +1647,104 @@ static void pt_event_stop(struct perf_event *event, int mode)
}
}
+#define PT_VM_NO_TRANSITION 0
+#define PT_VM_ENTRY 1
+#define PT_VM_EXIT 2
+
+void intel_pt_vm_entry(bool guest_trace_enable)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct perf_event *event;
+
+ pt->restart_event = NULL;
+ pt->stashed_buf_sz = 0;
+
+ WRITE_ONCE(pt->vm_transition, PT_VM_ENTRY);
+ barrier();
+
+ if (READ_ONCE(pt->handle_nmi)) {
+ /* Must stop handler before reading pt->handle.event */
+ WRITE_ONCE(pt->handle_nmi, 0);
+ barrier();
+ event = pt->handle.event;
+ if (event && !event->hw.state) {
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (buf && buf->snapshot)
+ pt->stashed_buf_sz = buf->nr_pages << PAGE_SHIFT;
+ pt->restart_event = event;
+ pt_event_stop(event, PERF_EF_UPDATE);
+ }
+ }
+
+ /*
+ * If guest_trace_enable, MSRs need to be saved, but the values are
+ * either already cached or not needed:
+ * MSR_IA32_RTIT_CTL event->hw.aux_config
+ * MSR_IA32_RTIT_STATUS pt->status
+ * MSR_IA32_RTIT_CR3_MATCH not used
+ * MSR_IA32_RTIT_OUTPUT_BASE pt->output_base
+ * MSR_IA32_RTIT_OUTPUT_MASK pt->output_mask
+ * MSR_IA32_RTIT_ADDR... pt->filters
+ */
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_entry);
+
+void intel_pt_vm_exit(bool guest_trace_enable)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 base = pt->output_base;
+ u64 mask = pt->output_mask;
+
+ WRITE_ONCE(pt->vm_transition, PT_VM_EXIT);
+ barrier();
+
+ /*
+ * If guest_trace_enable, MSRs need to be restored, but that is handled
+ * in different ways:
+ * MSR_IA32_RTIT_CTL written next start
+ * MSR_IA32_RTIT_STATUS restored below
+ * MSR_IA32_RTIT_CR3_MATCH not used
+ * MSR_IA32_RTIT_OUTPUT_BASE written next start or restored
+ * further below
+ * MSR_IA32_RTIT_OUTPUT_MASK written next start or restored
+ * further below
+ * MSR_IA32_RTIT_ADDR... flagged to be written when
+ * needed
+ */
+ if (guest_trace_enable) {
+ wrmsrl(MSR_IA32_RTIT_STATUS, pt->status);
+ /*
+ * Force address filter MSR writes during reconfiguration,
+ * refer pt_config_filters().
+ */
+ for (int range = 0; range < PT_FILTERS_NUM; range++)
+ pt->write_filter_msrs[range] = true;
+ }
+
+ if (pt->restart_event) {
+ if (guest_trace_enable) {
+ /* Invalidate to force buffer reconfiguration */
+ pt->output_base = ~0ULL;
+ pt->output_mask = 0;
+ }
+ pt_event_start(pt->restart_event, 0);
+ pt->restart_event = NULL;
+ }
+
+ /* If tracing wasn't started, restore buffer configuration */
+ if (guest_trace_enable && !READ_ONCE(pt->handle_nmi)) {
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, mask);
+ pt->output_base = base;
+ pt->output_mask = mask;
+ }
+
+ barrier();
+ WRITE_ONCE(pt->vm_transition, PT_VM_NO_TRANSITION);
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_exit);
+
static long pt_event_snapshot_aux(struct perf_event *event,
struct perf_output_handle *handle,
unsigned long size)
@@ -1646,6 +1753,24 @@ static long pt_event_snapshot_aux(struct perf_event *event,
struct pt_buffer *buf = perf_get_aux(&pt->handle);
unsigned long from = 0, to;
long ret;
+ int tr;
+
+ /*
+ * Special handling during VM transition. At VM-Entry stage, once
+ * tracing is stopped, as indicated by buf == NULL, snapshot using the
+ * saved head position. At VM-Exit do that also until tracing is
+ * reconfigured as indicated by handle_nmi.
+ */
+ tr = READ_ONCE(pt->vm_transition);
+ if ((tr == PT_VM_ENTRY && !buf) || (tr == PT_VM_EXIT && !READ_ONCE(pt->handle_nmi))) {
+ if (WARN_ON_ONCE(!pt->stashed_buf_sz))
+ return 0;
+ to = pt->handle.head;
+ if (to < size)
+ from = pt->stashed_buf_sz;
+ from += to - size;
+ return perf_output_copy_aux(&pt->handle, handle, from, to);
+ }
if (WARN_ON_ONCE(!buf))
return 0;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index f5e46c04c145..ecaaf112b923 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -119,6 +119,11 @@ struct pt_filters {
* @vmx_on: 1 if VMX is ON on this cpu
* @output_base: cached RTIT_OUTPUT_BASE MSR value
* @output_mask: cached RTIT_OUTPUT_MASK MSR value
+ * @status: cached RTIT_STATUS MSR value
+ * @vm_transition: VM transition (snapshot_aux needs special handling)
+ * @write_filter_msrs: write address filter MSRs during configuration
+ * @stashed_buf_sz: buffer size during VM transition
+ * @restart_event: event to restart after VM-Exit
*/
struct pt {
struct perf_output_handle handle;
@@ -127,6 +132,11 @@ struct pt {
int vmx_on;
u64 output_base;
u64 output_mask;
+ u64 status;
+ int vm_transition;
+ bool write_filter_msrs[PT_FILTERS_NUM];
+ unsigned long stashed_buf_sz;
+ struct perf_event *restart_event;
};
#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index c796e9bc98b6..a673ac3a825e 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -30,11 +30,15 @@ enum pt_capabilities {
void cpu_emergency_stop_pt(void);
extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
+extern void intel_pt_vm_entry(bool guest_trace_enable);
+extern void intel_pt_vm_exit(bool guest_trace_enable);
extern int is_intel_pt_event(struct perf_event *event);
#else
static inline void cpu_emergency_stop_pt(void) {}
static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; }
static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; }
+static inline void intel_pt_vm_entry(bool guest_trace_enable) {}
+static inline void intel_pt_vm_exit(bool guest_trace_enable) {}
static inline int is_intel_pt_event(struct perf_event *event) { return 0; }
#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3f1e3be552c0..d20458d83829 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1224,16 +1224,10 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
if (vmx_pt_mode_is_system())
return;
- /*
- * GUEST_IA32_RTIT_CTL is already set in the VMCS.
- * Save host state before VM entry.
- */
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ intel_pt_vm_entry(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
- }
}
static void pt_guest_exit(struct vcpu_vmx *vmx)
@@ -1241,17 +1235,10 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
if (vmx_pt_mode_is_system())
return;
- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
- }
- /*
- * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
- * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
- */
- if (vmx->pt_desc.host.ctl)
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ intel_pt_vm_exit(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 42498fa63abb..e1616282d97f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -67,7 +67,6 @@ struct pt_desc {
u64 ctl_bitmask;
u32 num_address_ranges;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
- struct pt_ctx host;
struct pt_ctx guest;
};
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 3/3] KVM: selftests: Add guest Intel PT test
2024-09-06 13:00 [PATCH 0/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also Adrian Hunter
2024-09-06 13:00 ` [PATCH 1/3] KVM: x86: Fix Intel PT IA32_RTIT_CTL MSR validation Adrian Hunter
2024-09-06 13:00 ` [PATCH 2/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also Adrian Hunter
@ 2024-09-06 13:00 ` Adrian Hunter
2024-09-10 6:02 ` [PATCH 0/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also Zhenyu Wang
2024-09-26 14:05 ` Adrian Hunter
4 siblings, 0 replies; 7+ messages in thread
From: Adrian Hunter @ 2024-09-06 13:00 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Peter Zijlstra
Cc: Zhenyu Wang, kvm, Shuah Khan, linux-kselftest, Ingo Molnar,
Thomas Gleixner, Borislav Petkov, Dave Hansen, x86, H Peter Anvin,
Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo,
Jiri Olsa, Namhyung Kim, Ian Rogers, Kan Liang, linux-kernel,
linux-perf-users
Add a test that starts Intel PT traces on host and guest. The test requires
support for Intel PT and having Host/Guest mode enabled i.e. kvm_intel
module parameter pt_mode=1.
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
tools/testing/selftests/kvm/Makefile | 1 +
.../selftests/kvm/include/x86_64/processor.h | 1 +
tools/testing/selftests/kvm/x86_64/intel_pt.c | 381 ++++++++++++++++++
3 files changed, 383 insertions(+)
create mode 100644 tools/testing/selftests/kvm/x86_64/intel_pt.c
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 48d32c5aa3eb..0722c5c384cc 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -79,6 +79,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
+TEST_GEN_PROGS_x86_64 += x86_64/intel_pt
TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index a0c1440017bb..87f98d342e79 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -157,6 +157,7 @@ struct kvm_x86_cpu_feature {
#define X86_FEATURE_PCOMMIT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 22)
#define X86_FEATURE_CLFLUSHOPT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 23)
#define X86_FEATURE_CLWB KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24)
+#define X86_FEATURE_INTEL_PT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 25)
#define X86_FEATURE_UMIP KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2)
#define X86_FEATURE_PKU KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3)
#define X86_FEATURE_OSPKE KVM_X86_CPU_FEATURE(0x7, 0, ECX, 4)
diff --git a/tools/testing/selftests/kvm/x86_64/intel_pt.c b/tools/testing/selftests/kvm/x86_64/intel_pt.c
new file mode 100644
index 000000000000..94753b12936e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/intel_pt.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest Intel PT test
+ *
+ * Copyright (C) 2024, Intel Corporation.
+ */
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/perf_event.h>
+
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "processor.h"
+#include "ucall_common.h"
+
+#define MEM_GPA SZ_256M
+/* Set PT_NR_PAGES to 1 to avoid single range errata on some processors */
+#define PT_NR_PAGES 1
+
+#define PT_CPUID_LEAVES 2
+#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */
+
+/* Capability-related code is from the Kernel Intel PT driver */
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_psb_cyc,
+ PT_CAP_ip_filtering,
+ PT_CAP_mtc,
+ PT_CAP_ptwrite,
+ PT_CAP_power_event_trace,
+ PT_CAP_event_trace,
+ PT_CAP_tnt_disable,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_single_range_output,
+ PT_CAP_output_subsys,
+ PT_CAP_payloads_lip,
+ PT_CAP_num_address_ranges,
+ PT_CAP_mtc_periods,
+ PT_CAP_cycle_thresholds,
+ PT_CAP_psb_periods,
+};
+
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = KVM_ ## _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)),
+ PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)),
+ PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
+ PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)),
+ PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)),
+ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+ PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)),
+ PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
+ PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
+};
+
+static u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
+{
+ struct pt_cap_desc *cd = &pt_caps[capability];
+ u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static int calc_psb_freq(u32 *caps, u64 *psb_freq)
+{
+ u64 allowed;
+
+ if (!(intel_pt_validate_cap(caps, PT_CAP_psb_cyc)))
+ return 0; /* PSBFreq not supported */
+
+ allowed = intel_pt_validate_cap(caps, PT_CAP_psb_periods);
+ if (!allowed)
+ return -1;
+
+ /* Select biggest period */
+ *psb_freq = __fls(allowed) << RTIT_CTL_PSB_FREQ_OFFSET;
+
+ return 0;
+}
+
+static u64 guest_psb_freq(u32 *caps)
+{
+ u64 psb_freq = 0;
+
+ GUEST_ASSERT(!calc_psb_freq(caps, &psb_freq));
+
+ return psb_freq;
+}
+
+static u64 host_psb_freq(u32 *caps)
+{
+ u64 psb_freq = 0;
+
+ TEST_ASSERT(!calc_psb_freq(caps, &psb_freq), "No valid PSBFreq");
+
+ return psb_freq;
+}
+
+static void read_caps(u32 *caps)
+{
+ for (int i = 0; i < PT_CPUID_LEAVES; i++) {
+ __cpuid(0x14, i,
+ &caps[KVM_CPUID_EAX + i * PT_CPUID_REGS_NUM],
+ &caps[KVM_CPUID_EBX + i * PT_CPUID_REGS_NUM],
+ &caps[KVM_CPUID_ECX + i * PT_CPUID_REGS_NUM],
+ &caps[KVM_CPUID_EDX + i * PT_CPUID_REGS_NUM]);
+ }
+}
+
+static void guest_code(void)
+{
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ u64 status;
+
+ GUEST_ASSERT(this_cpu_has(X86_FEATURE_INTEL_PT));
+
+ read_caps(caps);
+
+ /* Config PT buffer */
+ wrmsr(MSR_IA32_RTIT_OUTPUT_MASK, PT_NR_PAGES * PAGE_SIZE - 1);
+ wrmsr(MSR_IA32_RTIT_OUTPUT_BASE, MEM_GPA);
+
+ /* Start tracing */
+ wrmsr(MSR_IA32_RTIT_CTL, RTIT_CTL_TRACEEN | RTIT_CTL_OS | RTIT_CTL_USR | RTIT_CTL_TSC_EN |
+ RTIT_CTL_BRANCH_EN | guest_psb_freq(caps));
+
+ GUEST_ASSERT(rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN);
+
+ /*
+ * Test repeated VM_Exit / VM-Entry. PAGE_SIZE to match aux_watermark,
+ * refer to the handlng of UCALL_SYNC.
+ */
+ for (int i = 0; i < PAGE_SIZE; i++)
+ GUEST_SYNC(i);
+
+ /* Stop tracing */
+ wrmsr(MSR_IA32_RTIT_CTL, 0);
+
+ status = rdmsr(MSR_IA32_RTIT_STATUS);
+
+ GUEST_ASSERT(!(status & (RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED)));
+
+ GUEST_DONE();
+}
+
+static long perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
+ int group_fd, unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+static int read_sysfs(const char *file_path, unsigned int *val)
+{
+ FILE *f = fopen(file_path, "r");
+ int ret;
+
+ if (!f)
+ return -1;
+
+ ret = fscanf(f, "%u", val);
+
+ fclose(f);
+
+ return ret == 1 ? 0 : -1;
+}
+
+#define PT_CONFIG_PASS_THRU 1
+
+static int do_open_pt(u32 *caps, unsigned int type)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(attr),
+ .type = type,
+ .config = PT_CONFIG_PASS_THRU | RTIT_CTL_BRANCH_EN | host_psb_freq(caps),
+ .sample_period = 1,
+ .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_CPU |
+ PERF_SAMPLE_TIME | PERF_SAMPLE_IDENTIFIER,
+ .exclude_kernel = 1,
+ .exclude_user = 0,
+ .exclude_hv = 1,
+ .sample_id_all = 1,
+ .exclude_guest = 1,
+ .aux_watermark = PAGE_SIZE,
+ };
+
+ return perf_event_open(&attr, 0, -1, -1, 0);
+}
+
+static int open_pt(u32 *caps)
+{
+ unsigned int type;
+ int err;
+
+ err = read_sysfs("/sys/bus/event_source/devices/intel_pt/type", &type);
+ if (err)
+ return -1;
+
+ return do_open_pt(caps, type);
+}
+
+#define PERF_HOST_BUF_SZ (4 * PAGE_SIZE)
+#define PERF_HOST_MMAP_SZ (PERF_HOST_BUF_SZ + PAGE_SIZE)
+#define PT_HOST_BUF_SZ (2 * PAGE_SIZE)
+
+struct perf_info {
+ int fd;
+ void *perf_buf;
+ void *pt_buf;
+};
+
+static int perf_open(struct perf_info *pi)
+{
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ struct perf_event_mmap_page *pc;
+
+ read_caps(caps);
+
+ pi->fd = open_pt(caps);
+ if (pi->fd < 0)
+ goto out_err;
+
+ /* mmap host buffer and user page */
+ pi->perf_buf = mmap(NULL, PERF_HOST_MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, pi->fd, 0);
+ if (pi->perf_buf == MAP_FAILED)
+ goto out_close;
+
+ pc = pi->perf_buf;
+ pc->aux_offset = PERF_HOST_MMAP_SZ;
+ pc->aux_size = PT_HOST_BUF_SZ;
+
+ /* mmap pt buffer */
+ pi->pt_buf = mmap(NULL, PT_HOST_BUF_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, pi->fd, PERF_HOST_MMAP_SZ);
+ if (pi->pt_buf == MAP_FAILED)
+ goto out_munmap;
+
+ return 0;
+
+out_munmap:
+ munmap(pi->perf_buf, PERF_HOST_MMAP_SZ);
+out_close:
+ close(pi->fd);
+ pi->fd = -1;
+out_err:
+ TEST_FAIL("Failed to start Intel PT tracing on host");
+ return -1;
+}
+
+static void perf_close(struct perf_info *pi)
+{
+ if (pi->fd < 0)
+ return;
+
+ munmap(pi->pt_buf, PT_HOST_BUF_SZ);
+ munmap(pi->perf_buf, PERF_HOST_MMAP_SZ);
+ close(pi->fd);
+}
+
+static void perf_forward(struct perf_info *pi)
+{
+ volatile struct perf_event_mmap_page *pc = pi->perf_buf;
+
+ if (pi->fd < 0)
+ return;
+
+ /* Must stop to ensure aux_head is up to date */
+ ioctl(pi->fd, PERF_EVENT_IOC_DISABLE, 0);
+
+ /* Discard all trace data */
+ pc->data_tail = pc->data_head;
+ pc->aux_tail = pc->aux_head;
+
+ /* Start after setting aux_tail */
+ ioctl(pi->fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+/* Use volatile to discourage the compiler from unrolling the loop */
+volatile int loop_spin;
+
+static void run_vcpu(struct kvm_vcpu *vcpu, struct perf_info *pi)
+{
+ bool done = false;
+ struct ucall uc;
+
+ while (!done) {
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_PRINTF:
+ pr_info("%s", uc.buffer);
+ break;
+ case UCALL_SYNC:
+ /*
+ * Empty the buffer and spin to add trace data in ever
+ * increasing amounts, which will cause the host PMI to
+ * more likely happen somewhere sensitive prior to
+ * VM-Entry.
+ */
+ perf_forward(pi);
+ for (int cnt = 0; cnt < uc.args[1]; cnt++)
+ for (loop_spin = 0; loop_spin < 5; loop_spin++)
+ cpu_relax();
+ break;
+ case UCALL_DONE:
+ done = true;
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu exit reason: %s",
+ uc.cmd, exit_reason_str(vcpu->run->exit_reason));
+ break;
+ }
+ }
+}
+
+#define PT_CAP_SINGLE_RANGE_OUTPUT \
+ KVM_X86_CPU_FEATURE(0x14, 0, ECX, 2)
+
+int main(int argc, char *argv[])
+{
+ struct perf_info pi = {.fd = -1};
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ /*
+ * Guest X86_FEATURE_INTEL_PT depends on Intel PT support and kvm_intel
+ * module parameter pt_mode=1.
+ */
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_INTEL_PT));
+
+ /*
+ * Only using single-range for now. Currently only BDW does not support it, but
+ * BDW also doesn't support PT in VMX operation anyway.
+ */
+ TEST_REQUIRE(vcpu_cpuid_has(vcpu, PT_CAP_SINGLE_RANGE_OUTPUT));
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, MEM_GPA, 1, PT_NR_PAGES, 0);
+
+ perf_open(&pi);
+
+ run_vcpu(vcpu, &pi);
+
+ perf_close(&pi);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread