public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test
@ 2025-01-17 23:41 Sean Christopherson
  2025-01-17 23:41 ` [PATCH 1/5] KVM: selftests: Make Intel arch events globally available " Sean Christopherson
                   ` (6 more replies)
  0 siblings, 7 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:41 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts given the
underlying hardware.

The bug manifests as failures with the Top-Down Slots architectural event
when running CPUs that doesn't actually support that arch event (pre-ICX).
The arch event encoding still counts _something_, just not Top-Down Slots
(I haven't bothered to look up what it was counting).  The passed by sheer
dumb luck until an unrelated change caused the count of the unknown event
to drop.

All credit to Dapeng for hunting down the problem.

Sean Christopherson (5):
  KVM: selftests: Make Intel arch events globally available in PMU
    counters test
  KVM: selftests: Only validate counts for hardware-supported arch
    events
  KVM: selftests: Remove dead code in Intel PMU counters test
  KVM: selftests: Drop the "feature event" param from guest test helpers
  KVM: selftests: Print out the actual Top-Down Slots count on failure

 .../selftests/kvm/x86/pmu_counters_test.c     | 143 ++++++++++--------
 1 file changed, 83 insertions(+), 60 deletions(-)


base-commit: eb723766b1030a23c38adf2348b7c3d1409d11f0
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/5] KVM: selftests: Make Intel arch events globally available in PMU counters test
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
@ 2025-01-17 23:41 ` Sean Christopherson
  2025-01-17 23:42 ` [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events Sean Christopherson
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:41 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

Wrap PMU counter test's array of Intel architectrual in a helper function
so that the events can be queried in multiple locations.  Add a comment to
explain the need for a wrapper.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86/pmu_counters_test.c     | 84 +++++++++++--------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
index accd7ecd3e5f..fe7d72fc8a75 100644
--- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -33,6 +33,53 @@
 static uint8_t kvm_pmu_version;
 static bool kvm_has_perf_caps;
 
+#define X86_PMU_FEATURE_NULL						\
+({									\
+	struct kvm_x86_pmu_feature feature = {};			\
+									\
+	feature;							\
+})
+
+static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
+{
+	return !(*(u64 *)&event);
+}
+
+struct kvm_intel_pmu_event {
+	struct kvm_x86_pmu_feature gp_event;
+	struct kvm_x86_pmu_feature fixed_event;
+};
+
+/*
+ * Wrap the array to appease the compiler, as the macros used to construct each
+ * kvm_x86_pmu_feature use syntax that's only valid in function scope, and the
+ * compiler often thinks the feature definitions aren't compile-time constants.
+ */
+static struct kvm_intel_pmu_event intel_event_to_feature(uint8_t idx)
+{
+	const struct kvm_intel_pmu_event __intel_event_to_feature[] = {
+		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
+		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
+		/*
+		 * Note, the fixed counter for reference cycles is NOT the same as the
+		 * general purpose architectural event.  The fixed counter explicitly
+		 * counts at the same frequency as the TSC, whereas the GP event counts
+		 * at a fixed, but uarch specific, frequency.  Bundle them here for
+		 * simplicity.
+		 */
+		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
+		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
+	};
+
+	kvm_static_assert(ARRAY_SIZE(__intel_event_to_feature) == NR_INTEL_ARCH_EVENTS);
+
+	return __intel_event_to_feature[idx];
+}
+
 static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
 						  void *guest_code,
 						  uint8_t pmu_version,
@@ -197,41 +244,8 @@ static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature even
 		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
 }
 
-#define X86_PMU_FEATURE_NULL						\
-({									\
-	struct kvm_x86_pmu_feature feature = {};			\
-									\
-	feature;							\
-})
-
-static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
-{
-	return !(*(u64 *)&event);
-}
-
 static void guest_test_arch_event(uint8_t idx)
 {
-	const struct {
-		struct kvm_x86_pmu_feature gp_event;
-		struct kvm_x86_pmu_feature fixed_event;
-	} intel_event_to_feature[] = {
-		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
-		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
-		/*
-		 * Note, the fixed counter for reference cycles is NOT the same
-		 * as the general purpose architectural event.  The fixed counter
-		 * explicitly counts at the same frequency as the TSC, whereas
-		 * the GP event counts at a fixed, but uarch specific, frequency.
-		 * Bundle them here for simplicity.
-		 */
-		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
-		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
-	};
-
 	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
 	uint32_t pmu_version = guest_get_pmu_version();
 	/* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */
@@ -249,7 +263,7 @@ static void guest_test_arch_event(uint8_t idx)
 	else
 		base_pmc_msr = MSR_IA32_PERFCTR0;
 
-	gp_event = intel_event_to_feature[idx].gp_event;
+	gp_event = intel_event_to_feature(idx).gp_event;
 	GUEST_ASSERT_EQ(idx, gp_event.f.bit);
 
 	GUEST_ASSERT(nr_gp_counters);
@@ -270,7 +284,7 @@ static void guest_test_arch_event(uint8_t idx)
 	if (!guest_has_perf_global_ctrl)
 		return;
 
-	fixed_event = intel_event_to_feature[idx].fixed_event;
+	fixed_event = intel_event_to_feature(idx).fixed_event;
 	if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event))
 		return;
 
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
  2025-01-17 23:41 ` [PATCH 1/5] KVM: selftests: Make Intel arch events globally available " Sean Christopherson
@ 2025-01-17 23:42 ` Sean Christopherson
  2025-01-18  0:06   ` Mingwei Zhang
  2025-01-17 23:42 ` [PATCH 3/5] KVM: selftests: Remove dead code in Intel PMU counters test Sean Christopherson
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:42 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

In the Intel PMU counters test, only validate the counts for architectural
events that are supported in hardware.  If an arch event isn't supported,
the event selector may enable a completely different event, and thus the
logic for the expected count is bogus.

This fixes test failures on pre-Icelake systems due to the encoding for
the architectural Top-Down Slots event corresponding to something else
(at least on the Skylake family of CPUs).

Note, validation relies on *hardware* support, not KVM support and not
guest support.  Architectural events are all about enumerating the event
selector encoding; lack of enumeration for an architectural event doesn't
mean the event itself is unsupported, i.e. the event should still count as
expected even if KVM and/or guest CPUID doesn't enumerate the event as
being "architectural".

Note #2, it's desirable to _program_ the architectural event encoding even
if hardware doesn't support the event.  The count can't be validated when
the event is fully enabled, but KVM should still let the guest program the
event selector, and the PMC shouldn't count if the event is disabled.

Fixes: 4f1bd6b16074 ("KVM: selftests: Test Intel PMU architectural events on gp counters")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202501141009.30c629b4-lkp@intel.com
Debugged-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86/pmu_counters_test.c     | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
index fe7d72fc8a75..8159615ad492 100644
--- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -29,6 +29,8 @@
 /* Total number of instructions retired within the measured section. */
 #define NUM_INSNS_RETIRED		(NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
 
+/* Track which architectural events are supported by hardware. */
+static uint32_t hardware_pmu_arch_events;
 
 static uint8_t kvm_pmu_version;
 static bool kvm_has_perf_caps;
@@ -89,6 +91,7 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
 
 	vm = vm_create_with_one_vcpu(vcpu, guest_code);
 	sync_global_to_guest(vm, kvm_pmu_version);
+	sync_global_to_guest(vm, hardware_pmu_arch_events);
 
 	/*
 	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
@@ -152,7 +155,7 @@ static void guest_assert_event_count(uint8_t idx,
 	uint64_t count;
 
 	count = _rdpmc(pmc);
-	if (!this_pmu_has(event))
+	if (!(hardware_pmu_arch_events & BIT(idx)))
 		goto sanity_checks;
 
 	switch (idx) {
@@ -560,7 +563,7 @@ static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
 
 static void test_intel_counters(void)
 {
-	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint8_t nr_arch_events = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
 	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
 	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
 	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
@@ -582,18 +585,26 @@ static void test_intel_counters(void)
 
 	/*
 	 * Detect the existence of events that aren't supported by selftests.
-	 * This will (obviously) fail any time the kernel adds support for a
-	 * new event, but it's worth paying that price to keep the test fresh.
+	 * This will (obviously) fail any time hardware adds support for a new
+	 * event, but it's worth paying that price to keep the test fresh.
 	 */
 	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
 		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
-		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
+		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
 
 	/*
-	 * Force iterating over known arch events regardless of whether or not
-	 * KVM/hardware supports a given event.
+	 * Iterate over known arch events irrespective of KVM/hardware support
+	 * to verify that KVM doesn't reject programming of events just because
+	 * the *architectural* encoding is unsupported.  Track which events are
+	 * supported in hardware; the guest side will validate supported events
+	 * count correctly, even if *enumeration* of the event is unsupported
+	 * by KVM and/or isn't exposed to the guest.
 	 */
 	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
+	for (i = 0; i < nr_arch_events; i++) {
+		if (this_pmu_has(intel_event_to_feature(i).gp_event))
+			hardware_pmu_arch_events |= BIT(i);
+	}
 
 	for (v = 0; v <= max_pmu_version; v++) {
 		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/5] KVM: selftests: Remove dead code in Intel PMU counters test
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
  2025-01-17 23:41 ` [PATCH 1/5] KVM: selftests: Make Intel arch events globally available " Sean Christopherson
  2025-01-17 23:42 ` [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events Sean Christopherson
@ 2025-01-17 23:42 ` Sean Christopherson
  2025-01-17 23:42 ` [PATCH 4/5] KVM: selftests: Drop the "feature event" param from guest test helpers Sean Christopherson
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:42 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

Drop the local "nr_arch_events" in the Intel PMU counters test as the test
asserts that "nr_arch_events <= NR_INTEL_ARCH_EVENTS", and then sets
nr_arch_events to the max of the two.  I.e. nr_arch_events is guaranteed
to be NR_INTEL_ARCH_EVENTS for the meat of the test, just use
NR_INTEL_ARCH_EVENTS directly.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86/pmu_counters_test.c       | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
index 8159615ad492..5d6a5b9c17b3 100644
--- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -563,7 +563,6 @@ static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
 
 static void test_intel_counters(void)
 {
-	uint8_t nr_arch_events = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
 	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
 	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
 	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
@@ -588,9 +587,10 @@ static void test_intel_counters(void)
 	 * This will (obviously) fail any time hardware adds support for a new
 	 * event, but it's worth paying that price to keep the test fresh.
 	 */
-	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
+	TEST_ASSERT(this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH) <= NR_INTEL_ARCH_EVENTS,
 		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
-		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
+		    this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH),
+		    this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
 
 	/*
 	 * Iterate over known arch events irrespective of KVM/hardware support
@@ -600,8 +600,7 @@ static void test_intel_counters(void)
 	 * count correctly, even if *enumeration* of the event is unsupported
 	 * by KVM and/or isn't exposed to the guest.
 	 */
-	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
-	for (i = 0; i < nr_arch_events; i++) {
+	for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) {
 		if (this_pmu_has(intel_event_to_feature(i).gp_event))
 			hardware_pmu_arch_events |= BIT(i);
 	}
@@ -620,8 +619,8 @@ static void test_intel_counters(void)
 			 * vector length.
 			 */
 			if (v == pmu_version) {
-				for (k = 1; k < (BIT(nr_arch_events) - 1); k++)
-					test_arch_events(v, perf_caps[i], nr_arch_events, k);
+				for (k = 1; k < (BIT(NR_INTEL_ARCH_EVENTS) - 1); k++)
+					test_arch_events(v, perf_caps[i], NR_INTEL_ARCH_EVENTS, k);
 			}
 			/*
 			 * Test single bits for all PMU version and lengths up
@@ -630,11 +629,11 @@ static void test_intel_counters(void)
 			 * host length).  Explicitly test a mask of '0' and all
 			 * ones i.e. all events being available and unavailable.
 			 */
-			for (j = 0; j <= nr_arch_events + 1; j++) {
+			for (j = 0; j <= NR_INTEL_ARCH_EVENTS + 1; j++) {
 				test_arch_events(v, perf_caps[i], j, 0);
 				test_arch_events(v, perf_caps[i], j, 0xff);
 
-				for (k = 0; k < nr_arch_events; k++)
+				for (k = 0; k < NR_INTEL_ARCH_EVENTS; k++)
 					test_arch_events(v, perf_caps[i], j, BIT(k));
 			}
 
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/5] KVM: selftests: Drop the "feature event" param from guest test helpers
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
                   ` (2 preceding siblings ...)
  2025-01-17 23:42 ` [PATCH 3/5] KVM: selftests: Remove dead code in Intel PMU counters test Sean Christopherson
@ 2025-01-17 23:42 ` Sean Christopherson
  2025-01-17 23:42 ` [PATCH 5/5] KVM: selftests: Print out the actual Top-Down Slots count on failure Sean Christopherson
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:42 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

Now that validation of event count is tied to hardware support for event,
and not to guest support for an event, drop the unused "event" parameter
from the various helpers.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86/pmu_counters_test.c     | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
index 5d6a5b9c17b3..ea1485a08c78 100644
--- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -148,9 +148,7 @@ static uint8_t guest_get_pmu_version(void)
  * Sanity check that in all cases, the event doesn't count when it's disabled,
  * and that KVM correctly emulates the write of an arbitrary value.
  */
-static void guest_assert_event_count(uint8_t idx,
-				     struct kvm_x86_pmu_feature event,
-				     uint32_t pmc, uint32_t pmc_msr)
+static void guest_assert_event_count(uint8_t idx, uint32_t pmc, uint32_t pmc_msr)
 {
 	uint64_t count;
 
@@ -223,7 +221,7 @@ do {										\
 	);									\
 } while (0)
 
-#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)	\
+#define GUEST_TEST_EVENT(_idx, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)		\
 do {										\
 	wrmsr(_pmc_msr, 0);							\
 										\
@@ -234,17 +232,16 @@ do {										\
 	else									\
 		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
 										\
-	guest_assert_event_count(_idx, _event, _pmc, _pmc_msr);			\
+	guest_assert_event_count(_idx, _pmc, _pmc_msr);				\
 } while (0)
 
-static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event,
-				    uint32_t pmc, uint32_t pmc_msr,
+static void __guest_test_arch_event(uint8_t idx, uint32_t pmc, uint32_t pmc_msr,
 				    uint32_t ctrl_msr, uint64_t ctrl_msr_value)
 {
-	GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
+	GUEST_TEST_EVENT(idx, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
 
 	if (is_forced_emulation_enabled)
-		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
+		GUEST_TEST_EVENT(idx, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
 }
 
 static void guest_test_arch_event(uint8_t idx)
@@ -280,7 +277,7 @@ static void guest_test_arch_event(uint8_t idx)
 		if (guest_has_perf_global_ctrl)
 			wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
 
-		__guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i,
+		__guest_test_arch_event(idx, i, base_pmc_msr + i,
 					MSR_P6_EVNTSEL0 + i, eventsel);
 	}
 
@@ -295,7 +292,7 @@ static void guest_test_arch_event(uint8_t idx)
 
 	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
 
-	__guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED,
+	__guest_test_arch_event(idx, i | INTEL_RDPMC_FIXED,
 				MSR_CORE_PERF_FIXED_CTR0 + i,
 				MSR_CORE_PERF_GLOBAL_CTRL,
 				FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/5] KVM: selftests: Print out the actual Top-Down Slots count on failure
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
                   ` (3 preceding siblings ...)
  2025-01-17 23:42 ` [PATCH 4/5] KVM: selftests: Drop the "feature event" param from guest test helpers Sean Christopherson
@ 2025-01-17 23:42 ` Sean Christopherson
  2025-01-20 16:17 ` [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Paolo Bonzini
  2025-02-15  0:50 ` Sean Christopherson
  6 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-17 23:42 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

Print out the expected vs. actual count of the Top-Down Slots event on
failure in the Intel PMU counters test.  GUEST_ASSERT() only expands
constants/macros, i.e. only prints the value of the expected count, which
makes it difficult to debug and triage failures.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/pmu_counters_test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
index ea1485a08c78..8aaaf25b6111 100644
--- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -174,7 +174,9 @@ static void guest_assert_event_count(uint8_t idx, uint32_t pmc, uint32_t pmc_msr
 		GUEST_ASSERT_NE(count, 0);
 		break;
 	case INTEL_ARCH_TOPDOWN_SLOTS_INDEX:
-		GUEST_ASSERT(count >= NUM_INSNS_RETIRED);
+		__GUEST_ASSERT(count >= NUM_INSNS_RETIRED,
+			       "Expected top-down slots >= %u, got count = %lu",
+			       NUM_INSNS_RETIRED, count);
 		break;
 	default:
 		break;
-- 
2.48.0.rc2.279.g1de40edade-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-17 23:42 ` [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events Sean Christopherson
@ 2025-01-18  0:06   ` Mingwei Zhang
  2025-01-18  0:39     ` Sean Christopherson
  0 siblings, 1 reply; 14+ messages in thread
From: Mingwei Zhang @ 2025-01-18  0:06 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm, linux-kernel, kernel test robot

On Fri, Jan 17, 2025, Sean Christopherson wrote:
> In the Intel PMU counters test, only validate the counts for architectural
> events that are supported in hardware.  If an arch event isn't supported,
> the event selector may enable a completely different event, and thus the
> logic for the expected count is bogus.
> 
> This fixes test failures on pre-Icelake systems due to the encoding for
> the architectural Top-Down Slots event corresponding to something else
> (at least on the Skylake family of CPUs).
> 
> Note, validation relies on *hardware* support, not KVM support and not
> guest support.  Architectural events are all about enumerating the event
> selector encoding; lack of enumeration for an architectural event doesn't
> mean the event itself is unsupported, i.e. the event should still count as
> expected even if KVM and/or guest CPUID doesn't enumerate the event as
> being "architectural".
> 
> Note #2, it's desirable to _program_ the architectural event encoding even
> if hardware doesn't support the event.  The count can't be validated when
> the event is fully enabled, but KVM should still let the guest program the
> event selector, and the PMC shouldn't count if the event is disabled.
> 
> Fixes: 4f1bd6b16074 ("KVM: selftests: Test Intel PMU architectural events on gp counters")
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202501141009.30c629b4-lkp@intel.com
> Debugged-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  .../selftests/kvm/x86/pmu_counters_test.c     | 25 +++++++++++++------
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
> index fe7d72fc8a75..8159615ad492 100644
> --- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c
> +++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
> @@ -29,6 +29,8 @@
>  /* Total number of instructions retired within the measured section. */
>  #define NUM_INSNS_RETIRED		(NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
>  
> +/* Track which architectural events are supported by hardware. */
> +static uint32_t hardware_pmu_arch_events;
>  
>  static uint8_t kvm_pmu_version;
>  static bool kvm_has_perf_caps;
> @@ -89,6 +91,7 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
>  
>  	vm = vm_create_with_one_vcpu(vcpu, guest_code);
>  	sync_global_to_guest(vm, kvm_pmu_version);
> +	sync_global_to_guest(vm, hardware_pmu_arch_events);
>  
>  	/*
>  	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
> @@ -152,7 +155,7 @@ static void guest_assert_event_count(uint8_t idx,
>  	uint64_t count;
>  
>  	count = _rdpmc(pmc);
> -	if (!this_pmu_has(event))
> +	if (!(hardware_pmu_arch_events & BIT(idx)))
>  		goto sanity_checks;
>  
>  	switch (idx) {
> @@ -560,7 +563,7 @@ static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
>  
>  static void test_intel_counters(void)
>  {
> -	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
> +	uint8_t nr_arch_events = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
>  	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
>  	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
>  	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
> @@ -582,18 +585,26 @@ static void test_intel_counters(void)
>  
>  	/*
>  	 * Detect the existence of events that aren't supported by selftests.
> -	 * This will (obviously) fail any time the kernel adds support for a
> -	 * new event, but it's worth paying that price to keep the test fresh.
> +	 * This will (obviously) fail any time hardware adds support for a new
> +	 * event, but it's worth paying that price to keep the test fresh.
>  	 */
>  	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
>  		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
> -		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> +		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));

This is where it would make troubles for us (all companies that might be
using the selftest in upstream kernel and having a new hardware). In
this case when we get new hardware, the test will fail in the downstream
kernel. We will have to wait until the fix is ready, and backport it
downstream, re-test it.... It takes lots of extra work.

Perhaps we can just putting nr_arch_events = NR_INTEL_ARCH_EVENTS
if the former is larger than or equal to the latter? So that the "test"
only test what it knows. It does not test what it does not know, i.e.,
it does not "assume" it knows everything. We can always a warning or
info log at the moment. Then expanding the capability of the test should
be added smoothly later by either maintainers of SWEs from CPU vendors
without causing failures.

Thanks.
-Mingwei
>  
>  	/*
> -	 * Force iterating over known arch events regardless of whether or not
> -	 * KVM/hardware supports a given event.
> +	 * Iterate over known arch events irrespective of KVM/hardware support
> +	 * to verify that KVM doesn't reject programming of events just because
> +	 * the *architectural* encoding is unsupported.  Track which events are
> +	 * supported in hardware; the guest side will validate supported events
> +	 * count correctly, even if *enumeration* of the event is unsupported
> +	 * by KVM and/or isn't exposed to the guest.
>  	 */
>  	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
> +	for (i = 0; i < nr_arch_events; i++) {
> +		if (this_pmu_has(intel_event_to_feature(i).gp_event))
> +			hardware_pmu_arch_events |= BIT(i);
> +	}
>  
>  	for (v = 0; v <= max_pmu_version; v++) {
>  		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
> -- 
> 2.48.0.rc2.279.g1de40edade-goog
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-18  0:06   ` Mingwei Zhang
@ 2025-01-18  0:39     ` Sean Christopherson
  2025-01-20 16:12       ` Paolo Bonzini
  2025-01-22  4:51       ` Mingwei Zhang
  0 siblings, 2 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-18  0:39 UTC (permalink / raw)
  To: Mingwei Zhang; +Cc: Paolo Bonzini, kvm, linux-kernel, kernel test robot

On Sat, Jan 18, 2025, Mingwei Zhang wrote:
> On Fri, Jan 17, 2025, Sean Christopherson wrote:
> > @@ -582,18 +585,26 @@ static void test_intel_counters(void)
> >  
> >  	/*
> >  	 * Detect the existence of events that aren't supported by selftests.
> > -	 * This will (obviously) fail any time the kernel adds support for a
> > -	 * new event, but it's worth paying that price to keep the test fresh.
> > +	 * This will (obviously) fail any time hardware adds support for a new
> > +	 * event, but it's worth paying that price to keep the test fresh.
> >  	 */
> >  	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
> >  		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
> > -		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> > +		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> 
> This is where it would make troubles for us (all companies that might be
> using the selftest in upstream kernel and having a new hardware). In
> this case when we get new hardware, the test will fail in the downstream
> kernel. We will have to wait until the fix is ready, and backport it
> downstream, re-test it.... It takes lots of extra work.

If Intel can't upstream what should be a *very* simple patch to enumerate the
new encoding and its expected count in advance of hardware being shipped to
partners, then we have bigger problems.  I don't know what level of pre-silicon
and FPGA-based emulation testing Intel does these days, but I wouldn't be at all
surprised if KVM tests are being run well before silicon arrives.

I am not at all convinced that this will ever affect anyone besides the Intel
engineers doing early enablement, and I am definitely not convinced it will ever
take significant effort above beyond what would be required irrespective of what
approach we take.  E.g. figuring out what the expected count is might be time
consuming, but I don't expect updating the test to be difficult.

> Perhaps we can just putting nr_arch_events = NR_INTEL_ARCH_EVENTS
> if the former is larger than or equal to the latter? So that the "test"
> only test what it knows. It does not test what it does not know, i.e.,
> it does not "assume" it knows everything. We can always a warning or
> info log at the moment. Then expanding the capability of the test should
> be added smoothly later by either maintainers of SWEs from CPU vendors
> without causing failures.

If we just "warn", we're effectively relying on a future Intel engineer to run
the test *and* check the logs *and* actually act on the warning.  Given that tests
are rarely run manually with a human pouring over the output, I highly doubt that
will pan out.

The more likely scenario is that the warn go unnoticed until some random person
sees the warn and files a bug somewhere.  At that point, odds are good that someone
who knows nothing about Intel's new hardware/event will get saddled with hunting
down the appropriate specs, digging into the details of the event, submitting a
patch upstream, etc.

And if multiple someones detect the warn, e.g. at different companines, then we've
collectively wasted even more time.  Which is a pretty likely scenario, because I
know with 100% certainly that people carry out-of-tree fixes :-)

By failing the test, pretty much the only assumption we're making is that Intel
cares about upstream KVM.  All evidence suggests that's a very safe assumption.
And as shown by my fumbling with Top-Down Slots, Intel engineers are absolutely
the right people to tackle this sort of thing, as they have accesses to resources
about uarch/CPU behavior that others don't.

If this approach turns out to be a huge pain, then we can certainly revisit things.
But I truly expect this will be less work for everyone, Intel included.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-18  0:39     ` Sean Christopherson
@ 2025-01-20 16:12       ` Paolo Bonzini
  2025-01-22  4:51       ` Mingwei Zhang
  1 sibling, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2025-01-20 16:12 UTC (permalink / raw)
  To: Sean Christopherson, Mingwei Zhang; +Cc: kvm, linux-kernel, kernel test robot

On 1/18/25 01:39, Sean Christopherson wrote:
> On Sat, Jan 18, 2025, Mingwei Zhang wrote:
>> On Fri, Jan 17, 2025, Sean Christopherson wrote:
>>> @@ -582,18 +585,26 @@ static void test_intel_counters(void)
>>>   
>>>   	/*
>>>   	 * Detect the existence of events that aren't supported by selftests.
>>> -	 * This will (obviously) fail any time the kernel adds support for a
>>> -	 * new event, but it's worth paying that price to keep the test fresh.
>>> +	 * This will (obviously) fail any time hardware adds support for a new
>>> +	 * event, but it's worth paying that price to keep the test fresh.
>>>   	 */
>>>   	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
>>>   		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
>>> -		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
>>> +		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
>>
>> This is where it would make troubles for us (all companies that might be
>> using the selftest in upstream kernel and having a new hardware). In
>> this case when we get new hardware, the test will fail in the downstream
>> kernel. We will have to wait until the fix is ready, and backport it
>> downstream, re-test it.... It takes lots of extra work.
> 
> If Intel can't upstream what should be a *very* simple patch to enumerate the
> new encoding and its expected count in advance of hardware being shipped to
> partners, then we have bigger problems.

Conceptually I have bigger problems with people running stable kernels 
than people running on really really new hardware.

However the intersection of running a pretty old kernel on a very new 
bare metal x86 system is relatively small but nonzero (those pesky 
Debian users); it may happen with cloud instances but then the 
intersection of running old selftests in a nested virt environment is 
probably even smaller.

I am not too happy about the assertion, but it does seem like the lesser 
evil.

Paolo


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
                   ` (4 preceding siblings ...)
  2025-01-17 23:42 ` [PATCH 5/5] KVM: selftests: Print out the actual Top-Down Slots count on failure Sean Christopherson
@ 2025-01-20 16:17 ` Paolo Bonzini
  2025-02-10 23:38   ` Sean Christopherson
  2025-02-15  0:50 ` Sean Christopherson
  6 siblings, 1 reply; 14+ messages in thread
From: Paolo Bonzini @ 2025-01-20 16:17 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: kvm, linux-kernel, kernel test robot

On Sat, Jan 18, 2025 at 12:42 AM Sean Christopherson <seanjc@google.com> wrote:
>
> Fix a flaw in the Intel PMU counters test where it asserts that an event is
> counting correctly without actually knowing what the event counts given the
> underlying hardware.
>
> The bug manifests as failures with the Top-Down Slots architectural event
> when running CPUs that doesn't actually support that arch event (pre-ICX).
> The arch event encoding still counts _something_, just not Top-Down Slots
> (I haven't bothered to look up what it was counting).  The passed by sheer
> dumb luck until an unrelated change caused the count of the unknown event
> to drop.

Queued for 6.14-rc1, thanks.

Paolo


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-18  0:39     ` Sean Christopherson
  2025-01-20 16:12       ` Paolo Bonzini
@ 2025-01-22  4:51       ` Mingwei Zhang
  2025-01-24 15:57         ` Sean Christopherson
  1 sibling, 1 reply; 14+ messages in thread
From: Mingwei Zhang @ 2025-01-22  4:51 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm, linux-kernel, kernel test robot

On Fri, Jan 17, 2025, Sean Christopherson wrote:
> On Sat, Jan 18, 2025, Mingwei Zhang wrote:
> > On Fri, Jan 17, 2025, Sean Christopherson wrote:
> > > @@ -582,18 +585,26 @@ static void test_intel_counters(void)
> > >  
> > >  	/*
> > >  	 * Detect the existence of events that aren't supported by selftests.
> > > -	 * This will (obviously) fail any time the kernel adds support for a
> > > -	 * new event, but it's worth paying that price to keep the test fresh.
> > > +	 * This will (obviously) fail any time hardware adds support for a new
> > > +	 * event, but it's worth paying that price to keep the test fresh.
> > >  	 */
> > >  	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
> > >  		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
> > > -		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> > > +		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> > 
> > This is where it would make troubles for us (all companies that might be
> > using the selftest in upstream kernel and having a new hardware). In
> > this case when we get new hardware, the test will fail in the downstream
> > kernel. We will have to wait until the fix is ready, and backport it
> > downstream, re-test it.... It takes lots of extra work.
> 
> If Intel can't upstream what should be a *very* simple patch to enumerate the
> new encoding and its expected count in advance of hardware being shipped to
> partners, then we have bigger problems.  I don't know what level of pre-silicon
> and FPGA-based emulation testing Intel does these days, but I wouldn't be at all
> surprised if KVM tests are being run well before silicon arrives.

Right, Intel folks will be the 1st one that is impacted. But it should
be easy to fix on their end. But upstreaming the change may come very
late.

> 
> I am not at all convinced that this will ever affect anyone besides the Intel
> engineers doing early enablement, and I am definitely not convinced it will ever
> take significant effort above beyond what would be required irrespective of what
> approach we take.  E.g. figuring out what the expected count is might be time
> consuming, but I don't expect updating the test to be difficult.

It will affect the downstream kernels, I think? Like Paolo mentioned,
old distro kernel may run on new hardware. In usual cases, Intel HW has
already come out for a while, and the upstream software update is still
there under review. Fixing the problem is never difficult. But we need a
minor fix for each new generation of HW that adds a new architecture
event. I can imagine fixing that needs a simple patch, but each of them
has to cc stable tree and with "Fixes" tag.

> 
> > Perhaps we can just putting nr_arch_events = NR_INTEL_ARCH_EVENTS
> > if the former is larger than or equal to the latter? So that the "test"
> > only test what it knows. It does not test what it does not know, i.e.,
> > it does not "assume" it knows everything. We can always a warning or
> > info log at the moment. Then expanding the capability of the test should
> > be added smoothly later by either maintainers of SWEs from CPU vendors
> > without causing failures.
> 
> If we just "warn", we're effectively relying on a future Intel engineer to run
> the test *and* check the logs *and* actually act on the warning.  Given that tests
> are rarely run manually with a human pouring over the output, I highly doubt that
> will pan out.

In reality, we may not even need to warn. If the test only covers what
it knows, then there is no alarm to report. On the other hand, CPU
vendor who is doing the development will have selftest series to
update it. It will not just increase the array size, but also add more
meaningful testcases to the new events. The assumption is CPU vendors
care about upstream, which I think is true.
> 
> The more likely scenario is that the warn go unnoticed until some random person
> sees the warn and files a bug somewhere.  At that point, odds are good that someone
> who knows nothing about Intel's new hardware/event will get saddled with hunting
> down the appropriate specs, digging into the details of the event, submitting a
> patch upstream, etc.
> 
> And if multiple someones detect the warn, e.g. at different companines, then we've
> collectively wasted even more time.  Which is a pretty likely scenario, because I
> know with 100% certainly that people carry out-of-tree fixes :-)

Right, so a warning is not necessary here.
> 
> By failing the test, pretty much the only assumption we're making is that Intel
> cares about upstream KVM.  All evidence suggests that's a very safe assumption.
> And as shown by my fumbling with Top-Down Slots, Intel engineers are absolutely
> the right people to tackle this sort of thing, as they have accesses to resources
> about uarch/CPU behavior that others don't.

Well, yes. I believe Intel cares about it. I see Dapeng's effort, which
is really great. But often the effort is not just to fix that simple
bug, there are other parts that need to be done to test Top-Down slots
and other fixes. For instance, Dapeng's series for kvm-unit-tests/pmu is
still under review after 12+ months?

https://lore.kernel.org/all/20240914101728.33148-8-dapeng1.mi@linux.intel.com/

I think this is a minor pain for us, since we are strictly following the
upstream version on kut. Maybe that is something we need to change?

Thanks.
-Mingwei
>
> If this approach turns out to be a huge pain, then we can certainly revisit things.
> But I truly expect this will be less work for everyone, Intel included.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events
  2025-01-22  4:51       ` Mingwei Zhang
@ 2025-01-24 15:57         ` Sean Christopherson
  0 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-01-24 15:57 UTC (permalink / raw)
  To: Mingwei Zhang; +Cc: Paolo Bonzini, kvm, linux-kernel, kernel test robot

On Wed, Jan 22, 2025, Mingwei Zhang wrote:
> On Fri, Jan 17, 2025, Sean Christopherson wrote:
> > On Sat, Jan 18, 2025, Mingwei Zhang wrote:
> > > On Fri, Jan 17, 2025, Sean Christopherson wrote:
> > > > @@ -582,18 +585,26 @@ static void test_intel_counters(void)
> > > >  
> > > >  	/*
> > > >  	 * Detect the existence of events that aren't supported by selftests.
> > > > -	 * This will (obviously) fail any time the kernel adds support for a
> > > > -	 * new event, but it's worth paying that price to keep the test fresh.
> > > > +	 * This will (obviously) fail any time hardware adds support for a new
> > > > +	 * event, but it's worth paying that price to keep the test fresh.
> > > >  	 */
> > > >  	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
> > > >  		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
> > > > -		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> > > > +		    nr_arch_events, this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
> > > 
> > > This is where it would make troubles for us (all companies that might be
> > > using the selftest in upstream kernel and having a new hardware). In
> > > this case when we get new hardware, the test will fail in the downstream
> > > kernel. We will have to wait until the fix is ready, and backport it
> > > downstream, re-test it.... It takes lots of extra work.
> > 
> > If Intel can't upstream what should be a *very* simple patch to enumerate the
> > new encoding and its expected count in advance of hardware being shipped to
> > partners, then we have bigger problems.  I don't know what level of pre-silicon
> > and FPGA-based emulation testing Intel does these days, but I wouldn't be at all
> > surprised if KVM tests are being run well before silicon arrives.
> 
> Right, Intel folks will be the 1st one that is impacted. But it should
> be easy to fix on their end. But upstreaming the change may come very
> late.

And I'm saying we do what we can to motivate upstreaming the "fix" sooner than
later.

> > I am not at all convinced that this will ever affect anyone besides the Intel
> > engineers doing early enablement, and I am definitely not convinced it will ever
> > take significant effort above beyond what would be required irrespective of what
> > approach we take.  E.g. figuring out what the expected count is might be time
> > consuming, but I don't expect updating the test to be difficult.
> 
> It will affect the downstream kernels, I think? Like Paolo mentioned,
> old distro kernel may run on new hardware. In usual cases, Intel HW has
> already come out for a while, and the upstream software update is still
> there under review.

This isn't a usual case.  Or at least, it shouldn't be.  The effort required
should be more on par with adding Family/Model/Stepping information, and Intel
is quite capable of landing those types of changes well in advance of general
availability.

E.g. commit 7beade0dd41d ("x86/cpu: Add several Intel server CPU model numbers")
added Sierra Forest and Granite Rapids two years before they launched.  I don't
know when third parties first got silicion, but I would be surprised if it was
much, if at all, before that commit.

> Fixing the problem is never difficult. But we need a minor fix for each new
> generation of HW that adds a new architecture event. I can imagine fixing
> that needs a simple patch, but each of them has to cc stable tree

Yes, but sending patches to LTS kernels isn't inherently bad.  If the changes end
up conflicting regularly, then we can certainly reconsider the cost vs. benefit
of the assert.

> and with "Fixes" tag.

Nit, it doesn't need a Fixes, just Cc: stable@.

> > > Perhaps we can just putting nr_arch_events = NR_INTEL_ARCH_EVENTS
> > > if the former is larger than or equal to the latter? So that the "test"
> > > only test what it knows. It does not test what it does not know, i.e.,
> > > it does not "assume" it knows everything. We can always a warning or
> > > info log at the moment. Then expanding the capability of the test should
> > > be added smoothly later by either maintainers of SWEs from CPU vendors
> > > without causing failures.
> > 
> > If we just "warn", we're effectively relying on a future Intel engineer to run
> > the test *and* check the logs *and* actually act on the warning.  Given that tests
> > are rarely run manually with a human pouring over the output, I highly doubt that
> > will pan out.
> 
> In reality, we may not even need to warn. If the test only covers what
> it knows, then there is no alarm to report.

The assertion/warn isn't about test correctness, it's about ensuring test coverage
and distributing maintenance burden.  I don't want to have to chase down someone
at Intel that can provide the gory details on whatever architectural event comes
along next.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test
  2025-01-20 16:17 ` [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Paolo Bonzini
@ 2025-02-10 23:38   ` Sean Christopherson
  0 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-02-10 23:38 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

On Mon, Jan 20, 2025, Paolo Bonzini wrote:
> On Sat, Jan 18, 2025 at 12:42 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > Fix a flaw in the Intel PMU counters test where it asserts that an event is
> > counting correctly without actually knowing what the event counts given the
> > underlying hardware.
> >
> > The bug manifests as failures with the Top-Down Slots architectural event
> > when running CPUs that doesn't actually support that arch event (pre-ICX).
> > The arch event encoding still counts _something_, just not Top-Down Slots
> > (I haven't bothered to look up what it was counting).  The passed by sheer
> > dumb luck until an unrelated change caused the count of the unknown event
> > to drop.
> 
> Queued for 6.14-rc1, thanks.

Lies :-)

Neither this nor the main selftests pull request[*] landed in 6.14.  None of this
is urgent, so if it's easier on your end I can carry them forward and send them
for 6.15.

[*] https://lore.kernel.org/all/20250117010718.2328467-5-seanjc@google.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test
  2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
                   ` (5 preceding siblings ...)
  2025-01-20 16:17 ` [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Paolo Bonzini
@ 2025-02-15  0:50 ` Sean Christopherson
  6 siblings, 0 replies; 14+ messages in thread
From: Sean Christopherson @ 2025-02-15  0:50 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, kernel test robot

On Fri, 17 Jan 2025 15:41:58 -0800, Sean Christopherson wrote:
> Fix a flaw in the Intel PMU counters test where it asserts that an event is
> counting correctly without actually knowing what the event counts given the
> underlying hardware.
> 
> The bug manifests as failures with the Top-Down Slots architectural event
> when running CPUs that doesn't actually support that arch event (pre-ICX).
> The arch event encoding still counts _something_, just not Top-Down Slots
> (I haven't bothered to look up what it was counting).  The passed by sheer
> dumb luck until an unrelated change caused the count of the unknown event
> to drop.
> 
> [...]

In case Paolo ends up grabbing the version I applied...
https://lore.kernel.org/all/Z6_dZTbQbgr2iY6Q@google.com

Applied to kvm-x86 selftests_6.14.

[1/5] KVM: selftests: Make Intel arch events globally available in PMU counters test
      https://github.com/kvm-x86/linux/commit/933178ddf73a
[2/5] KVM: selftests: Only validate counts for hardware-supported arch events
      https://github.com/kvm-x86/linux/commit/8752e2b4a2b7
[3/5] KVM: selftests: Remove dead code in Intel PMU counters test
      https://github.com/kvm-x86/linux/commit/e327630e2a0c
[4/5] KVM: selftests: Drop the "feature event" param from guest test helpers
      https://github.com/kvm-x86/linux/commit/0e6714735c01
[5/5] KVM: selftests: Print out the actual Top-Down Slots count on failure
      https://github.com/kvm-x86/linux/commit/54108e733444

--
https://github.com/kvm-x86/linux/tree/next

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-02-15  0:56 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-17 23:41 [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Sean Christopherson
2025-01-17 23:41 ` [PATCH 1/5] KVM: selftests: Make Intel arch events globally available " Sean Christopherson
2025-01-17 23:42 ` [PATCH 2/5] KVM: selftests: Only validate counts for hardware-supported arch events Sean Christopherson
2025-01-18  0:06   ` Mingwei Zhang
2025-01-18  0:39     ` Sean Christopherson
2025-01-20 16:12       ` Paolo Bonzini
2025-01-22  4:51       ` Mingwei Zhang
2025-01-24 15:57         ` Sean Christopherson
2025-01-17 23:42 ` [PATCH 3/5] KVM: selftests: Remove dead code in Intel PMU counters test Sean Christopherson
2025-01-17 23:42 ` [PATCH 4/5] KVM: selftests: Drop the "feature event" param from guest test helpers Sean Christopherson
2025-01-17 23:42 ` [PATCH 5/5] KVM: selftests: Print out the actual Top-Down Slots count on failure Sean Christopherson
2025-01-20 16:17 ` [PATCH 0/5] KVM: selftests: Fix PMC checks in PMU counters test Paolo Bonzini
2025-02-10 23:38   ` Sean Christopherson
2025-02-15  0:50 ` Sean Christopherson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox