All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jiri Olsa <jolsa@redhat.com>
To: Ingo Molnar <mingo@kernel.org>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Andi Kleen <ak@linux.intel.com>
Cc: lkml <linux-kernel@vger.kernel.org>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>
Subject: [RFC,PATCH] VMWARE faults on accessing disabled counters
Date: Wed, 31 Aug 2016 14:03:58 +0200	[thread overview]
Message-ID: <20160831120358.GB9001@krava> (raw)

hi,
when booting under VMWARE we've got following dmesg lines:

[    0.051567] perf_event_intel: CPUID marked event: 'cpu cycles' unavailable
[    0.051567] perf_event_intel: CPUID marked event: 'instructions' unavailable
[    0.051568] perf_event_intel: CPUID marked event: 'bus cycles' unavailable
[    0.051568] perf_event_intel: CPUID marked event: 'cache references' unavailable
[    0.051569] perf_event_intel: CPUID marked event: 'cache misses' unavailable
[    0.051570] perf_event_intel: CPUID marked event: 'branch instructions' unavailable
[    0.051570] perf_event_intel: CPUID marked event: 'branch misses' unavailable

that means all the architectural events are disabled by CPUID(0xa)

The kernel code sets intel_perfmon_event_map to prevent
those event to be configured by PERF_TYPE_HARDWARE pmu
type. However they can still be configured by via
PERF_TYPE_RAW type.

We're getting GP fault on VMWARE when reading cycles PMC
configured throgh the PERF_TYPE_RAW interface:

 #4 [ffff88007c603e10] do_general_protection at ffffffff8163da9e
 #5 [ffff88007c603e40] general_protection at ffffffff8163d3a8
    [exception RIP: native_read_pmc+6]
    RIP: ffffffff81058d66  RSP: ffff88007c603ef0  RFLAGS: 00010083
    RAX: ffffffff81957ee0  RBX: 0000000000000000  RCX: 0000000040000002
    RDX: 000000000ff8f719  RSI: ffff88007c617fa8  RDI: 0000000040000002
    RBP: ffff88007c603ef0   R8: 00007ffde5053150   R9: 0000000000000000
    R10: 00007ffde5052530  R11: 00007fbb22aedc70  R12: ffffffff80000001
    R13: ffff880079b74400  R14: ffff880079b74578  R15: 0000000000000010
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0000
 #6 [ffff88007c603ef8] x86_perf_event_update at ffffffff81029e03
 #7 [ffff88007c603f30] x86_pmu_read at ffffffff8102a079
 #8 [ffff88007c603f40] __perf_event_read at ffffffff811590de

I couldn't find what real HW rdpmc does on this situation,
so I'm not sure if we actually want to prevent this.. patch
below tries to catch this case.

thanks,
jirka


---
 arch/x86/events/core.c       |  8 ++++-
 arch/x86/events/intel/core.c | 72 ++++++++++++++++++++++++++++++++------------
 arch/x86/events/perf_event.h |  6 ++++
 3 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 473519100b11..d836c5922b12 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -534,8 +534,14 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (!event->attr.exclude_kernel)
 		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (event->attr.type == PERF_TYPE_RAW)
+	if (event->attr.type == PERF_TYPE_RAW) {
+		u64 arch_config = event->attr.config & INTEL_ARCH_EVENT_MASK;
+
+		if (x86_pmu_event_disabled(arch_config))
+			return -ENOENT;
+
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+	}
 
 	if (event->attr.sample_period && x86_pmu.limit_period) {
 		if (x86_pmu.limit_period(event, event->attr.sample_period) >
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9049d62f34ae..99a83529c7ff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -23,16 +23,22 @@
 /*
  * Intel PerfMon, used on Core and later.
  */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
+struct intel_perfmon_event {
+	u64	config;
+	bool	disabled;
+	u64	replacement;
+};
+
+static struct intel_perfmon_event intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+	[PERF_COUNT_HW_CPU_CYCLES]		= { .config = 0x003c },
+	[PERF_COUNT_HW_INSTRUCTIONS]		= { .config = 0x00c0 },
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= { .config = 0x4f2e },
+	[PERF_COUNT_HW_CACHE_MISSES]		= { .config = 0x412e },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= { .config = 0x00c4 },
+	[PERF_COUNT_HW_BRANCH_MISSES]		= { .config = 0x00c5 },
+	[PERF_COUNT_HW_BUS_CYCLES]		= { .config = 0x013c },
+	[PERF_COUNT_HW_REF_CPU_CYCLES]		= { .config = 0x0300 }, /* pseudo-encoding */
 };
 
 static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -268,7 +274,31 @@ struct event_constraint intel_bdw_event_constraints[] = {
 
 static u64 intel_pmu_event_map(int hw_event)
 {
-	return intel_perfmon_event_map[hw_event];
+	struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+	if (event->disabled)
+		return event->config;
+	if (event->replacement)
+		return event->replacement;
+
+	return event->config;
+}
+
+static bool intel_pmu_event_disabled(int hw_event)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(intel_perfmon_event_map); i++) {
+		struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+		if (event->config != hw_event)
+			continue;
+
+		if (event->disabled)
+			return true;
+	}
+
+	return false;
 }
 
 /*
@@ -3165,6 +3195,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
+	.event_disabled		= intel_pmu_event_disabled,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
@@ -3205,6 +3236,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
+	.event_disabled		= intel_pmu_event_disabled,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
@@ -3357,7 +3389,7 @@ static __init void intel_arch_events_quirk(void)
 
 	/* disable event that reported as not presend by cpuid */
 	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+		intel_perfmon_event_map[intel_arch_events_map[bit].id].disabled = true;
 		pr_warn("CPUID marked event: \'%s\' unavailable\n",
 			intel_arch_events_map[bit].name);
 	}
@@ -3375,7 +3407,7 @@ static __init void intel_nehalem_quirk(void)
 		 * branch-misses, but it's still much better than the
 		 * architectural event which is often completely bogus:
 		 */
-		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES].replacement = 0x7f89;
 		ebx.split.no_branch_misses_retired = 0;
 		x86_pmu.events_maskl = ebx.full;
 		pr_info("CPU erratum AAJ80 worked around\n");
@@ -3543,10 +3575,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = nhm_events_attrs;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
@@ -3630,10 +3662,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = nhm_events_attrs;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
@@ -3667,10 +3699,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = snb_events_attrs;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
 
 		pr_cont("SandyBridge events, ");
@@ -3704,7 +3736,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = snb_events_attrs;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 
 		pr_cont("IvyBridge events, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 01ddfeadaee6..69cca7dc8de4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -514,6 +514,7 @@ struct x86_pmu {
 	int		(*addr_offset)(int index, bool eventsel);
 	int		(*rdpmc_index)(int index);
 	u64		(*event_map)(int);
+	bool		(*event_disabled)(int);
 	int		max_events;
 	int		num_counters;
 	int		num_counters_fixed;
@@ -715,6 +716,11 @@ static inline int x86_pmu_rdpmc_index(int index)
 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
+static inline bool x86_pmu_event_disabled(u64 config)
+{
+	return x86_pmu.event_disabled ? x86_pmu.event_disabled(config) : false;
+}
+
 int x86_add_exclusive(unsigned int what);
 
 void x86_del_exclusive(unsigned int what);
-- 
2.7.4

             reply	other threads:[~2016-08-31 12:04 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-31 12:03 Jiri Olsa [this message]
2016-08-31 13:11 ` [RFC,PATCH] VMWARE faults on accessing disabled counters Peter Zijlstra
2016-08-31 13:19   ` Jiri Olsa
2016-08-31 13:41     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160831120358.GB9001@krava \
    --to=jolsa@redhat.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=ak@linux.intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.