* [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf
@ 2014-08-13 1:45 Andi Kleen
2014-08-13 1:45 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Andi Kleen @ 2014-08-13 1:45 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, mingo, eranian, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2502d0d..ef6c8b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2541,7 +2541,6 @@ __init int intel_pmu_init(void)
case 60: /* Haswell Client */
case 70:
- case 71:
case 63:
case 69:
x86_pmu.late_ack = true;
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/3] perf, x86: Add Broadwell core support
2014-08-13 1:45 [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Andi Kleen
@ 2014-08-13 1:45 ` Andi Kleen
2014-08-13 8:00 ` Peter Zijlstra
2014-08-13 1:45 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds Andi Kleen
2014-08-13 7:58 ` [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Peter Zijlstra
2 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2014-08-13 1:45 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, mingo, eranian, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Add Broadwell support for Broadwell Client to perf.
This is very similar to Haswell. It uses a new cache event table,
because there were various changes there.
The constraint list has one new event that needs to be handled over Haswell.
The PEBS event list is the same, so we reuse Haswell's.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 152 +++++++++++++++++++++++++++++++++
1 file changed, 152 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ef6c8b7..dd0ea95 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -415,6 +424,126 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+static __initconst const u64 bdw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */
+ [ C(RESULT_ACCESS) ] = 0x1b7,
+ /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE|
+ L3_MISS|ANY_SNOOP */
+ [ C(RESULT_MISS) ] = 0x1b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE:ALL_RFO */
+ /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */
+ [ C(RESULT_MISS) ] = 0x1b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst const u64 bdw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */
+ [ C(RESULT_ACCESS) ] = 0x2d5,
+ /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE|
+ L3_MISS|ANY_SNOOP */
+ [ C(RESULT_MISS) ] = 0x3fbc0202d5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x122, /* OFFCORE_RESPONSE:ALL_RFO */
+ /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */
+ [ C(RESULT_MISS) ] = 0x3fbc020122,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2564,6 +2693,29 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* Broadwell */
+ case 71:
+ case 79:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, bdw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, bdw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 2/3] perf, x86: Add Broadwell core support
2014-08-13 1:45 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
@ 2014-08-13 8:00 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2014-08-13 8:00 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, mingo, eranian, Andi Kleen
[-- Attachment #1: Type: text/plain, Size: 305 bytes --]
On Tue, Aug 12, 2014 at 06:45:14PM -0700, Andi Kleen wrote:
> + case 61: /* Broadwell */
> + case 71:
> + case 79:
Same deal; please put a descriptive comment per model number.
https://git.kernel.org/cgit/linux/kernel/git/peterz/queue.git/commit/?h=perf/core&id=a42c1bf50d297fcb96a3c71b0fdc2da6dbe47030
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds
2014-08-13 1:45 [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Andi Kleen
2014-08-13 1:45 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
@ 2014-08-13 1:45 ` Andi Kleen
2014-08-13 8:10 ` Peter Zijlstra
2014-08-13 7:58 ` [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Peter Zijlstra
2 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2014-08-13 1:45 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, mingo, eranian, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
On Broadwell INST_RETIRED.PREC_DIST cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
Add a new callback to enforce this, and set it for Broadwell.
This is erratum BDM57 and BDM11.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event.c | 3 +++
arch/x86/kernel/cpu/perf_event.h | 1 +
arch/x86/kernel/cpu/perf_event_intel.c | 18 ++++++++++++++++++
3 files changed, 22 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0adc5e3..a0adf58 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -980,6 +980,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index de81627..a46e391 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -456,6 +456,7 @@ struct x86_pmu {
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
+ unsigned (*limit_period)(struct perf_event *event, unsigned l);
/*
* sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dd0ea95..db131b0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2034,6 +2034,23 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return c;
}
+/*
+ * Broadwell:
+ * The INST_RETIRED.ALL period always needs to have lowest
+ * 6bits cleared (BDM57). It shall not use a period smaller
+ * than 100 (BDM11). We combine the two to enforce
+ * a min-period of 128.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+ if ((event->hw.config & 0xffff) == 0x1c0) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fu;
+ }
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -2713,6 +2730,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.limit_period = bdw_limit_period;
pr_cont("Broadwell events, ");
break;
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds
2014-08-13 1:45 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds Andi Kleen
@ 2014-08-13 8:10 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2014-08-13 8:10 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, mingo, eranian, Andi Kleen
[-- Attachment #1: Type: text/plain, Size: 926 bytes --]
On Tue, Aug 12, 2014 at 06:45:15PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> On Broadwell INST_RETIRED.PREC_DIST cannot be used with any period
> that doesn't have the lowest 6 bits cleared. And the period
> should not be smaller than 128.
>
> Add a new callback to enforce this, and set it for Broadwell.
>
> This is erratum BDM57 and BDM11.
Urgh, this blows.
> +/*
> + * Broadwell:
> + * The INST_RETIRED.ALL period always needs to have lowest
> + * 6bits cleared (BDM57). It shall not use a period smaller
> + * than 100 (BDM11). We combine the two to enforce
> + * a min-period of 128.
> + */
> +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> +{
> + if ((event->hw.config & 0xffff) == 0x1c0) {
ISTR we have actual proper defines for that.
> + if (left < 128)
> + left = 128;
> + left &= ~0x3fu;
> + }
> + return left;
> +}
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf
2014-08-13 1:45 [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Andi Kleen
2014-08-13 1:45 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2014-08-13 1:45 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds Andi Kleen
@ 2014-08-13 7:58 ` Peter Zijlstra
2 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2014-08-13 7:58 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, mingo, eranian, Andi Kleen
[-- Attachment #1: Type: text/plain, Size: 982 bytes --]
On Tue, Aug 12, 2014 at 06:45:13PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> arch/x86/kernel/cpu/perf_event_intel.c | 1 -
> 1 file changed, 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 2502d0d..ef6c8b7 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2541,7 +2541,6 @@ __init int intel_pmu_init(void)
>
> case 60: /* Haswell Client */
> case 70:
> - case 71:
> case 63:
> case 69:
> x86_pmu.late_ack = true;
Can you properly describe the remaining model numbers? Surely there's a
significant difference between these chips otherwise why give them
difference model numbers.
Looking at the wikipedia page for Haswell I suspect its things like:
Haswell-{DT,MB,H,ULT}. If so can you but the right name with the right
number?
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-09 19:17 Andi Kleen
2015-02-09 19:17 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2015-02-09 19:17 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.
Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.
The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.
- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)
The events with additional caveats have references to the specification update.
v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
Various fixes with events being more similar to the older
Sandy Bridge tables.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
1 file changed, 195 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..387f580 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes both remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT(0)
+#define HSW_DEMAND_RFO BIT(1)
+#define HSW_PF_L2_RFO BIT(5)
+#define HSW_PF_L3_RFO BIT(8)
+#define HSW_ALL_RFO (HSW_DEMAND_RFO| \
+ HSW_PF_L2_RFO|HSW_PF_L3_RFO)
+#define HSW_ANY_RESPONSE BIT(16)
+#define HSW_SUPPLIER_NONE BIT(17)
+#define HSW_L3_MISS_LOCAL BIT(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT(31)
+#define HSW_SNOOP_NOT_NEEDED BIT(32)
+#define HSW_SNOOP_MISS BIT(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT(35)
+#define HSW_SNOOP_HITM BIT(36)
+#define HSW_SNOOP_NON_DRAM BIT(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_RFO|HSW_L3_MISS|
+ HSW_ANY_SNOOP|HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+ HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+ HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_RFO|
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+ HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_snb();
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/3] perf, x86: Add Broadwell core support
2015-02-09 19:17 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-09 19:17 ` Andi Kleen
0 siblings, 0 replies; 9+ messages in thread
From: Andi Kleen @ 2015-02-09 19:17 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Add Broadwell support for Broadwell to perf.
The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.
The constraint list has one new event that needs to be handled over Haswell.
Includes code and testing from Kan Liang.
v2: Remove unnamed model numbers.
v3: Rename cache event list to hsw_*. Change names.
v4: Use symbolic names for cache events. Improve comments and description.
Fix sparse warnings (Fengguang Wu)
Add Xeon D model number.
Remove cache event table (in separate patch)
Patch up remote node differences (Kan Liang)
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 48 ++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 387f580..7c480e8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -433,12 +442,16 @@ static __initconst const u64 snb_hw_cache_event_ids
#define HSW_ANY_RESPONSE BIT(16)
#define HSW_SUPPLIER_NONE BIT(17)
#define HSW_L3_MISS_LOCAL BIT(22)
+#define BDW_L3_MISS_LOCAL BIT(26)
#define HSW_L3_MISS_REMOTE_HOP0 BIT(27)
#define HSW_L3_MISS_REMOTE_HOP1 BIT(28)
#define HSW_L3_MISS_REMOTE_HOP2P BIT(29)
#define HSW_L3_MISS (HSW_L3_MISS_LOCAL| \
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
HSW_L3_MISS_REMOTE_HOP2P)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
#define HSW_SNOOP_NONE BIT(31)
#define HSW_SNOOP_NOT_NEEDED BIT(32)
#define HSW_SNOOP_MISS BIT(33)
@@ -2759,6 +2772,41 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_DATA_RD |
+ BDW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_ALL_RFO|BDW_L3_MISS|
+ HSW_ANY_SNOOP|HSW_SUPPLIER_NONE;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_DATA_RD|
+ BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_RFO|
+ BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP;
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-11 0:40 Andi Kleen
2015-02-11 0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2015-02-11 0:40 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.
Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.
The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.
- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)
The events with additional caveats have references to the specification update.
v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
Various fixes with events being more similar to the older
Sandy Bridge tables.
v4: Add aliases. Drop two PF bits.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
1 file changed, 195 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..02ab31d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_snb();
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/3] perf, x86: Add Broadwell core support
2015-02-11 0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-11 0:40 ` Andi Kleen
0 siblings, 0 replies; 9+ messages in thread
From: Andi Kleen @ 2015-02-11 0:40 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Add Broadwell support for Broadwell to perf.
The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.
The constraint list has one new event that needs to be handled over Haswell.
Includes code and testing from Kan Liang.
v2: Remove unnamed model numbers.
v3: Rename cache event list to hsw_*. Change names.
v4: Use symbolic names for cache events. Improve comments and description.
Fix sparse warnings (Fengguang Wu)
Add Xeon D model number.
Remove cache event table (in separate patch)
Patch up remote node differences (Kan Liang)
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 50 ++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 02ab31d..035f8d1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -452,6 +461,12 @@ static __initconst const u64 snb_hw_cache_event_ids
#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
static __initconst const u64 hsw_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2759,6 +2774,41 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_ANY_SNOOP|HSW_SUPPLIER_NONE;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP;
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-18 2:18 Andi Kleen
2015-02-18 2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2015-02-18 2:18 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.
Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.
The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.
- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)
The events with additional caveats have references to the specification update.
v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
Various fixes with events being more similar to the older
Sandy Bridge tables.
v4: Add aliases. Drop two PF bits.
v5: Don't use SUPPLIER_NONE. Remote node is DRAM only. More aliases.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 195 ++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..5cd7e2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,197 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2737,8 @@ __init int intel_pmu_init(void)
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_snb();
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/3] perf, x86: Add Broadwell core support
2015-02-18 2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-18 2:18 ` Andi Kleen
0 siblings, 0 replies; 9+ messages in thread
From: Andi Kleen @ 2015-02-18 2:18 UTC (permalink / raw)
To: peterz; +Cc: linux-kernel, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Add Broadwell support for Broadwell to perf.
The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.
The constraint list has one new event that needs to be handled over Haswell.
Includes code and testing from Kan Liang.
v2: Remove unnamed model numbers.
v3: Rename cache event list to hsw_*. Change names.
v4: Use symbolic names for cache events. Improve comments and description.
Fix sparse warnings (Fengguang Wu)
Add Xeon D model number.
Remove cache event table (in separate patch)
Patch up remote node differences (Kan Liang)
v5: Update offcore bits
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5cd7e2b..259e06b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -454,6 +463,12 @@ static __initconst const u64 snb_hw_cache_event_ids
HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
static __initconst const u64 hsw_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2757,6 +2772,38 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
--
1.9.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2015-02-18 2:18 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-08-13 1:45 [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Andi Kleen
2014-08-13 1:45 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2014-08-13 8:00 ` Peter Zijlstra
2014-08-13 1:45 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.PREC_DIST workarounds Andi Kleen
2014-08-13 8:10 ` Peter Zijlstra
2014-08-13 7:58 ` [PATCH 1/3] perf, x86: Remove incorrect model number from Haswell perf Peter Zijlstra
-- strict thread matches above, loose matches on Subject: below --
2015-02-09 19:17 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-09 19:17 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2015-02-11 0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-11 0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2015-02-18 2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-18 2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox