* [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU
@ 2010-03-18 10:33 Lin Ming
2010-03-18 15:56 ` Ingo Molnar
2010-03-18 17:38 ` [tip:perf/core] perf, x86: Add cache events for the Pentium-4 PMU tip-bot for Lin Ming
0 siblings, 2 replies; 5+ messages in thread
From: Lin Ming @ 2010-03-18 10:33 UTC (permalink / raw)
To: Cyrill Gorcunov, Ingo Molnar, Peter Zijlstra; +Cc: lkml
Add cache events in p4 PMU.
Move the HT bit setting code from p4_pmu_event_map to p4_hw_config.
So the cache events can get HT bit set correctly.
Tested on my P4 desktop, below 6 cache events work.
L1-dcache-load-misses
LLC-load-misses
dTLB-load-misses
dTLB-store-misses
iTLB-loads
iTLB-load-misses
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
arch/x86/include/asm/msr-index.h | 2 +
arch/x86/include/asm/perf_event_p4.h | 10 ++
arch/x86/kernel/cpu/perf_event_p4.c | 153 ++++++++++++++++++++++++++++++++--
3 files changed, 159 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cd..aef562c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -357,6 +357,8 @@
#define MSR_P4_U2L_ESCR0 0x000003b0
#define MSR_P4_U2L_ESCR1 0x000003b1
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
/* Intel Core-based CPU performance counters */
#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 7d3406a..871249c 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -708,4 +708,14 @@ enum P4_EVENTS_ATTR {
P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
};
+enum {
+ KEY_P4_L1D_OP_READ_RESULT_MISS,
+ KEY_P4_LL_OP_READ_RESULT_MISS,
+ KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+ KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ KEY_P4_UOP_TYPE,
+};
+
#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e088010..4513d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -19,6 +19,11 @@ struct p4_event_template {
u64 config; /* packed predefined bits */
int dep; /* upstream dependency event index */
int key; /* index into p4_templates */
+ u64 msr; /*
+ * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
+ * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
+ * for cache events
+ */
unsigned int emask; /* ESCR EventMask */
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int cntr[2]; /* counter index (offset) */
@@ -31,6 +36,67 @@ struct p4_pmu_res {
static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+#define P4_CACHE_EVENT_CONFIG(event, bit) \
+ p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
+ p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
+ p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+static __initconst u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* 1stL_cache_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_L1D_OP_READ_RESULT_MISS,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* 2ndL_cache_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_LL_OP_READ_RESULT_MISS,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* DTLB_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* DTLB_store_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ /* ITLB_reference.HIT */
+ [ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
+ | KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+
+ /* ITLB_reference.MISS */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
+ | KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
/*
* WARN: CCCR1 doesn't have a working enable bit so try to not
* use it if possible
@@ -121,11 +187,77 @@ struct p4_event_template p4_templates[] = {
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
.cntr = { 0, 2 },
},
- [7] = {
+ [KEY_P4_L1D_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_L1D_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_LL_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_LL_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
+ .key = KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
+ .opcode = P4_ITLB_REFERENCE,
+ .config = 0,
+ .dep = -1,
+ .msr = 0,
+ .key = KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+ .emask =
+ P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { 0, 2 },
+ },
+ [KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
+ .opcode = P4_ITLB_REFERENCE,
+ .config = 0,
+ .dep = -1,
+ .msr = 0,
+ .key = KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { 0, 2 },
+ },
+ [KEY_P4_UOP_TYPE] = {
.opcode = P4_UOP_TYPE,
.config = 0,
.dep = -1,
- .key = 7,
+ .key = KEY_P4_UOP_TYPE,
.emask =
P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS) |
P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
@@ -155,10 +287,6 @@ static u64 p4_pmu_event_map(int hw_event)
config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
- /* on HT machine we need a special bit */
- if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
- config = p4_set_ht_bit(config);
-
return config;
}
@@ -211,6 +339,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
/* Count user and OS events unless not requested to */
hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
attr->exclude_user));
+ /* on HT machine we need a special bit */
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ hwc->config = p4_set_ht_bit(hwc->config);
+
return 0;
}
@@ -271,6 +403,12 @@ static void p4_pmu_enable_event(struct perf_event *event)
pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
return;
}
+
+ if (tpl->msr) {
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
+ }
+
escr_base = (u64)tpl->escr_msr[thread];
/*
@@ -579,6 +717,9 @@ static __init int p4_pmu_init(void)
return -ENODEV;
}
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
pr_cont("Netburst events, ");
x86_pmu = p4_pmu;
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU
2010-03-18 10:33 [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU Lin Ming
@ 2010-03-18 15:56 ` Ingo Molnar
2010-03-18 16:01 ` Cyrill Gorcunov
2010-03-18 17:38 ` [tip:perf/core] perf, x86: Add cache events for the Pentium-4 PMU tip-bot for Lin Ming
1 sibling, 1 reply; 5+ messages in thread
From: Ingo Molnar @ 2010-03-18 15:56 UTC (permalink / raw)
To: Lin Ming; +Cc: Cyrill Gorcunov, Peter Zijlstra, lkml
* Lin Ming <ming.m.lin@intel.com> wrote:
> Add cache events in p4 PMU.
>
> Move the HT bit setting code from p4_pmu_event_map to p4_hw_config.
> So the cache events can get HT bit set correctly.
>
> Tested on my P4 desktop, below 6 cache events work.
> L1-dcache-load-misses
> LLC-load-misses
> dTLB-load-misses
> dTLB-store-misses
> iTLB-loads
> iTLB-load-misses
>
> Signed-off-by: Lin Ming <ming.m.lin@intel.com>
> ---
> arch/x86/include/asm/msr-index.h | 2 +
> arch/x86/include/asm/perf_event_p4.h | 10 ++
> arch/x86/kernel/cpu/perf_event_p4.c | 153 ++++++++++++++++++++++++++++++++--
> 3 files changed, 159 insertions(+), 6 deletions(-)
i tried it on a Pentium-D box, and it works pretty well:
rhea:/home/mingo/tip> perf stat -a sleep 1
Performance counter stats for 'sleep 1':
2003.237268 task-clock-msecs # 2.000 CPUs
11 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
174 page-faults # 0.000 M/sec
47361 cycles # 0.024 M/sec (scaled from 52.83%)
430 instructions # 0.009 IPC (scaled from 74.58%)
23873 branches # 0.012 M/sec (scaled from 96.70%)
193 branch-misses # 0.808 % (scaled from 49.64%)
867 cache-references # 0.000 M/sec (scaled from 49.69%)
504 cache-misses # 0.000 M/sec (scaled from 49.58%)
1.001411586 seconds time elapsed
So i've applied your patches. Cyrill, what do you think?
Ingo
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU
2010-03-18 15:56 ` Ingo Molnar
@ 2010-03-18 16:01 ` Cyrill Gorcunov
2010-03-18 20:59 ` Cyrill Gorcunov
0 siblings, 1 reply; 5+ messages in thread
From: Cyrill Gorcunov @ 2010-03-18 16:01 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Lin Ming, Peter Zijlstra, lkml
On Thu, Mar 18, 2010 at 04:56:10PM +0100, Ingo Molnar wrote:
>
> * Lin Ming <ming.m.lin@intel.com> wrote:
>
> > Add cache events in p4 PMU.
> >
> > Move the HT bit setting code from p4_pmu_event_map to p4_hw_config.
> > So the cache events can get HT bit set correctly.
> >
> > Tested on my P4 desktop, below 6 cache events work.
> > L1-dcache-load-misses
> > LLC-load-misses
> > dTLB-load-misses
> > dTLB-store-misses
> > iTLB-loads
> > iTLB-load-misses
> >
> > Signed-off-by: Lin Ming <ming.m.lin@intel.com>
> > ---
> > arch/x86/include/asm/msr-index.h | 2 +
> > arch/x86/include/asm/perf_event_p4.h | 10 ++
> > arch/x86/kernel/cpu/perf_event_p4.c | 153 ++++++++++++++++++++++++++++++++--
> > 3 files changed, 159 insertions(+), 6 deletions(-)
>
> i tried it on a Pentium-D box, and it works pretty well:
>
> rhea:/home/mingo/tip> perf stat -a sleep 1
>
> Performance counter stats for 'sleep 1':
>
> 2003.237268 task-clock-msecs # 2.000 CPUs
> 11 context-switches # 0.000 M/sec
> 2 CPU-migrations # 0.000 M/sec
> 174 page-faults # 0.000 M/sec
> 47361 cycles # 0.024 M/sec (scaled from 52.83%)
> 430 instructions # 0.009 IPC (scaled from 74.58%)
> 23873 branches # 0.012 M/sec (scaled from 96.70%)
> 193 branch-misses # 0.808 % (scaled from 49.64%)
> 867 cache-references # 0.000 M/sec (scaled from 49.69%)
> 504 cache-misses # 0.000 M/sec (scaled from 49.58%)
>
> 1.001411586 seconds time elapsed
>
> So i've applied your patches. Cyrill, what do you think?
Sorry for a bit delay, yes, pick it up please. I found that we have an
issue in escr binding (which is pretty mine error), hope to fix it up
today. Also I hope to eventually implement raw events this weekend
but better to base the new code on all this stuff merged. This will
allow to "take a look" on code structure from a high point and find
potential caveats.
All-in-one
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Thanks a huge, Ming!
>
> Ingo
>
-- Cyrill
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU
2010-03-18 16:01 ` Cyrill Gorcunov
@ 2010-03-18 20:59 ` Cyrill Gorcunov
0 siblings, 0 replies; 5+ messages in thread
From: Cyrill Gorcunov @ 2010-03-18 20:59 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Lin Ming, Peter Zijlstra, lkml
On Thu, Mar 18, 2010 at 07:01:35PM +0300, Cyrill Gorcunov wrote:
[...]
> > So i've applied your patches. Cyrill, what do you think?
>
> Sorry for a bit delay, yes, pick it up please. I found that we have an
> issue in escr binding (which is pretty mine error), hope to fix it up
> today. Also I hope to eventually implement raw events this weekend
> but better to base the new code on all this stuff merged. This will
> allow to "take a look" on code structure from a high point and find
> potential caveats.
>
> All-in-one
>
> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
>
> Thanks a huge, Ming!
>
> >
> > Ingo
> >
> -- Cyrill
Ok, the issue I thought about not really exist. Misfire :)
-- Cyrill
^ permalink raw reply [flat|nested] 5+ messages in thread
* [tip:perf/core] perf, x86: Add cache events for the Pentium-4 PMU
2010-03-18 10:33 [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU Lin Ming
2010-03-18 15:56 ` Ingo Molnar
@ 2010-03-18 17:38 ` tip-bot for Lin Ming
1 sibling, 0 replies; 5+ messages in thread
From: tip-bot for Lin Ming @ 2010-03-18 17:38 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, hpa, mingo, gorcunov, peterz, ming.m.lin, tglx,
mingo
Commit-ID: cb7d6b5053e86598735d9af19930f5929f007b7f
Gitweb: http://git.kernel.org/tip/cb7d6b5053e86598735d9af19930f5929f007b7f
Author: Lin Ming <ming.m.lin@intel.com>
AuthorDate: Thu, 18 Mar 2010 18:33:12 +0800
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 18 Mar 2010 17:04:02 +0100
perf, x86: Add cache events for the Pentium-4 PMU
Move the HT bit setting code from p4_pmu_event_map to
p4_hw_config. So the cache events can get HT bit set correctly.
Tested on my P4 desktop, below 6 cache events work:
L1-dcache-load-misses
LLC-load-misses
dTLB-load-misses
dTLB-store-misses
iTLB-loads
iTLB-load-misses
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908392.13901.128.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/x86/include/asm/msr-index.h | 2 +
arch/x86/include/asm/perf_event_p4.h | 10 ++
arch/x86/kernel/cpu/perf_event_p4.c | 153 ++++++++++++++++++++++++++++++++--
3 files changed, 159 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cd..aef562c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -357,6 +357,8 @@
#define MSR_P4_U2L_ESCR0 0x000003b0
#define MSR_P4_U2L_ESCR1 0x000003b1
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
/* Intel Core-based CPU performance counters */
#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 7d3406a..871249c 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -708,4 +708,14 @@ enum P4_EVENTS_ATTR {
P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
};
+enum {
+ KEY_P4_L1D_OP_READ_RESULT_MISS,
+ KEY_P4_LL_OP_READ_RESULT_MISS,
+ KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+ KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ KEY_P4_UOP_TYPE,
+};
+
#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3e97ed3..b7bf991 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -19,6 +19,11 @@ struct p4_event_template {
u64 config; /* packed predefined bits */
int dep; /* upstream dependency event index */
int key; /* index into p4_templates */
+ u64 msr; /*
+ * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
+ * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
+ * for cache events
+ */
unsigned int emask; /* ESCR EventMask */
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int cntr[2]; /* counter index (offset) */
@@ -31,6 +36,67 @@ struct p4_pmu_res {
static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+#define P4_CACHE_EVENT_CONFIG(event, bit) \
+ p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
+ p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
+ p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+static __initconst u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* 1stL_cache_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_L1D_OP_READ_RESULT_MISS,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* 2ndL_cache_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_LL_OP_READ_RESULT_MISS,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* DTLB_load_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ /* DTLB_store_miss_retired */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+ | KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ /* ITLB_reference.HIT */
+ [ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
+ | KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+
+ /* ITLB_reference.MISS */
+ [ C(RESULT_MISS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
+ | KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
/*
* WARN: CCCR1 doesn't have a working enable bit so try to not
* use it if possible
@@ -121,11 +187,77 @@ struct p4_event_template p4_templates[] = {
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
.cntr = { 0, 2 },
},
- [7] = {
+ [KEY_P4_L1D_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_L1D_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_LL_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_LL_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
+ .key = KEY_P4_DTLB_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
+ .opcode = P4_REPLAY_EVENT,
+ .config = 0,
+ .dep = -1,
+ .msr = (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
+ .key = KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+ .cntr = { 16, 17 },
+ },
+ [KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
+ .opcode = P4_ITLB_REFERENCE,
+ .config = 0,
+ .dep = -1,
+ .msr = 0,
+ .key = KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+ .emask =
+ P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { 0, 2 },
+ },
+ [KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
+ .opcode = P4_ITLB_REFERENCE,
+ .config = 0,
+ .dep = -1,
+ .msr = 0,
+ .key = KEY_P4_ITLB_OP_READ_RESULT_MISS,
+ .emask =
+ P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { 0, 2 },
+ },
+ [KEY_P4_UOP_TYPE] = {
.opcode = P4_UOP_TYPE,
.config = 0,
.dep = -1,
- .key = 7,
+ .key = KEY_P4_UOP_TYPE,
.emask =
P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS) |
P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
@@ -155,10 +287,6 @@ static u64 p4_pmu_event_map(int hw_event)
config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
- /* on HT machine we need a special bit */
- if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
- config = p4_set_ht_bit(config);
-
return config;
}
@@ -211,6 +339,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
/* Count user and OS events unless not requested to */
hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
attr->exclude_user));
+ /* on HT machine we need a special bit */
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ hwc->config = p4_set_ht_bit(hwc->config);
+
return 0;
}
@@ -271,6 +403,12 @@ static void p4_pmu_enable_event(struct perf_event *event)
pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
return;
}
+
+ if (tpl->msr) {
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
+ }
+
escr_base = (u64)tpl->escr_msr[thread];
/*
@@ -577,6 +715,9 @@ static __init int p4_pmu_init(void)
return -ENODEV;
}
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
pr_cont("Netburst events, ");
x86_pmu = p4_pmu;
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2010-03-18 20:59 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-18 10:33 [RFC][PATCH 2/2] x86,perf: add cache events in p4 PMU Lin Ming
2010-03-18 15:56 ` Ingo Molnar
2010-03-18 16:01 ` Cyrill Gorcunov
2010-03-18 20:59 ` Cyrill Gorcunov
2010-03-18 17:38 ` [tip:perf/core] perf, x86: Add cache events for the Pentium-4 PMU tip-bot for Lin Ming
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.