[PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
@ 2010-12-27 15:39 Lin Ming
  2011-01-04 11:59 ` Peter Zijlstra
  2011-01-05  9:50 ` Peter Zijlstra
  0 siblings, 2 replies; 6+ messages in thread
From: Lin Ming @ 2010-12-27 15:39 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian; +Cc: lkml

v2 change logs:
- generic load latency data source encoding
- sample more load latency data(data address/latency value/data source)
- remove the hw_perf_event::extra_flags 
- increment MAX_EXTRA_REGS to 3


This patch adds load latency monitoring on Intel Nehalem/Westmere.
It's applied on top of tip/master(1f7107c8) and Andi's offcore
patchsets are needed(see PATCH 0-3).

The load latency on Intel Nehalem/Westmere is monitored by event
MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD(0x100b). It measures latency
from micro-operation (uop) dispatch to when data is globally observable
(GO).

To monitor load latency, both the PEBS_EN_CTRx and LL_EN_CTRx bits must
be set in IA32_PEBS_ENABLE register. And an extra MSR
MSR_PEBS_LD_LAT_THRESHOLD must be programmed with the desired latency
threshold in core clock cycles. Loads with latencies greater than this
value are counted.

The latency threshold is encoded in the upper 32 bits of
perf_event_attr::config and 'p' modifier must be used to enabel PEBS.

The default latency threshold is 3, as Intel manual says, "The minimum
value that may be programmed in this register is 3 (the minimum
detectable load latency is 4 core clock cycles)."

Usage:

# perf top -e r100b:p

#Monitor load latency > 51 cycles
# perf top -e r510000100b:p

Any comment is appropriated.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 arch/x86/include/asm/msr-index.h          |    2 +
 arch/x86/include/asm/perf_event.h         |   26 ++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c          |    7 ++++
 arch/x86/kernel/cpu/perf_event_intel.c    |    6 +++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   47 +++++++++++++++++++++++++----
 include/linux/perf_event.h                |    8 ++++-
 6 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c3c42b1..174127d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -54,6 +54,8 @@
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 
+#define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
+
 #define MSR_MTRRfix64K_00000		0x00000250
 #define MSR_MTRRfix16K_80000		0x00000258
 #define MSR_MTRRfix16K_A0000		0x00000259
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae..e01259f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -75,6 +75,32 @@ union cpuid10_edx {
 	unsigned int full;
 };
 
+/*
+ * Load latency data source encoding
+ */
+
+/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached} */
+#define LD_LAT_L1			0x00
+#define LD_LAT_L2			0x01
+#define LD_LAT_L3			0x02
+#define LD_LAT_RAM			0x03
+#define LD_LAT_UNKNOWN			0x00
+#define LD_LAT_IO			0x01
+#define LD_LAT_UNCACHED			0x02
+
+/* Bits(2-3) {not-used, snoop, local, remote} */
+#define LD_LAT_NOT_USED			(0x00 << 2)
+#define LD_LAT_SNOOP			(0x01 << 2)
+#define LD_LAT_LOCAL			(0x02 << 2)
+#define LD_LAT_REMOTE			(0x03 << 2)
+
+/* Bits(4-5) {modified, exclusive, shared, invalid} */
+#define LD_LAT_MODIFIED			(0x00 << 4)
+#define LD_LAT_EXCLUSIVE		(0x01 << 4)
+#define LD_LAT_SHARED			(0x02 << 4)
+#define LD_LAT_INVALID			(0x03 << 4)
+
+#define LD_LAT_RESERVED			0x3F
 
 /*
  * Fixed-purpose performance events:
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed6ff11..713695e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -208,6 +208,9 @@ struct extra_reg {
 	}
 #define INTEL_EVENT_EXTRA_REG(event, msr, vm, es)	\
 	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es)
+#define INTEL_EVENT_EXTRA_REG2(event, msr, vm, es)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+			ARCH_PERFMON_EVENTSEL_UMASK, vm, es)
 #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0)
 
 union perf_capabilities {
@@ -384,6 +387,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 		if (extra & ~er->valid_mask)
 			return -EINVAL;
 		event->hw.extra_config = extra;
+
+		/* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
+		if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
+			event->hw.extra_config = 3;
 		break;
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 0a67425..6df225a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,6 +1,6 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
-#define MAX_EXTRA_REGS 2
+#define MAX_EXTRA_REGS 3
 
 /* 
  * Per register state.
@@ -90,6 +90,8 @@ static struct extra_reg intel_nehalem_extra_regs[] =
 {
 	/* OFFCORE_RESPONSE */
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, 32),
+	/* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
+	INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff, 32),
 	EVENT_EXTRA_END
 };
 
@@ -117,6 +119,8 @@ static struct extra_reg intel_westmere_extra_regs[] =
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, 32),
 	/* OFFCORE_RESPONSE_1 */
 	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, 32),
+	/* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
+	INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff, 32),
 	EVENT_EXTRA_END
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f..06c6107 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1,5 +1,26 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
+/* Indexed by Intel load latency data source encoding value */
+
+static u64 intel_load_latency_data[] = {
+	LD_LAT_UNKNOWN,					/* 0x00: Unknown L3 */
+	LD_LAT_L1 | LD_LAT_LOCAL,			/* 0x01: L1-local */
+	LD_LAT_L2 | LD_LAT_SNOOP,			/* 0x02: L2-snoop */
+	LD_LAT_L2 | LD_LAT_LOCAL,			/* 0x03: L2-local */
+	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_INVALID,	/* 0x04: L3-snoop, no coherency actions */
+	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x05: L3-snoop, found no M */
+	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_MODIFIED,	/* 0x06: L3-snoop, found M */
+	LD_LAT_RESERVED,				/* 0x07: reserved */
+	LD_LAT_RAM | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x08: L3-miss, snoop, shared */
+	LD_LAT_RESERVED,				/* 0x09: reserved */
+	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_SHARED,	/* 0x0A: L3-miss, local, shared */
+	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_SHARED,	/* 0x0B: L3-miss, remote, shared */
+	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_EXCLUSIVE,	/* 0x0C: L3-miss, local, exclusive */
+	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_EXCLUSIVE,	/* 0x0D: L3-miss, remote, exclusive */
+	LD_LAT_IO,					/* 0x0E: IO */
+	LD_LAT_UNCACHED,				/* 0x0F: Uncached */
+};
+
 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS		4
 
@@ -376,6 +397,7 @@ static struct event_constraint intel_core_pebs_events[] = {
 };
 
 static struct event_constraint intel_nehalem_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
 	PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
 	PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
 	PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
@@ -414,6 +436,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
+	if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
+		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
 	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
@@ -426,6 +450,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+	if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
+		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
@@ -541,14 +567,10 @@ static int intel_pmu_save_and_restart(struct perf_event *event);
 static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs)
 {
-	/*
-	 * We cast to pebs_record_core since that is a subset of
-	 * both formats and we don't use the other fields in this
-	 * routine.
-	 */
-	struct pebs_record_core *pebs = __pebs;
+	struct pebs_record_nhm *pebs = __pebs;
 	struct perf_sample_data data;
 	struct pt_regs regs;
+	u64 sample_type;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
@@ -556,6 +578,19 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
+	if (event->hw.extra_reg == MSR_PEBS_LD_LAT_THRESHOLD) {
+		sample_type = event->attr.sample_type;
+
+		if (sample_type & PERF_SAMPLE_ADDR)
+			data.addr = pebs->dla;
+
+		if (sample_type & PERF_SAMPLE_LATENCY)
+			data.latency = pebs->lat;
+
+		if (sample_type & PERF_SAMPLE_LATENCY_DATA)
+			data.latency_data = intel_load_latency_data[pebs->dse];
+	}
+
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d24d9ab..9428a1b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -124,9 +124,11 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
-	PERF_SAMPLE_RAW				= 1U << 10,
+	PERF_SAMPLE_LATENCY			= 1U << 10,
+	PERF_SAMPLE_LATENCY_DATA		= 1U << 11,
+	PERF_SAMPLE_RAW				= 1U << 12,
 
-	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 13,		/* non-ABI */
 };
 
 /*
@@ -958,6 +960,8 @@ struct perf_sample_data {
 	}				tid_entry;
 	u64				time;
 	u64				addr;
+	u64				latency;
+	u64				latency_data;
 	u64				id;
 	u64				stream_id;
 	struct {
-- 
1.7.3






^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
  2010-12-27 15:39 [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2 Lin Ming
@ 2011-01-04 11:59 ` Peter Zijlstra
  2011-01-04 14:03   ` Peter Zijlstra
  2011-01-05  7:12   ` Lin Ming
  2011-01-05  9:50 ` Peter Zijlstra
  1 sibling, 2 replies; 6+ messages in thread
From: Peter Zijlstra @ 2011-01-04 11:59 UTC (permalink / raw)
  To: Lin Ming; +Cc: Ingo Molnar, Andi Kleen, Stephane Eranian, lkml

On Mon, 2010-12-27 at 23:39 +0800, Lin Ming wrote:

> +/*
> + * Load latency data source encoding
> + */
> +
> +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached} */
> +#define LD_LAT_L1			0x00
> +#define LD_LAT_L2			0x01
> +#define LD_LAT_L3			0x02
> +#define LD_LAT_RAM			0x03
> +#define LD_LAT_UNKNOWN			0x00
> +#define LD_LAT_IO			0x01
> +#define LD_LAT_UNCACHED			0x02
> +
> +/* Bits(2-3) {not-used, snoop, local, remote} */
> +#define LD_LAT_NOT_USED			(0x00 << 2)
> +#define LD_LAT_SNOOP			(0x01 << 2)
> +#define LD_LAT_LOCAL			(0x02 << 2)
> +#define LD_LAT_REMOTE			(0x03 << 2)
> +
> +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> +#define LD_LAT_MODIFIED			(0x00 << 4)
> +#define LD_LAT_EXCLUSIVE		(0x01 << 4)
> +#define LD_LAT_SHARED			(0x02 << 4)
> +#define LD_LAT_INVALID			(0x03 << 4)
> +
> +#define LD_LAT_RESERVED			0x3F

Hmm, why is that RESREVED? From what I can see that ends up being:
  RAM-remote-invalid

Which brings us to the NOT_USED thing, from what I can see its the value
that toggles between {L1, L2, L3, RAM} and {unknown, IO, uncached},
that's not NOT_USED.

There's still room for a {reserved} value in that alternative set,
making it: {unknown, IO, uncached, reserved}

Which would make

#define EXTRA_SRC_RESERVED 0x03


> @@ -384,6 +387,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
>  		if (extra & ~er->valid_mask)
>  			return -EINVAL;
>  		event->hw.extra_config = extra;
> +
> +		/* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
> +		if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
> +			event->hw.extra_config = 3;
>  		break;
>  	}
>  	return 0;

I'm not really sure about this, should we silently fix up or bail on
invalid values?

> +/* Indexed by Intel load latency data source encoding value */
> +
> +static u64 intel_load_latency_data[] = {
> +	LD_LAT_UNKNOWN,					/* 0x00: Unknown L3 */
> +	LD_LAT_L1 | LD_LAT_LOCAL,			/* 0x01: L1-local */
> +	LD_LAT_L2 | LD_LAT_SNOOP,			/* 0x02: L2-snoop */

I bet Stephane wants to argue more on this ;-)

> +	LD_LAT_L2 | LD_LAT_LOCAL,			/* 0x03: L2-local */
> +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_INVALID,	/* 0x04: L3-snoop, no coherency actions */
> +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x05: L3-snoop, found no M */
> +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_MODIFIED,	/* 0x06: L3-snoop, found M */
> +	LD_LAT_RESERVED,				/* 0x07: reserved */
> +	LD_LAT_RAM | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x08: L3-miss, snoop, shared */
> +	LD_LAT_RESERVED,				/* 0x09: reserved */
> +	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_SHARED,	/* 0x0A: L3-miss, local, shared */
> +	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_SHARED,	/* 0x0B: L3-miss, remote, shared */
> +	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_EXCLUSIVE,	/* 0x0C: L3-miss, local, exclusive */
> +	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_EXCLUSIVE,	/* 0x0D: L3-miss, remote, exclusive */
> +	LD_LAT_IO,					/* 0x0E: IO */
> +	LD_LAT_UNCACHED,				/* 0x0F: Uncached */
> +};


> @@ -541,14 +567,10 @@ static int intel_pmu_save_and_restart(struct perf_event *event);
>  static void __intel_pmu_pebs_event(struct perf_event *event,
>  				   struct pt_regs *iregs, void *__pebs)
>  {
> -	/*
> -	 * We cast to pebs_record_core since that is a subset of
> -	 * both formats and we don't use the other fields in this
> -	 * routine.
> -	 */
> -	struct pebs_record_core *pebs = __pebs;
> +	struct pebs_record_nhm *pebs = __pebs;


So you cast to the biggest and make sure you don't peek past the end of
the actual data.. that might want to have a comment.


> @@ -556,6 +578,19 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
>  	perf_sample_data_init(&data, 0);
>  	data.period = event->hw.last_period;
>  
> +	if (event->hw.extra_reg == MSR_PEBS_LD_LAT_THRESHOLD) {

This is what stops us from peeking past the end of the pebs record,
since core would never end up having the LD_LAT extra_reg.

> +		sample_type = event->attr.sample_type;
> +
> +		if (sample_type & PERF_SAMPLE_ADDR)
> +			data.addr = pebs->dla;
> +
> +		if (sample_type & PERF_SAMPLE_LATENCY)
> +			data.latency = pebs->lat;
> +
> +		if (sample_type & PERF_SAMPLE_LATENCY_DATA)
> +			data.latency_data = intel_load_latency_data[pebs->dse];


> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d24d9ab..9428a1b 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -124,9 +124,11 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_CPU				= 1U << 7,
>  	PERF_SAMPLE_PERIOD			= 1U << 8,
>  	PERF_SAMPLE_STREAM_ID			= 1U << 9,
> -	PERF_SAMPLE_RAW				= 1U << 10,
> +	PERF_SAMPLE_LATENCY			= 1U << 10,
> +	PERF_SAMPLE_LATENCY_DATA		= 1U << 11,
> +	PERF_SAMPLE_RAW				= 1U << 12,
>  
> -	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 13,		/* non-ABI */
>  };
>  
>  /*
> @@ -958,6 +960,8 @@ struct perf_sample_data {
>  	}				tid_entry;
>  	u64				time;
>  	u64				addr;
> +	u64				latency;
> +	u64				latency_data;
>  	u64				id;
>  	u64				stream_id;
>  	struct {

Right, so I think I want this in 3 patches, one adding the load-latency
extra reg thing and PERF_SAMPLE_ADDR usage, one adding
PERF_SAMPLE_LATENCY and one adding PERF_SAMPLE_EXTRA, I really dislike
the LATENCY_DATA name since that ties the extra data to the latency
thing, which isn't at all true for other platforms.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
  2011-01-04 11:59 ` Peter Zijlstra
@ 2011-01-04 14:03   ` Peter Zijlstra
  2011-01-06  7:49     ` Lin Ming
  2011-01-05  7:12   ` Lin Ming
  1 sibling, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2011-01-04 14:03 UTC (permalink / raw)
  To: Lin Ming; +Cc: Ingo Molnar, Andi Kleen, Stephane Eranian, lkml

On Tue, 2011-01-04 at 12:59 +0100, Peter Zijlstra wrote:
> 
> Right, so I think I want this in 3 patches, one adding the load-latency
> extra reg thing and PERF_SAMPLE_ADDR usage, one adding
> PERF_SAMPLE_LATENCY and one adding PERF_SAMPLE_EXTRA, I really dislike
> the LATENCY_DATA name since that ties the extra data to the latency
> thing, which isn't at all true for other platforms.

Also, this wants a 4th patch to be fully mergable, we want to have
tools/perf/ support for these things..

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
  2011-01-04 14:03   ` Peter Zijlstra
@ 2011-01-06  7:49     ` Lin Ming
  0 siblings, 0 replies; 6+ messages in thread
From: Lin Ming @ 2011-01-06  7:49 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Andi Kleen, Stephane Eranian, lkml

On Tue, 2011-01-04 at 22:03 +0800, Peter Zijlstra wrote:
> On Tue, 2011-01-04 at 12:59 +0100, Peter Zijlstra wrote:
> > 
> > Right, so I think I want this in 3 patches, one adding the load-latency
> > extra reg thing and PERF_SAMPLE_ADDR usage, one adding
> > PERF_SAMPLE_LATENCY and one adding PERF_SAMPLE_EXTRA, I really dislike
> > the LATENCY_DATA name since that ties the extra data to the latency
> > thing, which isn't at all true for other platforms.
> 
> Also, this wants a 4th patch to be fully mergable, we want to have
> tools/perf/ support for these things..

I am thinking what can be added to perf tools....

Seems the average load latency values for each data source and data
linear address is useful.

We may add these support to perf stat or perf record/report.

Any other idea?

data source			average load latency(cycles)
=========			=============================
Unknown L3			xxx
L1-local			xxx
L2-snoop			xxx
....
....
L3-miss,remote,exclusive        xxx
IO                              xxx
Uncached memory                 xxx

data linear address		avergage load latency(cycles)
===================		=============================
xxxxxxx				xxx
xxxxxxx				xxx
xxxxxxx				xxx
....
....

Thanks,
Lin Ming

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
  2011-01-04 11:59 ` Peter Zijlstra
  2011-01-04 14:03   ` Peter Zijlstra
@ 2011-01-05  7:12   ` Lin Ming
  1 sibling, 0 replies; 6+ messages in thread
From: Lin Ming @ 2011-01-05  7:12 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Andi Kleen, Stephane Eranian, lkml

On Tue, 2011-01-04 at 19:59 +0800, Peter Zijlstra wrote:
> On Mon, 2010-12-27 at 23:39 +0800, Lin Ming wrote:
> 
> > +/*
> > + * Load latency data source encoding
> > + */
> > +
> > +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached} */
> > +#define LD_LAT_L1			0x00
> > +#define LD_LAT_L2			0x01
> > +#define LD_LAT_L3			0x02
> > +#define LD_LAT_RAM			0x03
> > +#define LD_LAT_UNKNOWN			0x00
> > +#define LD_LAT_IO			0x01
> > +#define LD_LAT_UNCACHED			0x02
> > +
> > +/* Bits(2-3) {not-used, snoop, local, remote} */
> > +#define LD_LAT_NOT_USED			(0x00 << 2)
> > +#define LD_LAT_SNOOP			(0x01 << 2)
> > +#define LD_LAT_LOCAL			(0x02 << 2)
> > +#define LD_LAT_REMOTE			(0x03 << 2)
> > +
> > +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> > +#define LD_LAT_MODIFIED			(0x00 << 4)
> > +#define LD_LAT_EXCLUSIVE		(0x01 << 4)
> > +#define LD_LAT_SHARED			(0x02 << 4)
> > +#define LD_LAT_INVALID			(0x03 << 4)
> > +
> > +#define LD_LAT_RESERVED			0x3F
> 
> Hmm, why is that RESREVED? From what I can see that ends up being:
>   RAM-remote-invalid

My bad, will fix this.

> 
> Which brings us to the NOT_USED thing, from what I can see its the value
> that toggles between {L1, L2, L3, RAM} and {unknown, IO, uncached},
> that's not NOT_USED.

Right,

-#define LD_LAT_NOT_USED                 (0x00 << 2)
+#define LD_LAT_TOGGLE                   (0x00 << 2)

But this name sounds strange, do you have a better name?

> 
> There's still room for a {reserved} value in that alternative set,
> making it: {unknown, IO, uncached, reserved}
> 
> Which would make
> 
> #define EXTRA_SRC_RESERVED 0x03

OK

> 
> 
> > @@ -384,6 +387,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
> >  		if (extra & ~er->valid_mask)
> >  			return -EINVAL;
> >  		event->hw.extra_config = extra;
> > +
> > +		/* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
> > +		if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
> > +			event->hw.extra_config = 3;
> >  		break;
> >  	}
> >  	return 0;
> 
> I'm not really sure about this, should we silently fix up or bail on
> invalid values?

Bailing on invalid values makes more sense.

> 
> > +/* Indexed by Intel load latency data source encoding value */
> > +
> > +static u64 intel_load_latency_data[] = {
> > +	LD_LAT_UNKNOWN,					/* 0x00: Unknown L3 */
> > +	LD_LAT_L1 | LD_LAT_LOCAL,			/* 0x01: L1-local */
> > +	LD_LAT_L2 | LD_LAT_SNOOP,			/* 0x02: L2-snoop */
> 
> I bet Stephane wants to argue more on this ;-)

Stephane, what do you think? :)

> 
> > +	LD_LAT_L2 | LD_LAT_LOCAL,			/* 0x03: L2-local */
> > +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_INVALID,	/* 0x04: L3-snoop, no coherency actions */
> > +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x05: L3-snoop, found no M */
> > +	LD_LAT_L3 | LD_LAT_SNOOP | LD_LAT_MODIFIED,	/* 0x06: L3-snoop, found M */
> > +	LD_LAT_RESERVED,				/* 0x07: reserved */
> > +	LD_LAT_RAM | LD_LAT_SNOOP | LD_LAT_SHARED,	/* 0x08: L3-miss, snoop, shared */
> > +	LD_LAT_RESERVED,				/* 0x09: reserved */
> > +	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_SHARED,	/* 0x0A: L3-miss, local, shared */
> > +	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_SHARED,	/* 0x0B: L3-miss, remote, shared */
> > +	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_EXCLUSIVE,	/* 0x0C: L3-miss, local, exclusive */
> > +	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_EXCLUSIVE,	/* 0x0D: L3-miss, remote, exclusive */
> > +	LD_LAT_IO,					/* 0x0E: IO */
> > +	LD_LAT_UNCACHED,				/* 0x0F: Uncached */
> > +};
> 
> 
> > @@ -541,14 +567,10 @@ static int intel_pmu_save_and_restart(struct perf_event *event);
> >  static void __intel_pmu_pebs_event(struct perf_event *event,
> >  				   struct pt_regs *iregs, void *__pebs)
> >  {
> > -	/*
> > -	 * We cast to pebs_record_core since that is a subset of
> > -	 * both formats and we don't use the other fields in this
> > -	 * routine.
> > -	 */
> > -	struct pebs_record_core *pebs = __pebs;
> > +	struct pebs_record_nhm *pebs = __pebs;
> 
> 
> So you cast to the biggest and make sure you don't peek past the end of
> the actual data.. that might want to have a comment.

Will add a comment.

> 
> 
> > @@ -556,6 +578,19 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
> >  	perf_sample_data_init(&data, 0);
> >  	data.period = event->hw.last_period;
> >  
> > +	if (event->hw.extra_reg == MSR_PEBS_LD_LAT_THRESHOLD) {
> 
> This is what stops us from peeking past the end of the pebs record,
> since core would never end up having the LD_LAT extra_reg.
> 
> > +		sample_type = event->attr.sample_type;
> > +
> > +		if (sample_type & PERF_SAMPLE_ADDR)
> > +			data.addr = pebs->dla;
> > +
> > +		if (sample_type & PERF_SAMPLE_LATENCY)
> > +			data.latency = pebs->lat;
> > +
> > +		if (sample_type & PERF_SAMPLE_LATENCY_DATA)
> > +			data.latency_data = intel_load_latency_data[pebs->dse];
> 
> 
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index d24d9ab..9428a1b 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -124,9 +124,11 @@ enum perf_event_sample_format {
> >  	PERF_SAMPLE_CPU				= 1U << 7,
> >  	PERF_SAMPLE_PERIOD			= 1U << 8,
> >  	PERF_SAMPLE_STREAM_ID			= 1U << 9,
> > -	PERF_SAMPLE_RAW				= 1U << 10,
> > +	PERF_SAMPLE_LATENCY			= 1U << 10,
> > +	PERF_SAMPLE_LATENCY_DATA		= 1U << 11,
> > +	PERF_SAMPLE_RAW				= 1U << 12,
> >  
> > -	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
> > +	PERF_SAMPLE_MAX = 1U << 13,		/* non-ABI */
> >  };
> >  
> >  /*
> > @@ -958,6 +960,8 @@ struct perf_sample_data {
> >  	}				tid_entry;
> >  	u64				time;
> >  	u64				addr;
> > +	u64				latency;
> > +	u64				latency_data;
> >  	u64				id;
> >  	u64				stream_id;
> >  	struct {
> 
> Right, so I think I want this in 3 patches, one adding the load-latency
> extra reg thing and PERF_SAMPLE_ADDR usage, one adding
> PERF_SAMPLE_LATENCY and one adding PERF_SAMPLE_EXTRA, I really dislike
> the LATENCY_DATA name since that ties the extra data to the latency
> thing, which isn't at all true for other platforms.

OK, will use the EXTRA name and split the patches.

Thanks for review.
Below is the incremental patch. I'll send new version later.

Thanks,

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e01259f..54a8bf9 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -79,7 +79,7 @@ union cpuid10_edx {
  * Load latency data source encoding
  */
 
-/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached} */
+/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved} */
 #define LD_LAT_L1			0x00
 #define LD_LAT_L2			0x01
 #define LD_LAT_L3			0x02
@@ -87,9 +87,10 @@ union cpuid10_edx {
 #define LD_LAT_UNKNOWN			0x00
 #define LD_LAT_IO			0x01
 #define LD_LAT_UNCACHED			0x02
+#define LD_LAT_RESERVED			0x03
 
-/* Bits(2-3) {not-used, snoop, local, remote} */
-#define LD_LAT_NOT_USED			(0x00 << 2)
+/* Bits(2-3) {toggle, snoop, local, remote} */
+#define LD_LAT_TOGGLE			(0x00 << 2)
 #define LD_LAT_SNOOP			(0x01 << 2)
 #define LD_LAT_LOCAL			(0x02 << 2)
 #define LD_LAT_REMOTE			(0x03 << 2)
@@ -100,7 +101,7 @@ union cpuid10_edx {
 #define LD_LAT_SHARED			(0x02 << 4)
 #define LD_LAT_INVALID			(0x03 << 4)
 
-#define LD_LAT_RESERVED			0x3F
+#define EXTRA_SRC_RESERVED		0x03
 
 /*
  * Fixed-purpose performance events:
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 713695e..cf2e12e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -386,11 +386,12 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 		extra = config >> er->extra_shift;
 		if (extra & ~er->valid_mask)
 			return -EINVAL;
-		event->hw.extra_config = extra;
 
 		/* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
 		if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
-			event->hw.extra_config = 3;
+			return -EINVAL;
+
+		event->hw.extra_config = extra;
 		break;
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 06c6107..f555f9f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,7 +3,7 @@
 /* Indexed by Intel load latency data source encoding value */
 
 static u64 intel_load_latency_data[] = {
-	LD_LAT_UNKNOWN,					/* 0x00: Unknown L3 */
+	LD_LAT_UNKNOWN | LD_LAT_TOGGLE,			/* 0x00: Unknown L3 */
 	LD_LAT_L1 | LD_LAT_LOCAL,			/* 0x01: L1-local */
 	LD_LAT_L2 | LD_LAT_SNOOP,			/* 0x02: L2-snoop */
 	LD_LAT_L2 | LD_LAT_LOCAL,			/* 0x03: L2-local */
@@ -17,8 +17,8 @@ static u64 intel_load_latency_data[] = {
 	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_SHARED,	/* 0x0B: L3-miss, remote, shared */
 	LD_LAT_RAM | LD_LAT_LOCAL | LD_LAT_EXCLUSIVE,	/* 0x0C: L3-miss, local, exclusive */
 	LD_LAT_RAM | LD_LAT_REMOTE | LD_LAT_EXCLUSIVE,	/* 0x0D: L3-miss, remote, exclusive */
-	LD_LAT_IO,					/* 0x0E: IO */
-	LD_LAT_UNCACHED,				/* 0x0F: Uncached */
+	LD_LAT_IO | LD_LAT_TOGGLE,			/* 0x0E: IO */
+	LD_LAT_UNCACHED | LD_LAT_TOGGLE,		/* 0x0F: Uncached */
 };
 
 /* The maximal number of PEBS events: */
@@ -567,6 +567,10 @@ static int intel_pmu_save_and_restart(struct perf_event *event);
 static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs)
 {
+	/*
+	 * We cast to pebs_record_nhm to get the load latency data
+	 * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
+	 */
 	struct pebs_record_nhm *pebs = __pebs;
 	struct perf_sample_data data;
 	struct pt_regs regs;
@@ -587,8 +591,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		if (sample_type & PERF_SAMPLE_LATENCY)
 			data.latency = pebs->lat;
 
-		if (sample_type & PERF_SAMPLE_LATENCY_DATA)
-			data.latency_data = intel_load_latency_data[pebs->dse];
+		if (sample_type & PERF_SAMPLE_EXTRA)
+			data.extra = intel_load_latency_data[pebs->dse];
 	}
 
 	/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9428a1b..0d80e0d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -125,7 +125,7 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
 	PERF_SAMPLE_LATENCY			= 1U << 10,
-	PERF_SAMPLE_LATENCY_DATA		= 1U << 11,
+	PERF_SAMPLE_EXTRA			= 1U << 11,
 	PERF_SAMPLE_RAW				= 1U << 12,
 
 	PERF_SAMPLE_MAX = 1U << 13,		/* non-ABI */
@@ -960,8 +960,6 @@ struct perf_sample_data {
 	}				tid_entry;
 	u64				time;
 	u64				addr;
-	u64				latency;
-	u64				latency_data;
 	u64				id;
 	u64				stream_id;
 	struct {
@@ -969,6 +967,8 @@ struct perf_sample_data {
 		u32	reserved;
 	}				cpu_entry;
 	u64				period;
+	u64				latency;
+	u64				extra;
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 };



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2
  2010-12-27 15:39 [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2 Lin Ming
  2011-01-04 11:59 ` Peter Zijlstra
@ 2011-01-05  9:50 ` Peter Zijlstra
  1 sibling, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2011-01-05  9:50 UTC (permalink / raw)
  To: Lin Ming; +Cc: Ingo Molnar, Andi Kleen, Stephane Eranian, lkml

On Mon, 2010-12-27 at 23:39 +0800, Lin Ming wrote:
> +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached} */
> +#define LD_LAT_L1                      0x00
> +#define LD_LAT_L2                      0x01
> +#define LD_LAT_L3                      0x02
> +#define LD_LAT_RAM                     0x03
> +#define LD_LAT_UNKNOWN                 0x00
> +#define LD_LAT_IO                      0x01
> +#define LD_LAT_UNCACHED                        0x02
> +
> +/* Bits(2-3) {not-used, snoop, local, remote} */
> +#define LD_LAT_NOT_USED                        (0x00 << 2)
> +#define LD_LAT_SNOOP                   (0x01 << 2)
> +#define LD_LAT_LOCAL                   (0x02 << 2)
> +#define LD_LAT_REMOTE                  (0x03 << 2)
> +
> +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> +#define LD_LAT_MODIFIED                        (0x00 << 4)
> +#define LD_LAT_EXCLUSIVE               (0x01 << 4)
> +#define LD_LAT_SHARED                  (0x02 << 4)
> +#define LD_LAT_INVALID                 (0x03 << 4)
> +
> +#define LD_LAT_RESERVED                        0x3F


Also, I guess we need to go actually look at the POWER, IA64 and other
PMU docs to see if this sufficiently covers their data-source
capabilities.

Stephane, do you know of more PMUs with data-source capabilities we need
to include in the audit, and do you happen to have all their docs
readily available or do I need to ask google?

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-01-06  7:49 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-27 15:39 [PATCH 7/7] perf: Add load latency monitoring on Intel Nehalem/Westmere v2 Lin Ming
2011-01-04 11:59 ` Peter Zijlstra
2011-01-04 14:03   ` Peter Zijlstra
2011-01-06  7:49     ` Lin Ming
2011-01-05  7:12   ` Lin Ming
2011-01-05  9:50 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox