linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-14 14:04 ARMv6 performance counters v2 Jamie Iles
@ 2009-12-14 14:04 ` Jamie Iles
  2009-12-14 14:39   ` Will Deacon
  2009-12-14 16:01   ` Jean Pihet
  0 siblings, 2 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-14 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

To add support for perf events and to allow the hardware
counters to be shared with oprofile, we need a way to reserve
access to the pmu (performance monitor unit).

Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/include/asm/pmu.h |   76 +++++++++++++++++++++++++++++++
 arch/arm/kernel/Makefile   |    1 +
 arch/arm/kernel/pmu.c      |  108 ++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/mm/Kconfig        |    6 +++
 4 files changed, 191 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/pmu.h
 create mode 100644 arch/arm/kernel/pmu.c

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
new file mode 100644
index 0000000..d66a7cd
--- /dev/null
+++ b/arch/arm/include/asm/pmu.h
@@ -0,0 +1,76 @@
+/*
+ *  linux/arch/arm/include/asm/pmu.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PMU_H__
+#define __ARM_PMU_H__
+
+#ifdef CONFIG_CPU_HAS_PMU
+
+#define MAX_PMU_IRQS	    8
+
+struct pmu_irqs {
+	int	    irqs[MAX_PMU_IRQS];
+	unsigned    num_irqs;
+};
+
+/**
+ * reserve_pmu() - reserve the hardware performance counters
+ *
+ * Reserve the hardware performance counters in the system for exclusive use.
+ * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
+ * encoded error on failure.
+ */
+extern const struct pmu_irqs *
+reserve_pmu(void);
+
+/**
+ * release_pmu() - Relinquish control of the performance counters
+ *
+ * Release the performance counters and allow someone else to use them.
+ * Callers must have disabled the counters and released IRQs before calling
+ * this. The 'struct pmu_irqs' returned from reserve_pmu() must be passed as
+ * a cookie.
+ */
+extern void
+release_pmu(const struct pmu_irqs *irqs);
+
+/**
+ * init_pmu() - Initialise the PMU.
+ *
+ * Initialise the system ready for PMU enabling. This should typically set the
+ * IRQ affinity and nothing else. The users (oprofile/perf events etc) will do
+ * the actual hardware initialisation.
+ */
+extern int
+init_pmu(void);
+
+#else /* CONFIG_CPU_HAS_PMU */
+
+static inline const struct pmu_irqs *
+reserve_pmu(void)
+{
+	ERR_PTR(-ENODEV);
+}
+
+static inline void
+release_pmu(const struct pmu_irqs *irqs)
+{
+}
+
+static inline int
+init_pmu(void)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_CPU_HAS_PMU */
+
+#endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index e7ccf7e..286a276 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_CPU_XSCALE)	+= xscale-cp0.o
 obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
+obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 
 ifneq ($(CONFIG_ARCH_EBSA110),y)
diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
new file mode 100644
index 0000000..881e526
--- /dev/null
+++ b/arch/arm/kernel/pmu.c
@@ -0,0 +1,108 @@
+/*
+ *  linux/arch/arm/kernel/pmu.c
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/semaphore.h>
+#include <linux/err.h>
+#include <linux/irq.h>
+
+#include <asm/pmu.h>
+#include <asm/irq.h>
+
+/*
+ * Define the IRQs for the system. We could use something like a platform
+ * device but that seems fairly heavyweight for this. Also, the performance
+ * counters can't be removed or hotplugged.
+ *
+ * Ordering is important: init_pmu() will use the ordering to set the affinity
+ * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
+ * second goes to cpu 1 etc.
+ */
+static const struct pmu_irqs pmu_irqs = {
+#ifdef CONFIG_ARCH_PC3XX
+	.irqs	    = { IRQ_NPMUIRQ },
+	.num_irqs   = 1,
+#elif defined(CONFIG_ARCH_OMAP2)
+	.irqs	    = { 3 },
+	.num_irqs   = 1,
+#elif defined(CONFIG_ARCH_BCMRING)
+	.irqs	    = { IRQ_PMUIRQ },
+	.num_irqs   = 1,
+#elif defined(CONFIG_MACH_REALVIEW_EB)
+	.irqs	    = {
+		[0]	= IRQ_EB11MP_PMU_CPU0,
+		[1]	= IRQ_EB11MP_PMU_CPU1,
+		[2]	= IRQ_EB11MP_PMU_CPU2,
+		[3]	= IRQ_EB11MP_PMU_CPU3
+	},
+	.num_irqs   = 4,
+#elif defined(CONFIG_ARCH_OMAP3)
+	.irqs	    = { INT_34XX_BENCH_MPU_EMUL },
+	.num_irqs   = 1,
+#elif defined(CONFIG_ARCH_IOP32X)
+	.irqs	    = { IRQ_IOP32X_CORE_PMU },
+	.num_irqs   = 1,
+#elif defined(CONFIG_ARCH_IOP33X)
+	.irqs	    = { IRQ_IOP33X_CORE_PMU },
+	.num_irqs   = 1,
+#elif defined(CONFIG_ARCH_PXA)
+	.irqs	    = { IRQ_PMU },
+	.num_irqs   = 1,
+#endif
+};
+
+static DECLARE_MUTEX(pmu_mutex);
+
+const struct pmu_irqs *
+reserve_pmu(void)
+{
+	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
+
+	return ret ? ERR_PTR(ret) : &pmu_irqs;
+}
+EXPORT_SYMBOL_GPL(reserve_pmu);
+
+void
+release_pmu(const struct pmu_irqs *irqs)
+{
+	WARN_ON(irqs != &pmu_irqs);
+	up(&pmu_mutex);
+}
+EXPORT_SYMBOL_GPL(release_pmu);
+
+static void
+set_irq_affinity(int irq,
+		 unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	struct irq_desc *desc = irq_desc + irq;
+	const struct cpumask *mask = cpumask_of(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	cpumask_copy(desc->affinity, mask);
+	desc->chip->set_affinity(irq, mask);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+#endif
+}
+
+int
+init_pmu(void)
+{
+	int i;
+
+	for (i = 0; i < pmu_irqs.num_irqs; ++i)
+		set_irq_affinity(pmu_irqs.irqs[i], i);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(init_pmu);
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index dd4698c..fc5c05b 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -342,6 +342,7 @@ config CPU_XSCALE
 	select CPU_PABRT_LEGACY
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MMU
+	select CPU_HAS_PMU
 	select CPU_TLB_V4WBI if MMU
 
 # XScale Core Version 3
@@ -398,6 +399,7 @@ config CPU_V6
 	select CPU_HAS_ASID if MMU
 	select CPU_COPY_V6 if MMU
 	select CPU_TLB_V6 if MMU
+	select CPU_HAS_PMU
 
 # ARMv6k
 config CPU_32v6K
@@ -421,6 +423,7 @@ config CPU_V7
 	select CPU_CACHE_V7
 	select CPU_CACHE_VIPT
 	select CPU_CP15_MMU
+	select CPU_HAS_PMU
 	select CPU_HAS_ASID if MMU
 	select CPU_COPY_V6 if MMU
 	select CPU_TLB_V7 if MMU
@@ -536,6 +539,9 @@ config CPU_COPY_FA
 config CPU_COPY_V6
 	bool
 
+config CPU_HAS_PMU
+	bool
+
 # This selects the TLB model
 config CPU_TLB_V3
 	bool
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-14 14:04 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
@ 2009-12-14 14:39   ` Will Deacon
  2009-12-14 15:03     ` Jamie Iles
  2009-12-14 16:01   ` Jean Pihet
  1 sibling, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-14 14:39 UTC (permalink / raw)
  To: linux-arm-kernel

* Jamie Iles wrote:

> To add support for perf events and to allow the hardware
> counters to be shared with oprofile, we need a way to reserve
> access to the pmu (performance monitor unit).

Hi Jamie, this is looking good. It's nice to see the IRQ stuff moving
out of oprofile. Comments are inline.

> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> new file mode 100644
> index 0000000..d66a7cd
> --- /dev/null
> +++ b/arch/arm/include/asm/pmu.h
> @@ -0,0 +1,76 @@
> +/*
> + *  linux/arch/arm/include/asm/pmu.h
> + *
> + *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#ifndef __ARM_PMU_H__
> +#define __ARM_PMU_H__
> +
> +#ifdef CONFIG_CPU_HAS_PMU
> +
> +#define MAX_PMU_IRQS	    8
> +
> +struct pmu_irqs {
> +	int	    irqs[MAX_PMU_IRQS];
> +	unsigned    num_irqs;
> +};

Since we're populating this struct at compile time anyway, could we make it
an array and use the ARRAY_SIZE macro to get the number of irqs? This would
also mean that MAX_PMU_IRQS could be removed.

> diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> new file mode 100644
> index 0000000..881e526
> --- /dev/null
> +++ b/arch/arm/kernel/pmu.c
<snip>
> +void
> +release_pmu(const struct pmu_irqs *irqs)
> +{
> +	WARN_ON(irqs != &pmu_irqs);
> +	up(&pmu_mutex);
> +}
> +EXPORT_SYMBOL_GPL(release_pmu);

I think it would be better to allow release to fail and do so if the irqs
don't match, otherwise a malicious oprofile module could release on behalf of
perf :).

> +static void
> +set_irq_affinity(int irq,
> +		 unsigned int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	struct irq_desc *desc = irq_desc + irq;
> +	const struct cpumask *mask = cpumask_of(cpu);
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	cpumask_copy(desc->affinity, mask);
> +	desc->chip->set_affinity(irq, mask);
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +#endif
> +}

Why not use irq_set_affinity(irq, cpumask_of(cpu))?
This function isn't exported, but I don't envisage building the pmu
as a module.

> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index dd4698c..fc5c05b 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -342,6 +342,7 @@ config CPU_XSCALE
>  	select CPU_PABRT_LEGACY
>  	select CPU_CACHE_VIVT
>  	select CPU_CP15_MMU
> +	select CPU_HAS_PMU
>  	select CPU_TLB_V4WBI if MMU
> 
>  # XScale Core Version 3
> @@ -398,6 +399,7 @@ config CPU_V6
>  	select CPU_HAS_ASID if MMU
>  	select CPU_COPY_V6 if MMU
>  	select CPU_TLB_V6 if MMU
> +	select CPU_HAS_PMU
> 
>  # ARMv6k
>  config CPU_32v6K
> @@ -421,6 +423,7 @@ config CPU_V7
>  	select CPU_CACHE_V7
>  	select CPU_CACHE_VIPT
>  	select CPU_CP15_MMU
> +	select CPU_HAS_PMU
>  	select CPU_HAS_ASID if MMU
>  	select CPU_COPY_V6 if MMU
>  	select CPU_TLB_V7 if MMU
> @@ -536,6 +539,9 @@ config CPU_COPY_FA
>  config CPU_COPY_V6
>  	bool
> 
> +config CPU_HAS_PMU
> +	bool

I think all v6 cores and above have a PMU, so you could set the bool based on that
(and add the exceptional cases like xscale).

I've got a quad-core pb11mp box so once this is settled I'll give it a test in an SMP
environment.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-14 14:39   ` Will Deacon
@ 2009-12-14 15:03     ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-14 15:03 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

Thanks for your feedback, comments inline.

Jamie

On Mon, Dec 14, 2009 at 02:39:59PM -0000, Will Deacon wrote:
> > +#define MAX_PMU_IRQS	    8
> > +
> > +struct pmu_irqs {
> > +	int	    irqs[MAX_PMU_IRQS];
> > +	unsigned    num_irqs;
> > +};
> 
> Since we're populating this struct at compile time anyway, could we make it
> an array and use the ARRAY_SIZE macro to get the number of irqs? This would
> also mean that MAX_PMU_IRQS could be removed.
Ok, good plan.

> > diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> > new file mode 100644
> > index 0000000..881e526
> > --- /dev/null
> > +++ b/arch/arm/kernel/pmu.c
> <snip>
> > +void
> > +release_pmu(const struct pmu_irqs *irqs)
> > +{
> > +	WARN_ON(irqs != &pmu_irqs);
> > +	up(&pmu_mutex);
> > +}
> > +EXPORT_SYMBOL_GPL(release_pmu);
> 
> I think it would be better to allow release to fail and do so if the irqs
> don't match, otherwise a malicious oprofile module could release on behalf of
> perf :).
Ok, that sounds reasonable. I'll make release_pmu() return an int, but I doubt
that it's recoverable by any of the users!
> 
> > +static void
> > +set_irq_affinity(int irq,
> > +		 unsigned int cpu)
> > +{
> > +#ifdef CONFIG_SMP
> > +	struct irq_desc *desc = irq_desc + irq;
> > +	const struct cpumask *mask = cpumask_of(cpu);
> > +	unsigned long flags;
> > +
> > +	raw_spin_lock_irqsave(&desc->lock, flags);
> > +	cpumask_copy(desc->affinity, mask);
> > +	desc->chip->set_affinity(irq, mask);
> > +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> > +#endif
> > +}
> 
> Why not use irq_set_affinity(irq, cpumask_of(cpu))?
> This function isn't exported, but I don't envisage building the pmu
> as a module.
Because I moved the code from oprofile ;-) irq_set_affinity() looks like a
better option so I'll use that for the next revision.

> I think all v6 cores and above have a PMU, so you could set the bool based
> on that (and add the exceptional cases like xscale).
Ok, I wasn't sure if that was the case but if so then that's a sensible
change.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-14 14:04 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
  2009-12-14 14:39   ` Will Deacon
@ 2009-12-14 16:01   ` Jean Pihet
  1 sibling, 0 replies; 55+ messages in thread
From: Jean Pihet @ 2009-12-14 16:01 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

I am OK with this code. It is good to have such a reservation mechanism.

Regards,
Jean

On Mon, 2009-12-14 at 14:04 +0000, Jamie Iles wrote:
> To add support for perf events and to allow the hardware
> counters to be shared with oprofile, we need a way to reserve
> access to the pmu (performance monitor unit).
> 
> Cc: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
> ---
>  arch/arm/include/asm/pmu.h |   76 +++++++++++++++++++++++++++++++
>  arch/arm/kernel/Makefile   |    1 +
>  arch/arm/kernel/pmu.c      |  108 ++++++++++++++++++++++++++++++++++++++++++++
>  arch/arm/mm/Kconfig        |    6 +++
>  4 files changed, 191 insertions(+), 0 deletions(-)
>  create mode 100644 arch/arm/include/asm/pmu.h
>  create mode 100644 arch/arm/kernel/pmu.c
> 
> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> new file mode 100644
> index 0000000..d66a7cd
> --- /dev/null
> +++ b/arch/arm/include/asm/pmu.h
> @@ -0,0 +1,76 @@
> +/*
> + *  linux/arch/arm/include/asm/pmu.h
> + *
> + *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#ifndef __ARM_PMU_H__
> +#define __ARM_PMU_H__
> +
> +#ifdef CONFIG_CPU_HAS_PMU
> +
> +#define MAX_PMU_IRQS	    8
> +
> +struct pmu_irqs {
> +	int	    irqs[MAX_PMU_IRQS];
> +	unsigned    num_irqs;
> +};
> +
> +/**
> + * reserve_pmu() - reserve the hardware performance counters
> + *
> + * Reserve the hardware performance counters in the system for exclusive use.
> + * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
> + * encoded error on failure.
> + */
> +extern const struct pmu_irqs *
> +reserve_pmu(void);
> +
> +/**
> + * release_pmu() - Relinquish control of the performance counters
> + *
> + * Release the performance counters and allow someone else to use them.
> + * Callers must have disabled the counters and released IRQs before calling
> + * this. The 'struct pmu_irqs' returned from reserve_pmu() must be passed as
> + * a cookie.
> + */
> +extern void
> +release_pmu(const struct pmu_irqs *irqs);
> +
> +/**
> + * init_pmu() - Initialise the PMU.
> + *
> + * Initialise the system ready for PMU enabling. This should typically set the
> + * IRQ affinity and nothing else. The users (oprofile/perf events etc) will do
> + * the actual hardware initialisation.
> + */
> +extern int
> +init_pmu(void);
> +
> +#else /* CONFIG_CPU_HAS_PMU */
> +
> +static inline const struct pmu_irqs *
> +reserve_pmu(void)
> +{
> +	ERR_PTR(-ENODEV);
> +}
> +
> +static inline void
> +release_pmu(const struct pmu_irqs *irqs)
> +{
> +}
> +
> +static inline int
> +init_pmu(void)
> +{
> +	return -ENODEV;
> +}
> +
> +#endif /* CONFIG_CPU_HAS_PMU */
> +
> +#endif /* __ARM_PMU_H__ */
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index e7ccf7e..286a276 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -46,6 +46,7 @@ obj-$(CONFIG_CPU_XSCALE)	+= xscale-cp0.o
>  obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
>  obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
>  obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
> +obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
>  AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
>  
>  ifneq ($(CONFIG_ARCH_EBSA110),y)
> diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> new file mode 100644
> index 0000000..881e526
> --- /dev/null
> +++ b/arch/arm/kernel/pmu.c
> @@ -0,0 +1,108 @@
> +/*
> + *  linux/arch/arm/kernel/pmu.c
> + *
> + *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/semaphore.h>
> +#include <linux/err.h>
> +#include <linux/irq.h>
> +
> +#include <asm/pmu.h>
> +#include <asm/irq.h>
> +
> +/*
> + * Define the IRQs for the system. We could use something like a platform
> + * device but that seems fairly heavyweight for this. Also, the performance
> + * counters can't be removed or hotplugged.
> + *
> + * Ordering is important: init_pmu() will use the ordering to set the affinity
> + * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
> + * second goes to cpu 1 etc.
> + */
> +static const struct pmu_irqs pmu_irqs = {
> +#ifdef CONFIG_ARCH_PC3XX
> +	.irqs	    = { IRQ_NPMUIRQ },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_ARCH_OMAP2)
> +	.irqs	    = { 3 },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_ARCH_BCMRING)
> +	.irqs	    = { IRQ_PMUIRQ },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_MACH_REALVIEW_EB)
> +	.irqs	    = {
> +		[0]	= IRQ_EB11MP_PMU_CPU0,
> +		[1]	= IRQ_EB11MP_PMU_CPU1,
> +		[2]	= IRQ_EB11MP_PMU_CPU2,
> +		[3]	= IRQ_EB11MP_PMU_CPU3
> +	},
> +	.num_irqs   = 4,
> +#elif defined(CONFIG_ARCH_OMAP3)
> +	.irqs	    = { INT_34XX_BENCH_MPU_EMUL },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_ARCH_IOP32X)
> +	.irqs	    = { IRQ_IOP32X_CORE_PMU },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_ARCH_IOP33X)
> +	.irqs	    = { IRQ_IOP33X_CORE_PMU },
> +	.num_irqs   = 1,
> +#elif defined(CONFIG_ARCH_PXA)
> +	.irqs	    = { IRQ_PMU },
> +	.num_irqs   = 1,
> +#endif
> +};
> +
> +static DECLARE_MUTEX(pmu_mutex);
> +
> +const struct pmu_irqs *
> +reserve_pmu(void)
> +{
> +	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
> +
> +	return ret ? ERR_PTR(ret) : &pmu_irqs;
> +}
> +EXPORT_SYMBOL_GPL(reserve_pmu);
> +
> +void
> +release_pmu(const struct pmu_irqs *irqs)
> +{
> +	WARN_ON(irqs != &pmu_irqs);
> +	up(&pmu_mutex);
> +}
> +EXPORT_SYMBOL_GPL(release_pmu);
> +
> +static void
> +set_irq_affinity(int irq,
> +		 unsigned int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	struct irq_desc *desc = irq_desc + irq;
> +	const struct cpumask *mask = cpumask_of(cpu);
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	cpumask_copy(desc->affinity, mask);
> +	desc->chip->set_affinity(irq, mask);
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +#endif
> +}
> +
> +int
> +init_pmu(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < pmu_irqs.num_irqs; ++i)
> +		set_irq_affinity(pmu_irqs.irqs[i], i);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(init_pmu);
> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index dd4698c..fc5c05b 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -342,6 +342,7 @@ config CPU_XSCALE
>  	select CPU_PABRT_LEGACY
>  	select CPU_CACHE_VIVT
>  	select CPU_CP15_MMU
> +	select CPU_HAS_PMU
>  	select CPU_TLB_V4WBI if MMU
>  
>  # XScale Core Version 3
> @@ -398,6 +399,7 @@ config CPU_V6
>  	select CPU_HAS_ASID if MMU
>  	select CPU_COPY_V6 if MMU
>  	select CPU_TLB_V6 if MMU
> +	select CPU_HAS_PMU
>  
>  # ARMv6k
>  config CPU_32v6K
> @@ -421,6 +423,7 @@ config CPU_V7
>  	select CPU_CACHE_V7
>  	select CPU_CACHE_VIPT
>  	select CPU_CP15_MMU
> +	select CPU_HAS_PMU
>  	select CPU_HAS_ASID if MMU
>  	select CPU_COPY_V6 if MMU
>  	select CPU_TLB_V7 if MMU
> @@ -536,6 +539,9 @@ config CPU_COPY_FA
>  config CPU_COPY_V6
>  	bool
>  
> +config CPU_HAS_PMU
> +	bool
> +
>  # This selects the TLB model
>  config CPU_TLB_V3
>  	bool

^ permalink raw reply	[flat|nested] 55+ messages in thread

* ARMv6 performance counters v3
@ 2009-12-15 11:15 Jamie Iles
  2009-12-15 11:15 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

Okay, here's the 3rd attempt at support for hardware perf events support
on ARMv6. After feedback from Jean and Will, I've added an arm_pmu
structure that allows other CPU versions to be supported in the future.
At the moment only ARMv6 is supported in hardware.

Thanks to Will and Jean for their comments.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-15 11:15 ARMv6 performance counters v3 Jamie Iles
@ 2009-12-15 11:15 ` Jamie Iles
  2009-12-15 11:15   ` [PATCH 2/5] arm/oprofile: reserve the PMU when starting Jamie Iles
                     ` (2 more replies)
  0 siblings, 3 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

To add support for perf events and to allow the hardware
counters to be shared with oprofile, we need a way to reserve
access to the pmu (performance monitor unit).

Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/include/asm/pmu.h |   74 ++++++++++++++++++++++++++++++
 arch/arm/kernel/Makefile   |    1 +
 arch/arm/kernel/pmu.c      |  108 ++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/mm/Kconfig        |    5 ++
 4 files changed, 188 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/pmu.h
 create mode 100644 arch/arm/kernel/pmu.c

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
new file mode 100644
index 0000000..e7cc264
--- /dev/null
+++ b/arch/arm/include/asm/pmu.h
@@ -0,0 +1,74 @@
+/*
+ *  linux/arch/arm/include/asm/pmu.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PMU_H__
+#define __ARM_PMU_H__
+
+#ifdef CONFIG_CPU_HAS_PMU
+
+struct pmu_irqs {
+	const int   *irqs;
+	unsigned    num_irqs;
+};
+
+/**
+ * reserve_pmu() - reserve the hardware performance counters
+ *
+ * Reserve the hardware performance counters in the system for exclusive use.
+ * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
+ * encoded error on failure.
+ */
+extern const struct pmu_irqs *
+reserve_pmu(void);
+
+/**
+ * release_pmu() - Relinquish control of the performance counters
+ *
+ * Release the performance counters and allow someone else to use them.
+ * Callers must have disabled the counters and released IRQs before calling
+ * this. The 'struct pmu_irqs' returned from reserve_pmu() must be passed as
+ * a cookie.
+ */
+extern int
+release_pmu(const struct pmu_irqs *irqs);
+
+/**
+ * init_pmu() - Initialise the PMU.
+ *
+ * Initialise the system ready for PMU enabling. This should typically set the
+ * IRQ affinity and nothing else. The users (oprofile/perf events etc) will do
+ * the actual hardware initialisation.
+ */
+extern int
+init_pmu(void);
+
+#else /* CONFIG_CPU_HAS_PMU */
+
+static inline const struct pmu_irqs *
+reserve_pmu(void)
+{
+	ERR_PTR(-ENODEV);
+}
+
+static inline int
+release_pmu(const struct pmu_irqs *irqs)
+{
+}
+
+static inline int
+init_pmu(void)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_CPU_HAS_PMU */
+
+#endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index dd00f74..216890d 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_CPU_XSCALE)	+= xscale-cp0.o
 obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
+obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 
 ifneq ($(CONFIG_ARCH_EBSA110),y)
diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
new file mode 100644
index 0000000..3a178bb
--- /dev/null
+++ b/arch/arm/kernel/pmu.c
@@ -0,0 +1,108 @@
+/*
+ *  linux/arch/arm/kernel/pmu.c
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/semaphore.h>
+
+#include <asm/pmu.h>
+
+/*
+ * Define the IRQs for the system. We could use something like a platform
+ * device but that seems fairly heavyweight for this. Also, the performance
+ * counters can't be removed or hotplugged.
+ *
+ * Ordering is important: init_pmu() will use the ordering to set the affinity
+ * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
+ * second goes to cpu 1 etc.
+ */
+static const int irqs[] = {
+#ifdef CONFIG_ARCH_PC3XX
+	IRQ_NPMUIRQ,
+#elif defined(CONFIG_ARCH_OMAP2)
+	3,
+#elif defined(CONFIG_ARCH_BCMRING)
+	IRQ_PMUIRQ,
+#elif defined(CONFIG_MACH_REALVIEW_EB)
+	IRQ_EB11MP_PMU_CPU0,
+	IRQ_EB11MP_PMU_CPU1,
+	IRQ_EB11MP_PMU_CPU2,
+	IRQ_EB11MP_PMU_CPU3,
+#elif defined(CONFIG_ARCH_OMAP3)
+	INT_34XX_BENCH_MPU_EMUL,
+#elif defined(CONFIG_ARCH_IOP32X)
+	IRQ_IOP32X_CORE_PMU,
+#elif defined(CONFIG_ARCH_IOP33X)
+	IRQ_IOP33X_CORE_PMU,
+#elif defined(CONFIG_ARCH_PXA)
+	IRQ_PMU,
+#endif
+};
+
+static const struct pmu_irqs pmu_irqs = {
+	.irqs	    = irqs,
+	.num_irqs   = ARRAY_SIZE(irqs),
+};
+
+static DECLARE_MUTEX(pmu_mutex);
+
+const struct pmu_irqs *
+reserve_pmu(void)
+{
+	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
+
+	return ret ? ERR_PTR(ret) : &pmu_irqs;
+}
+EXPORT_SYMBOL_GPL(reserve_pmu);
+
+int
+release_pmu(const struct pmu_irqs *irqs)
+{
+	if (WARN_ON(irqs != &pmu_irqs))
+		return -EINVAL;
+	up(&pmu_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(release_pmu);
+
+static int
+set_irq_affinity(int irq,
+		 unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	int err = irq_set_affinity(irq, cpumask_of(cpu));
+	if (err)
+		pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
+			   irq, cpu);
+	return err;
+#else
+	return 0;
+#endif
+}
+
+int
+init_pmu(void)
+{
+	int i;
+	int err = 0;
+
+	for (i = 0; i < pmu_irqs.num_irqs; ++i) {
+		err = set_irq_affinity(pmu_irqs.irqs[i], i);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(init_pmu);
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index dd4698c..5cd0ec4 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -536,6 +536,11 @@ config CPU_COPY_FA
 config CPU_COPY_V6
 	bool
 
+config CPU_HAS_PMU
+	depends on CPU_V6 || CPU_V7 || CPU_XSCALE
+	default y
+	bool
+
 # This selects the TLB model
 config CPU_TLB_V3
 	bool
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 2/5] arm/oprofile: reserve the PMU when starting
  2009-12-15 11:15 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
@ 2009-12-15 11:15   ` Jamie Iles
  2009-12-15 11:15     ` [PATCH 3/5] arm: use the spinlocked, generic atomic64 support Jamie Iles
  2009-12-15 14:13   ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Will Deacon
  2009-12-17 16:14   ` Will Deacon
  2 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

Make sure that we have access to the performance counters and
that they aren't being used by perf events or anything else.

Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/oprofile/op_model_arm11_core.c |    4 +-
 arch/arm/oprofile/op_model_arm11_core.h |    4 +-
 arch/arm/oprofile/op_model_mpcore.c     |   42 ++++++++++++++++--------------
 arch/arm/oprofile/op_model_v6.c         |   30 ++++++++++++++--------
 arch/arm/oprofile/op_model_v7.c         |   30 ++++++++++++++--------
 arch/arm/oprofile/op_model_v7.h         |    4 +-
 arch/arm/oprofile/op_model_xscale.c     |   35 ++++++++++++++-----------
 7 files changed, 85 insertions(+), 64 deletions(-)

diff --git a/arch/arm/oprofile/op_model_arm11_core.c b/arch/arm/oprofile/op_model_arm11_core.c
index ad80752..ef3e265 100644
--- a/arch/arm/oprofile/op_model_arm11_core.c
+++ b/arch/arm/oprofile/op_model_arm11_core.c
@@ -132,7 +132,7 @@ static irqreturn_t arm11_pmu_interrupt(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-int arm11_request_interrupts(int *irqs, int nr)
+int arm11_request_interrupts(const int *irqs, int nr)
 {
 	unsigned int i;
 	int ret = 0;
@@ -153,7 +153,7 @@ int arm11_request_interrupts(int *irqs, int nr)
 	return ret;
 }
 
-void arm11_release_interrupts(int *irqs, int nr)
+void arm11_release_interrupts(const int *irqs, int nr)
 {
 	unsigned int i;
 
diff --git a/arch/arm/oprofile/op_model_arm11_core.h b/arch/arm/oprofile/op_model_arm11_core.h
index 6f8538e..1902b99 100644
--- a/arch/arm/oprofile/op_model_arm11_core.h
+++ b/arch/arm/oprofile/op_model_arm11_core.h
@@ -39,7 +39,7 @@
 int arm11_setup_pmu(void);
 int arm11_start_pmu(void);
 int arm11_stop_pmu(void);
-int arm11_request_interrupts(int *, int);
-void arm11_release_interrupts(int *, int);
+int arm11_request_interrupts(const int *, int);
+void arm11_release_interrupts(const int *, int);
 
 #endif
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 4ce0f98..f73ce87 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -32,6 +32,7 @@
 /* #define DEBUG */
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/oprofile.h>
 #include <linux/interrupt.h>
@@ -43,6 +44,7 @@
 #include <mach/hardware.h>
 #include <mach/board-eb.h>
 #include <asm/system.h>
+#include <asm/pmu.h>
 
 #include "op_counter.h"
 #include "op_arm_model.h"
@@ -58,6 +60,7 @@
  * Bitmask of used SCU counters
  */
 static unsigned int scu_em_used;
+static const struct pmu_irqs *pmu_irqs;
 
 /*
  * 2 helper fns take a counter number from 0-7 (not the userspace-visible counter number)
@@ -225,33 +228,40 @@ static int em_setup_ctrs(void)
 	return 0;
 }
 
-static int arm11_irqs[] = {
-	[0]	= IRQ_EB11MP_PMU_CPU0,
-	[1]	= IRQ_EB11MP_PMU_CPU1,
-	[2]	= IRQ_EB11MP_PMU_CPU2,
-	[3]	= IRQ_EB11MP_PMU_CPU3
-};
-
 static int em_start(void)
 {
 	int ret;
 
-	ret = arm11_request_interrupts(arm11_irqs, ARRAY_SIZE(arm11_irqs));
+	pmu_irqs = reserve_pmu();
+	if (IS_ERR(pmu_irqs)) {
+		ret = PTR_ERR(pmu_irqs);
+		goto out;
+	}
+
+	ret = arm11_request_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
 	if (ret == 0) {
 		em_call_function(arm11_start_pmu);
 
 		ret = scu_start();
-		if (ret)
-			arm11_release_interrupts(arm11_irqs, ARRAY_SIZE(arm11_irqs));
+		if (ret) {
+			arm11_release_interrupts(pmu_irqs->irqs,
+						 pmu_irqs->num_irqs);
+		} else {
+			release_pmu(pmu_irqs);
+			pmu_irqs = NULL;
+		}
 	}
+
+out:
 	return ret;
 }
 
 static void em_stop(void)
 {
 	em_call_function(arm11_stop_pmu);
-	arm11_release_interrupts(arm11_irqs, ARRAY_SIZE(arm11_irqs));
+	arm11_release_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
 	scu_stop();
+	release_pmu(pmu_irqs);
 }
 
 /*
@@ -283,15 +293,7 @@ static int em_setup(void)
 	em_route_irq(IRQ_EB11MP_PMU_SCU6, 3);
 	em_route_irq(IRQ_EB11MP_PMU_SCU7, 3);
 
-	/*
-	 * Send CP15 PMU interrupts to the owner CPU.
-	 */
-	em_route_irq(IRQ_EB11MP_PMU_CPU0, 0);
-	em_route_irq(IRQ_EB11MP_PMU_CPU1, 1);
-	em_route_irq(IRQ_EB11MP_PMU_CPU2, 2);
-	em_route_irq(IRQ_EB11MP_PMU_CPU3, 3);
-
-	return 0;
+	return init_pmu();
 }
 
 struct op_arm_model_spec op_mpcore_spec = {
diff --git a/arch/arm/oprofile/op_model_v6.c b/arch/arm/oprofile/op_model_v6.c
index f7d2ec5..a22357a 100644
--- a/arch/arm/oprofile/op_model_v6.c
+++ b/arch/arm/oprofile/op_model_v6.c
@@ -19,39 +19,47 @@
 /* #define DEBUG */
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/oprofile.h>
 #include <linux/interrupt.h>
 #include <asm/irq.h>
 #include <asm/system.h>
+#include <asm/pmu.h>
 
 #include "op_counter.h"
 #include "op_arm_model.h"
 #include "op_model_arm11_core.h"
 
-static int irqs[] = {
-#ifdef CONFIG_ARCH_OMAP2
-	3,
-#endif
-#ifdef CONFIG_ARCH_BCMRING
-	IRQ_PMUIRQ, /* for BCMRING, ARM PMU interrupt is 43 */
-#endif
-};
+static const struct pmu_irqs *pmu_irqs;
 
 static void armv6_pmu_stop(void)
 {
 	arm11_stop_pmu();
-	arm11_release_interrupts(irqs, ARRAY_SIZE(irqs));
+	arm11_release_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
+	release_pmu(pmu_irqs);
+	pmu_irqs = NULL;
 }
 
 static int armv6_pmu_start(void)
 {
 	int ret;
 
-	ret = arm11_request_interrupts(irqs, ARRAY_SIZE(irqs));
-	if (ret >= 0)
+	pmu_irqs = reserve_pmu();
+	if (IS_ERR(pmu_irqs)) {
+		ret = PTR_ERR(pmu_irqs);
+		goto out;
+	}
+
+	ret = arm11_request_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
+	if (ret >= 0) {
 		ret = arm11_start_pmu();
+	} else {
+		release_pmu(pmu_irqs);
+		pmu_irqs = NULL;
+	}
 
+out:
 	return ret;
 }
 
diff --git a/arch/arm/oprofile/op_model_v7.c b/arch/arm/oprofile/op_model_v7.c
index f20295f..9258fca 100644
--- a/arch/arm/oprofile/op_model_v7.c
+++ b/arch/arm/oprofile/op_model_v7.c
@@ -11,11 +11,14 @@
  */
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/oprofile.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/smp.h>
 
+#include <asm/pmu.h>
+
 #include "op_counter.h"
 #include "op_arm_model.h"
 #include "op_model_v7.h"
@@ -299,7 +302,7 @@ static irqreturn_t armv7_pmnc_interrupt(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-int armv7_request_interrupts(int *irqs, int nr)
+int armv7_request_interrupts(const int *irqs, int nr)
 {
 	unsigned int i;
 	int ret = 0;
@@ -322,7 +325,7 @@ int armv7_request_interrupts(int *irqs, int nr)
 	return ret;
 }
 
-void armv7_release_interrupts(int *irqs, int nr)
+void armv7_release_interrupts(const int *irqs, int nr)
 {
 	unsigned int i;
 
@@ -366,12 +369,7 @@ static void armv7_pmnc_dump_regs(void)
 }
 #endif
 
-
-static int irqs[] = {
-#ifdef CONFIG_ARCH_OMAP3
-	INT_34XX_BENCH_MPU_EMUL,
-#endif
-};
+static const struct pmu_irqs *pmu_irqs;
 
 static void armv7_pmnc_stop(void)
 {
@@ -379,19 +377,29 @@ static void armv7_pmnc_stop(void)
 	armv7_pmnc_dump_regs();
 #endif
 	armv7_stop_pmnc();
-	armv7_release_interrupts(irqs, ARRAY_SIZE(irqs));
+	armv7_release_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
+	release_pmu(pmu_irqs);
+	pmu_irqs = NULL;
 }
 
 static int armv7_pmnc_start(void)
 {
 	int ret;
 
+	pmu_irqs = reserve_pmu();
+	if (IS_ERR(pmu_irqs))
+		return PTR_ERR(pmu_irqs);
+
 #ifdef DEBUG
 	armv7_pmnc_dump_regs();
 #endif
-	ret = armv7_request_interrupts(irqs, ARRAY_SIZE(irqs));
-	if (ret >= 0)
+	ret = armv7_request_interrupts(pmu_irqs->irqs, pmu_irqs->num_irqs);
+	if (ret >= 0) {
 		armv7_start_pmnc();
+	} else {
+		release_pmu(pmu_irqs);
+		pmu_irqs = NULL;
+	}
 
 	return ret;
 }
diff --git a/arch/arm/oprofile/op_model_v7.h b/arch/arm/oprofile/op_model_v7.h
index 0e19bcc..9ca334b 100644
--- a/arch/arm/oprofile/op_model_v7.h
+++ b/arch/arm/oprofile/op_model_v7.h
@@ -97,7 +97,7 @@
 int armv7_setup_pmu(void);
 int armv7_start_pmu(void);
 int armv7_stop_pmu(void);
-int armv7_request_interrupts(int *, int);
-void armv7_release_interrupts(int *, int);
+int armv7_request_interrupts(const int *, int);
+void armv7_release_interrupts(const int *, int);
 
 #endif
diff --git a/arch/arm/oprofile/op_model_xscale.c b/arch/arm/oprofile/op_model_xscale.c
index 724ab9c..1d34a02 100644
--- a/arch/arm/oprofile/op_model_xscale.c
+++ b/arch/arm/oprofile/op_model_xscale.c
@@ -17,12 +17,14 @@
 /* #define DEBUG */
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/oprofile.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 
 #include <asm/cputype.h>
+#include <asm/pmu.h>
 
 #include "op_counter.h"
 #include "op_arm_model.h"
@@ -33,17 +35,6 @@
 #define	PMU_RESET	(CCNT_RESET | PMN_RESET)
 #define PMU_CNT64	0x008	/* Make CCNT count every 64th cycle */
 
-/* TODO do runtime detection */
-#ifdef CONFIG_ARCH_IOP32X
-#define XSCALE_PMU_IRQ  IRQ_IOP32X_CORE_PMU
-#endif
-#ifdef CONFIG_ARCH_IOP33X
-#define XSCALE_PMU_IRQ  IRQ_IOP33X_CORE_PMU
-#endif
-#ifdef CONFIG_ARCH_PXA
-#define XSCALE_PMU_IRQ  IRQ_PMU
-#endif
-
 /*
  * Different types of events that can be counted by the XScale PMU
  * as used by Oprofile userspace. Here primarily for documentation
@@ -367,6 +358,8 @@ static irqreturn_t xscale_pmu_interrupt(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+static const struct pmu_irqs *pmu_irqs;
+
 static void xscale_pmu_stop(void)
 {
 	u32 pmnc = read_pmnc();
@@ -374,20 +367,30 @@ static void xscale_pmu_stop(void)
 	pmnc &= ~PMU_ENABLE;
 	write_pmnc(pmnc);
 
-	free_irq(XSCALE_PMU_IRQ, results);
+	free_irq(pmu_irqs->irqs[0], results);
+	release_pmu(pmu_irqs);
+	pmu_irqs = NULL;
 }
 
 static int xscale_pmu_start(void)
 {
 	int ret;
-	u32 pmnc = read_pmnc();
+	u32 pmnc;
+
+	pmu_irqs = reserve_pmu();
+	if (IS_ERR(pmu_irqs))
+		return PTR_ERR(pmu_irqs);
+
+	pmnc = read_pmnc();
 
-	ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED,
-			"XScale PMU", (void *)results);
+	ret = request_irq(pmu_irqs->irqs[0], xscale_pmu_interrupt,
+			  IRQF_DISABLED, "XScale PMU", (void *)results);
 
 	if (ret < 0) {
 		printk(KERN_ERR "oprofile: unable to request IRQ%d for XScale PMU\n",
-			XSCALE_PMU_IRQ);
+		       pmu_irqs->irqs[0]);
+		release_pmu(pmu_irqs);
+		pmu_irqs = NULL;
 		return ret;
 	}
 
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 3/5] arm: use the spinlocked, generic atomic64 support
  2009-12-15 11:15   ` [PATCH 2/5] arm/oprofile: reserve the PMU when starting Jamie Iles
@ 2009-12-15 11:15     ` Jamie Iles
  2009-12-15 11:15       ` [PATCH 4/5] arm: enable support for software perf events Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

perf events require that we can support atomic64's. There is a generic,
spinlocked version that we can use until we have proper hardware
support.

Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/Kconfig              |    1 +
 arch/arm/include/asm/atomic.h |    4 ++++
 2 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 233a222..9580418 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -18,6 +18,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
 	select HAVE_GENERIC_DMA_COHERENT
+	select GENERIC_ATOMIC64
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index d0daeab..ff286a8 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -15,6 +15,10 @@
 #include <linux/types.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_GENERIC_ATOMIC64
+#include <asm-generic/atomic64.h>
+#endif
+
 #define ATOMIC_INIT(i)	{ (i) }
 
 #ifdef __KERNEL__
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 4/5] arm: enable support for software perf events
  2009-12-15 11:15     ` [PATCH 3/5] arm: use the spinlocked, generic atomic64 support Jamie Iles
@ 2009-12-15 11:15       ` Jamie Iles
  2009-12-15 11:15         ` [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6 Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

The perf events subsystem allows counting of both hardware and
software events. This patch implements the bare minimum for software
performance events.

Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/Kconfig                  |    2 +
 arch/arm/include/asm/perf_event.h |   38 +++++++++++++++++++++++++++++++++++++
 arch/arm/mm/fault.c               |    7 ++++++
 3 files changed, 47 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/perf_event.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9580418..fe4ce95 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -19,6 +19,8 @@ config ARM
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
 	select HAVE_GENERIC_DMA_COHERENT
 	select GENERIC_ATOMIC64
+	select HAVE_PERF_EVENTS
+	select PERF_USE_VMALLOC
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
new file mode 100644
index 0000000..32a66ac
--- /dev/null
+++ b/arch/arm/include/asm/perf_event.h
@@ -0,0 +1,38 @@
+/*
+ *  linux/arch/arm/include/asm/perf_event.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PERF_EVENT_H__
+#define __ARM_PERF_EVENT_H__
+
+/*
+ * NOP: on *most* (read: all supported) ARM platforms, the performance
+ * counter interrupts are regular interrupts and not an NMI. This
+ * means that when we receive the interrupt we can call
+ * perf_event_do_pending() that handles all of the work with
+ * interrupts enabled.
+ */
+static inline void
+set_perf_event_pending(void)
+{
+}
+
+/* Get the PC. Make sure that we have a 64bit value with the upper 32 cleared.
+ */
+#define perf_instruction_pointer(_regs) \
+	((u64)instruction_pointer(regs) & 0xFFFFFFFFLU)
+#define perf_misc_flags(regs)   (user_mode(regs) ? PERF_RECORD_MISC_USER : \
+                                 PERF_RECORD_MISC_KERNEL)
+
+/* ARM performance counters start from 1 (in the cp15 accesses) so use the
+ * same indexes here for consistency. */
+#define PERF_EVENT_INDEX_OFFSET 1
+
+#endif /* __ARM_PERF_EVENT_H__ */
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 10e0680..9d40c34 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -18,6 +18,7 @@
 #include <linux/page-flags.h>
 #include <linux/sched.h>
 #include <linux/highmem.h>
+#include <linux/perf_event.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -302,6 +303,12 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	fault = __do_page_fault(mm, addr, fsr, tsk);
 	up_read(&mm->mmap_sem);
 
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, addr);
+	if (fault & VM_FAULT_MAJOR)
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, addr);
+	else if (fault & VM_FAULT_MINOR)
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, addr);
+
 	/*
 	 * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
 	 */
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 11:15       ` [PATCH 4/5] arm: enable support for software perf events Jamie Iles
@ 2009-12-15 11:15         ` Jamie Iles
  2009-12-15 14:29           ` Will Deacon
  2009-12-18 17:05           ` Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6) Jean Pihet
  0 siblings, 2 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 11:15 UTC (permalink / raw)
  To: linux-arm-kernel

This patch implements support for ARMv6 performance counters in the
Linux performance events subsystem. ARMv6 architectures that have the
performance counters should enable HW_PERF_EVENTS and define the
interrupts for the counters in arch/arm/kernel/perf_event.c

This implementation also provides an ARM PMU abstraction layer to allow
ARMv7 and others to be supported in the future by adding new a
'struct arm_pmu'.

Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/Kconfig             |    8 +
 arch/arm/kernel/Makefile     |    1 +
 arch/arm/kernel/perf_event.c | 1125 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1134 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/kernel/perf_event.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fe4ce95..ec26a1f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1170,6 +1170,14 @@ config HIGHPTE
 	depends on HIGHMEM
 	depends on !OUTER_CACHE
 
+config HW_PERF_EVENTS
+	bool "Enable hardware performance counter support for perf events"
+	depends on PERF_EVENTS && CPU_HAS_PMU && CPU_V6
+	default y
+	help
+	  Enable hardware performance counter support for perf events. If
+	  disabled, perf events will use software events only.
+
 source "mm/Kconfig"
 
 config LEDS
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 216890d..c76e6d2 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
 obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
+obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 
 ifneq ($(CONFIG_ARCH_EBSA110),y)
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
new file mode 100644
index 0000000..abb5267
--- /dev/null
+++ b/arch/arm/kernel/perf_event.c
@@ -0,0 +1,1125 @@
+#undef DEBUG
+
+/*
+ * ARM performance counter support.
+ *
+ * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ *
+ * This code is based on the sparc64 perf event code, which is in turn based
+ * on the x86 code. Callchain code is based on the ARM OProfile backtrace
+ * code.
+ */
+#define pr_fmt(fmt) "hw perfevents: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+#include <asm/cputype.h>
+#include <asm/irq.h>
+#include <asm/irq_regs.h>
+#include <asm/pmu.h>
+#include <asm/stacktrace.h>
+
+static const struct pmu_irqs *pmu_irqs;
+
+/*
+ * Hardware lock to serialize accesses to PMU registers. Needed for the
+ * read/modify/write sequences.
+ */
+DEFINE_SPINLOCK(pmu_lock);
+
+/*
+ * ARMv6 supports a maximum of 3 events, starting from index 1. If we add
+ * another platform that supports more, we need to increase this to be the
+ * largest of all platforms.
+ */
+#define ARMPMU_MAX_HWEVENTS		4
+
+/* The events for a given CPU. */
+struct cpu_hw_events {
+	/*
+	 * The events that are active on the CPU for the given index. Index 0
+	 * is reserved.
+	 */
+	struct perf_event	*events[ARMPMU_MAX_HWEVENTS];
+
+	/*
+	 * A 1 bit for an index indicates that the counter is being used for
+	 * an event. A 0 means that the counter can be used.
+	 */
+	unsigned long		used_mask[BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)];
+
+	/*
+	 * A 1 bit for an index indicates that the counter is actively being
+	 * used.
+	 */
+	unsigned long		active_mask[BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)];
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+struct arm_pmu {
+	const char	*name;
+	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
+	void		(*enable)(struct hw_perf_event *evt, int idx);
+	void		(*disable)(struct hw_perf_event *evt, int idx);
+	int		(*event_map)(int evt);
+	u64		(*raw_event)(u64);
+	int		(*get_event_idx)(struct cpu_hw_events *cpuc,
+					 struct hw_perf_event *hwc);
+        u32             (*read_counter)(int idx);
+        void            (*write_counter)(int idx, u32 val);
+        void            (*start)(void);
+	void		(*stop)(void);
+	int		num_events;
+        u64             max_period;
+};
+
+/* Set at runtime when we know what CPU type we are. */
+static struct arm_pmu *armpmu;
+
+#define HW_OP_UNSUPPORTED		    0xFFFF
+
+#define C(_x) \
+	PERF_COUNT_HW_CACHE_##_x
+
+#define CACHE_OP_UNSUPPORTED		0xFFFF
+
+static unsigned armpmu_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+				     [PERF_COUNT_HW_CACHE_OP_MAX]
+				     [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const int
+armpmu_map_cache_event(u64 config)
+{
+	unsigned int cache_type, cache_op, cache_result, ret;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ret = (int)armpmu_perf_cache_map[cache_type][cache_op][cache_result];
+
+	if (ret == CACHE_OP_UNSUPPORTED)
+		return -ENOENT;
+
+	return ret;
+}
+
+static int
+armpmu_event_set_period(struct perf_event *event,
+			struct hw_perf_event *hwc,
+			int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (left > armpmu->max_period)
+		left = armpmu->max_period;
+
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static u64
+armpmu_event_update(struct perf_event *event,
+		    struct hw_perf_event *hwc,
+		    int idx)
+{
+	int shift = 64 - 32;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	new_raw_count = armpmu->read_counter(idx);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static void
+armpmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	WARN_ON(idx < 0);
+
+	clear_bit(idx, cpuc->active_mask);
+	armpmu->disable(hwc, idx);
+
+	barrier();
+
+	armpmu_event_update(event, hwc, idx);
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static void
+armpmu_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Don't read disabled counters! */
+	if (hwc->idx < 0)
+		return;
+
+	armpmu_event_update(event, hwc, hwc->idx);
+}
+
+static void
+armpmu_unthrottle(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	armpmu->enable(hwc, hwc->idx);
+}
+
+static int
+armpmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+	int err = 0;
+
+	/* If we don't have a space for the counter then finish early. */
+	idx = armpmu->get_event_idx(cpuc, hwc);
+	if (idx < 0) {
+		err = idx;
+		goto out;
+	}
+
+	/*
+	 * If there is an event in the counter we are going to use then make
+	 * sure it is disabled.
+	 */
+	event->hw.idx = idx;
+	armpmu->disable(hwc, idx);
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	/* Set the period for the event. */
+	armpmu_event_set_period(event, hwc, idx);
+
+	/* Enable the event. */
+	armpmu->enable(hwc, idx);
+
+	/* Propagate our changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+out:
+	return err;
+}
+
+static struct pmu pmu = {
+        .enable     = armpmu_enable,
+        .disable    = armpmu_disable,
+        .unthrottle = armpmu_unthrottle,
+        .read       = armpmu_read,
+};
+
+static int
+validate_event(struct cpu_hw_events *cpuc,
+	       struct perf_event *event)
+{
+        struct hw_perf_event fake_event = event->hw;
+
+        if (event->pmu && event->pmu != &pmu)
+                return 0;
+
+        return armpmu->get_event_idx(cpuc, &fake_event) >= 0;
+}
+
+static int
+validate_group(struct perf_event *event)
+{
+        struct perf_event *sibling, *leader = event->group_leader;
+        struct cpu_hw_events fake_pmu;
+
+        memset(&fake_pmu, 0, sizeof(fake_pmu));
+
+        if (!validate_event(&fake_pmu, leader))
+                return -ENOSPC;
+
+        list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+                if (!validate_event(&fake_pmu, sibling))
+                        return -ENOSPC;
+        }
+
+        if (!validate_event(&fake_pmu, event))
+                return -ENOSPC;
+
+        return 0;
+}
+
+static int
+armpmu_reserve_hardware(void)
+{
+	int i;
+	int err;
+
+	pmu_irqs = reserve_pmu();
+	if (IS_ERR(pmu_irqs)) {
+		pr_warning("unable to reserve pmu\n");
+		return PTR_ERR(pmu_irqs);
+	}
+
+	init_pmu();
+
+	if (pmu_irqs->num_irqs < 1) {
+		pr_err("no irqs for PMUs defined\n");
+	}
+
+	for (i = 0; i < pmu_irqs->num_irqs; ++i) {
+		err = request_irq(pmu_irqs->irqs[i], armpmu->handle_irq,
+				  IRQF_DISABLED, "armpmu", NULL);
+		if (err) {
+			pr_warning("unable to request IRQ%d for ARM "
+				   "perf counters\n", pmu_irqs->irqs[i]);
+			break;
+		}
+	}
+
+	if (err) {
+		for (i = i - 1; i >= 0; --i)
+			free_irq(pmu_irqs->irqs[i], NULL);
+		release_pmu(pmu_irqs);
+		pmu_irqs = NULL;
+	}
+
+	return err;
+}
+
+static void
+armpmu_release_hardware(void)
+{
+	int i;
+
+	for (i = pmu_irqs->num_irqs - 1; i >= 0; --i)
+		free_irq(pmu_irqs->irqs[i], NULL);
+	armpmu->stop();
+
+	release_pmu(pmu_irqs);
+	pmu_irqs = NULL;
+}
+
+static atomic_t active_events = ATOMIC_INIT(0);
+static DEFINE_MUTEX(pmu_reserve_mutex);
+
+static void
+hw_perf_event_destroy(struct perf_event *event)
+{
+	if (atomic_dec_and_mutex_lock(&active_events, &pmu_reserve_mutex)) {
+		armpmu_release_hardware();
+		mutex_unlock(&pmu_reserve_mutex);
+	}
+}
+
+static int
+__hw_perf_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int mapping, err;
+
+	/* Decode the generic type into an ARM event identifier. */
+	if (PERF_TYPE_HARDWARE == event->attr.type) {
+		mapping = armpmu->event_map(event->attr.config);
+	} else if (PERF_TYPE_HW_CACHE == event->attr.type) {
+		mapping = armpmu_map_cache_event(event->attr.config);
+	} else if (PERF_TYPE_RAW == event->attr.type) {
+		mapping = armpmu->raw_event(event->attr.config);
+	} else {
+		pr_debug("event type %x not supported\n", event->attr.type);
+		return -EOPNOTSUPP;
+	}
+
+	if (mapping < 0) {
+		pr_debug("event %x:%llx not supported\n", event->attr.type,
+			 event->attr.config);
+		return mapping;
+	}
+
+	/*
+	 * Check whether we need to exclude the counter from certain modes.
+	 * The ARM performance counters are on all of the time so if someone
+	 * has asked us for some excludes then we have to fail.
+	 */
+	if (event->attr.exclude_kernel || event->attr.exclude_user ||
+	    event->attr.exclude_hv || event->attr.exclude_idle) {
+		pr_debug("ARM performance counters do not support "
+			 "mode exclusion\n");
+		return -EPERM;
+	}
+
+	/*
+	 * We don't assign an index until we actually place the event onto
+	 * hardware. Use -1 to signify that we haven't decided where to put it
+	 * yet. For SMP systems, each core has it's own PMU so we can't do any
+	 * clever allocation or constraints checking at this point.
+	 */
+	hwc->idx = -1;
+
+	/*
+	 * Store the event encoding into the config_base field. config and
+	 * event_base are unused as the only 2 things we need to know are
+	 * the event mapping and the counter to use. The counter to use is
+	 * also the indx and the config_base is the event type.
+	 */
+	hwc->config_base	    = (unsigned long)mapping;
+	hwc->config		    = 0;
+	hwc->event_base		    = 0;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period  = armpmu->max_period;
+		hwc->last_period    = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	err = 0;
+	if (event->group_leader != event) {
+		err = validate_group(event);
+		if (err)
+			return -EINVAL;
+	}
+
+	return err;
+}
+
+const struct pmu *
+hw_perf_event_init(struct perf_event *event)
+{
+	int err = 0;
+
+        if (!armpmu)
+                return ERR_PTR(-ENODEV);
+
+	event->destroy = hw_perf_event_destroy;
+
+	if (!atomic_inc_not_zero(&active_events)) {
+		if (atomic_read(&active_events) > perf_max_events) {
+			atomic_dec(&active_events);
+			return ERR_PTR(-ENOSPC);
+		}
+
+		mutex_lock(&pmu_reserve_mutex);
+		if (atomic_read(&active_events) == 0) {
+			err = armpmu_reserve_hardware();
+		}
+
+		if (!err)
+			atomic_inc(&active_events);
+		mutex_unlock(&pmu_reserve_mutex);
+	}
+
+	if (err)
+		return ERR_PTR(err);
+
+	err = __hw_perf_event_init(event);
+	if (err)
+		hw_perf_event_destroy(event);
+
+	return err ? ERR_PTR(err) : &pmu;
+}
+
+void
+hw_perf_enable(void)
+{
+	/* Enable all of the perf events on hardware. */
+	int idx;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+        if (!armpmu)
+                return;
+
+	for (idx = 0; idx <= armpmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+
+		if (!event)
+			continue;
+
+		armpmu->enable(&event->hw, idx);
+	}
+
+        armpmu->start();
+}
+
+void
+hw_perf_disable(void)
+{
+        if (armpmu)
+                armpmu->stop();
+}
+
+/*
+ * ARMv6 Performance counter handling code.
+ *
+ * ARMv6 has 2 configurable performance counters and a single cycle counter.
+ * They all share a single reset bit but can be written to zero so we can use
+ * that for a reset.
+ *
+ * The counters can't be individually enabled or disabled so when we remove
+ * one event and replace it with another we could get spurious counts from the
+ * wrong event. However, we can take advantage of the fact that the
+ * performance counters can export events to the event bus, and the event bus
+ * itself can be monitored. This requires that we *don't* export the events to
+ * the event bus. The procedure for disabling a configurable counter is:
+ *	- change the counter to count the ETMEXTOUT[0] signal (0x20). This
+ *	  effectively stops the counter from counting.
+ *	- disable the counter's interrupt generation (each counter has it's
+ *	  own interrupt enable bit).
+ * Once stopped, the counter value can be written as 0 to reset.
+ *
+ * To enable a counter:
+ *	- enable the counter's interrupt generation.
+ *	- set the new event type.
+ *
+ * Note: the dedicated cycle counter only counts cycles and can't be
+ * enabled/disabled independently of the others. When we want to disable the
+ * cycle counter, we have to just disable the interrupt reporting and start
+ * ignoring that counter. When re-enabling, we have to reset the value and
+ * enable the interrupt.
+ */
+
+enum armv6_perf_types {
+	ARMV6_PERFCTR_ICACHE_MISS	= 0x0,
+	ARMV6_PERFCTR_IBUF_STALL	= 0x1,
+	ARMV6_PERFCTR_DDEP_STALL	= 0x2,
+	ARMV6_PERFCTR_ITLB_MISS		= 0x3,
+	ARMV6_PERFCTR_DTLB_MISS		= 0x4,
+	ARMV6_PERFCTR_BR_EXEC		= 0x5,
+	ARMV6_PERFCTR_BR_MISPREDICT	= 0x6,
+	ARMV6_PERFCTR_INSTR_EXEC	= 0x7,
+	ARMV6_PERFCTR_DCACHE_HIT	= 0x9,
+	ARMV6_PERFCTR_DCACHE_ACCESS	= 0xA,
+	ARMV6_PERFCTR_DCACHE_MISS	= 0xB,
+	ARMV6_PERFCTR_DCACHE_WBACK	= 0xC,
+	ARMV6_PERFCTR_SW_PC_CHANGE	= 0xD,
+	ARMV6_PERFCTR_MAIN_TLB_MISS	= 0xF,
+	ARMV6_PERFCTR_EXPL_D_ACCESS	= 0x10,
+	ARMV6_PERFCTR_LSU_FULL_STALL	= 0x11,
+	ARMV6_PERFCTR_WBUF_DRAINED	= 0x12,
+	ARMV6_PERFCTR_CPU_CYCLES	= 0xFF,
+	ARMV6_PERFCTR_NOP		= 0x20,
+};
+
+enum armv6_counters {
+	ARMV6_CYCLE_COUNTER = 1,
+	ARMV6_COUNTER0,
+	ARMV6_COUNTER1,
+};
+
+/*
+ * The hardware events that we support. We do support cache operations but
+ * we have harvard caches and no way to combine instruction and data
+ * accesses/misses in hardware.
+ */
+static const unsigned armv6_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV6_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV6_PERFCTR_INSTR_EXEC,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV6_PERFCTR_BR_EXEC,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV6_PERFCTR_BR_MISPREDICT,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = HW_OP_UNSUPPORTED,
+};
+
+static const unsigned armv6_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+				          [PERF_COUNT_HW_CACHE_OP_MAX]
+				          [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ICACHE_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ICACHE_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * The ARM performance counters can count micro DTLB misses,
+		 * micro ITLB misses and main TLB misses. There isn't an event
+		 * for TLB misses, so use the micro misses here and if users
+		 * want the main TLB misses they can use a raw counter.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+static inline unsigned long
+armv6_pmcr_read(void)
+{
+	u32 val;
+	asm volatile("mrc   p15, 0, %0, c15, c12, 0" : "=r"(val));
+	return val;
+}
+
+static inline void
+armv6_pmcr_write(unsigned long val)
+{
+	asm volatile("mcr   p15, 0, %0, c15, c12, 0" : : "r"(val));
+}
+
+#define ARMV6_PMCR_ENABLE		(1 << 0)
+#define ARMV6_PMCR_CTR01_RESET          (1 << 1)
+#define ARMV6_PMCR_CCOUNT_RESET         (1 << 2)
+#define ARMV6_PMCR_CCOUNT_DIV           (1 << 3)
+#define ARMV6_PMCR_COUNT0_IEN	        (1 << 4)
+#define ARMV6_PMCR_COUNT1_IEN	        (1 << 5)
+#define ARMV6_PMCR_CCOUNT_IEN	        (1 << 6)
+#define ARMV6_PMCR_COUNT0_OVERFLOW	(1 << 8)
+#define ARMV6_PMCR_COUNT1_OVERFLOW	(1 << 9)
+#define ARMV6_PMCR_CCOUNT_OVERFLOW	(1 << 10)
+#define ARMV6_PMCR_EVT_COUNT0_SHIFT	20
+#define ARMV6_PMCR_EVT_COUNT0_MASK	(0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT)
+#define ARMV6_PMCR_EVT_COUNT1_SHIFT	12
+#define ARMV6_PMCR_EVT_COUNT1_MASK	(0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT)
+
+#define ARMV6_PMCR_OVERFLOWED_MASK \
+	(ARMV6_PMCR_COUNT0_OVERFLOW | ARMV6_PMCR_COUNT1_OVERFLOW | \
+	 ARMV6_PMCR_CCOUNT_OVERFLOW)
+
+static inline int
+armv6_pmcr_has_overflowed(unsigned long pmcr)
+{
+	return (pmcr & ARMV6_PMCR_OVERFLOWED_MASK);
+}
+
+static inline int
+armv6_pmcr_counter_has_overflowed(unsigned long pmcr,
+			          enum armv6_counters counter)
+{
+	int ret;
+
+	if (ARMV6_CYCLE_COUNTER == counter)
+		ret = pmcr & ARMV6_PMCR_CCOUNT_OVERFLOW;
+	else if (ARMV6_COUNTER0 == counter)
+		ret = pmcr & ARMV6_PMCR_COUNT0_OVERFLOW;
+	else if (ARMV6_COUNTER1 == counter)
+		ret = pmcr & ARMV6_PMCR_COUNT1_OVERFLOW;
+	else
+		BUG();
+
+	return ret;
+}
+
+static inline u32
+armv6pmu_read_counter(int counter)
+{
+	unsigned long value;
+
+	if (ARMV6_CYCLE_COUNTER == counter)
+		asm volatile("mrc   p15, 0, %0, c15, c12, 1" : "=r"(value));
+	else if (ARMV6_COUNTER0 == counter)
+		asm volatile("mrc   p15, 0, %0, c15, c12, 2" : "=r"(value));
+	else if (ARMV6_COUNTER1 == counter)
+		asm volatile("mrc   p15, 0, %0, c15, c12, 3" : "=r"(value));
+	else
+		BUG();
+
+	return value;
+}
+
+static inline void
+armv6pmu_write_counter(int counter,
+		       u32 value)
+{
+	if (ARMV6_CYCLE_COUNTER == counter)
+		asm volatile("mcr   p15, 0, %0, c15, c12, 1" : : "r"(value));
+	else if (ARMV6_COUNTER0 == counter)
+		asm volatile("mcr   p15, 0, %0, c15, c12, 2" : : "r"(value));
+	else if (ARMV6_COUNTER1 == counter)
+		asm volatile("mcr   p15, 0, %0, c15, c12, 3" : : "r"(value));
+	else
+		BUG();
+}
+
+void
+armv6pmu_enable_event(struct hw_perf_event *hwc,
+		      int idx)
+{
+	unsigned long val, mask, evt, flags;
+
+	if (ARMV6_CYCLE_COUNTER == idx) {
+		mask	= 0;
+		evt	= ARMV6_PMCR_CCOUNT_IEN;
+	} else if (ARMV6_COUNTER0 == idx) {
+		mask	= ARMV6_PMCR_EVT_COUNT0_MASK;
+		evt	= (hwc->config_base << ARMV6_PMCR_EVT_COUNT0_SHIFT) |
+			  ARMV6_PMCR_COUNT0_IEN;
+	} else if (ARMV6_COUNTER1 == idx) {
+		mask	= ARMV6_PMCR_EVT_COUNT1_MASK;
+		evt	= (hwc->config_base << ARMV6_PMCR_EVT_COUNT1_SHIFT) |
+			  ARMV6_PMCR_COUNT1_IEN;
+	} else {
+		BUG();
+	}
+
+	/*
+	 * Mask out the current event and set the counter to count the event
+	 * that we're interested in.
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+	val = armv6_pmcr_read();
+	val &= ~mask;
+	val |= evt;
+	armv6_pmcr_write(val);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static irqreturn_t
+armv6pmu_handle_irq(int irq_num,
+		    void *dev)
+{
+	unsigned long pmcr = armv6_pmcr_read();
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	if (!armv6_pmcr_has_overflowed(pmcr))
+		return IRQ_NONE;
+
+	regs = get_irq_regs();
+
+	/*
+	 * The interrupts are cleared by writing the overflow flags back to
+	 * the control register. All of the other bits don't have any effect
+	 * if they are rewritten, so write the whole value back.
+	 */
+	armv6_pmcr_write(pmcr);
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx <= armpmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		/*
+		 * We have a single interrupt for all counters. Check that
+		 * each counter has overflowed before we process it.
+		 */
+		if (!armv6_pmcr_counter_has_overflowed(pmcr, idx))
+			continue;
+
+		hwc = &event->hw;
+		armpmu_event_update(event, hwc, idx);
+		data.period = event->hw.last_period;
+		if (!armpmu_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 0, &data, regs))
+			armpmu->disable(hwc, idx);
+	}
+
+	/*
+	 * Handle the pending perf events.
+	 *
+	 * Note: this call *must* be run with interrupts enabled. For
+	 * platforms that can have the PMU interrupts raised as a PMI, this
+	 * will not work.
+	 */
+	perf_event_do_pending();
+
+	return IRQ_HANDLED;
+}
+
+static void
+armv6pmu_start(void)
+{
+	unsigned long flags, val;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	val = armv6_pmcr_read();
+	val |= ARMV6_PMCR_ENABLE;
+	armv6_pmcr_write(val);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+void
+armv6pmu_stop(void)
+{
+	unsigned long flags, val;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	val = armv6_pmcr_read();
+	val &= ~ARMV6_PMCR_ENABLE;
+	armv6_pmcr_write(val);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static inline int
+armv6pmu_event_map(int config)
+{
+	int mapping = armv6_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static u64
+armv6pmu_raw_event(u64 config)
+{
+        return config & 0xff;
+}
+
+static int
+armv6pmu_get_event_idx(struct cpu_hw_events *cpuc,
+		       struct hw_perf_event *event)
+{
+	/* Always place a cycle counter into the cycle counter. */
+	if (ARMV6_PERFCTR_CPU_CYCLES == event->config_base) {
+		if (test_and_set_bit(ARMV6_CYCLE_COUNTER, cpuc->used_mask))
+			return -EAGAIN;
+
+		return ARMV6_CYCLE_COUNTER;
+	} else {
+		/*
+		 * For anything other than a cycle counter, try and use
+		 * counter0 and counter1.
+		 */
+		if (!test_and_set_bit(ARMV6_COUNTER1, cpuc->used_mask)) {
+			return ARMV6_COUNTER1;
+		}
+
+		if (!test_and_set_bit(ARMV6_COUNTER0, cpuc->used_mask)) {
+			return ARMV6_COUNTER0;
+		}
+
+		/* The counters are all in use. */
+		return -EAGAIN;
+	}
+}
+
+static void
+armv6pmu_disable_event(struct hw_perf_event *hwc,
+		       int idx)
+{
+	unsigned long val, mask, evt, flags;
+
+	if (ARMV6_CYCLE_COUNTER == idx) {
+		mask	= ARMV6_PMCR_CCOUNT_IEN;
+		evt	= 0;
+	} else if (ARMV6_COUNTER0 == idx) {
+		mask	= ARMV6_PMCR_COUNT0_IEN | ARMV6_PMCR_EVT_COUNT0_MASK;
+		evt	= ARMV6_PERFCTR_NOP << ARMV6_PMCR_EVT_COUNT0_SHIFT;
+	} else if (ARMV6_COUNTER1 == idx) {
+		mask	= ARMV6_PMCR_COUNT1_IEN | ARMV6_PMCR_EVT_COUNT1_MASK;
+		evt	= ARMV6_PERFCTR_NOP << ARMV6_PMCR_EVT_COUNT1_SHIFT;
+	} else {
+		BUG();
+	}
+
+	/*
+	 * Mask out the current event and set the counter to count the number
+	 * of ETM bus signal assertion cycles. The external reporting should
+	 * be disabled and so this should never increment.
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+	val = armv6_pmcr_read();
+	val &= ~mask;
+	val |= evt;
+	armv6_pmcr_write(val);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static struct arm_pmu armv6pmu = {
+	.name			= "v6",
+	.handle_irq		= armv6pmu_handle_irq,
+	.enable			= armv6pmu_enable_event,
+	.disable		= armv6pmu_disable_event,
+	.event_map		= armv6pmu_event_map,
+	.raw_event		= armv6pmu_raw_event,
+        .read_counter           = armv6pmu_read_counter,
+        .write_counter          = armv6pmu_write_counter,
+	.get_event_idx		= armv6pmu_get_event_idx,
+        .start                  = armv6pmu_start,
+	.stop		        = armv6pmu_stop,
+	.num_events		= 3,
+	.max_period		= (1LLU << 32) - 1,
+};
+
+static int __init
+init_hw_perf_events(void)
+{
+#define CPUID_V6_MASK   0x7F000
+#define CPUID_V6_BITS   0x7B000
+        unsigned long cpuid = read_cpuid_id();
+
+        if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
+                armpmu = &armv6pmu;
+                memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
+                       sizeof(armv6_perf_cache_map));
+                perf_max_events	= armv6pmu.num_events;
+        } else {
+                pr_info("no hardware support available\n");
+                perf_max_events = -1;
+        }
+
+        if (armpmu)
+                pr_info("enabled with %s PMU driver\n",
+                        armpmu->name);
+
+        return 0;
+}
+arch_initcall(init_hw_perf_events);
+
+/*
+ * Callchain handling code.
+ */
+static inline void
+callchain_store(struct perf_callchain_entry *entry,
+		u64 ip)
+{
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+/*
+ * The registers we're interested in are at the end of the variable
+ * length saved register structure. The fp points at the end of this
+ * structure so the address of this struct is:
+ * (struct frame_tail *)(xxx->fp)-1
+ *
+ * This code has been adapted from the ARM OProfile support.
+ */
+struct frame_tail {
+	struct frame_tail   *fp;
+	unsigned long	    sp;
+	unsigned long	    lr;
+} __attribute__((packed));
+
+/*
+ * Get the return address for a single stackframe and return a pointer to the
+ * next frame tail.
+ */
+static struct frame_tail *
+user_backtrace(struct frame_tail *tail,
+	       struct perf_callchain_entry *entry)
+{
+	struct frame_tail buftail;
+
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+		return NULL;
+	if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
+		return NULL;
+
+	callchain_store(entry, buftail.lr);
+
+	/*
+	 * Frame pointers should strictly progress back up the stack
+	 * (towards higher addresses).
+	 */
+	if (tail >= buftail.fp)
+		return NULL;
+
+	return buftail.fp - 1;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs,
+		    struct perf_callchain_entry *entry)
+{
+	struct frame_tail *tail;
+
+	callchain_store(entry, PERF_CONTEXT_USER);
+
+	if (!user_mode(regs))
+		regs = task_pt_regs(current);
+
+	tail = (struct frame_tail *)regs->ARM_fp - 1;
+
+	while (tail && !((unsigned long)tail & 0x3))
+		tail = user_backtrace(tail, entry);
+}
+
+/*
+ * Gets called by walk_stackframe() for every stackframe. This will be called
+ * whist unwinding the stackframe and is like a subroutine return so we use
+ * the PC.
+ */
+static int
+callchain_trace(struct stackframe *fr,
+		void *data)
+{
+	struct perf_callchain_entry *entry = data;
+	callchain_store(entry, fr->pc);
+	return 0;
+}
+
+static void
+perf_callchain_kernel(struct pt_regs *regs,
+		      struct perf_callchain_entry *entry)
+{
+	struct stackframe fr;
+
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
+	fr.fp = regs->ARM_fp;
+	fr.sp = regs->ARM_sp;
+	fr.lr = regs->ARM_lr;
+	fr.pc = regs->ARM_pc;
+	walk_stackframe(&fr, callchain_trace, entry);
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs,
+		  struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || !current->pid)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+
+struct perf_callchain_entry *
+perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry);
+
+	entry->nr = 0;
+	perf_do_callchain(regs, entry);
+	return entry;
+}
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-15 11:15 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
  2009-12-15 11:15   ` [PATCH 2/5] arm/oprofile: reserve the PMU when starting Jamie Iles
@ 2009-12-15 14:13   ` Will Deacon
  2009-12-15 14:36     ` Jamie Iles
  2009-12-17 16:14   ` Will Deacon
  2 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-15 14:13 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jamie,

It's getting there! Minor stylistic suggestions inline.

* Jamie Iles wrote:

> To add support for perf events and to allow the hardware
> counters to be shared with oprofile, we need a way to reserve
> access to the pmu (performance monitor unit).
> 
> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> new file mode 100644
> index 0000000..e7cc264
> --- /dev/null
> +++ b/arch/arm/include/asm/pmu.h
> @@ -0,0 +1,74 @@
> +/*
> + *  linux/arch/arm/include/asm/pmu.h
> + *
> + *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#ifndef __ARM_PMU_H__
> +#define __ARM_PMU_H__
> +
> +#ifdef CONFIG_CPU_HAS_PMU
> +
> +struct pmu_irqs {
> +	const int   *irqs;
> +	unsigned    num_irqs;
> +};
> +
> +/**
> + * reserve_pmu() - reserve the hardware performance counters
> + *
> + * Reserve the hardware performance counters in the system for exclusive use.
> + * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
> + * encoded error on failure.
> + */
> +extern const struct pmu_irqs *
> +reserve_pmu(void);

I think it's standard Kernel coding style to put the declaration of a function
all on one line if it fits. The same goes elsewhere in the patch.

> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index dd4698c..5cd0ec4 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -536,6 +536,11 @@ config CPU_COPY_FA
>  config CPU_COPY_V6
>  	bool
> 
> +config CPU_HAS_PMU
> +	depends on CPU_V6 || CPU_V7 || CPU_XSCALE
> +	default y
> +	bool

I think you should use XSCALE_PMU instead of CPU_XSCALE. Also, this should
probably be in the top-level ARM Kconfig instead of the mm/ one.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 11:15         ` [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6 Jamie Iles
@ 2009-12-15 14:29           ` Will Deacon
  2009-12-15 15:02             ` Jamie Iles
  2009-12-18 17:05           ` Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6) Jean Pihet
  1 sibling, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-15 14:29 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jamie,

I've not looked at the entire patch in depth, but I did spot a couple
of things:

> This patch implements support for ARMv6 performance counters in the
> Linux performance events subsystem. ARMv6 architectures that have the
> performance counters should enable HW_PERF_EVENTS and define the
> interrupts for the counters in arch/arm/kernel/perf_event.c
> 
> This implementation also provides an ARM PMU abstraction layer to allow
> ARMv7 and others to be supported in the future by adding new a
> 'struct arm_pmu'.

<snip>

> +static struct arm_pmu armv6pmu = {
> +	.name			= "v6",
> +	.handle_irq		= armv6pmu_handle_irq,
> +	.enable			= armv6pmu_enable_event,
> +	.disable		= armv6pmu_disable_event,
> +	.event_map		= armv6pmu_event_map,
> +	.raw_event		= armv6pmu_raw_event,
> +        .read_counter           = armv6pmu_read_counter,
> +        .write_counter          = armv6pmu_write_counter,
> +	.get_event_idx		= armv6pmu_get_event_idx,
> +        .start                  = armv6pmu_start,
> +	.stop		        = armv6pmu_stop,
> +	.num_events		= 3,
> +	.max_period		= (1LLU << 32) - 1,
> +};

Your indentation seems to have gone awry here, I think I saw it somewhere
else in the file too.

> +static int __init
> +init_hw_perf_events(void)
> +{
> +#define CPUID_V6_MASK   0x7F000
> +#define CPUID_V6_BITS   0x7B000
> +        unsigned long cpuid = read_cpuid_id();
> +
> +        if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
> +                armpmu = &armv6pmu;
> +                memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
> +                       sizeof(armv6_perf_cache_map));
> +                perf_max_events	= armv6pmu.num_events;
> +        } else {
> +                pr_info("no hardware support available\n");
> +                perf_max_events = -1;
> +        }
> +
> +        if (armpmu)
> +                pr_info("enabled with %s PMU driver\n",
> +                        armpmu->name);
> +
> +        return 0;
> +}
> +arch_initcall(init_hw_perf_events);

Watch out for the 11MPCore CPU. The event numbers are defined *slightly*
differently (1136,1156 and 1176 don't have any conflicts with each other,
but the 11MPCore does). If you look at the TRM the first 6 events are the
same as for other v6 cores, but then event 0x06 becomes a new event: `branch
not predicted' which offsets the others by one. Other than that, the PMUs
are accessed the same way on each core, so you just need to make sure you
select the correct event mappings.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-15 14:13   ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Will Deacon
@ 2009-12-15 14:36     ` Jamie Iles
  2009-12-15 17:06       ` Will Deacon
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 14:36 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

Many thanks for the review again!

On Tue, Dec 15, 2009 at 02:13:25PM -0000, Will Deacon wrote:
[snip]
> > +/**
> > + * reserve_pmu() - reserve the hardware performance counters
> > + *
> > + * Reserve the hardware performance counters in the system for exclusive use.
> > + * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
> > + * encoded error on failure.
> > + */
> > +extern const struct pmu_irqs *
> > +reserve_pmu(void);
> 
> I think it's standard Kernel coding style to put the declaration of a function
> all on one line if it fits. The same goes elsewhere in the patch.
I couldn't find anything that has this written down and there are plenty of
other places in the perf code that do this. I personally like it because you
can grep for "^foo" to find the definition. Unless people have strong
objections to this I'll leave it as is.

> > +config CPU_HAS_PMU
> > +	depends on CPU_V6 || CPU_V7 || CPU_XSCALE
> > +	default y
> > +	bool
> 
> I think you should use XSCALE_PMU instead of CPU_XSCALE. Also, this should
> probably be in the top-level ARM Kconfig instead of the mm/ one.
Ok, I agree with you about using XSCALE_PMU, but why isn't mm/Kconfig the
correct one? It's describing what features the CPU has and the PMU *is* a
feature.

Cheers,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 14:29           ` Will Deacon
@ 2009-12-15 15:02             ` Jamie Iles
  2009-12-15 15:05               ` Will Deacon
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 15:02 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

On Tue, Dec 15, 2009 at 02:29:21PM -0000, Will Deacon wrote:
> > +static struct arm_pmu armv6pmu = {
> > +	.name			= "v6",
> > +	.handle_irq		= armv6pmu_handle_irq,
> > +	.enable			= armv6pmu_enable_event,
> > +	.disable		= armv6pmu_disable_event,
> > +	.event_map		= armv6pmu_event_map,
> > +	.raw_event		= armv6pmu_raw_event,
> > +        .read_counter           = armv6pmu_read_counter,
> > +        .write_counter          = armv6pmu_write_counter,
> > +	.get_event_idx		= armv6pmu_get_event_idx,
> > +        .start                  = armv6pmu_start,
> > +	.stop		        = armv6pmu_stop,
> > +	.num_events		= 3,
> > +	.max_period		= (1LLU << 32) - 1,
> > +};
> 
> Your indentation seems to have gone awry here, I think I saw it somewhere
> else in the file too.
Indeed. That needs fixing.

> Watch out for the 11MPCore CPU. The event numbers are defined *slightly*
> differently (1136,1156 and 1176 don't have any conflicts with each other,
> but the 11MPCore does). If you look at the TRM the first 6 events are the
> same as for other v6 cores, but then event 0x06 becomes a new event: `branch
> not predicted' which offsets the others by one. Other than that, the PMUs
> are accessed the same way on each core, so you just need to make sure you
> select the correct event mappings.
Ok, is it a safe assumption that for ARMv6 if num_possible_cpus() returns >1
we are mpcore and need to use different ID's or is there a better way to test
for mpcore?

Thanks,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 15:02             ` Jamie Iles
@ 2009-12-15 15:05               ` Will Deacon
  2009-12-15 15:19                 ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-15 15:05 UTC (permalink / raw)
  To: linux-arm-kernel


* Jamie Iles wrote:

> Ok, is it a safe assumption that for ARMv6 if num_possible_cpus() returns >1
> we are mpcore and need to use different ID's or is there a better way to test
> for mpcore?

The bottom 16 bits of the cpuid field will read 0xB020 on an 11MPCore as it's a
different chip to the other v6 cores.

You could cast the cpuid to a u16 and switch on that: 0xB360, 0xB560 and 0xB760
are the single core ARM11s.

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 15:05               ` Will Deacon
@ 2009-12-15 15:19                 ` Jamie Iles
  2009-12-15 15:30                   ` Peter Zijlstra
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 15:19 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

On Tue, Dec 15, 2009 at 03:05:13PM -0000, Will Deacon wrote:
> > Ok, is it a safe assumption that for ARMv6 if num_possible_cpus() returns >1
> > we are mpcore and need to use different ID's or is there a better way to test
> > for mpcore?
> 
> The bottom 16 bits of the cpuid field will read 0xB020 on an 11MPCore as it's a
> different chip to the other v6 cores.
> 
> You could cast the cpuid to a u16 and switch on that: 0xB360, 0xB560 and 0xB760
> are the single core ARM11s.
Ok, that sounds like a good idea. Another problem with mpcore support is that
with the v6 performance counters, you can't disable a single event counter.
If we receive lots of interrupts, the higher perf events layers will disable
the counter for a bit before reenabling/unthrottling it.

On the single core, I just disable the interrupt for the counter and tell it
to count the ETM cycle events with the event exporting disabled so that the
counter doesn't increment. mpcore doesn't have this event so the counter will
keep on incrementing. Is there an undocumented event number we can use to stop
the counter? If not we'll need to read the counter, disable it and stash the
value away for when we unthrottle it and write it back. I guess we'll lose
some accuracy so it would be nice not to do that.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 15:19                 ` Jamie Iles
@ 2009-12-15 15:30                   ` Peter Zijlstra
  2009-12-15 15:36                     ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Peter Zijlstra @ 2009-12-15 15:30 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2009-12-15 at 15:19 +0000, Jamie Iles wrote:
> Another problem with mpcore support is that
> with the v6 performance counters, you can't disable a single event
> counter. 

Can you program them with a non-counting event?

On x86 there's various ways of doing that, either by selecting an event
that simply doesn't count (cache-misses with 0 MESI mask), or by telling
it to mask both user and kernel event.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 15:30                   ` Peter Zijlstra
@ 2009-12-15 15:36                     ` Jamie Iles
  2009-12-16 10:54                       ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-15 15:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 04:30:13PM +0100, Peter Zijlstra wrote:
> On Tue, 2009-12-15 at 15:19 +0000, Jamie Iles wrote:
> > Another problem with mpcore support is that
> > with the v6 performance counters, you can't disable a single event
> > counter. 
> 
> Can you program them with a non-counting event?
> 
> On x86 there's various ways of doing that, either by selecting an event
> that simply doesn't count (cache-misses with 0 MESI mask), or by telling
> it to mask both user and kernel event.
That's exactly what I do for single core ARMv6. However, in the list of events
for mpcore I can't see any that wouldn't count. There's plenty of reserved
identifiers though so hopefully one of those will do the job. Also, ARM
counters can't be set to exclude any modes.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-15 14:36     ` Jamie Iles
@ 2009-12-15 17:06       ` Will Deacon
  0 siblings, 0 replies; 55+ messages in thread
From: Will Deacon @ 2009-12-15 17:06 UTC (permalink / raw)
  To: linux-arm-kernel

* Jamie Iles wrote:

> > I think it's standard Kernel coding style to put the declaration of a function
> > all on one line if it fits. The same goes elsewhere in the patch.
> I couldn't find anything that has this written down and there are plenty of
> other places in the perf code that do this. I personally like it because you
> can grep for "^foo" to find the definition. Unless people have strong
> objections to this I'll leave it as is.

I'm just going by the examples in Documentation/CodingStyle.

> > > +config CPU_HAS_PMU
> > > +	depends on CPU_V6 || CPU_V7 || CPU_XSCALE
> > > +	default y
> > > +	bool
> >
> > I think you should use XSCALE_PMU instead of CPU_XSCALE. Also, this should
> > probably be in the top-level ARM Kconfig instead of the mm/ one.
> Ok, I agree with you about using XSCALE_PMU, but why isn't mm/Kconfig the
> correct one? It's describing what features the CPU has and the PMU *is* a
> feature.

I'd say move it out of mm/Kconfig because the PMU is not related to memory
management and almost everything under mm/ is. oprofile may also want to use
these bools and that is based in the top-level ARM Kconfig.

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-15 15:36                     ` Jamie Iles
@ 2009-12-16 10:54                       ` Jamie Iles
  2009-12-16 11:04                         ` Will Deacon
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-16 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Dec 15, 2009 at 03:36:27PM +0000, Jamie Iles wrote:
> On Tue, Dec 15, 2009 at 04:30:13PM +0100, Peter Zijlstra wrote:
> > On Tue, 2009-12-15 at 15:19 +0000, Jamie Iles wrote:
> > > Another problem with mpcore support is that
> > > with the v6 performance counters, you can't disable a single event
> > > counter. 
> > 
> > Can you program them with a non-counting event?
> > 
> > On x86 there's various ways of doing that, either by selecting an event
> > that simply doesn't count (cache-misses with 0 MESI mask), or by telling
> > it to mask both user and kernel event.
> That's exactly what I do for single core ARMv6. However, in the list of events
> for mpcore I can't see any that wouldn't count. There's plenty of reserved
> identifiers though so hopefully one of those will do the job. Also, ARM
> counters can't be set to exclude any modes.
Thinking about this a bit more, although we can't disable the counters, we can
disable their interrupt reporting. So, when the generic perf events layer
calls pmu->disable(event), we do the update of the event then turn off the
IRQ. When we come to unthrottling, the counter will have carried on counting,
but if we set the period again, the counter gets set with the correct restart
value and then reenabled.

I think this should work for mpcore and is also required for the cycle counter
on all v6 cores. I've given this a go using an artificially low period on a
cycle counter and it does appear to do the job.

Thanks,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-16 10:54                       ` Jamie Iles
@ 2009-12-16 11:04                         ` Will Deacon
  2009-12-16 11:19                           ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-16 11:04 UTC (permalink / raw)
  To: linux-arm-kernel

*Jamie Iles wrote:

> > That's exactly what I do for single core ARMv6. However, in the list of events
> > for mpcore I can't see any that wouldn't count. There's plenty of reserved
> > identifiers though so hopefully one of those will do the job. Also, ARM
> > counters can't be set to exclude any modes.
>
> Thinking about this a bit more, although we can't disable the counters, we can
> disable their interrupt reporting. So, when the generic perf events layer
> calls pmu->disable(event), we do the update of the event then turn off the
> IRQ. When we come to unthrottling, the counter will have carried on counting,
> but if we set the period again, the counter gets set with the correct restart
> value and then reenabled.

This was my first thought, but I was concerned about how it would play out
with the armpmu_read function. Now that I see we don't read disabled counters,
I can't see any reason not to simply disable the interrupt and stash the count
value.
 
> I think this should work for mpcore and is also required for the cycle counter
> on all v6 cores. I've given this a go using an artificially low period on a
> cycle counter and it does appear to do the job.

If we do this for mpcore, is it worth doing the same thing for the other v6 cores
too [and removing the ETM `hack']?

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6
  2009-12-16 11:04                         ` Will Deacon
@ 2009-12-16 11:19                           ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-16 11:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Dec 16, 2009 at 11:04:09AM -0000, Will Deacon wrote:
> *Jamie Iles wrote:
> 
> > > That's exactly what I do for single core ARMv6. However, in the list of events
> > > for mpcore I can't see any that wouldn't count. There's plenty of reserved
> > > identifiers though so hopefully one of those will do the job. Also, ARM
> > > counters can't be set to exclude any modes.
> >
> > Thinking about this a bit more, although we can't disable the counters, we can
> > disable their interrupt reporting. So, when the generic perf events layer
> > calls pmu->disable(event), we do the update of the event then turn off the
> > IRQ. When we come to unthrottling, the counter will have carried on counting,
> > but if we set the period again, the counter gets set with the correct restart
> > value and then reenabled.
> 
> This was my first thought, but I was concerned about how it would play out
> with the armpmu_read function. Now that I see we don't read disabled counters,
> I can't see any reason not to simply disable the interrupt and stash the count
> value.
We shouldn't even need to stash the value. When the higher layer disables the
event we sample it to record the value so the stashing happens already.

> > I think this should work for mpcore and is also required for the cycle counter
> > on all v6 cores. I've given this a go using an artificially low period on a
> > cycle counter and it does appear to do the job.
> 
> If we do this for mpcore, is it worth doing the same thing for the other v6 cores
> too [and removing the ETM `hack']?
We do need to do this on UP v6 for the cycle counter as that keeps on running,
but imho the ETM hack is worth keeping as it really does stop the counter. It
doesn't hurt and we're guaranteed not to lose any events.

If I make the change to the unthrottling and the Kconfig and tab changes, and
add the mpcore events does anyone have an objection to making a final version
of this patch series and merging it?

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-15 11:15 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
  2009-12-15 11:15   ` [PATCH 2/5] arm/oprofile: reserve the PMU when starting Jamie Iles
  2009-12-15 14:13   ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Will Deacon
@ 2009-12-17 16:14   ` Will Deacon
  2009-12-17 16:27     ` Jamie Iles
  2 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-17 16:14 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jamie,

Just a small thing I noticed with the PMU reservation:

*Jamie Iles wrote:

> diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> new file mode 100644
> index 0000000..3a178bb
> --- /dev/null
> +++ b/arch/arm/kernel/pmu.c
<snip>
> +static const int irqs[] = {
> +#ifdef CONFIG_ARCH_PC3XX
> +	IRQ_NPMUIRQ,
> +#elif defined(CONFIG_ARCH_OMAP2)
> +	3,
> +#elif defined(CONFIG_ARCH_BCMRING)
> +	IRQ_PMUIRQ,
> +#elif defined(CONFIG_MACH_REALVIEW_EB)
> +	IRQ_EB11MP_PMU_CPU0,
> +	IRQ_EB11MP_PMU_CPU1,
> +	IRQ_EB11MP_PMU_CPU2,
> +	IRQ_EB11MP_PMU_CPU3,
> +#elif defined(CONFIG_ARCH_OMAP3)
> +	INT_34XX_BENCH_MPU_EMUL,
> +#elif defined(CONFIG_ARCH_IOP32X)
> +	IRQ_IOP32X_CORE_PMU,
> +#elif defined(CONFIG_ARCH_IOP33X)
> +	IRQ_IOP33X_CORE_PMU,
> +#elif defined(CONFIG_ARCH_PXA)
> +	IRQ_PMU,
> +#endif
> +};
> +
> +static const struct pmu_irqs pmu_irqs = {
> +	.irqs	    = irqs,
> +	.num_irqs   = ARRAY_SIZE(irqs),
> +};
> +
> +static DECLARE_MUTEX(pmu_mutex);
> +
> +const struct pmu_irqs *
> +reserve_pmu(void)
> +{
> +	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
> +
> +	return ret ? ERR_PTR(ret) : &pmu_irqs;
> +}

I think it would be sensible to return an error (-ENODEV) if
pmu_irqs.num_irqs == 0. Not doing so can cause applications
to fail silently when they are running on unsupported boards.

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2009-12-17 16:14   ` Will Deacon
@ 2009-12-17 16:27     ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-17 16:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Dec 17, 2009 at 04:14:22PM -0000, Will Deacon wrote:
> > +const struct pmu_irqs *
> > +reserve_pmu(void)
> > +{
> > +	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
> > +
> > +	return ret ? ERR_PTR(ret) : &pmu_irqs;
> > +}
> 
> I think it would be sensible to return an error (-ENODEV) if
> pmu_irqs.num_irqs == 0. Not doing so can cause applications
> to fail silently when they are running on unsupported boards.
I did think about that, but when the interrupts were in oprofile, it
didn't regard this as an error so I kept this the same.

In the perf events code, we check that num_irqs is >= 1. You _could_ use
the pmu as free running counters that you just periodically sample and
wouldn't need an interrupt so I thought it best to leave this error
checking to the user.

Cheers,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-15 11:15         ` [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6 Jamie Iles
  2009-12-15 14:29           ` Will Deacon
@ 2009-12-18 17:05           ` Jean Pihet
  2009-12-19 10:29             ` Jamie Iles
  2009-12-21 11:04             ` Will Deacon
  1 sibling, 2 replies; 55+ messages in thread
From: Jean Pihet @ 2009-12-18 17:05 UTC (permalink / raw)
  To: linux-arm-kernel

Hello,

Here is a patch that adds the support for ARMv7 processors, using the
PMNC HW unit.

The code is for review, it has been compiled and boot tested only, the
complete testing is in progress. Please let me know if the patch is
wrapped or garbled I will send it attached (20KB in size).

Feedback is welcome.

I had a question about the events mapping to user space. Although most
of the events are mapped in the kernel code, some of the exotic events
are not mapped (e.g. NEON or PMU related events). How to use those
events from user space? Is it done using the raw mappings?

Regards,
Jean

---
>From d48f736b380b0a05ab74743dcce4e662d71371d9 Mon Sep 17 00:00:00 2001
From: Jean Pihet <jpihet@mvista.com>
Date: Fri, 18 Dec 2009 17:46:21 +0100
Subject: [PATCH] arm/perfevents: add support for ARMv7

Adds the Performance Events support for ARMv7 processor, using
the PMNC unit in HW.

Signed-off-by: Jean Pihet <jpihet@mvista.com>
---
 arch/arm/Kconfig             |    2 +-
 arch/arm/kernel/perf_event.c |  708
+++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 708 insertions(+), 2 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2ac6e8d..9dfc0ee 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1171,7 +1171,7 @@ config HIGHPTE
 
 config HW_PERF_EVENTS
 	bool "Enable hardware performance counter support for perf events"
-	depends on PERF_EVENTS && CPU_HAS_PMU && CPU_V6
+	depends on PERF_EVENTS && CPU_HAS_PMU && (CPU_V6 || CPU_V7)
 	default y
 	help
 	  Enable hardware performance counter support for perf events. If
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index abb5267..79e92ce 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -4,6 +4,7 @@
  * ARM performance counter support.
  *
  * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ * ARMv7 support: Jean Pihet <jpihet@mvista.com>
  *
  * This code is based on the sparc64 perf event code, which is in turn
based
  * on the x86 code. Callchain code is based on the ARM OProfile
backtrace
@@ -35,8 +36,11 @@ DEFINE_SPINLOCK(pmu_lock);
  * ARMv6 supports a maximum of 3 events, starting from index 1. If we
add
  * another platform that supports more, we need to increase this to be
the
  * largest of all platforms.
+ *
+ * ARMv7 supports up to 5 events:
+ *  cycle counter CCNT + 4 events counters CNT0..3
  */
-#define ARMPMU_MAX_HWEVENTS		4
+#define ARMPMU_MAX_HWEVENTS		5
 
 /* The events for a given CPU. */
 struct cpu_hw_events {
@@ -965,6 +969,701 @@ static struct arm_pmu armv6pmu = {
 	.max_period		= (1LLU << 32) - 1,
 };
 
+/*
+ * ARMv7 Performance counter handling code.
+ *
+ * Copied from ARMv6 code, with the low level code inspired
+ *  by the ARMv7 Oprofile code.
+ *
+ * ARMv7 has 4 configurable performance counters and a single cycle
counter.
+ * All counters can be enabled/disabled and IRQ masked separately. The
cycle
+ *  counter and all 4 performance counters together can be reset
separately.
+ */
+
+enum armv7_perf_types {
+	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
+	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
+	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
+	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
+	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
+	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
+	ARMV7_PERFCTR_DREAD			= 0x06,
+	ARMV7_PERFCTR_DWRITE			= 0x07,
+	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
+	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
+	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
+	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
+	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
+	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
+	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
+	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
+
+	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
+
+	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
+	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
+	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
+	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
+	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
+	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
+	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
+	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
+	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
+	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
+	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
+	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
+	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
+	ARMV7_PERFCTR_L2_NEON			= 0x4E,
+	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
+	ARMV7_PERFCTR_L1_INST			= 0x50,
+	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
+	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
+	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
+	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
+	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
+	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
+	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
+	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
+	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
+	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
+
+	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
+	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
+	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
+
+	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
+};
+
+enum armv7_counters {
+	ARMV7_CYCLE_COUNTER = 1,
+	ARMV7_COUNTER0,
+	ARMV7_COUNTER1,
+	ARMV7_COUNTER2,
+	ARMV7_COUNTER3,
+};
+
+/*
+ * The hardware events that we support. We do support cache operations
but
+ * we have harvard caches and no way to combine instruction and data
+ * accesses/misses in hardware.
+ */
+static const unsigned armv7_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_FAILED,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = HW_OP_UNSUPPORTED,
+};
+
+static const unsigned armv7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					  [PERF_COUNT_HW_CACHE_OP_MAX]
+					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * The ARM performance counters can count micro DTLB misses,
+		 * micro ITLB misses and main TLB misses. There isn't an event
+		 * for TLB misses, so use the micro misses here and if users
+		 * want the main TLB misses they can use a raw counter.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
+ * ARMv7 low level PMNC access
+ */
+
+/*
+ * Per-CPU PMNC: config reg
+ */
+#define ARMV7_PMNC_E		(1 << 0) /* Enable all counters */
+#define ARMV7_PMNC_P		(1 << 1) /* Reset all counters */
+#define ARMV7_PMNC_C		(1 << 2) /* Cycle counter reset */
+#define ARMV7_PMNC_D		(1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV7_PMNC_X		(1 << 4) /* Export to ETM */
+#define ARMV7_PMNC_DP		(1 << 5) /* Disable CCNT if non-invasive debug*/
+#define	ARMV7_PMNC_MASK		0x3f	 /* Mask for writable bits */
+
+/*
+ * Available counters
+ */
+#define ARMV7_CCNT 		0
+#define ARMV7_CNT0 		1
+#define ARMV7_CNT1 		2
+#define ARMV7_CNT2 		3
+#define ARMV7_CNT3 		4
+#define ARMV7_CNTMAX 		5
+#define ARMV7_COUNTER_TO_CCNT	(ARMV7_CYCLE_COUNTER - ARMV7_CCNT)
+
+#define ARMV7_CPU_COUNTER(cpu, counter)	((cpu) * CNTMAX + (counter))
+
+/*
+ * CNTENS: counters enable reg
+ */
+#define ARMV7_CNTENS_P0		(1 << 0)
+#define ARMV7_CNTENS_P1		(1 << 1)
+#define ARMV7_CNTENS_P2		(1 << 2)
+#define ARMV7_CNTENS_P3		(1 << 3)
+#define ARMV7_CNTENS_C		(1 << 31)
+#define	ARMV7_CNTENS_MASK	0x8000000f	/* Mask for writable bits */
+
+/*
+ * CNTENC: counters disable reg
+ */
+#define ARMV7_CNTENC_P0		(1 << 0)
+#define ARMV7_CNTENC_P1		(1 << 1)
+#define ARMV7_CNTENC_P2		(1 << 2)
+#define ARMV7_CNTENC_P3		(1 << 3)
+#define ARMV7_CNTENC_C		(1 << 31)
+#define	ARMV7_CNTENC_MASK	0x8000000f	/* Mask for writable bits */
+
+/*
+ * INTENS: counters overflow interrupt enable reg
+ */
+#define ARMV7_INTENS_P0		(1 << 0)
+#define ARMV7_INTENS_P1		(1 << 1)
+#define ARMV7_INTENS_P2		(1 << 2)
+#define ARMV7_INTENS_P3		(1 << 3)
+#define ARMV7_INTENS_C		(1 << 31)
+#define	ARMV7_INTENS_MASK	0x8000000f	/* Mask for writable bits */
+
+/*
+ * INTENC: counters overflow interrupt disable reg
+ */
+#define ARMV7_INTENC_P0		(1 << 0)
+#define ARMV7_INTENC_P1		(1 << 1)
+#define ARMV7_INTENC_P2		(1 << 2)
+#define ARMV7_INTENC_P3		(1 << 3)
+#define ARMV7_INTENC_C		(1 << 31)
+#define	ARMV7_INTENC_MASK	0x8000000f	/* Mask for writable bits */
+
+/*
+ * EVTSEL: Event selection reg
+ */
+#define	ARMV7_EVTSEL_MASK	0x7f		/* Mask for writable bits */
+
+/*
+ * SELECT: Counter selection reg
+ */
+#define	ARMV7_SELECT_MASK	0x1f		/* Mask for writable bits */
+
+/*
+ * FLAG: counters overflow flag status reg
+ */
+#define ARMV7_FLAG_P0		(1 << 0)
+#define ARMV7_FLAG_P1		(1 << 1)
+#define ARMV7_FLAG_P2		(1 << 2)
+#define ARMV7_FLAG_P3		(1 << 3)
+#define ARMV7_FLAG_C		(1 << 31)
+#define	ARMV7_FLAG_MASK		0x8000000f	/* Mask for writable bits */
+#define	ARMV7_OVERFLOWED_MASK	ARMV7_FLAG_MASK
+
+static inline unsigned long armv7_pmnc_read(void)
+{
+	u32 val;
+	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(val));
+	return val;
+}
+
+static inline void armv7_pmnc_write(unsigned long val)
+{
+	val &= ARMV7_PMNC_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(val));
+}
+
+static inline int armv7_pmnc_has_overflowed(unsigned long pmnc)
+{
+	return pmnc & ARMV7_OVERFLOWED_MASK;
+}
+
+static inline int armv7_pmnc_counter_has_overflowed(unsigned long pmnc,
+					enum armv7_counters counter)
+{
+	int ret;
+
+	if (ARMV7_CYCLE_COUNTER == counter)
+		ret = pmnc & ARMV7_FLAG_C;
+	else if (ARMV7_COUNTER0 == counter)
+		ret = pmnc & ARMV7_FLAG_P0;
+	else if (ARMV7_COUNTER1 == counter)
+		ret = pmnc & ARMV7_FLAG_P1;
+	else if (ARMV7_COUNTER2 == counter)
+		ret = pmnc & ARMV7_FLAG_P2;
+	else if (ARMV7_COUNTER3 == counter)
+		ret = pmnc & ARMV7_FLAG_P3;
+	else
+		BUG();
+
+	return ret;
+}
+
+static inline int armv7_pmnc_select_counter(unsigned int cnt)
+{
+	u32 val;
+
+	cnt -= ARMV7_COUNTER_TO_CCNT;
+
+	if ((cnt == ARMV7_CCNT) || (cnt >= ARMV7_CNTMAX)) {
+		printk(KERN_ERR "oprofile: CPU%u selecting wrong PMNC counter"
+			" %d\n", smp_processor_id(), cnt);
+		return -1;
+	}
+
+	val = (cnt - ARMV7_CNT0) & ARMV7_SELECT_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (val));
+
+	return cnt;
+}
+
+static inline u32 armv7pmu_read_counter(int counter)
+{
+	unsigned long value = 0;
+
+	switch (counter) {
+	case ARMV7_CYCLE_COUNTER:
+		asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (value));
+		break;
+	case ARMV7_COUNTER0:
+	case ARMV7_COUNTER1:
+	case ARMV7_COUNTER2:
+	case ARMV7_COUNTER3:
+		if (armv7_pmnc_select_counter(counter) == counter)
+			asm volatile("mrc p15, 0, %0, c9, c13, 2"
+				     : "=r" (value));
+		break;
+	default:
+		BUG();
+	}
+
+	return value;
+}
+
+static inline void armv7pmu_write_counter(int counter, u32 value)
+{
+	switch (counter) {
+	case ARMV7_CYCLE_COUNTER:
+		asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (value));
+		break;
+	case ARMV7_COUNTER0:
+	case ARMV7_COUNTER1:
+	case ARMV7_COUNTER2:
+	case ARMV7_COUNTER3:
+		if (armv7_pmnc_select_counter(counter) == counter)
+			asm volatile("mcr p15, 0, %0, c9, c13, 2"
+				     : : "r" (value));
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void armv7_pmnc_write_evtsel(unsigned int cnt, u32 val)
+{
+	if (armv7_pmnc_select_counter(cnt) == cnt) {
+		val &= ARMV7_EVTSEL_MASK;
+		asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (val));
+	}
+}
+
+static inline u32 armv7_pmnc_enable_counter(unsigned int cnt)
+{
+	u32 val;
+
+	cnt -= ARMV7_COUNTER_TO_CCNT;
+
+	if (cnt >= ARMV7_CNTMAX) {
+		printk(KERN_ERR "oprofile: CPU%u enabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), cnt);
+		return -1;
+	}
+
+	if (cnt == ARMV7_CCNT)
+		val = ARMV7_CNTENS_C;
+	else
+		val = (1 << (cnt - ARMV7_CNT0));
+
+	val &= ARMV7_CNTENS_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (val));
+
+	return cnt;
+}
+
+static inline u32 armv7_pmnc_disable_counter(unsigned int cnt)
+{
+	u32 val;
+
+	cnt -= ARMV7_COUNTER_TO_CCNT;
+
+	if (cnt >= ARMV7_CNTMAX) {
+		printk(KERN_ERR "oprofile: CPU%u disabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), cnt);
+		return -1;
+	}
+
+	if (cnt == ARMV7_CCNT)
+		val = ARMV7_CNTENC_C;
+	else
+		val = (1 << (cnt - ARMV7_CNT0));
+
+	val &= ARMV7_CNTENC_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (val));
+
+	return cnt;
+}
+
+static inline u32 armv7_pmnc_enable_intens(unsigned int cnt)
+{
+	u32 val;
+
+	cnt -= ARMV7_COUNTER_TO_CCNT;
+
+	if (cnt >= ARMV7_CNTMAX) {
+		printk(KERN_ERR "oprofile: CPU%u enabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), cnt);
+		return -1;
+	}
+
+	if (cnt == ARMV7_CCNT)
+		val = ARMV7_INTENS_C;
+	else
+		val = (1 << (cnt - ARMV7_CNT0));
+
+	val &= ARMV7_INTENS_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (val));
+
+	return cnt;
+}
+
+static inline u32 armv7_pmnc_disable_intens(unsigned int cnt)
+{
+	u32 val;
+
+	cnt -= ARMV7_COUNTER_TO_CCNT;
+
+	if (cnt >= ARMV7_CNTMAX) {
+		printk(KERN_ERR "oprofile: CPU%u disabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), cnt);
+		return -1;
+	}
+
+	if (cnt == ARMV7_CCNT)
+		val = ARMV7_INTENC_C;
+	else
+		val = (1 << (cnt - ARMV7_CNT0));
+
+	val &= ARMV7_INTENC_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (val));
+
+	return cnt;
+}
+
+static inline u32 armv7_pmnc_getreset_flags(void)
+{
+	u32 val;
+
+	/* Read */
+	asm volatile("mrc p15, 0, %0, c9, c12, 3" : "=r" (val));
+
+	/* Write to clear flags */
+	val &= ARMV7_FLAG_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (val));
+
+	return val;
+}
+
+void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Enable counter and interrupt, and set the counter to count
+	 * the event that we're interested in.
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Set event (if destined for PMNx counters)
+	 * We don't need to set the event if it's a cycle count
+	 */
+	if (idx != ARMV7_CYCLE_COUNTER)
+		armv7_pmnc_write_evtsel(idx, hwc->config_base);
+
+	/*
+	 * Enable interrupt for this counter
+	 */
+	armv7_pmnc_enable_intens(idx);
+
+	/*
+	 * Enable counter
+	 */
+	armv7_pmnc_enable_counter(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable counter and interrupt
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Disable interrupt for this counter
+	 */
+	armv7_pmnc_disable_intens(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
+{
+	unsigned long pmnc;
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	/*
+	 * Get and reset the IRQ flags
+	 */
+	pmnc = armv7_pmnc_getreset_flags();
+
+	/*
+	 * Did an overflow occur?
+	 */
+	if (!armv7_pmnc_has_overflowed(pmnc))
+		return IRQ_NONE;
+
+	/*
+	 * Handle the counter(s) overflow(s)
+	 */
+	regs = get_irq_regs();
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx <= armpmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		/*
+		 * We have a single interrupt for all counters. Check that
+		 * each counter has overflowed before we process it.
+		 */
+		if (!armv7_pmnc_counter_has_overflowed(pmnc, idx))
+			continue;
+
+		hwc = &event->hw;
+		armpmu_event_update(event, hwc, idx);
+		data.period = event->hw.last_period;
+		if (!armpmu_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 0, &data, regs))
+			armpmu->disable(hwc, idx);
+	}
+
+	/*
+	 * Handle the pending perf events.
+	 *
+	 * Note: this call *must* be run with interrupts enabled. For
+	 * platforms that can have the PMU interrupts raised as a PMI, this
+	 * will not work.
+	 */
+	perf_event_do_pending();
+
+	return IRQ_HANDLED;
+}
+
+static void armv7pmu_start(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Enable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() | ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_stop(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Disable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() & ~ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static inline int armv7pmu_event_map(int config)
+{
+	int mapping = armv7_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static u64 armv7pmu_raw_event(u64 config)
+{
+	return config & 0xff;
+}
+
+static int armv7pmu_get_event_idx(struct cpu_hw_events *cpuc,
+				  struct hw_perf_event *event)
+{
+	/* Always place a cycle counter into the cycle counter. */
+	if (ARMV7_PERFCTR_CPU_CYCLES == event->config_base) {
+		if (test_and_set_bit(ARMV7_CYCLE_COUNTER, cpuc->used_mask))
+			return -EAGAIN;
+
+		return ARMV7_CYCLE_COUNTER;
+	} else {
+		/*
+		 * For anything other than a cycle counter, try and use
+		 * counters 0..3
+		 */
+		if (!test_and_set_bit(ARMV7_COUNTER0, cpuc->used_mask))
+			return ARMV7_COUNTER0;
+
+		if (!test_and_set_bit(ARMV7_COUNTER1, cpuc->used_mask))
+			return ARMV7_COUNTER1;
+
+		if (!test_and_set_bit(ARMV7_COUNTER2, cpuc->used_mask))
+			return ARMV7_COUNTER2;
+
+		if (!test_and_set_bit(ARMV7_COUNTER3, cpuc->used_mask))
+			return ARMV7_COUNTER3;
+
+		/* The counters are all in use. */
+		return -EAGAIN;
+	}
+}
+
+static struct arm_pmu armv7pmu = {
+	.name			= "v7",
+	.handle_irq		= armv7pmu_handle_irq,
+	.enable			= armv7pmu_enable_event,
+	.disable		= armv7pmu_disable_event,
+	.event_map		= armv7pmu_event_map,
+	.raw_event		= armv7pmu_raw_event,
+	.read_counter		= armv7pmu_read_counter,
+	.write_counter		= armv7pmu_write_counter,
+	.get_event_idx		= armv7pmu_get_event_idx,
+	.start			= armv7pmu_start,
+	.stop			= armv7pmu_stop,
+	.num_events		= 5,
+	.max_period		= (1LLU << 32) - 1,
+};
+
 static int __init
 init_hw_perf_events(void)
 {
@@ -977,6 +1676,13 @@ init_hw_perf_events(void)
                 memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
                        sizeof(armv6_perf_cache_map));
                 perf_max_events	= armv6pmu.num_events;
+	} else if (cpu_architecture() == CPU_ARCH_ARMv7) {
+		armpmu = &armv7pmu;
+		memcpy(armpmu_perf_cache_map, armv7_perf_cache_map,
+			sizeof(armv7_perf_cache_map));
+		perf_max_events	= armv7pmu.num_events;
+		/* Initialize & Reset PMNC: C bit and P bit */
+		armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
         } else {
                 pr_info("no hardware support available\n");
                 perf_max_events = -1;
-- 
1.6.0


---
On Tue, 2009-12-15@11:15 +0000, Jamie Iles wrote:
> This patch implements support for ARMv6 performance counters in the
> Linux performance events subsystem. ARMv6 architectures that have the
> performance counters should enable HW_PERF_EVENTS and define the
> interrupts for the counters in arch/arm/kernel/perf_event.c
> 
> This implementation also provides an ARM PMU abstraction layer to allow
> ARMv7 and others to be supported in the future by adding new a
> 'struct arm_pmu'.
> 
> Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@elte.hu>
> ---
>  arch/arm/Kconfig             |    8 +
>  arch/arm/kernel/Makefile     |    1 +
>  arch/arm/kernel/perf_event.c | 1125 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1134 insertions(+), 0 deletions(-)
>  create mode 100644 arch/arm/kernel/perf_event.c
> 
...

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-18 17:05           ` Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6) Jean Pihet
@ 2009-12-19 10:29             ` Jamie Iles
  2009-12-19 10:53               ` Ingo Molnar
  2009-12-21 11:29               ` Jean Pihet
  2009-12-21 11:04             ` Will Deacon
  1 sibling, 2 replies; 55+ messages in thread
From: Jamie Iles @ 2009-12-19 10:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 18, 2009 at 06:05:29PM +0100, Jean Pihet wrote:
> Here is a patch that adds the support for ARMv7 processors, using the
> PMNC HW unit.
> 
> The code is for review, it has been compiled and boot tested only, the
> complete testing is in progress. Please let me know if the patch is
> wrapped or garbled I will send it attached (20KB in size).
Excellent! It looks good to me, a few minor comments though. I don't know if
it's my mail client but some of the longer lines appeared to wrap onto 2 patch
lines but it's not difficult to apply.

[snip]
> I had a question about the events mapping to user space. Although most
> of the events are mapped in the kernel code, some of the exotic events
> are not mapped (e.g. NEON or PMU related events). How to use those
> events from user space? Is it done using the raw mappings?
Yes, the raw events should do the trick. 'perf stat -a -e rff -- sleep 1' will
do cycle counting on v6 using the raw event number.
> +enum armv7_perf_types {
> +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> +	ARMV7_PERFCTR_DREAD			= 0x06,
> +	ARMV7_PERFCTR_DWRITE			= 0x07,
> +	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
> +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> +	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
> +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> +
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
> +
> +	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
> +	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
> +	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
> +	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
> +	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
> +	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
> +	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
> +	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
> +	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
> +	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
> +	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
> +	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
> +	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
> +	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
> +	ARMV7_PERFCTR_L2_NEON			= 0x4E,
> +	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
> +	ARMV7_PERFCTR_L1_INST			= 0x50,
> +	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
> +	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
> +	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
> +	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
> +	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
> +	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
> +	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
> +	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
> +	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
> +	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
> +
> +	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
> +	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
> +	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
> +
> +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> +};
> +
> +enum armv7_counters {
> +	ARMV7_CYCLE_COUNTER = 1,
> +	ARMV7_COUNTER0,
> +	ARMV7_COUNTER1,
> +	ARMV7_COUNTER2,
> +	ARMV7_COUNTER3,
> +};
> +
> +/*
> + * The hardware events that we support. We do support cache operations
> but
> + * we have harvard caches and no way to combine instruction and data
> + * accesses/misses in hardware.
> + */
> +static const unsigned armv7_perf_map[PERF_COUNT_HW_MAX] = {
> +	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
> +	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
> +	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
> +	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
> +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_BRANCH_TAKEN,
> +	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_FAILED,
> +	[PERF_COUNT_HW_BUS_CYCLES]	    = HW_OP_UNSUPPORTED,
> +};
> +
> +static const unsigned armv7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
> +					  [PERF_COUNT_HW_CACHE_OP_MAX]
> +					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +	[C(L1D)] = {
> +		/*
> +		 * The performance counters don't differentiate between read
> +		 * and write accesses/misses so this isn't strictly correct,
> +		 * but it's the best we can do. Writes and reads get
> +		 * combined.
> +		 */
> +		[C(OP_READ)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> +		},
> +		[C(OP_WRITE)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> +		},
> +		[C(OP_PREFETCH)] = {
> +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> +		},
> +	},
> +	[C(L1I)] = {
> +		[C(OP_READ)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> +		},
> +		[C(OP_WRITE)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> +		},
> +		[C(OP_PREFETCH)] = {
> +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> +		},
> +	},
> +	[C(LL)] = {
> +		[C(OP_READ)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
> +		},
> +		[C(OP_WRITE)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
> +		},
> +		[C(OP_PREFETCH)] = {
> +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> +		},
> +	},
> +	[C(DTLB)] = {
> +		/*
> +		 * The ARM performance counters can count micro DTLB misses,
> +		 * micro ITLB misses and main TLB misses. There isn't an event
> +		 * for TLB misses, so use the micro misses here and if users
> +		 * want the main TLB misses they can use a raw counter.
> +		 */
I think this comment needs to be changed for v7. From the events enum it
doesn't look like v7 has micro tlb events.

> +static inline int armv7_pmnc_select_counter(unsigned int cnt)
> +{
> +	u32 val;
> +
> +	cnt -= ARMV7_COUNTER_TO_CCNT;
> +
> +	if ((cnt == ARMV7_CCNT) || (cnt >= ARMV7_CNTMAX)) {
> +		printk(KERN_ERR "oprofile: CPU%u selecting wrong PMNC counter"
> +			" %d\n", smp_processor_id(), cnt);
Most of the printk's refer to oprofile. Could we use pr_err() etc so we get
the same prefix for all messages?

[snip]
> +		if (armv7_pmnc_select_counter(counter) == counter)
> +			asm volatile("mrc p15, 0, %0, c9, c13, 2"
> +				     : "=r" (value));
Does this sequence need some locking to make sure that we really do read from
the counter that we've selected? The same applies to the other places.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-19 10:29             ` Jamie Iles
@ 2009-12-19 10:53               ` Ingo Molnar
  2009-12-21 11:32                 ` Jean Pihet
  2009-12-21 11:29               ` Jean Pihet
  1 sibling, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-12-19 10:53 UTC (permalink / raw)
  To: linux-arm-kernel


* Jamie Iles <jamie@jamieiles.com> wrote:

> [snip]
>
> > I had a question about the events mapping to user space. Although most of 
> > the events are mapped in the kernel code, some of the exotic events are 
> > not mapped (e.g. NEON or PMU related events). How to use those events from 
> > user space? Is it done using the raw mappings?
>
> Yes, the raw events should do the trick. 'perf stat -a -e rff -- sleep 1' 
> will do cycle counting on v6 using the raw event number.

Sidenote: if some of the more exotic events turn out to be useful and are 
worth generalizing, then we can add them to the generic enumeration and add 
tooling support (symbols, aliases, listing, etc.) for it.

The current set of generic events are intended to be a 'seed' set, to be 
extended on an as-needed basis - not cast into stone in any way.

The current generic (hardware) events are (from 'perf list' output):

  cpu-cycles OR cycles                       [Hardware event]
  instructions                               [Hardware event]
  cache-references                           [Hardware event]
  cache-misses                               [Hardware event]
  branch-instructions OR branches            [Hardware event]
  branch-misses                              [Hardware event]
  bus-cycles                                 [Hardware event]
  L1-dcache-loads                            [Hardware cache event]
  L1-dcache-load-misses                      [Hardware cache event]
  L1-dcache-stores                           [Hardware cache event]
  L1-dcache-store-misses                     [Hardware cache event]
  L1-dcache-prefetches                       [Hardware cache event]
  L1-dcache-prefetch-misses                  [Hardware cache event]
  L1-icache-loads                            [Hardware cache event]
  L1-icache-load-misses                      [Hardware cache event]
  L1-icache-prefetches                       [Hardware cache event]
  L1-icache-prefetch-misses                  [Hardware cache event]
  LLC-loads                                  [Hardware cache event]
  LLC-load-misses                            [Hardware cache event]
  LLC-stores                                 [Hardware cache event]
  LLC-store-misses                           [Hardware cache event]
  LLC-prefetches                             [Hardware cache event]
  LLC-prefetch-misses                        [Hardware cache event]
  dTLB-loads                                 [Hardware cache event]
  dTLB-load-misses                           [Hardware cache event]
  dTLB-stores                                [Hardware cache event]
  dTLB-store-misses                          [Hardware cache event]
  dTLB-prefetches                            [Hardware cache event]
  dTLB-prefetch-misses                       [Hardware cache event]
  iTLB-loads                                 [Hardware cache event]
  iTLB-load-misses                           [Hardware cache event]
  branch-loads                               [Hardware cache event]
  branch-load-misses                         [Hardware cache event]
  rNNN                                       [raw hardware event descriptor]
  mem:<addr>[:access]                        [hardware breakpoint]

But i think we might want to capture FPU-alike instructions as well on CPUs 
that can count/sample them - etc.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-18 17:05           ` Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6) Jean Pihet
  2009-12-19 10:29             ` Jamie Iles
@ 2009-12-21 11:04             ` Will Deacon
  2009-12-21 11:43               ` Jean Pihet
  1 sibling, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-21 11:04 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jean,

I've provided some comments inline. Hopefully they're useful.

* Jean Pihet wrote:

> Hello,
> 
> Here is a patch that adds the support for ARMv7 processors, using the
> PMNC HW unit.
> 
> The code is for review, it has been compiled and boot tested only, the
> complete testing is in progress. Please let me know if the patch is
> wrapped or garbled I will send it attached (20KB in size).
> 
> Feedback is welcome.
> 

<snip>

> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
> index abb5267..79e92ce 100644
> --- a/arch/arm/kernel/perf_event.c
> +++ b/arch/arm/kernel/perf_event.c
> @@ -4,6 +4,7 @@
>   * ARM performance counter support.
>   *
>   * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
> + * ARMv7 support: Jean Pihet <jpihet@mvista.com>
>   *
>   * This code is based on the sparc64 perf event code, which is in turn
> based
>   * on the x86 code. Callchain code is based on the ARM OProfile
> backtrace
> @@ -35,8 +36,11 @@ DEFINE_SPINLOCK(pmu_lock);
>   * ARMv6 supports a maximum of 3 events, starting from index 1. If we
> add
>   * another platform that supports more, we need to increase this to be
> the
>   * largest of all platforms.
> + *
> + * ARMv7 supports up to 5 events:
> + *  cycle counter CCNT + 4 events counters CNT0..3
>   */
> -#define ARMPMU_MAX_HWEVENTS		4
> +#define ARMPMU_MAX_HWEVENTS		5

The maximum number of event counters on ARMv7 is currently 6 [Cortex-A9],
plus a cycle counter. Additionally, the number of event counters actually
available is implementation defined (the cycle counter is mandatory). You can
find out the number of event counters using the PMCR ((PMCR >> 11) & 0x1f).

> 
>  /* The events for a given CPU. */
>  struct cpu_hw_events {
> @@ -965,6 +969,701 @@ static struct arm_pmu armv6pmu = {
>  	.max_period		= (1LLU << 32) - 1,
>  };
> 
> +/*
> + * ARMv7 Performance counter handling code.
> + *
> + * Copied from ARMv6 code, with the low level code inspired
> + *  by the ARMv7 Oprofile code.
> + *
> + * ARMv7 has 4 configurable performance counters and a single cycle
> counter.
> + * All counters can be enabled/disabled and IRQ masked separately. The
> cycle
> + *  counter and all 4 performance counters together can be reset
> separately.
> + */
> +
> +enum armv7_perf_types {
> +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> +	ARMV7_PERFCTR_DREAD			= 0x06,
> +	ARMV7_PERFCTR_DWRITE			= 0x07,
> +	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
> +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> +	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
> +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> +
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,

Ok - the events so far are defined by the v7 architecture.
Note that this doesn't necessarily mean they are all supported by
the core.

> +	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
> +	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
> +	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
> +	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
> +	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
> +	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
> +	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
> +	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
> +	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
> +	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
> +	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
> +	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
> +	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
> +	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
> +	ARMV7_PERFCTR_L2_NEON			= 0x4E,
> +	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
> +	ARMV7_PERFCTR_L1_INST			= 0x50,
> +	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
> +	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
> +	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
> +	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
> +	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
> +	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
> +	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
> +	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
> +	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
> +	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
> +
> +	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
> +	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
> +	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
> +
> +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> +};

These events are specific to the Cortex-A8.
Unfortunately, these numbers clash with events specific
to the Cortex-A9 [and potentially future v7 cores].
For example, 0x40 on the A8 is WRITE_BUFFER_FULL but on the
A9 it is JAVA_BYTECODE_EXEC. This means that you'll need to
take a similar approach as was taken for ARM11MP vs ARM11*.

<snip>

> +/*
> + * Available counters
> + */
> +#define ARMV7_CCNT 		0
> +#define ARMV7_CNT0 		1
> +#define ARMV7_CNT1 		2
> +#define ARMV7_CNT2 		3
> +#define ARMV7_CNT3 		4
> +#define ARMV7_CNTMAX 		5
> +#define ARMV7_COUNTER_TO_CCNT	(ARMV7_CYCLE_COUNTER - ARMV7_CCNT)
> +
> +#define ARMV7_CPU_COUNTER(cpu, counter)	((cpu) * CNTMAX + (counter))

You don't use this macro. I imagine there are others which are no longer used too.

<snip>

> +static inline int armv7_pmnc_select_counter(unsigned int cnt)
> +{
> +	u32 val;
> +
> +	cnt -= ARMV7_COUNTER_TO_CCNT;
> +
> +	if ((cnt == ARMV7_CCNT) || (cnt >= ARMV7_CNTMAX)) {
> +		printk(KERN_ERR "oprofile: CPU%u selecting wrong PMNC counter"
> +			" %d\n", smp_processor_id(), cnt);
> +		return -1;
> +	}

Nice error message :)

<snip>

>  static int __init
>  init_hw_perf_events(void)
>  {
> @@ -977,6 +1676,13 @@ init_hw_perf_events(void)
>                  memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
>                         sizeof(armv6_perf_cache_map));
>                  perf_max_events	= armv6pmu.num_events;
> +	} else if (cpu_architecture() == CPU_ARCH_ARMv7) {
> +		armpmu = &armv7pmu;
> +		memcpy(armpmu_perf_cache_map, armv7_perf_cache_map,
> +			sizeof(armv7_perf_cache_map));
> +		perf_max_events	= armv7pmu.num_events;
> +		/* Initialize & Reset PMNC: C bit and P bit */
> +		armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
>          } else {
>                  pr_info("no hardware support available\n");
>                  perf_max_events = -1;

You'll need to switch on the cpuid to select the correct event mappings.

I've implemented this for oprofile, I'll post it as an RFC after Christmas
as I won't be able to respond in the meantime.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-19 10:29             ` Jamie Iles
  2009-12-19 10:53               ` Ingo Molnar
@ 2009-12-21 11:29               ` Jean Pihet
  1 sibling, 0 replies; 55+ messages in thread
From: Jean Pihet @ 2009-12-21 11:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Saturday 19 December 2009 11:29:05 Jamie Iles wrote:
> On Fri, Dec 18, 2009 at 06:05:29PM +0100, Jean Pihet wrote:
> > Here is a patch that adds the support for ARMv7 processors, using the
> > PMNC HW unit.
> >
> > The code is for review, it has been compiled and boot tested only, the
> > complete testing is in progress. Please let me know if the patch is
> > wrapped or garbled I will send it attached (20KB in size).
>
> Excellent! It looks good to me, a few minor comments though. I don't know
> if it's my mail client but some of the longer lines appeared to wrap onto 2
> patch lines but it's not difficult to apply.
>
> [snip]
>
> > I had a question about the events mapping to user space. Although most
> > of the events are mapped in the kernel code, some of the exotic events
> > are not mapped (e.g. NEON or PMU related events). How to use those
> > events from user space? Is it done using the raw mappings?
>
> Yes, the raw events should do the trick. 'perf stat -a -e rff -- sleep 1'
> will do cycle counting on v6 using the raw event number.
Ok.

>
> > +enum armv7_perf_types {
> > +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> > +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> > +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> > +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> > +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> > +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> > +	ARMV7_PERFCTR_DREAD			= 0x06,
> > +	ARMV7_PERFCTR_DWRITE			= 0x07,
> > +	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
> > +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> > +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> > +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> > +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> > +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> > +	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
> > +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> > +
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
> > +
> > +	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
> > +	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
> > +	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
> > +	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
> > +	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
> > +	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
> > +	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
> > +	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
> > +	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
> > +	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
> > +	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
> > +	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
> > +	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
> > +	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
> > +	ARMV7_PERFCTR_L2_NEON			= 0x4E,
> > +	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
> > +	ARMV7_PERFCTR_L1_INST			= 0x50,
> > +	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
> > +	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
> > +	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
> > +	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
> > +	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
> > +	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
> > +	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
> > +	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
> > +	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
> > +	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
> > +
> > +	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
> > +	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
> > +	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
> > +
> > +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> > +};
> > +
> > +enum armv7_counters {
> > +	ARMV7_CYCLE_COUNTER = 1,
> > +	ARMV7_COUNTER0,
> > +	ARMV7_COUNTER1,
> > +	ARMV7_COUNTER2,
> > +	ARMV7_COUNTER3,
> > +};
> > +
> > +/*
> > + * The hardware events that we support. We do support cache operations
> > but
> > + * we have harvard caches and no way to combine instruction and data
> > + * accesses/misses in hardware.
> > + */
> > +static const unsigned armv7_perf_map[PERF_COUNT_HW_MAX] = {
> > +	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
> > +	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
> > +	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
> > +	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
> > +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_BRANCH_TAKEN,
> > +	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_FAILED,
> > +	[PERF_COUNT_HW_BUS_CYCLES]	    = HW_OP_UNSUPPORTED,
> > +};
> > +
> > +static const unsigned armv7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
> > +					  [PERF_COUNT_HW_CACHE_OP_MAX]
> > +					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > +	[C(L1D)] = {
> > +		/*
> > +		 * The performance counters don't differentiate between read
> > +		 * and write accesses/misses so this isn't strictly correct,
> > +		 * but it's the best we can do. Writes and reads get
> > +		 * combined.
> > +		 */
> > +		[C(OP_READ)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> > +		},
> > +		[C(OP_WRITE)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> > +		},
> > +		[C(OP_PREFETCH)] = {
> > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > +		},
> > +	},
> > +	[C(L1I)] = {
> > +		[C(OP_READ)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > +		},
> > +		[C(OP_WRITE)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > +		},
> > +		[C(OP_PREFETCH)] = {
> > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > +		},
> > +	},
> > +	[C(LL)] = {
> > +		[C(OP_READ)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
> > +		},
> > +		[C(OP_WRITE)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
> > +		},
> > +		[C(OP_PREFETCH)] = {
> > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > +		},
> > +	},
> > +	[C(DTLB)] = {
> > +		/*
> > +		 * The ARM performance counters can count micro DTLB misses,
> > +		 * micro ITLB misses and main TLB misses. There isn't an event
> > +		 * for TLB misses, so use the micro misses here and if users
> > +		 * want the main TLB misses they can use a raw counter.
> > +		 */
>
> I think this comment needs to be changed for v7. From the events enum it
> doesn't look like v7 has micro tlb events.
Yes I need to correct this.

> > +static inline int armv7_pmnc_select_counter(unsigned int cnt)
> > +{
> > +	u32 val;
> > +
> > +	cnt -= ARMV7_COUNTER_TO_CCNT;
> > +
> > +	if ((cnt == ARMV7_CCNT) || (cnt >= ARMV7_CNTMAX)) {
> > +		printk(KERN_ERR "oprofile: CPU%u selecting wrong PMNC counter"
> > +			" %d\n", smp_processor_id(), cnt);
>
> Most of the printk's refer to oprofile. Could we use pr_err() etc so we get
> the same prefix for all messages?
Oops this is a left over from Oprofile. I will correct the message and use the 
pr_ macros instead of printk.

> [snip]
>
> > +		if (armv7_pmnc_select_counter(counter) == counter)
> > +			asm volatile("mrc p15, 0, %0, c9, c13, 2"
> > +				     : "=r" (value));
>
> Does this sequence need some locking to make sure that we really do read
> from the counter that we've selected? The same applies to the other places.
In fact armv7_pmnc_select_counter is used by armv7pmu_read_counter, 
armv7pmu_write_counter and armv7pmu_enable_event that are called by the perf 
events generic code. Is that enough of a guarantee for atomic accesses, or we 
need some extra locking?

I will post a new version as soon as the changes are made and after some 
testing on a board.

> Jamie

Thanks,
Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-19 10:53               ` Ingo Molnar
@ 2009-12-21 11:32                 ` Jean Pihet
  0 siblings, 0 replies; 55+ messages in thread
From: Jean Pihet @ 2009-12-21 11:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Saturday 19 December 2009 11:53:43 Ingo Molnar wrote:
> * Jamie Iles <jamie@jamieiles.com> wrote:
> > [snip]
> >
> > > I had a question about the events mapping to user space. Although most
> > > of the events are mapped in the kernel code, some of the exotic events
> > > are not mapped (e.g. NEON or PMU related events). How to use those
> > > events from user space? Is it done using the raw mappings?
> >
> > Yes, the raw events should do the trick. 'perf stat -a -e rff -- sleep 1'
> > will do cycle counting on v6 using the raw event number.
>
> Sidenote: if some of the more exotic events turn out to be useful and are
> worth generalizing, then we can add them to the generic enumeration and add
> tooling support (symbols, aliases, listing, etc.) for it.
>
> The current set of generic events are intended to be a 'seed' set, to be
> extended on an as-needed basis - not cast into stone in any way.
>
> The current generic (hardware) events are (from 'perf list' output):
>
>   cpu-cycles OR cycles                       [Hardware event]
>   instructions                               [Hardware event]
>   cache-references                           [Hardware event]
>   cache-misses                               [Hardware event]
>   branch-instructions OR branches            [Hardware event]
>   branch-misses                              [Hardware event]
>   bus-cycles                                 [Hardware event]
>   L1-dcache-loads                            [Hardware cache event]
>   L1-dcache-load-misses                      [Hardware cache event]
>   L1-dcache-stores                           [Hardware cache event]
>   L1-dcache-store-misses                     [Hardware cache event]
>   L1-dcache-prefetches                       [Hardware cache event]
>   L1-dcache-prefetch-misses                  [Hardware cache event]
>   L1-icache-loads                            [Hardware cache event]
>   L1-icache-load-misses                      [Hardware cache event]
>   L1-icache-prefetches                       [Hardware cache event]
>   L1-icache-prefetch-misses                  [Hardware cache event]
>   LLC-loads                                  [Hardware cache event]
>   LLC-load-misses                            [Hardware cache event]
>   LLC-stores                                 [Hardware cache event]
>   LLC-store-misses                           [Hardware cache event]
>   LLC-prefetches                             [Hardware cache event]
>   LLC-prefetch-misses                        [Hardware cache event]
>   dTLB-loads                                 [Hardware cache event]
>   dTLB-load-misses                           [Hardware cache event]
>   dTLB-stores                                [Hardware cache event]
>   dTLB-store-misses                          [Hardware cache event]
>   dTLB-prefetches                            [Hardware cache event]
>   dTLB-prefetch-misses                       [Hardware cache event]
>   iTLB-loads                                 [Hardware cache event]
>   iTLB-load-misses                           [Hardware cache event]
>   branch-loads                               [Hardware cache event]
>   branch-load-misses                         [Hardware cache event]
>   rNNN                                       [raw hardware event
> descriptor] mem:<addr>[:access]                        [hardware
> breakpoint]
>
> But i think we might want to capture FPU-alike instructions as well on CPUs
> that can count/sample them - etc.
>
Ok, thanks for the information!
We might need to generalize the ARMv7 CPUs for the moment the processors are 
Cortex-A8 (supported by the proposed patch) and Cortex-A9 (not yet supported 
but should come soon).


> 	Ingo

Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-21 11:04             ` Will Deacon
@ 2009-12-21 11:43               ` Jean Pihet
  2009-12-21 12:10                 ` Will Deacon
  0 siblings, 1 reply; 55+ messages in thread
From: Jean Pihet @ 2009-12-21 11:43 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

On Monday 21 December 2009 12:04:55 Will Deacon wrote:
> Hi Jean,
>
> I've provided some comments inline. Hopefully they're useful.
Thanks for reviewing the code.

> * Jean Pihet wrote:
> > Hello,
> >
> > Here is a patch that adds the support for ARMv7 processors, using the
> > PMNC HW unit.
> >
> > The code is for review, it has been compiled and boot tested only, the
> > complete testing is in progress. Please let me know if the patch is
> > wrapped or garbled I will send it attached (20KB in size).
> >
> > Feedback is welcome.
>
> <snip>
>
> > diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
> > index abb5267..79e92ce 100644
> > --- a/arch/arm/kernel/perf_event.c
> > +++ b/arch/arm/kernel/perf_event.c
> > @@ -4,6 +4,7 @@
> >   * ARM performance counter support.
> >   *
> >   * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
> > + * ARMv7 support: Jean Pihet <jpihet@mvista.com>
> >   *
> >   * This code is based on the sparc64 perf event code, which is in turn
> > based
> >   * on the x86 code. Callchain code is based on the ARM OProfile
> > backtrace
> > @@ -35,8 +36,11 @@ DEFINE_SPINLOCK(pmu_lock);
> >   * ARMv6 supports a maximum of 3 events, starting from index 1. If we
> > add
> >   * another platform that supports more, we need to increase this to be
> > the
> >   * largest of all platforms.
> > + *
> > + * ARMv7 supports up to 5 events:
> > + *  cycle counter CCNT + 4 events counters CNT0..3
> >   */
> > -#define ARMPMU_MAX_HWEVENTS		4
> > +#define ARMPMU_MAX_HWEVENTS		5
>
> The maximum number of event counters on ARMv7 is currently 6 [Cortex-A9],
> plus a cycle counter. Additionally, the number of event counters actually
> available is implementation defined (the cycle counter is mandatory). You
> can find out the number of event counters using the PMCR ((PMCR >> 11) &
> 0x1f).
I think we should support Cortex-A8 for now and add support for Cortex-A9 on 
top of it. IIUC a generic ARMV7 support is not possible so I will need 
separate handling for Cortex-A8 and -A9. Is that correct?

Unfortunately I do not have any -A9 HW for now. I will look at the spec in 
order to spot the differences between both PMNC units.

> >  /* The events for a given CPU. */
> >  struct cpu_hw_events {
> > @@ -965,6 +969,701 @@ static struct arm_pmu armv6pmu = {
> >  	.max_period		= (1LLU << 32) - 1,
> >  };
> >
> > +/*
> > + * ARMv7 Performance counter handling code.
> > + *
> > + * Copied from ARMv6 code, with the low level code inspired
> > + *  by the ARMv7 Oprofile code.
> > + *
> > + * ARMv7 has 4 configurable performance counters and a single cycle
> > counter.
> > + * All counters can be enabled/disabled and IRQ masked separately. The
> > cycle
> > + *  counter and all 4 performance counters together can be reset
> > separately.
> > + */
> > +
> > +enum armv7_perf_types {
> > +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> > +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> > +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> > +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> > +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> > +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> > +	ARMV7_PERFCTR_DREAD			= 0x06,
> > +	ARMV7_PERFCTR_DWRITE			= 0x07,
> > +	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
> > +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> > +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> > +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> > +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> > +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> > +	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
> > +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> > +
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
>
> Ok - the events so far are defined by the v7 architecture.
> Note that this doesn't necessarily mean they are all supported by
> the core.
Is there a way to detect the supported PMU events at run-time? Is it harmful 
to use unsupported events?

> > +	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
> > +	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
> > +	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
> > +	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
> > +	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
> > +	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
> > +	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
> > +	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
> > +	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
> > +	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
> > +	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
> > +	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
> > +	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
> > +	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
> > +	ARMV7_PERFCTR_L2_NEON			= 0x4E,
> > +	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
> > +	ARMV7_PERFCTR_L1_INST			= 0x50,
> > +	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
> > +	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
> > +	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
> > +	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
> > +	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
> > +	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
> > +	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
> > +	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
> > +	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
> > +	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
> > +
> > +	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
> > +	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
> > +	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
> > +
> > +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> > +};
>
> These events are specific to the Cortex-A8.
> Unfortunately, these numbers clash with events specific
> to the Cortex-A9 [and potentially future v7 cores].
> For example, 0x40 on the A8 is WRITE_BUFFER_FULL but on the
> A9 it is JAVA_BYTECODE_EXEC. This means that you'll need to
> take a similar approach as was taken for ARM11MP vs ARM11*.
Ok so I will need to separate Cortex-A8 from -A9.

> <snip>
>
> > +/*
> > + * Available counters
> > + */
> > +#define ARMV7_CCNT 		0
> > +#define ARMV7_CNT0 		1
> > +#define ARMV7_CNT1 		2
> > +#define ARMV7_CNT2 		3
> > +#define ARMV7_CNT3 		4
> > +#define ARMV7_CNTMAX 		5
> > +#define ARMV7_COUNTER_TO_CCNT	(ARMV7_CYCLE_COUNTER - ARMV7_CCNT)
> > +
> > +#define ARMV7_CPU_COUNTER(cpu, counter)	((cpu) * CNTMAX + (counter))
>
> You don't use this macro. I imagine there are others which are no longer
> used too.
Ok I am checking and cleaning the code.

> <snip>
>
> > +static inline int armv7_pmnc_select_counter(unsigned int cnt)
> > +{
> > +	u32 val;
> > +
> > +	cnt -= ARMV7_COUNTER_TO_CCNT;
> > +
> > +	if ((cnt == ARMV7_CCNT) || (cnt >= ARMV7_CNTMAX)) {
> > +		printk(KERN_ERR "oprofile: CPU%u selecting wrong PMNC counter"
> > +			" %d\n", smp_processor_id(), cnt);
> > +		return -1;
> > +	}
>
> Nice error message :)
Indeed! Theis is corrected already.

> <snip>
>
> >  static int __init
> >  init_hw_perf_events(void)
> >  {
> > @@ -977,6 +1676,13 @@ init_hw_perf_events(void)
> >                  memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
> >                         sizeof(armv6_perf_cache_map));
> >                  perf_max_events	= armv6pmu.num_events;
> > +	} else if (cpu_architecture() == CPU_ARCH_ARMv7) {
> > +		armpmu = &armv7pmu;
> > +		memcpy(armpmu_perf_cache_map, armv7_perf_cache_map,
> > +			sizeof(armv7_perf_cache_map));
> > +		perf_max_events	= armv7pmu.num_events;
> > +		/* Initialize & Reset PMNC: C bit and P bit */
> > +		armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
> >          } else {
> >                  pr_info("no hardware support available\n");
> >                  perf_max_events = -1;
>
> You'll need to switch on the cpuid to select the correct event mappings.
>
> I've implemented this for oprofile, I'll post it as an RFC after Christmas
> as I won't be able to respond in the meantime.
Ok. Do you know how I can differentiate Cortex-A8 from -A9?

I will post a new version with the corrections.
>
> Cheers,
>
> Will

Cheers and a good celebration time,
Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-21 11:43               ` Jean Pihet
@ 2009-12-21 12:10                 ` Will Deacon
  2009-12-21 12:43                   ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2009-12-21 12:10 UTC (permalink / raw)
  To: linux-arm-kernel

Hello,

* Jean Pihet wrote:

> > The maximum number of event counters on ARMv7 is currently 6 [Cortex-A9],
> > plus a cycle counter. Additionally, the number of event counters actually
> > available is implementation defined (the cycle counter is mandatory). You
> > can find out the number of event counters using the PMCR ((PMCR >> 11) &
> > 0x1f).
> I think we should support Cortex-A8 for now and add support for Cortex-A9 on
> top of it. IIUC a generic ARMV7 support is not possible so I will need
> separate handling for Cortex-A8 and -A9. Is that correct?
> 
> Unfortunately I do not have any -A9 HW for now. I will look at the spec in
> order to spot the differences between both PMNC units.

Sorry, I should've mentioned that the PMU hardware interface is the same
across all v7 cores. The only difference is the core-specific event numberings.
A9 is also available in MP configurations, but that shouldn't cause many problems
for perf. I can test on an A9MP once you have something you're happy with.

> > Ok - the events so far are defined by the v7 architecture.
> > Note that this doesn't necessarily mean they are all supported by
> > the core.
> Is there a way to detect the supported PMU events at run-time? Is it harmful
> to use unsupported events?

The unsupported events for a given core are documented in the TRM.
For example, A9 doesn't support 0x08 and 0x0E but does support all
the other events defined by the architecture. It then has its own
set of extensions listed in the TRM [Section 9.2.1].

> Ok so I will need to separate Cortex-A8 from -A9.

Yep - but it's an easy thing to do.

> > I've implemented this for oprofile, I'll post it as an RFC after Christmas
> > as I won't be able to respond in the meantime.
> Ok. Do you know how I can differentiate Cortex-A8 from -A9?

The bottom two bytes of the main cpuid (read_cpuid_id()) are 0xC080 and 0xC090
for the A8 and A9 respectively.

> I will post a new version with the corrections.

Excellent. I can comment on things tomorrow as well, but then I'm off.

> Cheers and a good celebration time,

Thanks. I'll post my oprofile patches when I return. They cover these issues
already so hopefully you can spot anything I've missed.

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-21 12:10                 ` Will Deacon
@ 2009-12-21 12:43                   ` Jamie Iles
  2009-12-21 13:35                     ` Jean Pihet
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-21 12:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 21, 2009 at 12:10:46PM -0000, Will Deacon wrote:
> > I will post a new version with the corrections.
> 
> Excellent. I can comment on things tomorrow as well, but then I'm off.
> 
> > Cheers and a good celebration time,
> 
> Thanks. I'll post my oprofile patches when I return. They cover these issues
> already so hopefully you can spot anything I've missed.
It's my last working day today. When I return I'll post an updated patch
series for the generic arm perfcounters and v6 support. Hopefully we can get
these signed off then and base the v7 support off of these.

Enjoy the break,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-21 12:43                   ` Jamie Iles
@ 2009-12-21 13:35                     ` Jean Pihet
  2009-12-22 16:51                       ` Jean Pihet
  0 siblings, 1 reply; 55+ messages in thread
From: Jean Pihet @ 2009-12-21 13:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Monday 21 December 2009 13:43:18 Jamie Iles wrote:
> On Mon, Dec 21, 2009 at 12:10:46PM -0000, Will Deacon wrote:
> > > I will post a new version with the corrections.
> >
> > Excellent. I can comment on things tomorrow as well, but then I'm off.
> >
> > > Cheers and a good celebration time,
> >
> > Thanks. I'll post my oprofile patches when I return. They cover these
> > issues already so hopefully you can spot anything I've missed.
>
> It's my last working day today. When I return I'll post an updated patch
> series for the generic arm perfcounters and v6 support. Hopefully we can
> get these signed off then and base the v7 support off of these.
OK, good! I am continuing on v7 and I will rebase my patches as soon as those 
are out.

> Enjoy the break,
Thx, the same for you!

>
> Jamie

Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-21 13:35                     ` Jean Pihet
@ 2009-12-22 16:51                       ` Jean Pihet
  2009-12-28  7:57                         ` Ingo Molnar
                                           ` (2 more replies)
  0 siblings, 3 replies; 55+ messages in thread
From: Jean Pihet @ 2009-12-22 16:51 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

Here is the updated patch. It now supports:
- Cortex-A8 and Cortex-A9 processors,
- The low level code has been completely redesigned to allow the dynamic 
detection of the number of available counters, based on the PMCR value,
- runtime detection of the CPU arch (v6 or v7) and model (Cortex-A8 or 
Cortex-A9)

The code is for review, it has been checked, compiled and boot tested on OMAP3 
(Cortex-A8). Unfortunately I am still facing some cross compilation problems 
of the tools/perf utility.

Some remarks and questions:

1) The number of available counters can reach 32 on ARMv7, so the macro  
ARMPMU_MAX_HWEVENTS is now defined as 32. Is that correct?

2) Please note that the Cortex-A9 events do not easily map to the predefined 
events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in the code.
- the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks like the number 
of instructions is calculated by adding events numbers (events from 0x70 till 
0x74: MAIN_UNIT_EXECUTED_INST, SECOND_UNIT_EXECUTED_INST, 
LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
- the HW_BRANCH events are not found
- the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and 
COHERENT_LINE_MISS events, is that correct?
- L1 and L2 cache events are not found. Those could be available in separate 
PL310 registers, TBC
- no TLB events excepted the ITLB_MISS event are found.

Any thoughts?

Regards,
Jean

---
>From 26fbfd99815abbe1147a191ff759b1197344587f Mon Sep 17 00:00:00 2001
From: Jean Pihet <jpihet@mvista.com>
Date: Fri, 18 Dec 2009 17:46:21 +0100
Subject: [PATCH] arm/perfevents: add support for ARMv7

Adds the Performance Events support for ARMv7 processor, using
the PMNC unit in HW.

Supports the following:
- Cortex-A8 and Cortex-A9 processors,
- dynamic detection of the number of available counters,
   based on the PMCR value,
- runtime detection of the CPU arch (v6 or v7)
   and model (Cortex-A8 or Cortex-A9)

Signed-off-by: Jean Pihet <jpihet@mvista.com>
---
 arch/arm/Kconfig             |    2 +-
 arch/arm/kernel/perf_event.c |  919 
+++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 903 insertions(+), 18 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e5bd97e..5c6afdb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1172,7 +1172,7 @@ config HIGHPTE
 
 config HW_PERF_EVENTS
 	bool "Enable hardware performance counter support for perf events"
-	depends on PERF_EVENTS && CPU_HAS_PMU && CPU_V6
+	depends on PERF_EVENTS && CPU_HAS_PMU && (CPU_V6 || CPU_V7)
 	default y
 	help
 	  Enable hardware performance counter support for perf events. If
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index abb5267..035dd33 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -4,6 +4,7 @@
  * ARM performance counter support.
  *
  * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ * ARMv7 support: Jean Pihet <jpihet@mvista.com>
  *
  * This code is based on the sparc64 perf event code, which is in turn based
  * on the x86 code. Callchain code is based on the ARM OProfile backtrace
@@ -35,8 +36,12 @@ DEFINE_SPINLOCK(pmu_lock);
  * ARMv6 supports a maximum of 3 events, starting from index 1. If we add
  * another platform that supports more, we need to increase this to be the
  * largest of all platforms.
+ *
+ * ARMv7 supports up to 32 events:
+ *  cycle counter CCNT + 31 events counters CNT0..30.
+ *  Cortex-A8 has 1+4 counters, Cortex-A9 has 1+6 counters
  */
-#define ARMPMU_MAX_HWEVENTS		4
+#define ARMPMU_MAX_HWEVENTS		32
 
 /* The events for a given CPU. */
 struct cpu_hw_events {
@@ -61,7 +66,7 @@ struct cpu_hw_events {
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
 struct arm_pmu {
-	const char	*name;
+	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
 	void		(*enable)(struct hw_perf_event *evt, int idx);
 	void		(*disable)(struct hw_perf_event *evt, int idx);
@@ -965,26 +970,906 @@ static struct arm_pmu armv6pmu = {
 	.max_period		= (1LLU << 32) - 1,
 };
 
+/*
+ * ARMv7 Cortex-A8 and Cortex-A9 Performance Events handling code.
+ *
+ * Copied from ARMv6 code, with the low level code inspired
+ *  by the ARMv7 Oprofile code.
+ *
+ * Cortex-A8 has up to 4 configurable performance counters and
+ *  a single cycle counter.
+ * Cortex-A9 has up to 31 configurable performance counters and
+ *  a single cycle counter.
+ *
+ * All counters can be enabled/disabled and IRQ masked separately. The cycle
+ *  counter and all 4 performance counters together can be reset separately.
+ */
+
+#define ARMV7_PMU_CORTEX_A8_NAME		"ARMv7 Cortex-A8"
+
+#define ARMV7_PMU_CORTEX_A9_NAME		"ARMv7 Cortex-A9"
+
+/* Common ARMv7 event types */
+enum armv7_perf_types {
+	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
+	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
+	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
+	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
+	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
+	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
+	ARMV7_PERFCTR_DREAD			= 0x06,
+	ARMV7_PERFCTR_DWRITE			= 0x07,
+
+	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
+	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
+	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
+	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
+	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
+	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
+	ARMV7_PERFCTR_CLOCK_CYCLES		= 0x11,
+
+	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
+
+	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
+};
+
+/* ARMv7 Cortex-A8 specific event types */
+enum armv7_a8_perf_types {
+	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
+
+	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
+
+	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
+	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
+	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
+	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
+	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
+	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
+	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
+	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
+	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
+	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
+	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
+	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
+	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
+	ARMV7_PERFCTR_L2_NEON			= 0x4E,
+	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
+	ARMV7_PERFCTR_L1_INST			= 0x50,
+	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
+	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
+	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
+	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
+	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
+	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
+	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
+	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
+	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
+	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
+
+	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
+	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
+	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
+};
+
+/* ARMv7 Cortex-A9 specific event types */
+enum armv7_a9_perf_types {
+	ARMV7_PERFCTR_JAVA_HW_BYTECODE_EXEC	= 0x40,
+	ARMV7_PERFCTR_JAVA_SW_BYTECODE_EXEC	= 0x41,
+	ARMV7_PERFCTR_JAZELLE_BRANCH_EXEC	= 0x42,
+
+	ARMV7_PERFCTR_COHERENT_LINE_MISS	= 0x50,
+	ARMV7_PERFCTR_COHERENT_LINE_HIT		= 0x51,
+
+	ARMV7_PERFCTR_ICACHE_DEP_STALL_CYCLES	= 0x60,
+	ARMV7_PERFCTR_DCACHE_DEP_STALL_CYCLES	= 0x61,
+	ARMV7_PERFCTR_TLB_MISS_DEP_STALL_CYCLES	= 0x62,
+	ARMV7_PERFCTR_STREX_EXECUTED_PASSED	= 0x63,
+	ARMV7_PERFCTR_STREX_EXECUTED_FAILED	= 0x64,
+	ARMV7_PERFCTR_DATA_EVICTION		= 0x65,
+	ARMV7_PERFCTR_ISSUE_STAGE_NO_INST	= 0x66,
+	ARMV7_PERFCTR_ISSUE_STAGE_EMPTY		= 0x67,
+	ARMV7_PERFCTR_INST_OUT_OF_RENAME_STAGE	= 0x68,
+
+	ARMV7_PERFCTR_PREDICTABLE_FUNCT_RETURNS	= 0x6E,
+
+	ARMV7_PERFCTR_MAIN_UNIT_EXECUTED_INST	= 0x70,
+	ARMV7_PERFCTR_SECOND_UNIT_EXECUTED_INST	= 0x71,
+	ARMV7_PERFCTR_LD_ST_UNIT_EXECUTED_INST	= 0x72,
+	ARMV7_PERFCTR_FP_EXECUTED_INST		= 0x73,
+	ARMV7_PERFCTR_NEON_EXECUTED_INST	= 0x74,
+
+	ARMV7_PERFCTR_PLD_FULL_DEP_STALL_CYCLES	= 0x80,
+	ARMV7_PERFCTR_DATA_WR_DEP_STALL_CYCLES	= 0x81,
+	ARMV7_PERFCTR_ITLB_MISS_DEP_STALL_CYCLES	= 0x82,
+	ARMV7_PERFCTR_DTLB_MISS_DEP_STALL_CYCLES	= 0x83,
+	ARMV7_PERFCTR_MICRO_ITLB_MISS_DEP_STALL_CYCLES	= 0x84,
+	ARMV7_PERFCTR_MICRO_DTLB_MISS_DEP_STALL_CYCLES 	= 0x85,
+	ARMV7_PERFCTR_DMB_DEP_STALL_CYCLES	= 0x86,
+
+	ARMV7_PERFCTR_INTGR_CLK_ENABLED_CYCLES	= 0x8A,
+	ARMV7_PERFCTR_DATA_ENGINE_CLK_EN_CYCLES	= 0x8B,
+
+	ARMV7_PERFCTR_ISB_INST			= 0x90,
+	ARMV7_PERFCTR_DSB_INST			= 0x91,
+	ARMV7_PERFCTR_DMB_INST			= 0x92,
+	ARMV7_PERFCTR_EXT_INTERRUPTS		= 0x93,
+
+	ARMV7_PERFCTR_PLE_CACHE_LINE_RQST_COMPLETED	= 0xA0,
+	ARMV7_PERFCTR_PLE_CACHE_LINE_RQST_SKIPPED	= 0xA1,
+	ARMV7_PERFCTR_PLE_FIFO_FLUSH		= 0xA2,
+	ARMV7_PERFCTR_PLE_RQST_COMPLETED	= 0xA3,
+	ARMV7_PERFCTR_PLE_FIFO_OVERFLOW		= 0xA4,
+	ARMV7_PERFCTR_PLE_RQST_PROG		= 0xA5
+};
+
+/*
+ * Cortex-A8 HW events mapping
+ *
+ * The hardware events that we support. We do support cache operations but
+ * we have harvard caches and no way to combine instruction and data
+ * accesses/misses in hardware.
+ */
+static const unsigned armv7_a8_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_FAILED,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
+};
+
+static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					  [PERF_COUNT_HW_CACHE_OP_MAX]
+					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * Only ITLB misses and DTLB refills are supported.
+		 * If users want the DTLB refills misses a raw counter
+		 * must be used.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
+ * Cortex-A9 HW events mapping
+ */
+static const unsigned armv7_a9_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = ARMV7_PERFCTR_COHERENT_LINE_HIT,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = ARMV7_PERFCTR_COHERENT_LINE_MISS,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
+};
+
+static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					  [PERF_COUNT_HW_CACHE_OP_MAX]
+					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * Only ITLB misses and DTLB refills are supported.
+		 * If users want the DTLB refills misses a raw counter
+		 * must be used.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
+ * Perf Events counters
+ */
+enum armv7_counters {
+	ARMV7_CYCLE_COUNTER 		= 1,	/* Cycle counter */
+	ARMV7_COUNTER0			= 2,	/* First event counter */
+};
+
+/* The last event counter is (ARMV7_COUNTER0 + armpmu->num_events) */
+#define	ARMV7_COUNTER_LAST		(ARMV7_COUNTER0 + armpmu->num_events)
+
+/*
+ * ARMv7 low level PMNC access
+ */
+
+/*
+ * Per-CPU PMNC: config reg
+ */
+#define ARMV7_PMNC_E		(1 << 0) /* Enable all counters */
+#define ARMV7_PMNC_P		(1 << 1) /* Reset all counters */
+#define ARMV7_PMNC_C		(1 << 2) /* Cycle counter reset */
+#define ARMV7_PMNC_D		(1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV7_PMNC_X		(1 << 4) /* Export to ETM */
+#define ARMV7_PMNC_DP		(1 << 5) /* Disable CCNT if non-invasive debug*/
+#define	ARMV7_PMNC_N_SHIFT	11	 /* Number of counters supported */
+#define	ARMV7_PMNC_N_MASK	0x1f
+#define	ARMV7_PMNC_MASK		0x3f	 /* Mask for writable bits */
+
+/*
+ * Available counters
+ */
+#define ARMV7_CNT0 		0	/* First event counter */
+#define ARMV7_CCNT 		31	/* Cycle counter */
+
+#define ARMV7_A8_CNTMAX		5	/* Cortex-A8: up to 4 counters + CCNT */
+#define ARMV7_A9_CNTMAX		32	/* Cortex-A9: up to 31 counters + CCNT*/
+
+/* Perf Event to low level counters mapping */
+#define ARMV7_EVENT_CNT_TO_CNTx	(ARMV7_COUNTER0 - ARMV7_CNT0)
+
+/*
+ * CNTENS: counters enable reg
+ */
+#define ARMV7_CNTENS_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_CNTENS_C		(1 << ARMV7_CCNT)
+
+/*
+ * CNTENC: counters disable reg
+ */
+#define ARMV7_CNTENC_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_CNTENC_C		(1 << ARMV7_CCNT)
+
+/*
+ * INTENS: counters overflow interrupt enable reg
+ */
+#define ARMV7_INTENS_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_INTENS_C		(1 << ARMV7_CCNT)
+
+/*
+ * INTENC: counters overflow interrupt disable reg
+ */
+#define ARMV7_INTENC_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_INTENC_C		(1 << ARMV7_CCNT)
+
+/*
+ * EVTSEL: Event selection reg
+ */
+#define	ARMV7_EVTSEL_MASK	0x7f		/* Mask for writable bits */
+
+/*
+ * SELECT: Counter selection reg
+ */
+#define	ARMV7_SELECT_MASK	0x1f		/* Mask for writable bits */
+
+/*
+ * FLAG: counters overflow flag status reg
+ */
+#define ARMV7_FLAG_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_FLAG_C		(1 << ARMV7_CCNT)
+#define	ARMV7_FLAG_MASK		0xffffffff	/* Mask for writable bits */
+#define	ARMV7_OVERFLOWED_MASK	ARMV7_FLAG_MASK
+
+static inline unsigned long armv7_pmnc_read(void)
+{
+	u32 val;
+	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(val));
+	return val;
+}
+
+static inline void armv7_pmnc_write(unsigned long val)
+{
+	val &= ARMV7_PMNC_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(val));
+}
+
+static inline int armv7_pmnc_has_overflowed(unsigned long pmnc)
+{
+	return pmnc & ARMV7_OVERFLOWED_MASK;
+}
+
+static inline int armv7_pmnc_counter_has_overflowed(unsigned long pmnc,
+					enum armv7_counters counter)
+{
+	int ret;
+
+	if (counter == ARMV7_CYCLE_COUNTER)
+		ret = pmnc & ARMV7_FLAG_C;
+	else if ((counter >= ARMV7_COUNTER0) && (counter <= ARMV7_COUNTER_LAST))
+		ret = pmnc & ARMV7_FLAG_P(counter);
+	else
+		BUG();
+
+	return ret;
+}
+
+static inline int armv7_pmnc_select_counter(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u selecting wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	val = (idx - ARMV7_EVENT_CNT_TO_CNTx) & ARMV7_SELECT_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7pmu_read_counter(int idx)
+{
+	unsigned long value = 0;
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (value));
+	else if ((idx >= ARMV7_COUNTER0) && (idx <= ARMV7_COUNTER_LAST)) {
+		if (armv7_pmnc_select_counter(idx) == idx)
+			asm volatile("mrc p15, 0, %0, c9, c13, 2"
+				     : "=r" (value));
+	} else
+		BUG();
+
+	return value;
+}
+
+static inline void armv7pmu_write_counter(int idx, u32 value)
+{
+	if (idx == ARMV7_CYCLE_COUNTER)
+		asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (value));
+	else if ((idx >= ARMV7_COUNTER0) && (idx <= ARMV7_COUNTER_LAST)) {
+		if (armv7_pmnc_select_counter(idx) == idx)
+			asm volatile("mcr p15, 0, %0, c9, c13, 2"
+				     : : "r" (value));
+	} else
+		BUG();
+}
+
+static inline void armv7_pmnc_write_evtsel(unsigned int idx, u32 val)
+{
+	if (armv7_pmnc_select_counter(idx) == idx) {
+		val &= ARMV7_EVTSEL_MASK;
+		asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (val));
+	}
+}
+
+static inline u32 armv7_pmnc_enable_counter(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u enabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_CNTENS_C;
+	else
+		val = ARMV7_CNTENS_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_disable_counter(unsigned int idx)
+{
+	u32 val;
+
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u disabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_CNTENC_C;
+	else
+		val = ARMV7_CNTENC_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_enable_intens(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u enabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_INTENS_C;
+	else
+		val = ARMV7_INTENS_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_disable_intens(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u disabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_INTENC_C;
+	else
+		val = ARMV7_INTENC_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_getreset_flags(void)
+{
+	u32 val;
+
+	/* Read */
+	asm volatile("mrc p15, 0, %0, c9, c12, 3" : "=r" (val));
+
+	/* Write to clear flags */
+	val &= ARMV7_FLAG_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (val));
+
+	return val;
+}
+
+void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Enable counter and interrupt, and set the counter to count
+	 * the event that we're interested in.
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Set event (if destined for PMNx counters)
+	 * We don't need to set the event if it's a cycle count
+	 */
+	if (idx != ARMV7_CYCLE_COUNTER)
+		armv7_pmnc_write_evtsel(idx, hwc->config_base);
+
+	/*
+	 * Enable interrupt for this counter
+	 */
+	armv7_pmnc_enable_intens(idx);
+
+	/*
+	 * Enable counter
+	 */
+	armv7_pmnc_enable_counter(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable counter and interrupt
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Disable interrupt for this counter
+	 */
+	armv7_pmnc_disable_intens(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
+{
+	unsigned long pmnc;
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	/*
+	 * Get and reset the IRQ flags
+	 */
+	pmnc = armv7_pmnc_getreset_flags();
+
+	/*
+	 * Did an overflow occur?
+	 */
+	if (!armv7_pmnc_has_overflowed(pmnc))
+		return IRQ_NONE;
+
+	/*
+	 * Handle the counter(s) overflow(s)
+	 */
+	regs = get_irq_regs();
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx <= armpmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		/*
+		 * We have a single interrupt for all counters. Check that
+		 * each counter has overflowed before we process it.
+		 */
+		if (!armv7_pmnc_counter_has_overflowed(pmnc, idx))
+			continue;
+
+		hwc = &event->hw;
+		armpmu_event_update(event, hwc, idx);
+		data.period = event->hw.last_period;
+		if (!armpmu_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 0, &data, regs))
+			armpmu->disable(hwc, idx);
+	}
+
+	/*
+	 * Handle the pending perf events.
+	 *
+	 * Note: this call *must* be run with interrupts enabled. For
+	 * platforms that can have the PMU interrupts raised as a PMI, this
+	 * will not work.
+	 */
+	perf_event_do_pending();
+
+	return IRQ_HANDLED;
+}
+
+static void armv7pmu_start(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Enable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() | ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_stop(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Disable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() & ~ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static inline int armv7_a8_pmu_event_map(int config)
+{
+	int mapping = armv7_a8_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static inline int armv7_a9_pmu_event_map(int config)
+{
+	int mapping = armv7_a9_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static u64 armv7pmu_raw_event(u64 config)
+{
+	return config & 0xff;
+}
+
+static int armv7pmu_get_event_idx(struct cpu_hw_events *cpuc,
+				  struct hw_perf_event *event)
+{
+	int idx;
+
+	/* Always place a cycle counter into the cycle counter. */
+	if (event->config_base == ARMV7_PERFCTR_CPU_CYCLES) {
+		if (test_and_set_bit(ARMV7_CYCLE_COUNTER, cpuc->used_mask))
+			return -EAGAIN;
+
+		return ARMV7_CYCLE_COUNTER;
+	} else {
+		/*
+		 * For anything other than a cycle counter, try and use
+		 * the events counters
+		 */
+		for (idx = ARMV7_COUNTER0; idx <= armpmu->num_events; ++idx) {
+			if (!test_and_set_bit(idx, cpuc->used_mask))
+				return idx;
+		}
+
+		/* The counters are all in use. */
+		return -EAGAIN;
+	}
+}
+
+static struct arm_pmu armv7pmu = {
+	.handle_irq		= armv7pmu_handle_irq,
+	.enable			= armv7pmu_enable_event,
+	.disable		= armv7pmu_disable_event,
+	.raw_event		= armv7pmu_raw_event,
+	.read_counter		= armv7pmu_read_counter,
+	.write_counter		= armv7pmu_write_counter,
+	.get_event_idx		= armv7pmu_get_event_idx,
+	.start			= armv7pmu_start,
+	.stop			= armv7pmu_stop,
+	.max_period		= (1LLU << 32) - 1,
+};
+
 static int __init
 init_hw_perf_events(void)
 {
-#define CPUID_V6_MASK   0x7F000
-#define CPUID_V6_BITS   0x7B000
-        unsigned long cpuid = read_cpuid_id();
-
-        if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
-                armpmu = &armv6pmu;
-                memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
-                       sizeof(armv6_perf_cache_map));
-                perf_max_events	= armv6pmu.num_events;
-        } else {
-                pr_info("no hardware support available\n");
-                perf_max_events = -1;
-        }
+#define CPUID_V6_MASK   	0x7F000
+#define CPUID_V6_BITS   	0x7B000
+
+#define CPUID_CORTEX_A8_BITS	0xC080
+#define CPUID_CORTEX_A8_MASK	0xFFF0
+
+#define CPUID_CORTEX_A9_BITS	0xC090
+#define CPUID_CORTEX_A9_MASK	0xFFF0
+
+	unsigned long cpuid = read_cpuid_id();
+
+	/*
+	 * ARMv6 detection
+	 */
+	if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
+		armpmu = &armv6pmu;
+		memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
+			sizeof(armv6_perf_cache_map));
+		perf_max_events	= armv6pmu.num_events;
+	}
+	/*
+	 * ARMv7 detection
+	 */
+	else if (cpu_architecture() == CPU_ARCH_ARMv7) {
+		/*
+		 * Cortex-A8 detection
+		 */
+		if ((cpuid & CPUID_CORTEX_A8_MASK) == CPUID_CORTEX_A8_BITS) {
+			armv7pmu.name = ARMV7_PMU_CORTEX_A8_NAME;
+			memcpy(armpmu_perf_cache_map, armv7_a8_perf_cache_map,
+				sizeof(armv7_a8_perf_cache_map));
+			armv7pmu.event_map = armv7_a8_pmu_event_map;
+			armpmu = &armv7pmu;
+		} else
+		/*
+		 * Cortex-A9 detection
+		 */
+			if ((cpuid & CPUID_CORTEX_A9_MASK)
+			    == CPUID_CORTEX_A9_BITS) {
+				armv7pmu.name = ARMV7_PMU_CORTEX_A9_NAME;
+				memcpy(armpmu_perf_cache_map,
+					armv7_a9_perf_cache_map,
+					sizeof(armv7_a9_perf_cache_map));
+				armv7pmu.event_map = armv7_a9_pmu_event_map;
+				armpmu = &armv7pmu;
+		} else
+			perf_max_events = -1;
+
+		if (armpmu) {
+			u32 nb_cnt;
+
+			/* Read the nb of CNTx counters supported from PMNC */
+			nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT)
+				& ARMV7_PMNC_N_MASK;
+			/* Add the CPU cycles counter */
+			armv7pmu.num_events = nb_cnt + 1;
+			perf_max_events	= armv7pmu.num_events;
+
+			/* Initialize & Reset PMNC: C bit and P bit */
+			armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
+		}
+	} else {
+		pr_info("no hardware support available\n");
+		perf_max_events = -1;
+	}
 
         if (armpmu)
-                pr_info("enabled with %s PMU driver\n",
-                        armpmu->name);
+		pr_info("enabled with %s PMU driver, %d counters available\n",
+			armpmu->name, armpmu->num_events);
 
         return 0;
 }
-- 
1.6.2.5.168.g3823



On Monday 21 December 2009 14:35:04 Jean Pihet wrote:
> On Monday 21 December 2009 13:43:18 Jamie Iles wrote:
> > On Mon, Dec 21, 2009 at 12:10:46PM -0000, Will Deacon wrote:
> > > > I will post a new version with the corrections.
> > >
> > > Excellent. I can comment on things tomorrow as well, but then I'm off.
> > >
> > > > Cheers and a good celebration time,
> > >
> > > Thanks. I'll post my oprofile patches when I return. They cover these
> > > issues already so hopefully you can spot anything I've missed.
> >
> > It's my last working day today. When I return I'll post an updated patch
> > series for the generic arm perfcounters and v6 support. Hopefully we can
> > get these signed off then and base the v7 support off of these.
>
> OK, good! I am continuing on v7 and I will rebase my patches as soon as
> those are out.
>
> > Enjoy the break,
>
> Thx, the same for you!
>
> > Jamie
>
> Jean
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-22 16:51                       ` Jean Pihet
@ 2009-12-28  7:57                         ` Ingo Molnar
  2009-12-29 13:52                           ` Jean Pihet
  2009-12-29 13:58                         ` Jean Pihet
  2010-01-08 22:17                         ` Woodruff, Richard
  2 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-12-28  7:57 UTC (permalink / raw)
  To: linux-arm-kernel


* Jean Pihet <jpihet@mvista.com> wrote:

> The code is for review, it has been checked, compiled and boot tested on 
> OMAP3 (Cortex-A8). Unfortunately I am still facing some cross compilation 
> problems of the tools/perf utility.

Have you managed to solve these cross-compilation problems? If yes, it would 
be nice to merge the fixes into upstream perf.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-28  7:57                         ` Ingo Molnar
@ 2009-12-29 13:52                           ` Jean Pihet
  2009-12-29 16:32                             ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jean Pihet @ 2009-12-29 13:52 UTC (permalink / raw)
  To: linux-arm-kernel

Ingo, Jamie,

On Monday 28 December 2009 08:57:48 Ingo Molnar wrote:
> * Jean Pihet <jpihet@mvista.com> wrote:
> > The code is for review, it has been checked, compiled and boot tested on
> > OMAP3 (Cortex-A8). Unfortunately I am still facing some cross compilation
> > problems of the tools/perf utility.
>
> Have you managed to solve these cross-compilation problems? If yes, it
> would be nice to merge the fixes into upstream perf.
Yes I got them resolved but I needed to make a few changes:
- -Werror removed from the CFLAGS definition in tools/perf/Makefile. Without 
the change the compilation stops after a warning about include paths.
- the rmb() macro in tools/perf/perf.h prevents the compilation. I changed it 
to the definition from arch/arm/include/asm/system.h (asm 
volatile("":::"memory")). Where is the original definition from? Why is it 
specific to perf_events?

What do you think about those changes? Should they be submitted separately?

I tested the ARMv7 code on the Cortex-A8 processor. The code looks fine but 
when I load the CPU I am running into spinlock recursion problems (in 
perf_ctx_adjust_freq).
Does those problem happen on ARMv6 as well?
Here is the backtrace:

BUG: spinlock recursion on CPU#0, cp/1049
 lock: c7ab4600, .magic: dead4ead, .owner: cp/1049, .owner_cpu: 0
[<c00415e0>] (unwind_backtrace+0x0/0xdc) from [<c0200c7c>] 
(do_raw_spin_lock+0x48/0x14c)
[<c0200c7c>] (do_raw_spin_lock+0x48/0x14c) from [<c00b7ca4>] 
(perf_ctx_adjust_freq+0xc/0x1b4)
[<c00b7ca4>] (perf_ctx_adjust_freq+0xc/0x1b4) from [<c00b7e84>] 
(perf_event_task_tick+0x38/0x9c)
[<c00b7e84>] (perf_event_task_tick+0x38/0x9c) from [<c0078d9c>] 
(update_process_times+0x3c/0x48)
[<c0078d9c>] (update_process_times+0x3c/0x48) from [<c00907fc>] 
(tick_sched_timer+0x80/0xbc)
[<c00907fc>] (tick_sched_timer+0x80/0xbc) from [<c0087f3c>] 
(__run_hrtimer+0xc8/0x158)
[<c0087f3c>] (__run_hrtimer+0xc8/0x158) from [<c008823c>] 
(hrtimer_interrupt+0x130/0x310)
[<c008823c>] (hrtimer_interrupt+0x130/0x310) from [<c0048244>] 
(omap2_gp_timer_interrupt+0x20/0x2c)
[<c0048244>] (omap2_gp_timer_interrupt+0x20/0x2c) from [<c009dc14>] 
(handle_IRQ_event+0x70/0x184)
[<c009dc14>] (handle_IRQ_event+0x70/0x184) from [<c009f7a8>] 
(handle_level_irq+0xa4/0x118)
[<c009f7a8>] (handle_level_irq+0xa4/0x118) from [<c003b070>] 
(asm_do_IRQ+0x70/0x90)
[<c003b070>] (asm_do_IRQ+0x70/0x90) from [<c042d2f0>] (__irq_svc+0x30/0x80)
Exception stack(0xc7a89890 to 0xc7a898d8)
9880:                                     00000000 c05bc1e0 0000001e 00000010
98a0: 00000004 c05bc1e0 c7a7e8a8 c05bc160 c0606550 c7bccdc0 c05db260 c7a8997c
98c0: 00000000 c7a898d8 c0041f44 c01f6034 40000013 ffffffff
[<c042d2f0>] (__irq_svc+0x30/0x80) from [<c01f6034>] 
(_test_and_set_bit_le+0x20/0x34)
BUG: spinlock lockup on CPU#0, cp/1049, c7ab4600
[<c00415e0>] (unwind_backtrace+0x0/0xdc) from [<c0200d44>] 
(do_raw_spin_lock+0x110/0x14c)
[<c0200d44>] (do_raw_spin_lock+0x110/0x14c) from [<c00b7ca4>] 
(perf_ctx_adjust_freq+0xc/0x1b4)
[<c00b7ca4>] (perf_ctx_adjust_freq+0xc/0x1b4) from [<c00b7e84>] 
(perf_event_task_tick+0x38/0x9c)
[<c00b7e84>] (perf_event_task_tick+0x38/0x9c) from [<c0078d9c>] 
(update_process_times+0x3c/0x48)
[<c0078d9c>] (update_process_times+0x3c/0x48) from [<c00907fc>] 
(tick_sched_timer+0x80/0xbc)
[<c00907fc>] (tick_sched_timer+0x80/0xbc) from [<c0087f3c>] 
(__run_hrtimer+0xc8/0x158)
[<c0087f3c>] (__run_hrtimer+0xc8/0x158) from [<c008823c>] 
(hrtimer_interrupt+0x130/0x310)
[<c008823c>] (hrtimer_interrupt+0x130/0x310) from [<c0048244>] 
(omap2_gp_timer_interrupt+0x20/0x2c)
[<c0048244>] (omap2_gp_timer_interrupt+0x20/0x2c) from [<c009dc14>] 
(handle_IRQ_event+0x70/0x184)
[<c009dc14>] (handle_IRQ_event+0x70/0x184) from [<c009f7a8>] 
(handle_level_irq+0xa4/0x118)
[<c009f7a8>] (handle_level_irq+0xa4/0x118) from [<c003b070>] 
(asm_do_IRQ+0x70/0x90)
[<c003b070>] (asm_do_IRQ+0x70/0x90) from [<c042d2f0>] (__irq_svc+0x30/0x80)
Exception stack(0xc7a89890 to 0xc7a898d8)
9880:                                     00000000 c05bc1e0 0000001e 00000010
98a0: 00000004 c05bc1e0 c7a7e8a8 c05bc160 c0606550 c7bccdc0 c05db260 c7a8997c
98c0: 00000000 c7a898d8 c0041f44 c01f6034 40000013 ffffffff

>
> 	Ingo

Regards,
Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-22 16:51                       ` Jean Pihet
  2009-12-28  7:57                         ` Ingo Molnar
@ 2009-12-29 13:58                         ` Jean Pihet
  2010-01-04 16:52                           ` Will Deacon
  2010-01-08 22:17                         ` Woodruff, Richard
  2 siblings, 1 reply; 55+ messages in thread
From: Jean Pihet @ 2009-12-29 13:58 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

Here is the updated patch after testing on HW.
I will rebase it on Jamie's latest patch set as soon as they are out.

Feedback is welcome!

The comments and remarks are the same as in the previous version:
It now supports:
- Cortex-A8 and Cortex-A9 processors,
- The low level code has been completely redesigned to allow the dynamic 
detection of the number of available counters, based on the PMCR value,
- runtime detection of the CPU arch (v6 or v7) and model (Cortex-A8 or 
Cortex-A9)

The code is for review, it has been checked, compiled and tested on OMAP3 
(Cortex-A8).

Some remarks and questions:

1) The number of available counters can reach 32 on ARMv7, so the macro ?
ARMPMU_MAX_HWEVENTS is now defined as 32. Is that correct?

2) Please note that the Cortex-A9 events do not easily map to the predefined 
events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in the code.
- the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks like the number 
of instructions is calculated by adding events numbers (events from 0x70 till 
0x74: MAIN_UNIT_EXECUTED_INST, SECOND_UNIT_EXECUTED_INST, 
LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
- the HW_BRANCH events are not found
- the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and 
COHERENT_LINE_MISS events, is that correct?
- L1 and L2 cache events are not found. Those could be available in separate 
PL310 registers, TBC
- no TLB events excepted the ITLB_MISS event are found.

Any thoughts?

Regards,
Jean

---
>From 9d559085417187870be2823e8a10fba6a513aae9 Mon Sep 17 00:00:00 2001
From: Jean Pihet <jpihet@mvista.com>
Date: Fri, 18 Dec 2009 17:46:21 +0100
Subject: [PATCH] arm/perfevents: add support for ARMv7

Adds the Performance Events support for ARMv7 processor, using
the PMNC unit in HW.

Supports the following:
- Cortex-A8 and Cortex-A9 processors,
- dynamic detection of the number of available counters,
   based on the PMCR value,
- runtime detection of the CPU arch (v6 or v7)
   and model (Cortex-A8 or Cortex-A9)

Tested on OMAP3530 (Cortex-A8) only.

Signed-off-by: Jean Pihet <jpihet@mvista.com>
---
 arch/arm/Kconfig             |    2 +-
 arch/arm/kernel/perf_event.c |  923 
+++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 907 insertions(+), 18 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e5bd97e..5c6afdb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1172,7 +1172,7 @@ config HIGHPTE
 
 config HW_PERF_EVENTS
 	bool "Enable hardware performance counter support for perf events"
-	depends on PERF_EVENTS && CPU_HAS_PMU && CPU_V6
+	depends on PERF_EVENTS && CPU_HAS_PMU && (CPU_V6 || CPU_V7)
 	default y
 	help
 	  Enable hardware performance counter support for perf events. If
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index abb5267..d56c2cd 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -4,6 +4,7 @@
  * ARM performance counter support.
  *
  * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ * ARMv7 support: Jean Pihet <jpihet@mvista.com>
  *
  * This code is based on the sparc64 perf event code, which is in turn based
  * on the x86 code. Callchain code is based on the ARM OProfile backtrace
@@ -35,8 +36,12 @@ DEFINE_SPINLOCK(pmu_lock);
  * ARMv6 supports a maximum of 3 events, starting from index 1. If we add
  * another platform that supports more, we need to increase this to be the
  * largest of all platforms.
+ *
+ * ARMv7 supports up to 32 events:
+ *  cycle counter CCNT + 31 events counters CNT0..30.
+ *  Cortex-A8 has 1+4 counters, Cortex-A9 has 1+6 counters
  */
-#define ARMPMU_MAX_HWEVENTS		4
+#define ARMPMU_MAX_HWEVENTS		32
 
 /* The events for a given CPU. */
 struct cpu_hw_events {
@@ -61,7 +66,7 @@ struct cpu_hw_events {
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
 struct arm_pmu {
-	const char	*name;
+	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
 	void		(*enable)(struct hw_perf_event *evt, int idx);
 	void		(*disable)(struct hw_perf_event *evt, int idx);
@@ -965,26 +970,910 @@ static struct arm_pmu armv6pmu = {
 	.max_period		= (1LLU << 32) - 1,
 };
 
+/*
+ * ARMv7 Cortex-A8 and Cortex-A9 Performance Events handling code.
+ *
+ * Copied from ARMv6 code, with the low level code inspired
+ *  by the ARMv7 Oprofile code.
+ *
+ * Cortex-A8 has up to 4 configurable performance counters and
+ *  a single cycle counter.
+ * Cortex-A9 has up to 31 configurable performance counters and
+ *  a single cycle counter.
+ *
+ * All counters can be enabled/disabled and IRQ masked separately. The cycle
+ *  counter and all 4 performance counters together can be reset separately.
+ */
+
+#define ARMV7_PMU_CORTEX_A8_NAME		"ARMv7 Cortex-A8"
+
+#define ARMV7_PMU_CORTEX_A9_NAME		"ARMv7 Cortex-A9"
+
+/* Common ARMv7 event types */
+enum armv7_perf_types {
+	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
+	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
+	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
+	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
+	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
+	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
+	ARMV7_PERFCTR_DREAD			= 0x06,
+	ARMV7_PERFCTR_DWRITE			= 0x07,
+
+	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
+	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
+	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
+	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
+	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
+	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
+	ARMV7_PERFCTR_CLOCK_CYCLES		= 0x11,
+
+	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
+
+	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
+};
+
+/* ARMv7 Cortex-A8 specific event types */
+enum armv7_a8_perf_types {
+	ARMV7_PERFCTR_INSTR_EXECUTED		= 0x08,
+
+	ARMV7_PERFCTR_PC_PROC_RETURN		= 0x0E,
+
+	ARMV7_PERFCTR_WRITE_BUFFER_FULL		= 0x40,
+	ARMV7_PERFCTR_L2_STORE_MERGED		= 0x41,
+	ARMV7_PERFCTR_L2_STORE_BUFF		= 0x42,
+	ARMV7_PERFCTR_L2_ACCESS			= 0x43,
+	ARMV7_PERFCTR_L2_CACH_MISS		= 0x44,
+	ARMV7_PERFCTR_AXI_READ_CYCLES		= 0x45,
+	ARMV7_PERFCTR_AXI_WRITE_CYCLES		= 0x46,
+	ARMV7_PERFCTR_MEMORY_REPLAY		= 0x47,
+	ARMV7_PERFCTR_UNALIGNED_ACCESS_REPLAY	= 0x48,
+	ARMV7_PERFCTR_L1_DATA_MISS		= 0x49,
+	ARMV7_PERFCTR_L1_INST_MISS		= 0x4A,
+	ARMV7_PERFCTR_L1_DATA_COLORING		= 0x4B,
+	ARMV7_PERFCTR_L1_NEON_DATA		= 0x4C,
+	ARMV7_PERFCTR_L1_NEON_CACH_DATA		= 0x4D,
+	ARMV7_PERFCTR_L2_NEON			= 0x4E,
+	ARMV7_PERFCTR_L2_NEON_HIT		= 0x4F,
+	ARMV7_PERFCTR_L1_INST			= 0x50,
+	ARMV7_PERFCTR_PC_RETURN_MIS_PRED	= 0x51,
+	ARMV7_PERFCTR_PC_BRANCH_FAILED		= 0x52,
+	ARMV7_PERFCTR_PC_BRANCH_TAKEN		= 0x53,
+	ARMV7_PERFCTR_PC_BRANCH_EXECUTED	= 0x54,
+	ARMV7_PERFCTR_OP_EXECUTED		= 0x55,
+	ARMV7_PERFCTR_CYCLES_INST_STALL		= 0x56,
+	ARMV7_PERFCTR_CYCLES_INST		= 0x57,
+	ARMV7_PERFCTR_CYCLES_NEON_DATA_STALL	= 0x58,
+	ARMV7_PERFCTR_CYCLES_NEON_INST_STALL	= 0x59,
+	ARMV7_PERFCTR_NEON_CYCLES		= 0x5A,
+
+	ARMV7_PERFCTR_PMU0_EVENTS		= 0x70,
+	ARMV7_PERFCTR_PMU1_EVENTS		= 0x71,
+	ARMV7_PERFCTR_PMU_EVENTS		= 0x72,
+};
+
+/* ARMv7 Cortex-A9 specific event types */
+enum armv7_a9_perf_types {
+	ARMV7_PERFCTR_JAVA_HW_BYTECODE_EXEC	= 0x40,
+	ARMV7_PERFCTR_JAVA_SW_BYTECODE_EXEC	= 0x41,
+	ARMV7_PERFCTR_JAZELLE_BRANCH_EXEC	= 0x42,
+
+	ARMV7_PERFCTR_COHERENT_LINE_MISS	= 0x50,
+	ARMV7_PERFCTR_COHERENT_LINE_HIT		= 0x51,
+
+	ARMV7_PERFCTR_ICACHE_DEP_STALL_CYCLES	= 0x60,
+	ARMV7_PERFCTR_DCACHE_DEP_STALL_CYCLES	= 0x61,
+	ARMV7_PERFCTR_TLB_MISS_DEP_STALL_CYCLES	= 0x62,
+	ARMV7_PERFCTR_STREX_EXECUTED_PASSED	= 0x63,
+	ARMV7_PERFCTR_STREX_EXECUTED_FAILED	= 0x64,
+	ARMV7_PERFCTR_DATA_EVICTION		= 0x65,
+	ARMV7_PERFCTR_ISSUE_STAGE_NO_INST	= 0x66,
+	ARMV7_PERFCTR_ISSUE_STAGE_EMPTY		= 0x67,
+	ARMV7_PERFCTR_INST_OUT_OF_RENAME_STAGE	= 0x68,
+
+	ARMV7_PERFCTR_PREDICTABLE_FUNCT_RETURNS	= 0x6E,
+
+	ARMV7_PERFCTR_MAIN_UNIT_EXECUTED_INST	= 0x70,
+	ARMV7_PERFCTR_SECOND_UNIT_EXECUTED_INST	= 0x71,
+	ARMV7_PERFCTR_LD_ST_UNIT_EXECUTED_INST	= 0x72,
+	ARMV7_PERFCTR_FP_EXECUTED_INST		= 0x73,
+	ARMV7_PERFCTR_NEON_EXECUTED_INST	= 0x74,
+
+	ARMV7_PERFCTR_PLD_FULL_DEP_STALL_CYCLES	= 0x80,
+	ARMV7_PERFCTR_DATA_WR_DEP_STALL_CYCLES	= 0x81,
+	ARMV7_PERFCTR_ITLB_MISS_DEP_STALL_CYCLES	= 0x82,
+	ARMV7_PERFCTR_DTLB_MISS_DEP_STALL_CYCLES	= 0x83,
+	ARMV7_PERFCTR_MICRO_ITLB_MISS_DEP_STALL_CYCLES	= 0x84,
+	ARMV7_PERFCTR_MICRO_DTLB_MISS_DEP_STALL_CYCLES 	= 0x85,
+	ARMV7_PERFCTR_DMB_DEP_STALL_CYCLES	= 0x86,
+
+	ARMV7_PERFCTR_INTGR_CLK_ENABLED_CYCLES	= 0x8A,
+	ARMV7_PERFCTR_DATA_ENGINE_CLK_EN_CYCLES	= 0x8B,
+
+	ARMV7_PERFCTR_ISB_INST			= 0x90,
+	ARMV7_PERFCTR_DSB_INST			= 0x91,
+	ARMV7_PERFCTR_DMB_INST			= 0x92,
+	ARMV7_PERFCTR_EXT_INTERRUPTS		= 0x93,
+
+	ARMV7_PERFCTR_PLE_CACHE_LINE_RQST_COMPLETED	= 0xA0,
+	ARMV7_PERFCTR_PLE_CACHE_LINE_RQST_SKIPPED	= 0xA1,
+	ARMV7_PERFCTR_PLE_FIFO_FLUSH		= 0xA2,
+	ARMV7_PERFCTR_PLE_RQST_COMPLETED	= 0xA3,
+	ARMV7_PERFCTR_PLE_FIFO_OVERFLOW		= 0xA4,
+	ARMV7_PERFCTR_PLE_RQST_PROG		= 0xA5
+};
+
+/*
+ * Cortex-A8 HW events mapping
+ *
+ * The hardware events that we support. We do support cache operations but
+ * we have harvard caches and no way to combine instruction and data
+ * accesses/misses in hardware.
+ */
+static const unsigned armv7_a8_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_FAILED,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
+};
+
+static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					  [PERF_COUNT_HW_CACHE_OP_MAX]
+					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACH_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * Only ITLB misses and DTLB refills are supported.
+		 * If users want the DTLB refills misses a raw counter
+		 * must be used.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_TAKEN,
+			[C(RESULT_MISS)]
+					= ARMV7_PERFCTR_PC_BRANCH_FAILED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
+ * Cortex-A9 HW events mapping
+ */
+static const unsigned armv7_a9_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]    = ARMV7_PERFCTR_COHERENT_LINE_HIT,
+	[PERF_COUNT_HW_CACHE_MISSES]	    = ARMV7_PERFCTR_COHERENT_LINE_MISS,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BRANCH_MISSES]	    = HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
+};
+
+static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					  [PERF_COUNT_HW_CACHE_OP_MAX]
+					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		/*
+		 * Only ITLB misses and DTLB refills are supported.
+		 * If users want the DTLB refills misses a raw counter
+		 * must be used.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
+ * Perf Events counters
+ */
+enum armv7_counters {
+	ARMV7_CYCLE_COUNTER 		= 1,	/* Cycle counter */
+	ARMV7_COUNTER0			= 2,	/* First event counter */
+};
+
+/* The last event counter is (ARMV7_COUNTER0 + armpmu->num_events) */
+#define	ARMV7_COUNTER_LAST		(ARMV7_COUNTER0 + armpmu->num_events)
+
+/*
+ * ARMv7 low level PMNC access
+ */
+
+/*
+ * Per-CPU PMNC: config reg
+ */
+#define ARMV7_PMNC_E		(1 << 0) /* Enable all counters */
+#define ARMV7_PMNC_P		(1 << 1) /* Reset all counters */
+#define ARMV7_PMNC_C		(1 << 2) /* Cycle counter reset */
+#define ARMV7_PMNC_D		(1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV7_PMNC_X		(1 << 4) /* Export to ETM */
+#define ARMV7_PMNC_DP		(1 << 5) /* Disable CCNT if non-invasive debug*/
+#define	ARMV7_PMNC_N_SHIFT	11	 /* Number of counters supported */
+#define	ARMV7_PMNC_N_MASK	0x1f
+#define	ARMV7_PMNC_MASK		0x3f	 /* Mask for writable bits */
+
+/*
+ * Available counters
+ */
+#define ARMV7_CNT0 		0	/* First event counter */
+#define ARMV7_CCNT 		31	/* Cycle counter */
+
+#define ARMV7_A8_CNTMAX		5	/* Cortex-A8: up to 4 counters + CCNT */
+#define ARMV7_A9_CNTMAX		32	/* Cortex-A9: up to 31 counters + CCNT*/
+
+/* Perf Event to low level counters mapping */
+#define ARMV7_EVENT_CNT_TO_CNTx	(ARMV7_COUNTER0 - ARMV7_CNT0)
+
+/*
+ * CNTENS: counters enable reg
+ */
+#define ARMV7_CNTENS_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_CNTENS_C		(1 << ARMV7_CCNT)
+
+/*
+ * CNTENC: counters disable reg
+ */
+#define ARMV7_CNTENC_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_CNTENC_C		(1 << ARMV7_CCNT)
+
+/*
+ * INTENS: counters overflow interrupt enable reg
+ */
+#define ARMV7_INTENS_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_INTENS_C		(1 << ARMV7_CCNT)
+
+/*
+ * INTENC: counters overflow interrupt disable reg
+ */
+#define ARMV7_INTENC_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_INTENC_C		(1 << ARMV7_CCNT)
+
+/*
+ * EVTSEL: Event selection reg
+ */
+#define	ARMV7_EVTSEL_MASK	0x7f		/* Mask for writable bits */
+
+/*
+ * SELECT: Counter selection reg
+ */
+#define	ARMV7_SELECT_MASK	0x1f		/* Mask for writable bits */
+
+/*
+ * FLAG: counters overflow flag status reg
+ */
+#define ARMV7_FLAG_P(idx)	(1 << (idx - ARMV7_EVENT_CNT_TO_CNTx))
+#define ARMV7_FLAG_C		(1 << ARMV7_CCNT)
+#define	ARMV7_FLAG_MASK		0xffffffff	/* Mask for writable bits */
+#define	ARMV7_OVERFLOWED_MASK	ARMV7_FLAG_MASK
+
+static inline unsigned long armv7_pmnc_read(void)
+{
+	u32 val;
+	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(val));
+	return val;
+}
+
+static inline void armv7_pmnc_write(unsigned long val)
+{
+	val &= ARMV7_PMNC_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(val));
+}
+
+static inline int armv7_pmnc_has_overflowed(unsigned long pmnc)
+{
+	return pmnc & ARMV7_OVERFLOWED_MASK;
+}
+
+static inline int armv7_pmnc_counter_has_overflowed(unsigned long pmnc,
+					enum armv7_counters counter)
+{
+	int ret;
+
+	if (counter == ARMV7_CYCLE_COUNTER)
+		ret = pmnc & ARMV7_FLAG_C;
+	else if ((counter >= ARMV7_COUNTER0) && (counter <= ARMV7_COUNTER_LAST))
+		ret = pmnc & ARMV7_FLAG_P(counter);
+	else
+		BUG();
+
+	return ret;
+}
+
+static inline int armv7_pmnc_select_counter(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST)) {
+		pr_err("CPU%u selecting wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	val = (idx - ARMV7_EVENT_CNT_TO_CNTx) & ARMV7_SELECT_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7pmu_read_counter(int idx)
+{
+	unsigned long value = 0;
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (value));
+	else if ((idx >= ARMV7_COUNTER0) && (idx <= ARMV7_COUNTER_LAST)) {
+		if (armv7_pmnc_select_counter(idx) == idx)
+			asm volatile("mrc p15, 0, %0, c9, c13, 2"
+				     : "=r" (value));
+	} else
+		BUG();
+
+	return value;
+}
+
+static inline void armv7pmu_write_counter(int idx, u32 value)
+{
+	if (idx == ARMV7_CYCLE_COUNTER)
+		asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (value));
+	else if ((idx >= ARMV7_COUNTER0) && (idx <= ARMV7_COUNTER_LAST)) {
+		if (armv7_pmnc_select_counter(idx) == idx)
+			asm volatile("mcr p15, 0, %0, c9, c13, 2"
+				     : : "r" (value));
+	} else
+		BUG();
+}
+
+static inline void armv7_pmnc_write_evtsel(unsigned int idx, u32 val)
+{
+	if (armv7_pmnc_select_counter(idx) == idx) {
+		val &= ARMV7_EVTSEL_MASK;
+		asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (val));
+	}
+}
+
+static inline u32 armv7_pmnc_enable_counter(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx != ARMV7_CYCLE_COUNTER) &&
+	    ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST))) {
+		pr_err("CPU%u enabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_CNTENS_C;
+	else
+		val = ARMV7_CNTENS_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_disable_counter(unsigned int idx)
+{
+	u32 val;
+
+
+	if ((idx != ARMV7_CYCLE_COUNTER) &&
+	    ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST))) {
+		pr_err("CPU%u disabling wrong PMNC counter"
+			" %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_CNTENC_C;
+	else
+		val = ARMV7_CNTENC_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_enable_intens(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx != ARMV7_CYCLE_COUNTER) &&
+	    ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST))) {
+		pr_err("CPU%u enabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_INTENS_C;
+	else
+		val = ARMV7_INTENS_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_disable_intens(unsigned int idx)
+{
+	u32 val;
+
+	if ((idx != ARMV7_CYCLE_COUNTER) &&
+	    ((idx < ARMV7_COUNTER0) || (idx > ARMV7_COUNTER_LAST))) {
+		pr_err("CPU%u disabling wrong PMNC counter"
+			" interrupt enable %d\n", smp_processor_id(), idx);
+		return -1;
+	}
+
+	if (idx == ARMV7_CYCLE_COUNTER)
+		val = ARMV7_INTENC_C;
+	else
+		val = ARMV7_INTENC_P(idx);
+
+	asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (val));
+
+	return idx;
+}
+
+static inline u32 armv7_pmnc_getreset_flags(void)
+{
+	u32 val;
+
+	/* Read */
+	asm volatile("mrc p15, 0, %0, c9, c12, 3" : "=r" (val));
+
+	/* Write to clear flags */
+	val &= ARMV7_FLAG_MASK;
+	asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (val));
+
+	return val;
+}
+
+void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Enable counter and interrupt, and set the counter to count
+	 * the event that we're interested in.
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Set event (if destined for PMNx counters)
+	 * We don't need to set the event if it's a cycle count
+	 */
+	if (idx != ARMV7_CYCLE_COUNTER)
+		armv7_pmnc_write_evtsel(idx, hwc->config_base);
+
+	/*
+	 * Enable interrupt for this counter
+	 */
+	armv7_pmnc_enable_intens(idx);
+
+	/*
+	 * Enable counter
+	 */
+	armv7_pmnc_enable_counter(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable counter and interrupt
+	 */
+	spin_lock_irqsave(&pmu_lock, flags);
+
+	/*
+	 * Disable counter
+	 */
+	armv7_pmnc_disable_counter(idx);
+
+	/*
+	 * Disable interrupt for this counter
+	 */
+	armv7_pmnc_disable_intens(idx);
+
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
+{
+	unsigned long pmnc;
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	/*
+	 * Get and reset the IRQ flags
+	 */
+	pmnc = armv7_pmnc_getreset_flags();
+
+	/*
+	 * Did an overflow occur?
+	 */
+	if (!armv7_pmnc_has_overflowed(pmnc))
+		return IRQ_NONE;
+
+	/*
+	 * Handle the counter(s) overflow(s)
+	 */
+	regs = get_irq_regs();
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx <= armpmu->num_events; ++idx) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		/*
+		 * We have a single interrupt for all counters. Check that
+		 * each counter has overflowed before we process it.
+		 */
+		if (!armv7_pmnc_counter_has_overflowed(pmnc, idx))
+			continue;
+
+		hwc = &event->hw;
+		armpmu_event_update(event, hwc, idx);
+		data.period = event->hw.last_period;
+		if (!armpmu_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 0, &data, regs))
+			armpmu->disable(hwc, idx);
+	}
+
+	/*
+	 * Handle the pending perf events.
+	 *
+	 * Note: this call *must* be run with interrupts enabled. For
+	 * platforms that can have the PMU interrupts raised as a PMI, this
+	 * will not work.
+	 */
+	perf_event_do_pending();
+
+	return IRQ_HANDLED;
+}
+
+static void armv7pmu_start(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Enable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() | ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static void armv7pmu_stop(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	/* Disable all counters */
+	armv7_pmnc_write(armv7_pmnc_read() & ~ARMV7_PMNC_E);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+static inline int armv7_a8_pmu_event_map(int config)
+{
+	int mapping = armv7_a8_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static inline int armv7_a9_pmu_event_map(int config)
+{
+	int mapping = armv7_a9_perf_map[config];
+	if (HW_OP_UNSUPPORTED == mapping)
+		mapping = -EOPNOTSUPP;
+	return mapping;
+}
+
+static u64 armv7pmu_raw_event(u64 config)
+{
+	return config & 0xff;
+}
+
+static int armv7pmu_get_event_idx(struct cpu_hw_events *cpuc,
+				  struct hw_perf_event *event)
+{
+	int idx;
+
+	/* Always place a cycle counter into the cycle counter. */
+	if (event->config_base == ARMV7_PERFCTR_CPU_CYCLES) {
+		if (test_and_set_bit(ARMV7_CYCLE_COUNTER, cpuc->used_mask))
+			return -EAGAIN;
+
+		return ARMV7_CYCLE_COUNTER;
+	} else {
+		/*
+		 * For anything other than a cycle counter, try and use
+		 * the events counters
+		 */
+		for (idx = ARMV7_COUNTER0; idx <= armpmu->num_events; ++idx) {
+			if (!test_and_set_bit(idx, cpuc->used_mask))
+				return idx;
+		}
+
+		/* The counters are all in use. */
+		return -EAGAIN;
+	}
+}
+
+static struct arm_pmu armv7pmu = {
+	.handle_irq		= armv7pmu_handle_irq,
+	.enable			= armv7pmu_enable_event,
+	.disable		= armv7pmu_disable_event,
+	.raw_event		= armv7pmu_raw_event,
+	.read_counter		= armv7pmu_read_counter,
+	.write_counter		= armv7pmu_write_counter,
+	.get_event_idx		= armv7pmu_get_event_idx,
+	.start			= armv7pmu_start,
+	.stop			= armv7pmu_stop,
+	.max_period		= (1LLU << 32) - 1,
+};
+
 static int __init
 init_hw_perf_events(void)
 {
-#define CPUID_V6_MASK   0x7F000
-#define CPUID_V6_BITS   0x7B000
-        unsigned long cpuid = read_cpuid_id();
-
-        if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
-                armpmu = &armv6pmu;
-                memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
-                       sizeof(armv6_perf_cache_map));
-                perf_max_events	= armv6pmu.num_events;
-        } else {
-                pr_info("no hardware support available\n");
-                perf_max_events = -1;
-        }
+#define CPUID_V6_MASK   	0x7F000
+#define CPUID_V6_BITS   	0x7B000
+
+#define CPUID_CORTEX_A8_BITS	0xC080
+#define CPUID_CORTEX_A8_MASK	0xFFF0
+
+#define CPUID_CORTEX_A9_BITS	0xC090
+#define CPUID_CORTEX_A9_MASK	0xFFF0
+
+	unsigned long cpuid = read_cpuid_id();
+
+	/*
+	 * ARMv6 detection
+	 */
+	if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
+		armpmu = &armv6pmu;
+		memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
+			sizeof(armv6_perf_cache_map));
+		perf_max_events	= armv6pmu.num_events;
+	}
+	/*
+	 * ARMv7 detection
+	 */
+	else if (cpu_architecture() == CPU_ARCH_ARMv7) {
+		/*
+		 * Cortex-A8 detection
+		 */
+		if ((cpuid & CPUID_CORTEX_A8_MASK) == CPUID_CORTEX_A8_BITS) {
+			armv7pmu.name = ARMV7_PMU_CORTEX_A8_NAME;
+			memcpy(armpmu_perf_cache_map, armv7_a8_perf_cache_map,
+				sizeof(armv7_a8_perf_cache_map));
+			armv7pmu.event_map = armv7_a8_pmu_event_map;
+			armpmu = &armv7pmu;
+		} else
+		/*
+		 * Cortex-A9 detection
+		 */
+			if ((cpuid & CPUID_CORTEX_A9_MASK)
+			    == CPUID_CORTEX_A9_BITS) {
+				armv7pmu.name = ARMV7_PMU_CORTEX_A9_NAME;
+				memcpy(armpmu_perf_cache_map,
+					armv7_a9_perf_cache_map,
+					sizeof(armv7_a9_perf_cache_map));
+				armv7pmu.event_map = armv7_a9_pmu_event_map;
+				armpmu = &armv7pmu;
+		} else
+			perf_max_events = -1;
+
+		if (armpmu) {
+			u32 nb_cnt;
+
+			/* Read the nb of CNTx counters supported from PMNC */
+			nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT)
+				& ARMV7_PMNC_N_MASK;
+			/* Add the CPU cycles counter */
+			armv7pmu.num_events = nb_cnt + 1;
+			perf_max_events	= armv7pmu.num_events;
+
+			/* Initialize & Reset PMNC: C bit and P bit */
+			armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
+		}
+	} else {
+		pr_info("no hardware support available\n");
+		perf_max_events = -1;
+	}
 
         if (armpmu)
-                pr_info("enabled with %s PMU driver\n",
-                        armpmu->name);
+		pr_info("enabled with %s PMU driver, %d counters available\n",
+			armpmu->name, armpmu->num_events);
 
         return 0;
 }
-- 
1.6.2.5.168.g3823



On Tuesday 22 December 2009 17:51:39 Jean Pihet wrote:
> Hi,
>
> Here is the updated patch. It now supports:
> - Cortex-A8 and Cortex-A9 processors,
> - The low level code has been completely redesigned to allow the dynamic
> detection of the number of available counters, based on the PMCR value,
> - runtime detection of the CPU arch (v6 or v7) and model (Cortex-A8 or
> Cortex-A9)
>
> The code is for review, it has been checked, compiled and boot tested on
> OMAP3 (Cortex-A8). Unfortunately I am still facing some cross compilation
> problems of the tools/perf utility.
>
> Some remarks and questions:
>
> 1) The number of available counters can reach 32 on ARMv7, so the macro
> ARMPMU_MAX_HWEVENTS is now defined as 32. Is that correct?
>
> 2) Please note that the Cortex-A9 events do not easily map to the
> predefined events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in the
> code. - the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks like
> the number of instructions is calculated by adding events numbers (events
> from 0x70 till 0x74: MAIN_UNIT_EXECUTED_INST, SECOND_UNIT_EXECUTED_INST,
> LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
> - the HW_BRANCH events are not found
> - the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and
> COHERENT_LINE_MISS events, is that correct?
> - L1 and L2 cache events are not found. Those could be available in
> separate PL310 registers, TBC
> - no TLB events excepted the ITLB_MISS event are found.
>
> Any thoughts?
>
> Regards,
> Jean
>

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-29 13:52                           ` Jean Pihet
@ 2009-12-29 16:32                             ` Jamie Iles
  2010-01-06 15:16                               ` Michał Nazarewicz
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2009-12-29 16:32 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jean,

On Tue, Dec 29, 2009 at 02:52:16PM +0100, Jean Pihet wrote:
> Ingo, Jamie,
> 
> On Monday 28 December 2009 08:57:48 Ingo Molnar wrote:
> > * Jean Pihet <jpihet@mvista.com> wrote:
> > > The code is for review, it has been checked, compiled and boot tested on
> > > OMAP3 (Cortex-A8). Unfortunately I am still facing some cross compilation
> > > problems of the tools/perf utility.
> >
> > Have you managed to solve these cross-compilation problems? If yes, it
> > would be nice to merge the fixes into upstream perf.
> Yes I got them resolved but I needed to make a few changes:
> - -Werror removed from the CFLAGS definition in tools/perf/Makefile. Without 
> the change the compilation stops after a warning about include paths.
I've found that for the embedded platform I'm working on I need to build with
NO_LIBPERL=1 otherwise the local include paths are used for perl. Obviously
perl scripting will not be available.
> - the rmb() macro in tools/perf/perf.h prevents the compilation. I changed it 
> to the definition from arch/arm/include/asm/system.h (asm 
> volatile("":::"memory")). Where is the original definition from? Why is it 
> specific to perf_events?
I took this definition of rmb() from arch/arm/kernel/entry-armv.S. This gives
a CPU independent way of a real memory barrier. This compiles with the latest
codesourcery toolchain and 2009q1.

[snip]
> I tested the ARMv7 code on the Cortex-A8 processor. The code looks fine but 
> when I load the CPU I am running into spinlock recursion problems (in 
> perf_ctx_adjust_freq).
> Does those problem happen on ARMv6 as well?
I haven't seen this on ARMv6 but perhaps I've been getting lucky...

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2010-01-04 10:48 ARM perf events support v4 Jamie Iles
@ 2010-01-04 10:48 ` Jamie Iles
  2010-01-06 12:00   ` Michał Nazarewicz
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2010-01-04 10:48 UTC (permalink / raw)
  To: linux-arm-kernel

To add support for perf events and to allow the hardware
counters to be shared with oprofile, we need a way to reserve
access to the pmu (performance monitor unit).

Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/Kconfig           |    5 ++
 arch/arm/include/asm/pmu.h |   74 ++++++++++++++++++++++++++++++
 arch/arm/kernel/Makefile   |    1 +
 arch/arm/kernel/pmu.c      |  107 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 187 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/pmu.h
 create mode 100644 arch/arm/kernel/pmu.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 233a222..9e08891 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -863,6 +863,11 @@ config XSCALE_PMU
 	depends on CPU_XSCALE && !XSCALE_PMU_TIMER
 	default y
 
+config CPU_HAS_PMU
+	depends on CPU_V6 || CPU_V7 || XSCALE_PMU
+	default y
+	bool
+
 if !MMU
 source "arch/arm/Kconfig-nommu"
 endif
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
new file mode 100644
index 0000000..5840d2d
--- /dev/null
+++ b/arch/arm/include/asm/pmu.h
@@ -0,0 +1,74 @@
+/*
+ *  linux/arch/arm/include/asm/pmu.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PMU_H__
+#define __ARM_PMU_H__
+
+#ifdef CONFIG_CPU_HAS_PMU
+
+struct pmu_irqs {
+	const int   *irqs;
+	int	    num_irqs;
+};
+
+/**
+ * reserve_pmu() - reserve the hardware performance counters
+ *
+ * Reserve the hardware performance counters in the system for exclusive use.
+ * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
+ * encoded error on failure.
+ */
+extern const struct pmu_irqs *
+reserve_pmu(void);
+
+/**
+ * release_pmu() - Relinquish control of the performance counters
+ *
+ * Release the performance counters and allow someone else to use them.
+ * Callers must have disabled the counters and released IRQs before calling
+ * this. The 'struct pmu_irqs' returned from reserve_pmu() must be passed as
+ * a cookie.
+ */
+extern int
+release_pmu(const struct pmu_irqs *irqs);
+
+/**
+ * init_pmu() - Initialise the PMU.
+ *
+ * Initialise the system ready for PMU enabling. This should typically set the
+ * IRQ affinity and nothing else. The users (oprofile/perf events etc) will do
+ * the actual hardware initialisation.
+ */
+extern int
+init_pmu(void);
+
+#else /* CONFIG_CPU_HAS_PMU */
+
+static inline const struct pmu_irqs *
+reserve_pmu(void)
+{
+	ERR_PTR(-ENODEV);
+}
+
+static inline int
+release_pmu(const struct pmu_irqs *irqs)
+{
+}
+
+static inline int
+init_pmu(void)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_CPU_HAS_PMU */
+
+#endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index dd00f74..216890d 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_CPU_XSCALE)	+= xscale-cp0.o
 obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
+obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 
 ifneq ($(CONFIG_ARCH_EBSA110),y)
diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
new file mode 100644
index 0000000..a8c015d
--- /dev/null
+++ b/arch/arm/kernel/pmu.c
@@ -0,0 +1,107 @@
+/*
+ *  linux/arch/arm/kernel/pmu.c
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/semaphore.h>
+
+#include <asm/pmu.h>
+
+/*
+ * Define the IRQs for the system. We could use something like a platform
+ * device but that seems fairly heavyweight for this. Also, the performance
+ * counters can't be removed or hotplugged.
+ *
+ * Ordering is important: init_pmu() will use the ordering to set the affinity
+ * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
+ * second goes to cpu 1 etc.
+ */
+static const int irqs[] = {
+#ifdef CONFIG_ARCH_PC3XX
+	IRQ_NPMUIRQ,
+#elif defined(CONFIG_ARCH_OMAP2)
+	3,
+#elif defined(CONFIG_ARCH_BCMRING)
+	IRQ_PMUIRQ,
+#elif defined(CONFIG_MACH_REALVIEW_EB)
+	IRQ_EB11MP_PMU_CPU0,
+	IRQ_EB11MP_PMU_CPU1,
+	IRQ_EB11MP_PMU_CPU2,
+	IRQ_EB11MP_PMU_CPU3,
+#elif defined(CONFIG_ARCH_OMAP3)
+	INT_34XX_BENCH_MPU_EMUL,
+#elif defined(CONFIG_ARCH_IOP32X)
+	IRQ_IOP32X_CORE_PMU,
+#elif defined(CONFIG_ARCH_IOP33X)
+	IRQ_IOP33X_CORE_PMU,
+#elif defined(CONFIG_ARCH_PXA)
+	IRQ_PMU,
+#endif
+};
+
+static const struct pmu_irqs pmu_irqs = {
+	.irqs	    = irqs,
+	.num_irqs   = ARRAY_SIZE(irqs),
+};
+
+static DECLARE_MUTEX(pmu_mutex);
+
+const struct pmu_irqs *
+reserve_pmu(void)
+{
+	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
+
+	return ret ? ERR_PTR(ret) : &pmu_irqs;
+}
+EXPORT_SYMBOL_GPL(reserve_pmu);
+
+int
+release_pmu(const struct pmu_irqs *irqs)
+{
+	if (WARN_ON(irqs != &pmu_irqs))
+		return -EINVAL;
+	up(&pmu_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(release_pmu);
+
+static int
+set_irq_affinity(int irq,
+		 unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	int err = irq_set_affinity(irq, cpumask_of(cpu));
+	if (err)
+		pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
+			   irq, cpu);
+	return err;
+#else
+	return 0;
+#endif
+}
+
+int
+init_pmu(void)
+{
+	int i, err = 0;
+
+	for (i = 0; i < pmu_irqs.num_irqs; ++i) {
+		err = set_irq_affinity(pmu_irqs.irqs[i], i);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(init_pmu);
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-29 13:58                         ` Jean Pihet
@ 2010-01-04 16:52                           ` Will Deacon
  2010-01-15 15:30                             ` Jean Pihet
  0 siblings, 1 reply; 55+ messages in thread
From: Will Deacon @ 2010-01-04 16:52 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jean,

Thanks for this. Feedback inline.

* Jean Pihet wrote:

> Here is the updated patch after testing on HW.
> I will rebase it on Jamie's latest patch set as soon as they are out.
> 
> Feedback is welcome!

> Some remarks and questions:
> 
> 1) The number of available counters can reach 32 on ARMv7, so the macro
> ARMPMU_MAX_HWEVENTS is now defined as 32. Is that correct?

I think index 0 is reserved for the events array, so this should probably be 33.
Hopefully that doesn't break any bitmasks.

> 2) Please note that the Cortex-A9 events do not easily map to the predefined
> events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in the code.
> - the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks like the number
> of instructions is calculated by adding events numbers (events from 0x70 till
> 0x74: MAIN_UNIT_EXECUTED_INST, SECOND_UNIT_EXECUTED_INST,
> LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),

Event 0x68 - `Instructions coming out of the core renaming stage'
can be used as an approximation for PERF_COUNT_HW_INSTRUCTIONS.

> - the HW_BRANCH events are not found

0x0c is HW_BRANCH_INSTRUCTIONS and 0x10 is HW_BRANCH_MISSES.
0x12 is the number of predictable branch instructions executed, so the mispredict
rate is 0x10/0x12. These events are defined for v7, so A8 should take these
definitions too.

> - the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and
> COHERENT_LINE_MISS events, is that correct?

0x50 is COHERENT_LINE_MISS and 0x51 is COHERENT_LINE_HIT. These are only available
on A9 with SMP.

> - L1 and L2 cache events are not found. Those could be available in separate
> PL310 registers, TBC

We could use 0x01 for icache miss, 0x03 for dcache miss and 0x04 for dcache access.

> - no TLB events excepted the ITLB_MISS event are found.

I think 0x05 is DTLB_MISS.

<snip>

> +/*
> + * ARMv7 Cortex-A8 and Cortex-A9 Performance Events handling code.
> + *
> + * Copied from ARMv6 code, with the low level code inspired
> + *  by the ARMv7 Oprofile code.
> + *
> + * Cortex-A8 has up to 4 configurable performance counters and
> + *  a single cycle counter.
> + * Cortex-A9 has up to 31 configurable performance counters and
> + *  a single cycle counter.
> + *
> + * All counters can be enabled/disabled and IRQ masked separately. The cycle
> + *  counter and all 4 performance counters together can be reset separately.
> + */
> +
> +#define ARMV7_PMU_CORTEX_A8_NAME		"ARMv7 Cortex-A8"
> +
> +#define ARMV7_PMU_CORTEX_A9_NAME		"ARMv7 Cortex-A9"
> +
> +/* Common ARMv7 event types */
> +enum armv7_perf_types {
> +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> +	ARMV7_PERFCTR_DREAD			= 0x06,
> +	ARMV7_PERFCTR_DWRITE			= 0x07,
> +
> +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> +	ARMV7_PERFCTR_CLOCK_CYCLES		= 0x11,
> +
> +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
> +
> +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> +};

For consistency, I think it's better to stick to either MISS or REFILL.
Events 0x01 and 0x03 are the same, but for instructions and data respectively.

<snip>

> +static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
> +					  [PERF_COUNT_HW_CACHE_OP_MAX]
> +					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +	[C(L1D)] = {
> +		/*
> +		 * The performance counters don't differentiate between read
> +		 * and write accesses/misses so this isn't strictly correct,
> +		 * but it's the best we can do. Writes and reads get
> +		 * combined.
> +		 */
> +		[C(OP_READ)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> +		},
> +		[C(OP_WRITE)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> +		},
> +		[C(OP_PREFETCH)] = {
> +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> +		},
> +	},

You're using the A8-only event 0x49 [ARMV7_PERFCTR_L1_DATA_MISS]. This also
counts hash misses in the address translation during the cache lookup procedure,
even if the resulting access is a [slightly slower] hit. I think you're better off
using event 0x03. In fact, I'd try to use architecturally defined events whenever
you can because that allows for comparisons between v7 cores.

> +	[C(L1I)] = {
> +		[C(OP_READ)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> +		},
> +		[C(OP_WRITE)] = {
> +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> +		},
> +		[C(OP_PREFETCH)] = {
> +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> +		},
> +	},

Same thing here. I'd suggest using 0x01 instead of 0x4a.

<snip>

> +/*
> + * Available counters
> + */
> +#define ARMV7_CNT0 		0	/* First event counter */
> +#define ARMV7_CCNT 		31	/* Cycle counter */
> +
> +#define ARMV7_A8_CNTMAX		5	/* Cortex-A8: up to 4 counters + CCNT */
> +#define ARMV7_A9_CNTMAX		32	/* Cortex-A9: up to 31 counters + CCNT*/

Actually, A9 has a maximum number of 6 event counters + CCNT.

<snip>

> +#define CPUID_V6_MASK   	0x7F000
> +#define CPUID_V6_BITS   	0x7B000
> +
> +#define CPUID_CORTEX_A8_BITS	0xC080
> +#define CPUID_CORTEX_A8_MASK	0xFFF0
> +
> +#define CPUID_CORTEX_A9_BITS	0xC090
> +#define CPUID_CORTEX_A9_MASK	0xFFF0

Just define CPUID_V7_MASK.

> +	unsigned long cpuid = read_cpuid_id();
> +
> +	/*
> +	 * ARMv6 detection
> +	 */
> +	if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
> +		armpmu = &armv6pmu;
> +		memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
> +			sizeof(armv6_perf_cache_map));
> +		perf_max_events	= armv6pmu.num_events;
> +	}
> +	/*
> +	 * ARMv7 detection
> +	 */
> +	else if (cpu_architecture() == CPU_ARCH_ARMv7) {
> +		/*
> +		 * Cortex-A8 detection
> +		 */
> +		if ((cpuid & CPUID_CORTEX_A8_MASK) == CPUID_CORTEX_A8_BITS) {
> +			armv7pmu.name = ARMV7_PMU_CORTEX_A8_NAME;
> +			memcpy(armpmu_perf_cache_map, armv7_a8_perf_cache_map,
> +				sizeof(armv7_a8_perf_cache_map));
> +			armv7pmu.event_map = armv7_a8_pmu_event_map;
> +			armpmu = &armv7pmu;
> +		} else
> +		/*
> +		 * Cortex-A9 detection
> +		 */
> +			if ((cpuid & CPUID_CORTEX_A9_MASK)
> +			    == CPUID_CORTEX_A9_BITS) {
> +				armv7pmu.name = ARMV7_PMU_CORTEX_A9_NAME;
> +				memcpy(armpmu_perf_cache_map,
> +					armv7_a9_perf_cache_map,
> +					sizeof(armv7_a9_perf_cache_map));
> +				armv7pmu.event_map = armv7_a9_pmu_event_map;
> +				armpmu = &armv7pmu;
> +		} else
> +			perf_max_events = -1;

The A9 code indentation is a level too deep I think. It might also be
worth adding a cpu_architecture() check to the v6 test just in case a
v7 core conflicts with the mask.

I hope that helps. Please let me know what you think about the event
numberings.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2010-01-04 10:48 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
@ 2010-01-06 12:00   ` Michał Nazarewicz
  2010-01-06 12:15     ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Michał Nazarewicz @ 2010-01-06 12:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 04 Jan 2010 11:48:38 +0100, Jamie Iles <jamie.iles@picochip.com> wrote:
> To add support for perf events and to allow the hardware
> counters to be shared with oprofile, we need a way to reserve
> access to the pmu (performance monitor unit).
>
> Cc: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Jamie Iles <jamie.iles@picochip.com>

> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> new file mode 100644
> index 0000000..5840d2d
> --- /dev/null
> +++ b/arch/arm/include/asm/pmu.h
> @@ -0,0 +1,74 @@

[...]

> +#ifndef __ARM_PMU_H__
> +#define __ARM_PMU_H__
> +
> +#ifdef CONFIG_CPU_HAS_PMU

[...]

> +#else /* CONFIG_CPU_HAS_PMU */
> +
> +static inline const struct pmu_irqs *
> +reserve_pmu(void)
> +{
> +	ERR_PTR(-ENODEV);

-	ERR_PTR(-ENODEV);
+	return ERR_PTR(-ENODEV);

> +}
> +
> +static inline int
> +release_pmu(const struct pmu_irqs *irqs)
> +{

+	return -ENODEV;

> +}
> +
> +static inline int
> +init_pmu(void)
> +{
> +	return -ENODEV;
> +}
> +
> +#endif /* CONFIG_CPU_HAS_PMU */
> +
> +#endif /* __ARM_PMU_H__ */

> diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> new file mode 100644
> index 0000000..a8c015d
> --- /dev/null
> +++ b/arch/arm/kernel/pmu.c
> @@ -0,0 +1,107 @@

[...]

> +static const int irqs[] = {

[...]

> +};
> +
> +static const struct pmu_irqs pmu_irqs = {
> +	.irqs	    = irqs,
> +	.num_irqs   = ARRAY_SIZE(irqs),
> +};
> +
> +static DECLARE_MUTEX(pmu_mutex);

Isn't mutex an overkill? A bit field would be enough:

-static DECLARE_MUTEX(pmu_mutex);
+static volatile long pmu_mutex;

> +
> +const struct pmu_irqs *
> +reserve_pmu(void)
> +{
> +	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
> +
> +	return ret ? ERR_PTR(ret) : &pmu_irqs;

-	int ret = down_trylock(&pmu_mutex) ? -EBUSY : 0;
-
-	return ret ? ERR_PTR(ret) : &pmu_irqs;
+	return test_and_set_bit_lock(0, &pmu_mutex) ? ERR_PTR(-EBUSY) : &pmm_irqs;

> +}
> +EXPORT_SYMBOL_GPL(reserve_pmu);
> +
> +int
> +release_pmu(const struct pmu_irqs *irqs)
> +{
> +	if (WARN_ON(irqs != &pmu_irqs))
> +		return -EINVAL;
> +	up(&pmu_mutex);

-	up(&pmu_mutex);
+	clear_bit_unlock(&pmm_mutex);

> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(release_pmu);

[...]


-- 
Best regards,                                           _     _
  .o. | Liege of Serenely Enlightened Majesty of       o' \,=./ `o
  ..o | Computer Science,  Micha? "mina86" Nazarewicz     (o o)
  ooo +---<mina86@mina86.com>---<mina86@jabber.org>---ooO--(_)--Ooo--

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2010-01-06 12:00   ` Michał Nazarewicz
@ 2010-01-06 12:15     ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2010-01-06 12:15 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 06, 2010 at 01:00:56PM +0100, Micha? Nazarewicz wrote:
>> +#else /* CONFIG_CPU_HAS_PMU */
>> +
>> +static inline const struct pmu_irqs *
>> +reserve_pmu(void)
>> +{
>> +	ERR_PTR(-ENODEV);
>
> -	ERR_PTR(-ENODEV);
> +	return ERR_PTR(-ENODEV);
>
>> +}
>> +
>> +static inline int
>> +release_pmu(const struct pmu_irqs *irqs)
>> +{
>
> +	return -ENODEV;
>
>> +}
>> +
>> +static inline int
>> +init_pmu(void)
>> +{
>> +	return -ENODEV;
>> +}
>> +
>> +#endif /* CONFIG_CPU_HAS_PMU */
>> +
>> +#endif /* __ARM_PMU_H__ */
Thanks, well spotted!
>> +static const struct pmu_irqs pmu_irqs = {
>> +	.irqs	    = irqs,
>> +	.num_irqs   = ARRAY_SIZE(irqs),
>> +};
>> +
>> +static DECLARE_MUTEX(pmu_mutex);
>
> Isn't mutex an overkill? A bit field would be enough:
>
> -static DECLARE_MUTEX(pmu_mutex);
> +static volatile long pmu_mutex;
Yes, it probably is. I don't think performance is important here but that's a
simpler solution so I'll make that change.

Thanks,

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-29 16:32                             ` Jamie Iles
@ 2010-01-06 15:16                               ` Michał Nazarewicz
  2010-01-06 15:30                                 ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Michał Nazarewicz @ 2010-01-06 15:16 UTC (permalink / raw)
  To: linux-arm-kernel

>>> * Jean Pihet <jpihet@mvista.com> wrote:
>>>> I am still facing some cross compilation problems of the
>>>> tools/perf utility.

>> On Monday 28 December 2009 08:57:48 Ingo Molnar wrote:
>>> Have you managed to solve these cross-compilation problems?
>>> If yes, it would be nice to merge the fixes into upstream perf.

> On Tue, Dec 29, 2009 at 02:52:16PM +0100, Jean Pihet wrote:
>> Yes I got them resolved but I needed to make a few changes:
>> - -Werror removed from the CFLAGS definition in tools/perf/Makefile.
>> Without the change the compilation stops after a warning about
>> include paths.

On Tue, 29 Dec 2009 17:32:52 +0100, Jamie Iles wrote:
> I've found that for the embedded platform I'm working on I need
> to build with NO_LIBPERL=1 otherwise the local include paths are
> used for perl. Obviously perl scripting will not be available.

>> - the rmb() macro in tools/perf/perf.h prevents the compilation.
>> I changed it to the definition from arch/arm/include/asm/system.h
>> (asm volatile("":::"memory")). Where is the original definition
>> from? Why is it specific to perf_events?

> I took this definition of rmb() from arch/arm/kernel/entry-armv.S.
> This give a CPU independent way of a real memory barrier. This
> compiles with the latest codesourcery toolchain and 2009q1.

Since I've just started to investigate the performance counter
architecture and will probably need to port tools/perf on ARM as well,
would you care to post any patches of your work in this area? It will
be greatly appreciated.  I couldn't find any of such patches posted
anywhere or in the tree.

Also, I would like to ask what's the idea behind tools/perf utility
anyway. As far as I understand it provides only a few generalized
events and all other events are accessed only via critic r### where
### is hardware dependent number.  If that's the case the tools might
be hard to use since users don't tend to remember such numbers.  This
means that perf would require some kind of front-end which would
provide a nice textual description for those mysterious r###.
													
-- 
Best regards,                                           _     _
  .o. | Liege of Serenely Enlightened Majesty of       o' \,=./ `o
  ..o | Computer Science,  Micha? "mina86" Nazarewicz     (o o)
  ooo +---<mina86@mina86.com>---<mina86@jabber.org>---ooO--(_)--Ooo--

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-06 15:16                               ` Michał Nazarewicz
@ 2010-01-06 15:30                                 ` Jamie Iles
  2010-01-07 17:02                                   ` Michał Nazarewicz
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2010-01-06 15:30 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 06, 2010 at 04:16:06PM +0100, Micha? Nazarewicz wrote:
[snip]
> Since I've just started to investigate the performance counter
> architecture and will probably need to port tools/perf on ARM as well,
> would you care to post any patches of your work in this area? It will
> be greatly appreciated.  I couldn't find any of such patches posted
> anywhere or in the tree.
The patches are in tip/master. 58e9f94138c1d9c47f6a63632ca7a78fc6dcc15f and
cc835752ae3634acd2d487fdf5152f6075f45aef should do the trick. They should also
be in Linus' tree.

> Also, I would like to ask what's the idea behind tools/perf utility
> anyway. As far as I understand it provides only a few generalized
> events and all other events are accessed only via critic r### where
> ### is hardware dependent number.  If that's the case the tools might
> be hard to use since users don't tend to remember such numbers.  This
> means that perf would require some kind of front-end which would
> provide a nice textual description for those mysterious r###.
True. However, for simple profiling, cycle counts are often enough plus all of
the software events are always available. I do wonder if it would be worth
having a mechanism for implementations to register a list of supported events
with the perf subsystem that could be exported through debugfs. The perf tools
could then read this to produce the list of events and would show all events
that were available.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-06 15:30                                 ` Jamie Iles
@ 2010-01-07 17:02                                   ` Michał Nazarewicz
  0 siblings, 0 replies; 55+ messages in thread
From: Michał Nazarewicz @ 2010-01-07 17:02 UTC (permalink / raw)
  To: linux-arm-kernel

> On Wed, Jan 06, 2010 at 04:16:06PM +0100, Micha? Nazarewicz wrote:
>> Since I [...] will probably need to port tools/perf on ARM as well,
>> would you care to post any patches of your work in this area?

On Wed, 06 Jan 2010 16:30:34 +0100, Jamie Iles <jamie@jamieiles.com> wrote:
> The patches are in tip/master. [...] They should also be in Linus' tree.

Ah, thank you.  Now I don't know why I couldn't find it previously...

BTW. Did anyone got problems with stack protector? My ld complained it
could not find ssp_nonshared when checking for libelf.  I think the "-c"
should be dropped from the -fstack-protector-all support checking in
Makefile (diff included at the end of mail).


>> what's the idea behind tools/perf utility. As far as I understand it
>> provides a few generalized events and all other events are accessed via
>> critic r### [...] tool might be hard to use since users don't tend to
>> remember such numbers.  [...] perf would require front-end which would
>> provide textual description for those mysterious r###.

> True. However, for simple profiling, cycle counts are often enough plus all of
> the software events are always available. I do wonder if it would be worth
> having a mechanism for implementations to register a list of supported events
> with the perf subsystem that could be exported through debugfs. The perf tools
> could then read this to produce the list of events and would show all events
> that were available.

I would also consider using sysfs (/sys/devices/system/cpu/perf-events?) since
one may lack debugfs in their kernel.  It can be argued that if someone wants
to use performance counters we might just require debugfs however personally
I'd hate such requirement and prefer this information to be placed in sysfs.


Lets not argue about details though.  More important question is whether
such information is needed at all and I believe it is.

I agree that in many (if not most) cases the generalized events are enough
but imagine this poor fellow who needs to look through 1k-page CPU reference
manual to find "raw" event number because he wants to do more in-depth
profiling.

In my opinion, if the utility is to be widely used such database will be
created at one point or another and it can be either in kernel or user
space (some front-ends for pref cold emerge or maybe there'll be some other
utility that uses the same API and has its own database).  Furthermore,
I believe it's better done in kernel because it knows better what CPU
system is running on and what it supports.

So if you ask me I say that such list is needed, is needed to be done
in kernel and besides raw numbers should provide some textual names for
events so for example the same name can be used on two different
ARM CPUs which use different event number for the very same event.


And promised diff:
--
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 652a470..434c5ec 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -250,7 +250,7 @@ PTHREAD_LIBS = -lpthread
  # explicitly what architecture to check for. Fix this up for yours..
  SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__

-ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y)
+ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y)
    CFLAGS := $(CFLAGS) -fstack-protector-all
  endif

-- 
Best regards,                                           _     _
  .o. | Liege of Serenely Enlightened Majesty of       o' \,=./ `o
  ..o | Computer Science,  Micha? "mina86" Nazarewicz     (o o)
  ooo +---<mina86@mina86.com>---<mina86@jabber.org>---ooO--(_)--Ooo--

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2009-12-22 16:51                       ` Jean Pihet
  2009-12-28  7:57                         ` Ingo Molnar
  2009-12-29 13:58                         ` Jean Pihet
@ 2010-01-08 22:17                         ` Woodruff, Richard
  2010-01-15 15:34                           ` Jean Pihet
  2 siblings, 1 reply; 55+ messages in thread
From: Woodruff, Richard @ 2010-01-08 22:17 UTC (permalink / raw)
  To: linux-arm-kernel


> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-kernel-
> bounces at lists.infradead.org] On Behalf Of Jean Pihet
> Sent: Tuesday, December 22, 2009 10:52 AM

> 2) Please note that the Cortex-A9 events do not easily map to the predefined
> events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in the code.
> - the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks like the number
> of instructions is calculated by adding events numbers (events from 0x70 till
> 0x74: MAIN_UNIT_EXECUTED_INST, SECOND_UNIT_EXECUTED_INST,
> LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
> - the HW_BRANCH events are not found
> - the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and
> COHERENT_LINE_MISS events, is that correct?
> - L1 and L2 cache events are not found. Those could be available in separate
> PL310 registers, TBC

Recently I had done a side by side diff of A8 and A9 events for OMAP4.

It is notable that L2 cache events for CortexA8 come up through same PMNC register interface as ARMv7 core events.  For CortexA9 + PL310 the L2 events all come up through a _different_ register interface.  The interface is still simple but different.

- Several of the registers needed to enable PL310 event bus are Trustzone protected.  This will lead to some messiness in getting at them through monitor mode proxies.

- A9 is missing a few events at core level and all l2 events which come up through pl310 regs.

- A9 has a few more event counter instances over A8. This grows some registers in expected way.

One bit I didn't get clear on was if any entity was trying to account for per-core stats at the shared PL310 level.  Each core can give stats in familiar manner but association of both cores with common pl310 is not clear.

Regards,
Richard W.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2010-01-14 12:14 ARM perf events support v5 Jamie Iles
@ 2010-01-14 12:14 ` Jamie Iles
  2010-01-21  9:30   ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2010-01-14 12:14 UTC (permalink / raw)
  To: linux-arm-kernel

To add support for perf events and to allow the hardware
counters to be shared with oprofile, we need a way to reserve
access to the pmu (performance monitor unit).

Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
---
 arch/arm/Kconfig           |    5 ++
 arch/arm/include/asm/pmu.h |   75 +++++++++++++++++++++++++++++++
 arch/arm/kernel/Makefile   |    1 +
 arch/arm/kernel/pmu.c      |  105 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 186 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/pmu.h
 create mode 100644 arch/arm/kernel/pmu.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c2238cd..31d52ed 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -866,6 +866,11 @@ config XSCALE_PMU
 	depends on CPU_XSCALE && !XSCALE_PMU_TIMER
 	default y
 
+config CPU_HAS_PMU
+	depends on CPU_V6 || CPU_V7 || XSCALE_PMU
+	default y
+	bool
+
 if !MMU
 source "arch/arm/Kconfig-nommu"
 endif
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
new file mode 100644
index 0000000..2829b9f
--- /dev/null
+++ b/arch/arm/include/asm/pmu.h
@@ -0,0 +1,75 @@
+/*
+ *  linux/arch/arm/include/asm/pmu.h
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __ARM_PMU_H__
+#define __ARM_PMU_H__
+
+#ifdef CONFIG_CPU_HAS_PMU
+
+struct pmu_irqs {
+	const int   *irqs;
+	int	    num_irqs;
+};
+
+/**
+ * reserve_pmu() - reserve the hardware performance counters
+ *
+ * Reserve the hardware performance counters in the system for exclusive use.
+ * The 'struct pmu_irqs' for the system is returned on success, ERR_PTR()
+ * encoded error on failure.
+ */
+extern const struct pmu_irqs *
+reserve_pmu(void);
+
+/**
+ * release_pmu() - Relinquish control of the performance counters
+ *
+ * Release the performance counters and allow someone else to use them.
+ * Callers must have disabled the counters and released IRQs before calling
+ * this. The 'struct pmu_irqs' returned from reserve_pmu() must be passed as
+ * a cookie.
+ */
+extern int
+release_pmu(const struct pmu_irqs *irqs);
+
+/**
+ * init_pmu() - Initialise the PMU.
+ *
+ * Initialise the system ready for PMU enabling. This should typically set the
+ * IRQ affinity and nothing else. The users (oprofile/perf events etc) will do
+ * the actual hardware initialisation.
+ */
+extern int
+init_pmu(void);
+
+#else /* CONFIG_CPU_HAS_PMU */
+
+static inline const struct pmu_irqs *
+reserve_pmu(void)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline int
+release_pmu(const struct pmu_irqs *irqs)
+{
+	return -ENODEV;
+}
+
+static inline int
+init_pmu(void)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_CPU_HAS_PMU */
+
+#endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index dd00f74..216890d 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_CPU_XSCALE)	+= xscale-cp0.o
 obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
+obj-$(CONFIG_CPU_HAS_PMU)	+= pmu.o
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 
 ifneq ($(CONFIG_ARCH_EBSA110),y)
diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
new file mode 100644
index 0000000..688d450
--- /dev/null
+++ b/arch/arm/kernel/pmu.c
@@ -0,0 +1,105 @@
+/*
+ *  linux/arch/arm/kernel/pmu.c
+ *
+ *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/pmu.h>
+
+/*
+ * Define the IRQs for the system. We could use something like a platform
+ * device but that seems fairly heavyweight for this. Also, the performance
+ * counters can't be removed or hotplugged.
+ *
+ * Ordering is important: init_pmu() will use the ordering to set the affinity
+ * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
+ * second goes to cpu 1 etc.
+ */
+static const int irqs[] = {
+#ifdef CONFIG_ARCH_PC3XX
+	IRQ_NPMUIRQ,
+#elif defined(CONFIG_ARCH_OMAP2)
+	3,
+#elif defined(CONFIG_ARCH_BCMRING)
+	IRQ_PMUIRQ,
+#elif defined(CONFIG_MACH_REALVIEW_EB)
+	IRQ_EB11MP_PMU_CPU0,
+	IRQ_EB11MP_PMU_CPU1,
+	IRQ_EB11MP_PMU_CPU2,
+	IRQ_EB11MP_PMU_CPU3,
+#elif defined(CONFIG_ARCH_OMAP3)
+	INT_34XX_BENCH_MPU_EMUL,
+#elif defined(CONFIG_ARCH_IOP32X)
+	IRQ_IOP32X_CORE_PMU,
+#elif defined(CONFIG_ARCH_IOP33X)
+	IRQ_IOP33X_CORE_PMU,
+#elif defined(CONFIG_ARCH_PXA)
+	IRQ_PMU,
+#endif
+};
+
+static const struct pmu_irqs pmu_irqs = {
+	.irqs	    = irqs,
+	.num_irqs   = ARRAY_SIZE(irqs),
+};
+
+static volatile long pmu_lock;
+
+const struct pmu_irqs *
+reserve_pmu(void)
+{
+	return test_and_set_bit_lock(0, &pmu_lock) ? ERR_PTR(-EBUSY) :
+		&pmu_irqs;
+}
+EXPORT_SYMBOL_GPL(reserve_pmu);
+
+int
+release_pmu(const struct pmu_irqs *irqs)
+{
+	if (WARN_ON(irqs != &pmu_irqs))
+		return -EINVAL;
+	clear_bit_unlock(0, &pmu_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(release_pmu);
+
+static int
+set_irq_affinity(int irq,
+		 unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	int err = irq_set_affinity(irq, cpumask_of(cpu));
+	if (err)
+		pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
+			   irq, cpu);
+	return err;
+#else
+	return 0;
+#endif
+}
+
+int
+init_pmu(void)
+{
+	int i, err = 0;
+
+	for (i = 0; i < pmu_irqs.num_irqs; ++i) {
+		err = set_irq_affinity(pmu_irqs.irqs[i], i);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(init_pmu);
-- 
1.6.5.4

^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-04 16:52                           ` Will Deacon
@ 2010-01-15 15:30                             ` Jean Pihet
  2010-01-15 15:39                               ` Jamie Iles
  2010-01-20 13:40                               ` Will Deacon
  0 siblings, 2 replies; 55+ messages in thread
From: Jean Pihet @ 2010-01-15 15:30 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

Thanks for the answers. Here are the remarks.

Jamie, there is a remark at the very end about the cpuid detection. Can you 
check?

Regards,
Jean

On Monday 04 January 2010 17:52:01 Will Deacon wrote:
> Hi Jean,
>
> Thanks for this. Feedback inline.
>
> * Jean Pihet wrote:
> > Here is the updated patch after testing on HW.
> > I will rebase it on Jamie's latest patch set as soon as they are out.
> >
> > Feedback is welcome!
> >
> > Some remarks and questions:
> >
> > 1) The number of available counters can reach 32 on ARMv7, so the macro
> > ARMPMU_MAX_HWEVENTS is now defined as 32. Is that correct?
>
> I think index 0 is reserved for the events array, so this should probably
> be 33. Hopefully that doesn't break any bitmasks.
Ok, changed to 33.
Note that the code is generic enough to support up to 32 counters (CCNT + 31 
event counters), even though the Cortex-A8 and Cortex-A9 respectively support 
up to 1+4 and 1+6 counters.
The actual number of counters (from the PMNC registers) is printed out at Perf 
Events init.

> > 2) Please note that the Cortex-A9 events do not easily map to the
> > predefined events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in
> > the code. - the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks
> > like the number of instructions is calculated by adding events numbers
> > (events from 0x70 till 0x74: MAIN_UNIT_EXECUTED_INST,
> > SECOND_UNIT_EXECUTED_INST,
> > LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
>
> Event 0x68 - `Instructions coming out of the core renaming stage'
> can be used as an approximation for PERF_COUNT_HW_INSTRUCTIONS.
Ok changed.

> > - the HW_BRANCH events are not found
>
> 0x0c is HW_BRANCH_INSTRUCTIONS and 0x10 is HW_BRANCH_MISSES.
> 0x12 is the number of predictable branch instructions executed, so the
> mispredict rate is 0x10/0x12. These events are defined for v7, so A8 should
> take these definitions too.
>From the spec I read 0x0c is 'SW write of the PC', is that equivalent to 
HW_BRANCH_INSTRUCTIONS?

For A8 I am using:
- ARMV7_PERFCTR_PC_BRANCH_TAKEN (0x53),
- ARMV7_PERFCTR_PC_BRANCH_FAILED (0x52)

For A9 it is unsupported for now.

Do you think I should use 0x0c and 0x10 for both A8 and A9? How to get the 
accesses and misses count directly?

> > - the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and
> > COHERENT_LINE_MISS events, is that correct?
>
> 0x50 is COHERENT_LINE_MISS and 0x51 is COHERENT_LINE_HIT. These are only
> available on A9 with SMP.
Ok the code uses 0x50 and 0x51 for A9.

> > - L1 and L2 cache events are not found. Those could be available in
> > separate PL310 registers, TBC
>
> We could use 0x01 for icache miss, 0x03 for dcache miss and 0x04 for dcache
> access.
Ok changed to the following. Is that correct?
Note that A8 uses specific events for I cache in order to make them comparable 
to each other. I cache miss could use 0x01 also. Cf. remark below for more.

Cortex-A8:
- D cache access: ARMV7_PERFCTR_DCACHE_ACCESS (0x04),
- D cache miss: ARMV7_PERFCTR_DCACHE_REFILL (0x03) instead of 
ARMV7_PERFCTR_L1_DATA_MISS (0x49),
- I cache access: ARMV7_PERFCTR_L1_DATA_MISS (0x50),
- I cache miss: ARMV7_PERFCTR_L1_INST_MISS (0x4a).

Cortex-A9:
- D cache access: ARMV7_PERFCTR_DCACHE_ACCESS (0x04),
- D cache miss: ARMV7_PERFCTR_DCACHE_REFILL (0x03),
- I cache access: Not supported,
- I cache miss: ARMV7_PERFCTR_IFETCH_MISS (0x01).

> > - no TLB events excepted the ITLB_MISS event are found.
>
> I think 0x05 is DTLB_MISS.
Ok using 0x05 for DTLB misses

> <snip>
>
> > +/*
> > + * ARMv7 Cortex-A8 and Cortex-A9 Performance Events handling code.
> > + *
> > + * Copied from ARMv6 code, with the low level code inspired
> > + *  by the ARMv7 Oprofile code.
> > + *
> > + * Cortex-A8 has up to 4 configurable performance counters and
> > + *  a single cycle counter.
> > + * Cortex-A9 has up to 31 configurable performance counters and
> > + *  a single cycle counter.
> > + *
> > + * All counters can be enabled/disabled and IRQ masked separately. The
> > cycle + *  counter and all 4 performance counters together can be reset
> > separately. + */
> > +
> > +#define ARMV7_PMU_CORTEX_A8_NAME		"ARMv7 Cortex-A8"
> > +
> > +#define ARMV7_PMU_CORTEX_A9_NAME		"ARMv7 Cortex-A9"
> > +
> > +/* Common ARMv7 event types */
> > +enum armv7_perf_types {
> > +	ARMV7_PERFCTR_PMNC_SW_INCR		= 0x00,
> > +	ARMV7_PERFCTR_IFETCH_MISS		= 0x01,
> > +	ARMV7_PERFCTR_ITLB_MISS			= 0x02,
> > +	ARMV7_PERFCTR_DCACHE_REFILL		= 0x03,
> > +	ARMV7_PERFCTR_DCACHE_ACCESS		= 0x04,
> > +	ARMV7_PERFCTR_DTLB_REFILL		= 0x05,
> > +	ARMV7_PERFCTR_DREAD			= 0x06,
> > +	ARMV7_PERFCTR_DWRITE			= 0x07,
> > +
> > +	ARMV7_PERFCTR_EXC_TAKEN			= 0x09,
> > +	ARMV7_PERFCTR_EXC_EXECUTED		= 0x0A,
> > +	ARMV7_PERFCTR_CID_WRITE			= 0x0B,
> > +	ARMV7_PERFCTR_PC_WRITE			= 0x0C,
> > +	ARMV7_PERFCTR_PC_IMM_BRANCH		= 0x0D,
> > +	ARMV7_PERFCTR_UNALIGNED_ACCESS		= 0x0F,
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_PRED	= 0x10,
> > +	ARMV7_PERFCTR_CLOCK_CYCLES		= 0x11,
> > +
> > +	ARMV7_PERFCTR_PC_BRANCH_MIS_USED	= 0x12,
> > +
> > +	ARMV7_PERFCTR_CPU_CYCLES		= 0xFF
> > +};
>
> For consistency, I think it's better to stick to either MISS or REFILL.
> Events 0x01 and 0x03 are the same, but for instructions and data
> respectively.
Good point! I am using the terminology from the TRM but I have to agree that 
some are cryptic.

> <snip>
>
> > +static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
> > +					  [PERF_COUNT_HW_CACHE_OP_MAX]
> > +					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > +	[C(L1D)] = {
> > +		/*
> > +		 * The performance counters don't differentiate between read
> > +		 * and write accesses/misses so this isn't strictly correct,
> > +		 * but it's the best we can do. Writes and reads get
> > +		 * combined.
> > +		 */
> > +		[C(OP_READ)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> > +		},
> > +		[C(OP_WRITE)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_DCACHE_ACCESS,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DATA_MISS,
> > +		},
> > +		[C(OP_PREFETCH)] = {
> > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > +		},
> > +	},
>
> You're using the A8-only event 0x49 [ARMV7_PERFCTR_L1_DATA_MISS]. This also
> counts hash misses in the address translation during the cache lookup
> procedure, even if the resulting access is a [slightly slower] hit. I think
> you're better off using event 0x03. In fact, I'd try to use architecturally
> defined events whenever you can because that allows for comparisons between
> v7 cores.
Ok changed that after remark above about L1 and L2.

> > +	[C(L1I)] = {
> > +		[C(OP_READ)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > +		},
> > +		[C(OP_WRITE)] = {
> > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > +		},
> > +		[C(OP_PREFETCH)] = {
> > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > +		},
> > +	},
>
> Same thing here. I'd suggest using 0x01 instead of 0x4a.
Ok is it preferred to keep the ARMV7_PERFCTR_L1_ events for both accesses and 
misses in order to make the events counts comparable to each other? On the 
other end using 0x01 allows the comparison between A8 and A9.
I am OK to change it, just let me know.

> <snip>
>
> > +/*
> > + * Available counters
> > + */
> > +#define ARMV7_CNT0 		0	/* First event counter */
> > +#define ARMV7_CCNT 		31	/* Cycle counter */
> > +
> > +#define ARMV7_A8_CNTMAX		5	/* Cortex-A8: up to 4 counters + CCNT */
> > +#define ARMV7_A9_CNTMAX		32	/* Cortex-A9: up to 31 counters + CCNT*/
>
> Actually, A9 has a maximum number of 6 event counters + CCNT.
Cf. remark above. The code is generic enough and supports up to the 1+31 
events as defined in the A8 and A9 TRMs. The number of counters is 
dynamically read from the PMNC registers. Should that be compared against the 
given maximum (1+4 for A8, 1+6 for A9)? That looks like overkill.

> <snip>
>
> > +#define CPUID_V6_MASK   	0x7F000
> > +#define CPUID_V6_BITS   	0x7B000
> > +
> > +#define CPUID_CORTEX_A8_BITS	0xC080
> > +#define CPUID_CORTEX_A8_MASK	0xFFF0
> > +
> > +#define CPUID_CORTEX_A9_BITS	0xC090
> > +#define CPUID_CORTEX_A9_MASK	0xFFF0
>
> Just define CPUID_V7_MASK.
Ok changed.

>
> > +	unsigned long cpuid = read_cpuid_id();
> > +
> > +	/*
> > +	 * ARMv6 detection
> > +	 */
> > +	if (CPUID_V6_BITS == (cpuid & CPUID_V6_MASK)) {
> > +		armpmu = &armv6pmu;
> > +		memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
> > +			sizeof(armv6_perf_cache_map));
> > +		perf_max_events	= armv6pmu.num_events;
> > +	}
> > +	/*
> > +	 * ARMv7 detection
> > +	 */
> > +	else if (cpu_architecture() == CPU_ARCH_ARMv7) {
> > +		/*
> > +		 * Cortex-A8 detection
> > +		 */
> > +		if ((cpuid & CPUID_CORTEX_A8_MASK) == CPUID_CORTEX_A8_BITS) {
> > +			armv7pmu.name = ARMV7_PMU_CORTEX_A8_NAME;
> > +			memcpy(armpmu_perf_cache_map, armv7_a8_perf_cache_map,
> > +				sizeof(armv7_a8_perf_cache_map));
> > +			armv7pmu.event_map = armv7_a8_pmu_event_map;
> > +			armpmu = &armv7pmu;
> > +		} else
> > +		/*
> > +		 * Cortex-A9 detection
> > +		 */
> > +			if ((cpuid & CPUID_CORTEX_A9_MASK)
> > +			    == CPUID_CORTEX_A9_BITS) {
> > +				armv7pmu.name = ARMV7_PMU_CORTEX_A9_NAME;
> > +				memcpy(armpmu_perf_cache_map,
> > +					armv7_a9_perf_cache_map,
> > +					sizeof(armv7_a9_perf_cache_map));
> > +				armv7pmu.event_map = armv7_a9_pmu_event_map;
> > +				armpmu = &armv7pmu;
> > +		} else
> > +			perf_max_events = -1;
>
> The A9 code indentation is a level too deep I think.
Ok changed. It is beautiful now ;p

> It might also be 
> worth adding a cpu_architecture() check to the v6 test just in case a
> v7 core conflicts with the mask.
Jamie, what do you think?

>
> I hope that helps. Please let me know what you think about the event
> numberings.
Ok that definitely helps. Please let me know about my remarks.

>
> Cheers,
>
> Will

Cheers,
Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-08 22:17                         ` Woodruff, Richard
@ 2010-01-15 15:34                           ` Jean Pihet
  0 siblings, 0 replies; 55+ messages in thread
From: Jean Pihet @ 2010-01-15 15:34 UTC (permalink / raw)
  To: linux-arm-kernel

Richard,

On Friday 08 January 2010 23:17:10 Woodruff, Richard wrote:
> > From: linux-arm-kernel-bounces at lists.infradead.org
> > [mailto:linux-arm-kernel- bounces at lists.infradead.org] On Behalf Of Jean
> > Pihet
> > Sent: Tuesday, December 22, 2009 10:52 AM
> >
> > 2) Please note that the Cortex-A9 events do not easily map to the
> > predefined events. Cf. armv7_a9_perf_map and armv7_a9_perf_cache_map in
> > the code. - the PERF_COUNT_HW_INSTRUCTIONS event is not found. It looks
> > like the number of instructions is calculated by adding events numbers
> > (events from 0x70 till 0x74: MAIN_UNIT_EXECUTED_INST,
> > SECOND_UNIT_EXECUTED_INST,
> > LD_ST_UNIT_EXECUTED_INST, FP_EXECUTED_INST and NEON_EXECUTED_INST),
> > - the HW_BRANCH events are not found
> > - the global cache events 0x50 and 0x51 define the COHERENT_LINE_HIT and
> > COHERENT_LINE_MISS events, is that correct?
> > - L1 and L2 cache events are not found. Those could be available in
> > separate PL310 registers, TBC
>
> Recently I had done a side by side diff of A8 and A9 events for OMAP4.
>
> It is notable that L2 cache events for CortexA8 come up through same PMNC
> register interface as ARMv7 core events.  For CortexA9 + PL310 the L2
> events all come up through a _different_ register interface.  The interface
> is still simple but different.
Ok that could be done after the first ARMv7 support is merged in. Where can I 
find more info about the PL310 interface?

> - Several of the registers needed to enable PL310 event bus are Trustzone
> protected.  This will lead to some messiness in getting at them through
> monitor mode proxies.
Mmh security related code might be missing here. More investigation is needed.

> - A9 is missing a few events at core level and all l2 events which come up
> through pl310 regs.
Ok.

> - A9 has a few more event counter instances over A8. This grows some
> registers in expected way.
The latest code supports up to 1+31 events as defined in the A8 and A9 specs. 
The number of counters is dynamically read at Perf Events init.

> One bit I didn't get clear on was if any entity was trying to account for
> per-core stats at the shared PL310 level.  Each core can give stats in
> familiar manner but association of both cores with common pl310 is not
> clear.
Ok it wouls be nice to have all the details.

> Regards,
> Richard W.

Thanks & regards,
Jean

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-15 15:30                             ` Jean Pihet
@ 2010-01-15 15:39                               ` Jamie Iles
  2010-01-15 15:43                                 ` Jean Pihet
  2010-01-20 13:40                               ` Will Deacon
  1 sibling, 1 reply; 55+ messages in thread
From: Jamie Iles @ 2010-01-15 15:39 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Jan 15, 2010 at 04:30:06PM +0100, Jean Pihet wrote:
> Hi Will,
> 
> Thanks for the answers. Here are the remarks.
> 
> Jamie, there is a remark at the very end about the cpuid detection. Can you 
> check?
[snip]
> > It might also be 
> > worth adding a cpu_architecture() check to the v6 test just in case a
> > v7 core conflicts with the mask.
> Jamie, what do you think?
Hmm, I'm not sure. I thought that cpu_architecture() returned the MMU
architecture of the core. See the thread 'question about ARM11MP cpu
architecture identification' for more details. Certainly on my ARM1176 Linux
reports it as a v7 on bootup so this check would be bad...

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-15 15:39                               ` Jamie Iles
@ 2010-01-15 15:43                                 ` Jean Pihet
  2010-01-15 15:49                                   ` Jamie Iles
  0 siblings, 1 reply; 55+ messages in thread
From: Jean Pihet @ 2010-01-15 15:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 15 January 2010 16:39:41 Jamie Iles wrote:
> On Fri, Jan 15, 2010 at 04:30:06PM +0100, Jean Pihet wrote:
> > Hi Will,
> >
> > Thanks for the answers. Here are the remarks.
> >
> > Jamie, there is a remark at the very end about the cpuid detection. Can
> > you check?
>
> [snip]
>
> > > It might also be
> > > worth adding a cpu_architecture() check to the v6 test just in case a
> > > v7 core conflicts with the mask.
> >
> > Jamie, what do you think?
>
> Hmm, I'm not sure. I thought that cpu_architecture() returned the MMU
> architecture of the core. See the thread 'question about ARM11MP cpu
> architecture identification' for more details. Certainly on my ARM1176
> Linux reports it as a v7 on bootup so this check would be bad...
Correct! That is why I was asking. So the tests are working for v6 and v7 
because of the code ordering (need to check v6 first, then v7 variants). Is 
it ok to keep the code as is?

>
> Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-15 15:43                                 ` Jean Pihet
@ 2010-01-15 15:49                                   ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2010-01-15 15:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Jan 15, 2010 at 04:43:39PM +0100, Jean Pihet wrote:
> On Friday 15 January 2010 16:39:41 Jamie Iles wrote:
> > On Fri, Jan 15, 2010 at 04:30:06PM +0100, Jean Pihet wrote:
> > > Hi Will,
> > >
> > > Thanks for the answers. Here are the remarks.
> > >
> > > Jamie, there is a remark at the very end about the cpuid detection. Can
> > > you check?
> >
> > [snip]
> >
> > > > It might also be
> > > > worth adding a cpu_architecture() check to the v6 test just in case a
> > > > v7 core conflicts with the mask.
> > >
> > > Jamie, what do you think?
> >
> > Hmm, I'm not sure. I thought that cpu_architecture() returned the MMU
> > architecture of the core. See the thread 'question about ARM11MP cpu
> > architecture identification' for more details. Certainly on my ARM1176
> > Linux reports it as a v7 on bootup so this check would be bad...
> Correct! That is why I was asking. So the tests are working for v6 and v7 
> because of the code ordering (need to check v6 first, then v7 variants). Is 
> it ok to keep the code as is?
That's fine with me.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6)
  2010-01-15 15:30                             ` Jean Pihet
  2010-01-15 15:39                               ` Jamie Iles
@ 2010-01-20 13:40                               ` Will Deacon
  1 sibling, 0 replies; 55+ messages in thread
From: Will Deacon @ 2010-01-20 13:40 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jean,

Sorry for the delay in getting back to you, I've had a few technical
problems with my machine. Anyway, here we go:

* Jean Pihet wrote:
<snip>
> > 0x0c is HW_BRANCH_INSTRUCTIONS and 0x10 is HW_BRANCH_MISSES.
> > 0x12 is the number of predictable branch instructions executed, so the
> > mispredict rate is 0x10/0x12. These events are defined for v7, so A8 should
> > take these definitions too.
> From the spec I read 0x0c is 'SW write of the PC', is that equivalent to
> HW_BRANCH_INSTRUCTIONS?

This event counts:
	- All branch instructions
	- Instructions that explicitly write the PC
	- Exception generating instructions

I think this is suitable for HW_BRANCH_INSTRUCTIONS, but if anybody feels
differently then maybe we should reconsider.

> For A8 I am using:
> - ARMV7_PERFCTR_PC_BRANCH_TAKEN (0x53),
> - ARMV7_PERFCTR_PC_BRANCH_FAILED (0x52)
> 
> For A9 it is unsupported for now.
> 
> Do you think I should use 0x0c and 0x10 for both A8 and A9? How to get the
> accesses and misses count directly?

I think we should define the `standard' set (i.e. those that perf supports by
name) using the v7 events, so in this case then use 0x0c and 0x10 for both A8
and A9. The core-specific definitions can then always be accessed as raw events.
As I mentioned, I think this is important if people decide to compare the counts
between two cores.

> > We could use 0x01 for icache miss, 0x03 for dcache miss and 0x04 for dcache
> > access.
> Ok changed to the following. Is that correct?
> Note that A8 uses specific events for I cache in order to make them comparable
> to each other. I cache miss could use 0x01 also. Cf. remark below for more.
> 
> Cortex-A8:
> - D cache access: ARMV7_PERFCTR_DCACHE_ACCESS (0x04),
> - D cache miss: ARMV7_PERFCTR_DCACHE_REFILL (0x03) instead of
> ARMV7_PERFCTR_L1_DATA_MISS (0x49),
> - I cache access: ARMV7_PERFCTR_L1_DATA_MISS (0x50),
> - I cache miss: ARMV7_PERFCTR_L1_INST_MISS (0x4a).
> 
> Cortex-A9:
> - D cache access: ARMV7_PERFCTR_DCACHE_ACCESS (0x04),
> - D cache miss: ARMV7_PERFCTR_DCACHE_REFILL (0x03),
> - I cache access: Not supported,
> - I cache miss: ARMV7_PERFCTR_IFETCH_MISS (0x01).

Hmm, this is an interesting one. I suppose comparison between events on a given
core (i.e. A8) is preferable, so I agree with you here. Due to the lack of I-cache
access events on A9, there's nothing we can do to get a fair cross-core comparison.
[minor note: You've called the I-cache access event ARMV7_PERFCTR_L1_DATA_MISS!]
 
> > > +	[C(L1I)] = {
> > > +		[C(OP_READ)] = {
> > > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > > +		},
> > > +		[C(OP_WRITE)] = {
> > > +			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_INST,
> > > +			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_INST_MISS,
> > > +		},
> > > +		[C(OP_PREFETCH)] = {
> > > +			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
> > > +			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
> > > +		},
> > > +	},
> >
> > Same thing here. I'd suggest using 0x01 instead of 0x4a.
> Ok is it preferred to keep the ARMV7_PERFCTR_L1_ events for both accesses and
> misses in order to make the events counts comparable to each other? On the
> other end using 0x01 allows the comparison between A8 and A9.
> I am OK to change it, just let me know.

After thinking about this above, I agree with you; let's use the
ARMV7_PERFCTR_L1_ events to allow for event comparisons on the A8. Comparing with
an A9 is a non-starter because the I-cache accesses can't be counted there.

> > > +/*
> > > + * Available counters
> > > + */
> > > +#define ARMV7_CNT0 		0	/* First event counter */
> > > +#define ARMV7_CCNT 		31	/* Cycle counter */
> > > +
> > > +#define ARMV7_A8_CNTMAX		5	/* Cortex-A8: up to 4 counters + CCNT */
> > > +#define ARMV7_A9_CNTMAX		32	/* Cortex-A9: up to 31 counters + CCNT*/
> >
> > Actually, A9 has a maximum number of 6 event counters + CCNT.
> Cf. remark above. The code is generic enough and supports up to the 1+31
> events as defined in the A8 and A9 TRMs. The number of counters is
> dynamically read from the PMNC registers. Should that be compared against the
> given maximum (1+4 for A8, 1+6 for A9)? That looks like overkill.

Sure, I was just referring to ARMV7_A9_CNTMAX being artificially high.
You'll never see more than 6 event counters on an A9.

> > It might also be
> > worth adding a cpu_architecture() check to the v6 test just in case a
> > v7 core conflicts with the mask.
> Jamie, what do you think?

I forgot that looked at the MMU. Oh well, the ordering will have to matter.

Cheers,

Will

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH 1/5] arm: provide a mechanism to reserve performance counters
  2010-01-14 12:14 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
@ 2010-01-21  9:30   ` Jamie Iles
  0 siblings, 0 replies; 55+ messages in thread
From: Jamie Iles @ 2010-01-21  9:30 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Jan 14, 2010 at 12:14:12PM +0000, Jamie Iles wrote:
> To add support for perf events and to allow the hardware
> counters to be shared with oprofile, we need a way to reserve
> access to the pmu (performance monitor unit).
[snip]
> diff --git a/arch/arm/kernel/pmu.c b/arch/arm/kernel/pmu.c
> new file mode 100644
> index 0000000..688d450
> --- /dev/null
> +++ b/arch/arm/kernel/pmu.c
> @@ -0,0 +1,105 @@
> +/*
> + *  linux/arch/arm/kernel/pmu.c
> + *
> + *  Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/cpumask.h>
> +#include <linux/err.h>
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +
> +#include <asm/pmu.h>
> +
> +/*
> + * Define the IRQs for the system. We could use something like a platform
> + * device but that seems fairly heavyweight for this. Also, the performance
> + * counters can't be removed or hotplugged.
> + *
> + * Ordering is important: init_pmu() will use the ordering to set the affinity
> + * to the corresponding core. e.g. the first interrupt will go to cpu 0, the
> + * second goes to cpu 1 etc.
> + */
> +static const int irqs[] = {
> +#ifdef CONFIG_ARCH_PC3XX
> +	IRQ_NPMUIRQ,
> +#elif defined(CONFIG_ARCH_OMAP2)
This one (ARCH_PC3XX) shouldn't have made it in here as the platform isn't in
mainline. I'll remove this before submitting the patch.

Jamie

^ permalink raw reply	[flat|nested] 55+ messages in thread

end of thread, other threads:[~2010-01-21  9:30 UTC | newest]

Thread overview: 55+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-15 11:15 ARMv6 performance counters v3 Jamie Iles
2009-12-15 11:15 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
2009-12-15 11:15   ` [PATCH 2/5] arm/oprofile: reserve the PMU when starting Jamie Iles
2009-12-15 11:15     ` [PATCH 3/5] arm: use the spinlocked, generic atomic64 support Jamie Iles
2009-12-15 11:15       ` [PATCH 4/5] arm: enable support for software perf events Jamie Iles
2009-12-15 11:15         ` [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6 Jamie Iles
2009-12-15 14:29           ` Will Deacon
2009-12-15 15:02             ` Jamie Iles
2009-12-15 15:05               ` Will Deacon
2009-12-15 15:19                 ` Jamie Iles
2009-12-15 15:30                   ` Peter Zijlstra
2009-12-15 15:36                     ` Jamie Iles
2009-12-16 10:54                       ` Jamie Iles
2009-12-16 11:04                         ` Will Deacon
2009-12-16 11:19                           ` Jamie Iles
2009-12-18 17:05           ` Perf Event support for ARMv7 (was: Re: [PATCH 5/5] arm/perfevents: implement perf event support for ARMv6) Jean Pihet
2009-12-19 10:29             ` Jamie Iles
2009-12-19 10:53               ` Ingo Molnar
2009-12-21 11:32                 ` Jean Pihet
2009-12-21 11:29               ` Jean Pihet
2009-12-21 11:04             ` Will Deacon
2009-12-21 11:43               ` Jean Pihet
2009-12-21 12:10                 ` Will Deacon
2009-12-21 12:43                   ` Jamie Iles
2009-12-21 13:35                     ` Jean Pihet
2009-12-22 16:51                       ` Jean Pihet
2009-12-28  7:57                         ` Ingo Molnar
2009-12-29 13:52                           ` Jean Pihet
2009-12-29 16:32                             ` Jamie Iles
2010-01-06 15:16                               ` Michał Nazarewicz
2010-01-06 15:30                                 ` Jamie Iles
2010-01-07 17:02                                   ` Michał Nazarewicz
2009-12-29 13:58                         ` Jean Pihet
2010-01-04 16:52                           ` Will Deacon
2010-01-15 15:30                             ` Jean Pihet
2010-01-15 15:39                               ` Jamie Iles
2010-01-15 15:43                                 ` Jean Pihet
2010-01-15 15:49                                   ` Jamie Iles
2010-01-20 13:40                               ` Will Deacon
2010-01-08 22:17                         ` Woodruff, Richard
2010-01-15 15:34                           ` Jean Pihet
2009-12-15 14:13   ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Will Deacon
2009-12-15 14:36     ` Jamie Iles
2009-12-15 17:06       ` Will Deacon
2009-12-17 16:14   ` Will Deacon
2009-12-17 16:27     ` Jamie Iles
  -- strict thread matches above, loose matches on Subject: below --
2010-01-14 12:14 ARM perf events support v5 Jamie Iles
2010-01-14 12:14 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
2010-01-21  9:30   ` Jamie Iles
2010-01-04 10:48 ARM perf events support v4 Jamie Iles
2010-01-04 10:48 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
2010-01-06 12:00   ` Michał Nazarewicz
2010-01-06 12:15     ` Jamie Iles
2009-12-14 14:04 ARMv6 performance counters v2 Jamie Iles
2009-12-14 14:04 ` [PATCH 1/5] arm: provide a mechanism to reserve performance counters Jamie Iles
2009-12-14 14:39   ` Will Deacon
2009-12-14 15:03     ` Jamie Iles
2009-12-14 16:01   ` Jean Pihet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).