* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
@ 2010-03-09 10:14 Catalin Marinas
2010-03-09 10:30 ` Colin Tuckley
2010-03-13 13:34 ` Shilimkar, Santosh
0 siblings, 2 replies; 8+ messages in thread
From: Catalin Marinas @ 2010-03-09 10:14 UTC (permalink / raw)
To: linux-arm-kernel
With this L2 cache controller, the cache maintenance by PA and sync
operations are atomic and do not require a "wait" loop or spinlocks.
This patch conditionally defines the cache_wait() function and locking
primitives (rather than duplicating the functions or file).
Since L2x0 cache controllers do not work with ARMv7 CPUs, the patch
automatically enables CACHE_PL310 when CPU_V7 is defined.
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Russell King <rmk@arm.linux.org.uk>
---
We did some benchmarks and the performance benefit of this patch with a
PL310 cache controller is considerable.
I also considered separate functions in the same file or a separate file
but this would mean having to move the TI's workaround to a new file as
well. Suggestions welcome.
arch/arm/mm/Kconfig | 7 +++++
arch/arm/mm/cache-l2x0.c | 70 ++++++++++++++++++++++++++++++++--------------
2 files changed, 56 insertions(+), 21 deletions(-)
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index ef61c93..0a59071 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -781,6 +781,13 @@ config CACHE_L2X0
help
This option enables the L2x0 PrimeCell.
+config CACHE_PL310
+ bool
+ depends on CACHE_L2X0
+ default y if CPU_V7
+ help
+ This option enables support for the PL310 cache controller.
+
config CACHE_TAUROS2
bool "Enable the Tauros2 L2 cache controller"
depends on ARCH_DOVE
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index 9fc01b4..2a20d95 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -26,9 +26,21 @@
#define CACHE_LINE_SIZE 32
static void __iomem *l2x0_base;
-static DEFINE_SPINLOCK(l2x0_lock);
bool l2x0_disabled;
+#ifdef CONFIG_CACHE_PL310
+static inline void cache_wait(void __iomem *reg, unsigned long mask)
+{
+ /* cache operations are atomic */
+}
+
+#define l2x0_lock(lock, flags) ((void)(flags))
+#define l2x0_unlock(lock, flags) ((void)(flags))
+
+#define block_end(start, end) (end)
+
+#define L2CC_TYPE "PL310/L2C-310"
+#else
static inline void cache_wait(void __iomem *reg, unsigned long mask)
{
/* wait for the operation to complete */
@@ -36,6 +48,22 @@ static inline void cache_wait(void __iomem *reg, unsigned long mask)
;
}
+static DEFINE_SPINLOCK(l2x0_lock);
+#define l2x0_lock(lock, flags) spin_lock_irqsave(lock, flags)
+#define l2x0_unlock(lock, flags) spin_unlock_irqrestore(lock, flags)
+
+#define block_end(start, end) ((start) + min((end) - (start), 4096UL))
+
+#define L2CC_TYPE "L2x0"
+#endif
+
+static inline void cache_wait_always(void __iomem *reg, unsigned long mask)
+{
+ /* wait for the operation to complete */
+ while (readl(reg) & mask)
+ ;
+}
+
static inline void cache_sync(void)
{
void __iomem *base = l2x0_base;
@@ -99,11 +127,11 @@ static inline void l2x0_inv_all(void)
unsigned long flags;
/* invalidate all ways */
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
writel(0xff, l2x0_base + L2X0_INV_WAY);
- cache_wait(l2x0_base + L2X0_INV_WAY, 0xff);
+ cache_wait_always(l2x0_base + L2X0_INV_WAY, 0xff);
cache_sync();
- spin_unlock_irqrestore(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
}
static void l2x0_inv_range(unsigned long start, unsigned long end)
@@ -111,7 +139,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
void __iomem *base = l2x0_base;
unsigned long flags;
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
if (start & (CACHE_LINE_SIZE - 1)) {
start &= ~(CACHE_LINE_SIZE - 1);
debug_writel(0x03);
@@ -128,7 +156,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
}
while (start < end) {
- unsigned long blk_end = start + min(end - start, 4096UL);
+ unsigned long blk_end = block_end(start, end);
while (start < blk_end) {
l2x0_inv_line(start);
@@ -136,13 +164,13 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
}
if (blk_end < end) {
- spin_unlock_irqrestore(&l2x0_lock, flags);
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
}
}
cache_wait(base + L2X0_INV_LINE_PA, 1);
cache_sync();
- spin_unlock_irqrestore(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
}
static void l2x0_clean_range(unsigned long start, unsigned long end)
@@ -150,10 +178,10 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
void __iomem *base = l2x0_base;
unsigned long flags;
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
start &= ~(CACHE_LINE_SIZE - 1);
while (start < end) {
- unsigned long blk_end = start + min(end - start, 4096UL);
+ unsigned long blk_end = block_end(start, end);
while (start < blk_end) {
l2x0_clean_line(start);
@@ -161,13 +189,13 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
}
if (blk_end < end) {
- spin_unlock_irqrestore(&l2x0_lock, flags);
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
}
}
cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
cache_sync();
- spin_unlock_irqrestore(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
}
static void l2x0_flush_range(unsigned long start, unsigned long end)
@@ -175,10 +203,10 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
void __iomem *base = l2x0_base;
unsigned long flags;
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
start &= ~(CACHE_LINE_SIZE - 1);
while (start < end) {
- unsigned long blk_end = start + min(end - start, 4096UL);
+ unsigned long blk_end = block_end(start, end);
debug_writel(0x03);
while (start < blk_end) {
@@ -188,13 +216,13 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
debug_writel(0x00);
if (blk_end < end) {
- spin_unlock_irqrestore(&l2x0_lock, flags);
- spin_lock_irqsave(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
+ l2x0_lock(&l2x0_lock, flags);
}
}
cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1);
cache_sync();
- spin_unlock_irqrestore(&l2x0_lock, flags);
+ l2x0_unlock(&l2x0_lock, flags);
}
void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
@@ -202,7 +230,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
__u32 aux;
if (l2x0_disabled) {
- printk(KERN_INFO "L2X0 cache controller disabled\n");
+ pr_info(L2CC_TYPE " cache controller disabled\n");
return;
}
@@ -232,7 +260,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
outer_cache.clean_range = l2x0_clean_range;
outer_cache.flush_range = l2x0_flush_range;
- printk(KERN_INFO "L2X0 cache controller enabled\n");
+ pr_info(L2CC_TYPE " cache controller enabled\n");
}
static int __init l2x0_disable(char *unused)
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-09 10:14 [PATCH] ARM: Improve the L2 cache performance when PL310 is used Catalin Marinas
@ 2010-03-09 10:30 ` Colin Tuckley
2010-03-09 10:52 ` Catalin Marinas
2010-03-13 13:34 ` Shilimkar, Santosh
1 sibling, 1 reply; 8+ messages in thread
From: Colin Tuckley @ 2010-03-09 10:30 UTC (permalink / raw)
To: linux-arm-kernel
> -----Original Message-----
> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-
> kernel-bounces at lists.infradead.org] On Behalf Of Catalin Marinas
> With this L2 cache controller, the cache maintenance by PA and sync
> operations are atomic and do not require a "wait" loop or spinlocks.
> This patch conditionally defines the cache_wait() function and locking
> primitives (rather than duplicating the functions or file).
>
> Since L2x0 cache controllers do not work with ARMv7 CPUs, the patch
> automatically enables CACHE_PL310 when CPU_V7 is defined.
That will cause a problem with A8 CPUs which are V7 but which do *not* use a
PL310 for the L2 cache.
Colin
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-09 10:30 ` Colin Tuckley
@ 2010-03-09 10:52 ` Catalin Marinas
2010-03-09 11:07 ` Colin Tuckley
0 siblings, 1 reply; 8+ messages in thread
From: Catalin Marinas @ 2010-03-09 10:52 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, 2010-03-09 at 10:30 +0000, Colin Tuckley wrote:
> > -----Original Message-----
> > From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-
> > kernel-bounces at lists.infradead.org] On Behalf Of Catalin Marinas
>
>
> > With this L2 cache controller, the cache maintenance by PA and sync
> > operations are atomic and do not require a "wait" loop or spinlocks.
> > This patch conditionally defines the cache_wait() function and locking
> > primitives (rather than duplicating the functions or file).
> >
> > Since L2x0 cache controllers do not work with ARMv7 CPUs, the patch
> > automatically enables CACHE_PL310 when CPU_V7 is defined.
>
> That will cause a problem with A8 CPUs which are V7 but which do *not* use a
> PL310 for the L2 cache.
But do they use an L220? Or they don't use anything?
Note that CACHE_PL310 depends on CACHE_L2X0, so if you don't have an
outer cache controller, you don't get CACHE_PL310 enabled.
--
Catalin
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-09 10:52 ` Catalin Marinas
@ 2010-03-09 11:07 ` Colin Tuckley
2010-03-09 11:28 ` Shilimkar, Santosh
0 siblings, 1 reply; 8+ messages in thread
From: Colin Tuckley @ 2010-03-09 11:07 UTC (permalink / raw)
To: linux-arm-kernel
> -----Original Message-----
> From: Catalin Marinas
> But do they use an L220? Or they don't use anything?
The A8 has it's own L2 cache design, the chip used on the PBA8 has a bit tacked on that makes it look a bit like a 220. I'm also told that it's possible to implement an A8 without a level two cache.
I think this is a case of reading the docs or maybe raising an Arm-support ticket.
Colin
--
Colin Tuckley - ARM Ltd.
110 Fulbourn Rd
Cambridge, CB1 9NJ
Tel: +44 1223 400536
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-09 11:07 ` Colin Tuckley
@ 2010-03-09 11:28 ` Shilimkar, Santosh
0 siblings, 0 replies; 8+ messages in thread
From: Shilimkar, Santosh @ 2010-03-09 11:28 UTC (permalink / raw)
To: linux-arm-kernel
> -----Original Message-----
> From: linux-arm-kernel-bounces at lists.infradead.org [mailto:linux-arm-kernel-
> bounces at lists.infradead.org] On Behalf Of Colin Tuckley
> Sent: Tuesday, March 09, 2010 4:38 PM
> To: linux-arm-kernel at lists.infradead.org
> Subject: RE: [PATCH] ARM: Improve the L2 cache performance when PL310 is used
>
> > -----Original Message-----
> > From: Catalin Marinas
>
> > But do they use an L220? Or they don't use anything?
>
> The A8 has it's own L2 cache design, the chip used on the PBA8 has a bit tacked on that makes it look
> a bit like a 220. I'm also told that it's possible to implement an A8 without a level two cache.
>
> I think this is a case of reading the docs or maybe raising an Arm-support ticket.
>
Just as an example,
Both OMAP3 and OMAP4 are based on ARMv7. OMAP3 there is Cortex-A8 with internal
L2 cache where as OMAP4 is Cortex-A9 with Pl310 as external L2. So below
approach from Catalin should work at least for v7 based OMAPs for sure.
+config CACHE_PL310
+ bool
+ depends on CACHE_L2X0
+ default y if CPU_V7
+ help
+ This option enables support for the PL310 cache controller.
+
OMAP3 -- "CACHE_PL310" won't be enabled because "CACHE_L2X0" isn't enabled.
OMAP4 -- PL310 is enabled because "CACHE_L2X0" is enabled
Regards,
Santosh
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-09 10:14 [PATCH] ARM: Improve the L2 cache performance when PL310 is used Catalin Marinas
2010-03-09 10:30 ` Colin Tuckley
@ 2010-03-13 13:34 ` Shilimkar, Santosh
2011-02-15 11:29 ` Srinidhi Kasagar
1 sibling, 1 reply; 8+ messages in thread
From: Shilimkar, Santosh @ 2010-03-13 13:34 UTC (permalink / raw)
To: linux-arm-kernel
> -----Original Message-----
> From: Catalin Marinas [mailto:catalin.marinas at arm.com]
> Sent: Tuesday, March 09, 2010 3:45 PM
> To: linux-arm-kernel at lists.infradead.org
> Cc: Shilimkar, Santosh; Russell King
> Subject: [PATCH] ARM: Improve the L2 cache performance when PL310 is used
>
> With this L2 cache controller, the cache maintenance by PA and sync
> operations are atomic and do not require a "wait" loop or spinlocks.
> This patch conditionally defines the cache_wait() function and locking
> primitives (rather than duplicating the functions or file).
>
> Since L2x0 cache controllers do not work with ARMv7 CPUs, the patch
> automatically enables CACHE_PL310 when CPU_V7 is defined.
>
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
> Cc: Russell King <rmk@arm.linux.org.uk>
> ---
>
> We did some benchmarks and the performance benefit of this patch with a
> PL310 cache controller is considerable.
>
I tested this patch and indeed it improves the performance by
12 % to 28 % for buffers ranging from 1 MB to 10 MB.
> I also considered separate functions in the same file or a separate file
> but this would mean having to move the TI's workaround to a new file as
> well. Suggestions welcome.
>
>
> arch/arm/mm/Kconfig | 7 +++++
> arch/arm/mm/cache-l2x0.c | 70 ++++++++++++++++++++++++++++++++--------------
> 2 files changed, 56 insertions(+), 21 deletions(-)
>
> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index ef61c93..0a59071 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -781,6 +781,13 @@ config CACHE_L2X0
> help
> This option enables the L2x0 PrimeCell.
>
> +config CACHE_PL310
> + bool
> + depends on CACHE_L2X0
> + default y if CPU_V7
> + help
> + This option enables support for the PL310 cache controller.
> +
> config CACHE_TAUROS2
> bool "Enable the Tauros2 L2 cache controller"
> depends on ARCH_DOVE
> diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
> index 9fc01b4..2a20d95 100644
> --- a/arch/arm/mm/cache-l2x0.c
> +++ b/arch/arm/mm/cache-l2x0.c
> @@ -26,9 +26,21 @@
> #define CACHE_LINE_SIZE 32
>
> static void __iomem *l2x0_base;
> -static DEFINE_SPINLOCK(l2x0_lock);
> bool l2x0_disabled;
>
> +#ifdef CONFIG_CACHE_PL310
> +static inline void cache_wait(void __iomem *reg, unsigned long mask)
> +{
> + /* cache operations are atomic */
> +}
> +
> +#define l2x0_lock(lock, flags) ((void)(flags))
> +#define l2x0_unlock(lock, flags) ((void)(flags))
> +
> +#define block_end(start, end) (end)
> +
> +#define L2CC_TYPE "PL310/L2C-310"
> +#else
> static inline void cache_wait(void __iomem *reg, unsigned long mask)
> {
> /* wait for the operation to complete */
> @@ -36,6 +48,22 @@ static inline void cache_wait(void __iomem *reg, unsigned long mask)
> ;
> }
>
> +static DEFINE_SPINLOCK(l2x0_lock);
> +#define l2x0_lock(lock, flags) spin_lock_irqsave(lock, flags)
> +#define l2x0_unlock(lock, flags) spin_unlock_irqrestore(lock, flags)
> +
> +#define block_end(start, end) ((start) + min((end) - (start), 4096UL))
> +
> +#define L2CC_TYPE "L2x0"
> +#endif
> +
> +static inline void cache_wait_always(void __iomem *reg, unsigned long mask)
> +{
> + /* wait for the operation to complete */
> + while (readl(reg) & mask)
> + ;
> +}
> +
> static inline void cache_sync(void)
> {
> void __iomem *base = l2x0_base;
> @@ -99,11 +127,11 @@ static inline void l2x0_inv_all(void)
> unsigned long flags;
>
> /* invalidate all ways */
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> writel(0xff, l2x0_base + L2X0_INV_WAY);
> - cache_wait(l2x0_base + L2X0_INV_WAY, 0xff);
> + cache_wait_always(l2x0_base + L2X0_INV_WAY, 0xff);
> cache_sync();
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> }
>
> static void l2x0_inv_range(unsigned long start, unsigned long end)
> @@ -111,7 +139,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
> void __iomem *base = l2x0_base;
> unsigned long flags;
>
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> if (start & (CACHE_LINE_SIZE - 1)) {
> start &= ~(CACHE_LINE_SIZE - 1);
> debug_writel(0x03);
> @@ -128,7 +156,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
> }
>
> while (start < end) {
> - unsigned long blk_end = start + min(end - start, 4096UL);
> + unsigned long blk_end = block_end(start, end);
>
> while (start < blk_end) {
> l2x0_inv_line(start);
> @@ -136,13 +164,13 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
> }
>
> if (blk_end < end) {
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> }
> }
> cache_wait(base + L2X0_INV_LINE_PA, 1);
> cache_sync();
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> }
>
> static void l2x0_clean_range(unsigned long start, unsigned long end)
> @@ -150,10 +178,10 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
> void __iomem *base = l2x0_base;
> unsigned long flags;
>
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> start &= ~(CACHE_LINE_SIZE - 1);
> while (start < end) {
> - unsigned long blk_end = start + min(end - start, 4096UL);
> + unsigned long blk_end = block_end(start, end);
>
> while (start < blk_end) {
> l2x0_clean_line(start);
> @@ -161,13 +189,13 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
> }
>
> if (blk_end < end) {
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> }
> }
> cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
> cache_sync();
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> }
>
> static void l2x0_flush_range(unsigned long start, unsigned long end)
> @@ -175,10 +203,10 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
> void __iomem *base = l2x0_base;
> unsigned long flags;
>
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> start &= ~(CACHE_LINE_SIZE - 1);
> while (start < end) {
> - unsigned long blk_end = start + min(end - start, 4096UL);
> + unsigned long blk_end = block_end(start, end);
>
> debug_writel(0x03);
> while (start < blk_end) {
> @@ -188,13 +216,13 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
> debug_writel(0x00);
>
> if (blk_end < end) {
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> - spin_lock_irqsave(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> + l2x0_lock(&l2x0_lock, flags);
> }
> }
> cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1);
> cache_sync();
> - spin_unlock_irqrestore(&l2x0_lock, flags);
> + l2x0_unlock(&l2x0_lock, flags);
> }
>
> void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
> @@ -202,7 +230,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
> __u32 aux;
>
> if (l2x0_disabled) {
> - printk(KERN_INFO "L2X0 cache controller disabled\n");
> + pr_info(L2CC_TYPE " cache controller disabled\n");
> return;
> }
>
> @@ -232,7 +260,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
> outer_cache.clean_range = l2x0_clean_range;
> outer_cache.flush_range = l2x0_flush_range;
>
> - printk(KERN_INFO "L2X0 cache controller enabled\n");
> + pr_info(L2CC_TYPE " cache controller enabled\n");
> }
>
> static int __init l2x0_disable(char *unused)
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2010-03-13 13:34 ` Shilimkar, Santosh
@ 2011-02-15 11:29 ` Srinidhi Kasagar
2011-02-15 16:43 ` Catalin Marinas
0 siblings, 1 reply; 8+ messages in thread
From: Srinidhi Kasagar @ 2011-02-15 11:29 UTC (permalink / raw)
To: linux-arm-kernel
Just curious to know, why the spinlock surrounding
l2x0_cache_sync still exists? I see that
Catalin's first version adds void lock for PL310
as they are atomic.
am I missing some discussion thread which I have
not noticed?
srinidhi
On Sat, Mar 13, 2010 at 7:04 PM, Shilimkar, Santosh
<santosh.shilimkar@ti.com> wrote:
>> -----Original Message-----
>> From: Catalin Marinas [mailto:catalin.marinas at arm.com]
>> Sent: Tuesday, March 09, 2010 3:45 PM
>> To: linux-arm-kernel at lists.infradead.org
>> Cc: Shilimkar, Santosh; Russell King
>> Subject: [PATCH] ARM: Improve the L2 cache performance when PL310 is used
>>
>> With this L2 cache controller, the cache maintenance by PA and sync
>> operations are atomic and do not require a "wait" loop or spinlocks.
>> This patch conditionally defines the cache_wait() function and locking
>> primitives (rather than duplicating the functions or file).
>>
>> Since L2x0 cache controllers do not work with ARMv7 CPUs, the patch
>> automatically enables CACHE_PL310 when CPU_V7 is defined.
>>
>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
>> Cc: Russell King <rmk@arm.linux.org.uk>
>> ---
>>
>> We did some benchmarks and the performance benefit of this patch with a
>> PL310 cache controller is considerable.
>>
> I tested this patch and indeed it improves the performance by
> 12 % to 28 % for buffers ranging from 1 MB to 10 MB.
>
>> I also considered separate functions in the same file or a separate file
>> but this would mean having to move the TI's workaround to a new file as
>> well. Suggestions welcome.
>>
>>
>> ?arch/arm/mm/Kconfig ? ? ?| ? ?7 +++++
>> ?arch/arm/mm/cache-l2x0.c | ? 70 ++++++++++++++++++++++++++++++++--------------
>> ?2 files changed, 56 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
>> index ef61c93..0a59071 100644
>> --- a/arch/arm/mm/Kconfig
>> +++ b/arch/arm/mm/Kconfig
>> @@ -781,6 +781,13 @@ config CACHE_L2X0
>> ? ? ? help
>> ? ? ? ? This option enables the L2x0 PrimeCell.
>>
>> +config CACHE_PL310
>> + ? ? bool
>> + ? ? depends on CACHE_L2X0
>> + ? ? default y if CPU_V7
>> + ? ? help
>> + ? ? ? This option enables support for the PL310 cache controller.
>> +
>> ?config CACHE_TAUROS2
>> ? ? ? bool "Enable the Tauros2 L2 cache controller"
>> ? ? ? depends on ARCH_DOVE
>> diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
>> index 9fc01b4..2a20d95 100644
>> --- a/arch/arm/mm/cache-l2x0.c
>> +++ b/arch/arm/mm/cache-l2x0.c
>> @@ -26,9 +26,21 @@
>> ?#define CACHE_LINE_SIZE ? ? ? ? ? ? ?32
>>
>> ?static void __iomem *l2x0_base;
>> -static DEFINE_SPINLOCK(l2x0_lock);
>> ?bool l2x0_disabled;
>>
>> +#ifdef CONFIG_CACHE_PL310
>> +static inline void cache_wait(void __iomem *reg, unsigned long mask)
>> +{
>> + ? ? /* cache operations are atomic */
>> +}
>> +
>> +#define l2x0_lock(lock, flags) ? ? ? ? ? ? ? ((void)(flags))
>> +#define l2x0_unlock(lock, flags) ? ? ((void)(flags))
>> +
>> +#define block_end(start, end) ? ? ? ? ? ? ? ?(end)
>> +
>> +#define L2CC_TYPE ? ? ? ? ? ? ? ? ? ?"PL310/L2C-310"
>> +#else
>> ?static inline void cache_wait(void __iomem *reg, unsigned long mask)
>> ?{
>> ? ? ? /* wait for the operation to complete */
>> @@ -36,6 +48,22 @@ static inline void cache_wait(void __iomem *reg, unsigned long mask)
>> ? ? ? ? ? ? ? ;
>> ?}
>>
>> +static DEFINE_SPINLOCK(l2x0_lock);
>> +#define l2x0_lock(lock, flags) ? ? ? ? ? ? ? spin_lock_irqsave(lock, flags)
>> +#define l2x0_unlock(lock, flags) ? ? spin_unlock_irqrestore(lock, flags)
>> +
>> +#define block_end(start, end) ? ? ? ? ? ? ? ?((start) + min((end) - (start), 4096UL))
>> +
>> +#define L2CC_TYPE ? ? ? ? ? ? ? ? ? ?"L2x0"
>> +#endif
>> +
>> +static inline void cache_wait_always(void __iomem *reg, unsigned long mask)
>> +{
>> + ? ? /* wait for the operation to complete */
>> + ? ? while (readl(reg) & mask)
>> + ? ? ? ? ? ? ;
>> +}
>> +
>> ?static inline void cache_sync(void)
>> ?{
>> ? ? ? void __iomem *base = l2x0_base;
>> @@ -99,11 +127,11 @@ static inline void l2x0_inv_all(void)
>> ? ? ? unsigned long flags;
>>
>> ? ? ? /* invalidate all ways */
>> - ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? writel(0xff, l2x0_base + L2X0_INV_WAY);
>> - ? ? cache_wait(l2x0_base + L2X0_INV_WAY, 0xff);
>> + ? ? cache_wait_always(l2x0_base + L2X0_INV_WAY, 0xff);
>> ? ? ? cache_sync();
>> - ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> + ? ? l2x0_unlock(&l2x0_lock, flags);
>> ?}
>>
>> ?static void l2x0_inv_range(unsigned long start, unsigned long end)
>> @@ -111,7 +139,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
>> ? ? ? void __iomem *base = l2x0_base;
>> ? ? ? unsigned long flags;
>>
>> - ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? if (start & (CACHE_LINE_SIZE - 1)) {
>> ? ? ? ? ? ? ? start &= ~(CACHE_LINE_SIZE - 1);
>> ? ? ? ? ? ? ? debug_writel(0x03);
>> @@ -128,7 +156,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
>> ? ? ? }
>>
>> ? ? ? while (start < end) {
>> - ? ? ? ? ? ? unsigned long blk_end = start + min(end - start, 4096UL);
>> + ? ? ? ? ? ? unsigned long blk_end = block_end(start, end);
>>
>> ? ? ? ? ? ? ? while (start < blk_end) {
>> ? ? ? ? ? ? ? ? ? ? ? l2x0_inv_line(start);
>> @@ -136,13 +164,13 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
>> ? ? ? ? ? ? ? }
>>
>> ? ? ? ? ? ? ? if (blk_end < end) {
>> - ? ? ? ? ? ? ? ? ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> - ? ? ? ? ? ? ? ? ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_unlock(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? ? ? ? ? }
>> ? ? ? }
>> ? ? ? cache_wait(base + L2X0_INV_LINE_PA, 1);
>> ? ? ? cache_sync();
>> - ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> + ? ? l2x0_unlock(&l2x0_lock, flags);
>> ?}
>>
>> ?static void l2x0_clean_range(unsigned long start, unsigned long end)
>> @@ -150,10 +178,10 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
>> ? ? ? void __iomem *base = l2x0_base;
>> ? ? ? unsigned long flags;
>>
>> - ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? start &= ~(CACHE_LINE_SIZE - 1);
>> ? ? ? while (start < end) {
>> - ? ? ? ? ? ? unsigned long blk_end = start + min(end - start, 4096UL);
>> + ? ? ? ? ? ? unsigned long blk_end = block_end(start, end);
>>
>> ? ? ? ? ? ? ? while (start < blk_end) {
>> ? ? ? ? ? ? ? ? ? ? ? l2x0_clean_line(start);
>> @@ -161,13 +189,13 @@ static void l2x0_clean_range(unsigned long start, unsigned long end)
>> ? ? ? ? ? ? ? }
>>
>> ? ? ? ? ? ? ? if (blk_end < end) {
>> - ? ? ? ? ? ? ? ? ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> - ? ? ? ? ? ? ? ? ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_unlock(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? ? ? ? ? }
>> ? ? ? }
>> ? ? ? cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
>> ? ? ? cache_sync();
>> - ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> + ? ? l2x0_unlock(&l2x0_lock, flags);
>> ?}
>>
>> ?static void l2x0_flush_range(unsigned long start, unsigned long end)
>> @@ -175,10 +203,10 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
>> ? ? ? void __iomem *base = l2x0_base;
>> ? ? ? unsigned long flags;
>>
>> - ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? start &= ~(CACHE_LINE_SIZE - 1);
>> ? ? ? while (start < end) {
>> - ? ? ? ? ? ? unsigned long blk_end = start + min(end - start, 4096UL);
>> + ? ? ? ? ? ? unsigned long blk_end = block_end(start, end);
>>
>> ? ? ? ? ? ? ? debug_writel(0x03);
>> ? ? ? ? ? ? ? while (start < blk_end) {
>> @@ -188,13 +216,13 @@ static void l2x0_flush_range(unsigned long start, unsigned long end)
>> ? ? ? ? ? ? ? debug_writel(0x00);
>>
>> ? ? ? ? ? ? ? if (blk_end < end) {
>> - ? ? ? ? ? ? ? ? ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> - ? ? ? ? ? ? ? ? ? ? spin_lock_irqsave(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_unlock(&l2x0_lock, flags);
>> + ? ? ? ? ? ? ? ? ? ? l2x0_lock(&l2x0_lock, flags);
>> ? ? ? ? ? ? ? }
>> ? ? ? }
>> ? ? ? cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1);
>> ? ? ? cache_sync();
>> - ? ? spin_unlock_irqrestore(&l2x0_lock, flags);
>> + ? ? l2x0_unlock(&l2x0_lock, flags);
>> ?}
>>
>> ?void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
>> @@ -202,7 +230,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
>> ? ? ? __u32 aux;
>>
>> ? ? ? if (l2x0_disabled) {
>> - ? ? ? ? ? ? printk(KERN_INFO "L2X0 cache controller disabled\n");
>> + ? ? ? ? ? ? pr_info(L2CC_TYPE " cache controller disabled\n");
>> ? ? ? ? ? ? ? return;
>> ? ? ? }
>>
>> @@ -232,7 +260,7 @@ void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
>> ? ? ? outer_cache.clean_range = l2x0_clean_range;
>> ? ? ? outer_cache.flush_range = l2x0_flush_range;
>>
>> - ? ? printk(KERN_INFO "L2X0 cache controller enabled\n");
>> + ? ? pr_info(L2CC_TYPE " cache controller enabled\n");
>> ?}
>>
>> ?static int __init l2x0_disable(char *unused)
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH] ARM: Improve the L2 cache performance when PL310 is used
2011-02-15 11:29 ` Srinidhi Kasagar
@ 2011-02-15 16:43 ` Catalin Marinas
0 siblings, 0 replies; 8+ messages in thread
From: Catalin Marinas @ 2011-02-15 16:43 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, 2011-02-15 at 11:29 +0000, Srinidhi Kasagar wrote:
> Just curious to know, why the spinlock surrounding
> l2x0_cache_sync still exists? I see that
> Catalin's first version adds void lock for PL310
> as they are atomic.
On PL310, range operations and the cache sync don't need the spinlock.
However, TI pushed optimisations to use "all" operations for large
ranges. These operations are background and you need to use a cache sync
until completed. You also need locks around since starting other
operations while a background one is active is unpredictable.
--
Catalin
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2011-02-15 16:43 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-09 10:14 [PATCH] ARM: Improve the L2 cache performance when PL310 is used Catalin Marinas
2010-03-09 10:30 ` Colin Tuckley
2010-03-09 10:52 ` Catalin Marinas
2010-03-09 11:07 ` Colin Tuckley
2010-03-09 11:28 ` Shilimkar, Santosh
2010-03-13 13:34 ` Shilimkar, Santosh
2011-02-15 11:29 ` Srinidhi Kasagar
2011-02-15 16:43 ` Catalin Marinas
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).