linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] arm64: cache: Skip an unnecessary data cache clean PoU operation
@ 2017-02-08 21:19 Shanker Donthineni
  2017-02-21 15:47 ` Catalin Marinas
  0 siblings, 1 reply; 3+ messages in thread
From: Shanker Donthineni @ 2017-02-08 21:19 UTC (permalink / raw)
  To: linux-arm-kernel

The cache management functions always do the data cache PoU
(point of unification) operations even though it is not required
on some systems. No need to clean data cache till PoU if all the
cache levels below PoUIS are WT (Write-Through) caches. It causes
a huge performance degradation when operating on a larger memory
area, especially THP with 64K page size kernel.

For each online CPU, check the need of 'dc cvau' instruction and
update a global variable __dcache_flags. The two functions
__flush_cache_user_range() and __clean_dcache_area_pou() are
modified to skip an unnecessary code execution based on flags.
It won't change the existing behavior if any one of the online
CPU is capable of WB cache below PoUIS level.

Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
---
Changes since v1:
    handle skipping a dcache clean POU operation by checking the
    global variable __dcache_flags in cache.S instead of patching
    the code segment.

 arch/arm64/include/asm/cachetype.h |  8 ++++++++
 arch/arm64/kernel/cpuinfo.c        | 30 ++++++++++++++++++++++++++++++
 arch/arm64/mm/cache.S              |  8 +++++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h
index f558869..352fb23 100644
--- a/arch/arm64/include/asm/cachetype.h
+++ b/arch/arm64/include/asm/cachetype.h
@@ -28,6 +28,8 @@
 #define ICACHE_POLICY_VIPT	2
 #define ICACHE_POLICY_PIPT	3
 
+#define DCACHE_SKIP_POU		0
+
 #ifndef __ASSEMBLY__
 
 #include <linux/bitops.h>
@@ -39,6 +41,12 @@
 
 extern unsigned long __icache_flags;
 
+extern unsigned long __dcache_flags;
+
+#define CLIDR_LOUIS_SHIFT	(21)
+#define CLIDR_LOUIS_MASK	(0x7)
+#define CLIDR_LOUIS(x)		(((x) >> CLIDR_LOUIS_SHIFT) & CLIDR_LOUIS_MASK)
+
 /*
  * NumSets, bits[27:13] - (Number of sets in cache) - 1
  * Associativity, bits[12:3] - (Associativity of cache) - 1
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 7b7be71..0e1a30a 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -50,6 +50,7 @@
 };
 
 unsigned long __icache_flags;
+unsigned long __dcache_flags;
 
 static const char *const hwcap_str[] = {
 	"fp",
@@ -305,6 +306,33 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
 	pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
 }
 
+/*
+ * Check if all the data cache levels below LoUIS doesn't support WB.
+ * The flag DCACHE_SKIP_POU set to 0 if any one of the online CPU
+ * doesn't support WB cache below LoUIS.
+ */
+static void cpuinfo_ckeck_dcache_pou(struct cpuinfo_arm64 *info)
+{
+	u32 louis = CLIDR_LOUIS(read_sysreg(clidr_el1));
+	static bool update_pou_once;
+	u32 lvl, csidr;
+
+	/* Set the DCACHE_SKIP_POU flag only first time */
+	if (!update_pou_once) {
+		set_bit(DCACHE_SKIP_POU, &__dcache_flags);
+		update_pou_once = true;
+	}
+
+	/* Go through all the cache level below LoUIS */
+	for (lvl = 0; lvl < louis; lvl++) {
+		csidr = cache_get_ccsidr(lvl << 1);
+		if (csidr & CCSIDR_EL1_WRITE_BACK) {
+			clear_bit(DCACHE_SKIP_POU, &__dcache_flags);
+			break;
+		}
+	}
+}
+
 static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 {
 	info->reg_cntfrq = arch_timer_get_cntfrq();
@@ -345,6 +373,8 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	}
 
 	cpuinfo_detect_icache_policy(info);
+	cpuinfo_ckeck_dcache_pou(info);
+
 }
 
 void cpuinfo_store_cpu(void)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 83c27b6e..1884da2 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,6 +24,7 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 #include <asm/asm-uaccess.h>
+#include <asm/cachetype.h>
 
 /*
  *	flush_icache_range(start,end)
@@ -50,6 +51,8 @@ ENTRY(flush_icache_range)
  */
 ENTRY(__flush_cache_user_range)
 	uaccess_ttbr0_enable x2, x3
+	ldr_l	x4, __dcache_flags
+	tbnz	x4, #DCACHE_SKIP_POU, 2f
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
@@ -60,6 +63,7 @@ user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	b.lo	1b
 	dsb	ish
 
+2:
 	icache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
@@ -104,8 +108,10 @@ ENDPIPROC(__flush_dcache_area)
  *	- size    - size in question
  */
 ENTRY(__clean_dcache_area_pou)
+	ldr_l	x2, __dcache_flags
+	tbnz	x2, #DCACHE_SKIP_POU, 1f
 	dcache_by_line_op cvau, ish, x0, x1, x2, x3
-	ret
+1:	ret
 ENDPROC(__clean_dcache_area_pou)
 
 /*
-- 
Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v2] arm64: cache: Skip an unnecessary data cache clean PoU operation
  2017-02-08 21:19 [PATCH v2] arm64: cache: Skip an unnecessary data cache clean PoU operation Shanker Donthineni
@ 2017-02-21 15:47 ` Catalin Marinas
  2017-02-21 15:49   ` Will Deacon
  0 siblings, 1 reply; 3+ messages in thread
From: Catalin Marinas @ 2017-02-21 15:47 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Feb 08, 2017 at 03:19:37PM -0600, Shanker Donthineni wrote:
> The cache management functions always do the data cache PoU
> (point of unification) operations even though it is not required
> on some systems. No need to clean data cache till PoU if all the
> cache levels below PoUIS are WT (Write-Through) caches. It causes
> a huge performance degradation when operating on a larger memory
> area, especially THP with 64K page size kernel.
> 
> For each online CPU, check the need of 'dc cvau' instruction and
> update a global variable __dcache_flags. The two functions
> __flush_cache_user_range() and __clean_dcache_area_pou() are
> modified to skip an unnecessary code execution based on flags.
> It won't change the existing behavior if any one of the online
> CPU is capable of WB cache below PoUIS level.
> 
> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
[...]
> +#define CLIDR_LOUIS_SHIFT	(21)
> +#define CLIDR_LOUIS_MASK	(0x7)
> +#define CLIDR_LOUIS(x)		(((x) >> CLIDR_LOUIS_SHIFT) & CLIDR_LOUIS_MASK)

According to the ARMv8 ARM, CLIDR_EL1 "identifies the type of cache, or
caches, that are implemented at each level and can be managed using the
architected cache maintenance instructions that operate by set/way". The
key part is "set/way" here and hence you cannot use CLIDR_EL1 and
CCSIDR_EL1 to infer whether you can skip cache maintenance by VA.

> +	/* Go through all the cache level below LoUIS */
> +	for (lvl = 0; lvl < louis; lvl++) {
> +		csidr = cache_get_ccsidr(lvl << 1);
> +		if (csidr & CCSIDR_EL1_WRITE_BACK) {

The type bits have also been deprecated in ARMv8 (we need to update the
kernel or just remove the cache topology detection entirely, leaving it
just to DT).

-- 
Catalin

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v2] arm64: cache: Skip an unnecessary data cache clean PoU operation
  2017-02-21 15:47 ` Catalin Marinas
@ 2017-02-21 15:49   ` Will Deacon
  0 siblings, 0 replies; 3+ messages in thread
From: Will Deacon @ 2017-02-21 15:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Feb 21, 2017 at 03:47:27PM +0000, Catalin Marinas wrote:
> On Wed, Feb 08, 2017 at 03:19:37PM -0600, Shanker Donthineni wrote:
> > The cache management functions always do the data cache PoU
> > (point of unification) operations even though it is not required
> > on some systems. No need to clean data cache till PoU if all the
> > cache levels below PoUIS are WT (Write-Through) caches. It causes
> > a huge performance degradation when operating on a larger memory
> > area, especially THP with 64K page size kernel.
> > 
> > For each online CPU, check the need of 'dc cvau' instruction and
> > update a global variable __dcache_flags. The two functions
> > __flush_cache_user_range() and __clean_dcache_area_pou() are
> > modified to skip an unnecessary code execution based on flags.
> > It won't change the existing behavior if any one of the online
> > CPU is capable of WB cache below PoUIS level.
> > 
> > Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
> [...]
> > +#define CLIDR_LOUIS_SHIFT	(21)
> > +#define CLIDR_LOUIS_MASK	(0x7)
> > +#define CLIDR_LOUIS(x)		(((x) >> CLIDR_LOUIS_SHIFT) & CLIDR_LOUIS_MASK)
> 
> According to the ARMv8 ARM, CLIDR_EL1 "identifies the type of cache, or
> caches, that are implemented at each level and can be managed using the
> architected cache maintenance instructions that operate by set/way". The
> key part is "set/way" here and hence you cannot use CLIDR_EL1 and
> CCSIDR_EL1 to infer whether you can skip cache maintenance by VA.
> 
> > +	/* Go through all the cache level below LoUIS */
> > +	for (lvl = 0; lvl < louis; lvl++) {
> > +		csidr = cache_get_ccsidr(lvl << 1);
> > +		if (csidr & CCSIDR_EL1_WRITE_BACK) {
> 
> The type bits have also been deprecated in ARMv8 (we need to update the
> kernel or just remove the cache topology detection entirely, leaving it
> just to DT).

I'll dust off the patches I have for this...

Will

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-02-21 15:49 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-02-08 21:19 [PATCH v2] arm64: cache: Skip an unnecessary data cache clean PoU operation Shanker Donthineni
2017-02-21 15:47 ` Catalin Marinas
2017-02-21 15:49   ` Will Deacon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).