linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Optimize multi-CPU tlb flushing a little more
@ 2011-08-23 11:06 Russell King - ARM Linux
  2011-09-06 16:53 ` Catalin Marinas
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Russell King - ARM Linux @ 2011-08-23 11:06 UTC (permalink / raw)
  To: linux-arm-kernel

The compiler does not conditionalize the assembly instructions for
the tlb operations, which leads to sub-optimal code being generated
when building a kernel for multiple CPUs.

We can tweak things fairly simply as the code fragment below shows:

    17f8:       e3120001        tst     r2, #1  ; 0x1
...
    1800:       0a000000        beq     1808 <handle_pte_fault+0x194>
    1804:       ee061f10        mcr     15, 0, r1, cr6, cr0, {0}
    1808:       e3120004        tst     r2, #4  ; 0x4
    180c:       0a000000        beq     1814 <handle_pte_fault+0x1a0>
    1810:       ee081f36        mcr     15, 0, r1, cr8, cr6, {1}
becomes:
    17f0:       e3120001        tst     r2, #1  ; 0x1
    17f4:       1e063f10        mcrne   15, 0, r3, cr6, cr0, {0}
    17f8:       e3120004        tst     r2, #4  ; 0x4
    17fc:       1e083f36        mcrne   15, 0, r3, cr8, cr6, {1}

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/tlbflush.h |  128 ++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 76 deletions(-)

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index d2005de..252874c 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -318,6 +318,18 @@ extern struct cpu_tlb_fns cpu_tlb;
 
 #define tlb_flag(f)	((always_tlb_flags & (f)) || (__tlb_flag & possible_tlb_flags & (f)))
 
+#define tlb_op(f, regs, arg)						\
+	do {								\
+		if (always_tlb_flags & (f))				\
+			asm("mcr p15, 0, %0, " regs			\
+			    : : "r" (arg) : "cc");			\
+		else if (possible_tlb_flags & (f))			\
+			asm("tst %1, %2\n\t"				\
+			    "mcrne p15, 0, %0, " regs			\
+			    : : "r" (arg), "r" (__tlb_flag), "Ir" (f)	\
+			    : "cc");					\
+	} while (0)
+
 static inline void local_flush_tlb_all(void)
 {
 	const int zero = 0;
@@ -326,16 +338,11 @@ static inline void local_flush_tlb_all(void)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (tlb_flag(TLB_V3_FULL))
-		asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_U_FULL | TLB_V6_U_FULL))
-		asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_D_FULL | TLB_V6_D_FULL))
-		asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
-		asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V7_UIS_FULL))
-		asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+	tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
+	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
+	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
+	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
 
 	if (tlb_flag(TLB_BARRIER)) {
 		dsb();
@@ -352,29 +359,22 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		if (tlb_flag(TLB_V3_FULL))
-			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_U_FULL))
-			asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_D_FULL))
-			asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_I_FULL))
-			asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
+	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
+	    cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+		tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+		tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+		tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+		tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
 	}
 	put_cpu();
 
-	if (tlb_flag(TLB_V6_U_ASID))
-		asm("mcr p15, 0, %0, c8, c7, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V6_D_ASID))
-		asm("mcr p15, 0, %0, c8, c6, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V6_I_ASID))
-		asm("mcr p15, 0, %0, c8, c5, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V7_UIS_ASID))
+	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
+	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
+	tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
 #ifdef CONFIG_ARM_ERRATA_720789
-		asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
 #else
-		asm("mcr p15, 0, %0, c8, c3, 2" : : "r" (asid) : "cc");
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -392,30 +392,23 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
-		if (tlb_flag(TLB_V3_PAGE))
-			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_U_PAGE))
-			asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_D_PAGE))
-			asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_I_PAGE))
-			asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
+	if (possible_tlb_flags & (TLB_V3_PAGE|TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) &&
+	    cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
+		tlb_op(TLB_V3_PAGE, "c6, c0, 0", uaddr);
+		tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr);
+		tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", uaddr);
+		tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", uaddr);
 		if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
 			asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
 	}
 
-	if (tlb_flag(TLB_V6_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V6_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V6_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V7_UIS_PAGE))
+	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
+	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
+	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
 #ifdef CONFIG_ARM_ERRATA_720789
-		asm("mcr p15, 0, %0, c8, c3, 3" : : "r" (uaddr & PAGE_MASK) : "cc");
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
 #else
-		asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (uaddr) : "cc");
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", uaddr);
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -432,25 +425,17 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (tlb_flag(TLB_V3_PAGE))
-		asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
+	tlb_op(TLB_V3_PAGE, "c6, c0, 0", kaddr);
+	tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr);
+	tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr);
+	tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr);
 	if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
 		asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
 
-	if (tlb_flag(TLB_V6_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V6_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V6_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V7_UIS_PAGE))
-		asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (kaddr) : "cc");
+	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
+	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
+	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
 
 	if (tlb_flag(TLB_BARRIER)) {
 		dsb();
@@ -475,13 +460,8 @@ static inline void flush_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-	if (tlb_flag(TLB_DCLEAN))
-		asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
-			: : "r" (pmd) : "cc");
-
-	if (tlb_flag(TLB_L2CLEAN_FR))
-		asm("mcr	p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
-			: : "r" (pmd) : "cc");
+	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
+	tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 
 	if (tlb_flag(TLB_WB))
 		dsb();
@@ -491,15 +471,11 @@ static inline void clean_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-	if (tlb_flag(TLB_DCLEAN))
-		asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
-			: : "r" (pmd) : "cc");
-
-	if (tlb_flag(TLB_L2CLEAN_FR))
-		asm("mcr	p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
-			: : "r" (pmd) : "cc");
+	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
+	tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 }
 
+#undef tlb_op
 #undef tlb_flag
 #undef always_tlb_flags
 #undef possible_tlb_flags

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
@ 2011-09-06 16:53 ` Catalin Marinas
  2012-02-13 16:06 ` Rabin Vincent
  2012-02-14 18:52 ` Stephen Warren
  2 siblings, 0 replies; 12+ messages in thread
From: Catalin Marinas @ 2011-09-06 16:53 UTC (permalink / raw)
  To: linux-arm-kernel

On 23 August 2011 12:06, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> The compiler does not conditionalize the assembly instructions for
> the tlb operations, which leads to sub-optimal code being generated
> when building a kernel for multiple CPUs.
>
> We can tweak things fairly simply as the code fragment below shows:
>
> ? ?17f8: ? ? ? e3120001 ? ? ? ?tst ? ? r2, #1 ?; 0x1
> ...
> ? ?1800: ? ? ? 0a000000 ? ? ? ?beq ? ? 1808 <handle_pte_fault+0x194>
> ? ?1804: ? ? ? ee061f10 ? ? ? ?mcr ? ? 15, 0, r1, cr6, cr0, {0}
> ? ?1808: ? ? ? e3120004 ? ? ? ?tst ? ? r2, #4 ?; 0x4
> ? ?180c: ? ? ? 0a000000 ? ? ? ?beq ? ? 1814 <handle_pte_fault+0x1a0>
> ? ?1810: ? ? ? ee081f36 ? ? ? ?mcr ? ? 15, 0, r1, cr8, cr6, {1}
> becomes:
> ? ?17f0: ? ? ? e3120001 ? ? ? ?tst ? ? r2, #1 ?; 0x1
> ? ?17f4: ? ? ? 1e063f10 ? ? ? ?mcrne ? 15, 0, r3, cr6, cr0, {0}
> ? ?17f8: ? ? ? e3120004 ? ? ? ?tst ? ? r2, #4 ?; 0x4
> ? ?17fc: ? ? ? 1e083f36 ? ? ? ?mcrne ? 15, 0, r3, cr8, cr6, {1}

We need to be careful in this area if a conditional TLB operation is
not supported by the hardware. Conditional undefined instructions may
trigger an undef abort even if the condition fails (though I think
that's the case only on a Qualcomm implementation, but you never know
in the future).

IIUC, with your patch we could get some conditional inner shareable
TLB maintenance on a UP implementation which doesn't have such
operation.

-- 
Catalin

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
  2011-09-06 16:53 ` Catalin Marinas
@ 2012-02-13 16:06 ` Rabin Vincent
  2012-02-13 16:23   ` Russell King - ARM Linux
  2012-02-14 18:52 ` Stephen Warren
  2 siblings, 1 reply; 12+ messages in thread
From: Rabin Vincent @ 2012-02-13 16:06 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Aug 23, 2011 at 16:36, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> @@ -352,29 +359,22 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> ? ? ? ?if (tlb_flag(TLB_WB))
> ? ? ? ? ? ? ? ?dsb();
>
> - ? ? ? if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V3_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_U_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_D_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_I_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
> + ? ? ? if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
> + ? ? ? ? ? cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> + ? ? ? ? ? ? ? tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> ? ? ? ?}
> ? ? ? ?put_cpu();

This part conditionally calls get_cpu() but unconditionally calls
put_cpu():

 ------------[ cut here ]------------
 WARNING: at /home/rabin/kernel/arm/kernel/sched/core.c:3051
sub_preempt_count+0xa0/0xe0()
 Modules linked in:
 [<c0013c4c>] (unwind_backtrace+0x0/0xec) from [<c028b2e4>]
(dump_stack+0x20/0x24)
 [<c028b2e4>] (dump_stack+0x20/0x24) from [<c001f808>]
(warn_slowpath_common+0x5c/0x74)
 [<c001f808>] (warn_slowpath_common+0x5c/0x74) from [<c001f84c>]
(warn_slowpath_null+0x2c/0x34)
 [<c001f84c>] (warn_slowpath_null+0x2c/0x34) from [<c004d588>]
(sub_preempt_count+0xa0/0xe0)
 [<c004d588>] (sub_preempt_count+0xa0/0xe0) from [<c0012e54>]
(flush_tlb_mm+0x64/0xa4)
 [<c0012e54>] (flush_tlb_mm+0x64/0xa4) from [<c00e25b8>]
(setup_arg_pages+0x260/0x39c)
 [<c00e25b8>] (setup_arg_pages+0x260/0x39c) from [<c012000c>]
(load_elf_binary+0x3ec/0x11d8)
 [<c012000c>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e78>]
(search_binary_handler+0xd8/0x2c8)
 [<c00e1e78>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b5c>]
(do_execve+0x29c/0x3b4)
 [<c00e3b5c>] (do_execve+0x29c/0x3b4) from [<c0011080>]
(kernel_execve+0x48/0x90)
 [<c0011080>] (kernel_execve+0x48/0x90) from [<c0037274>]
(____call_usermodehelper+0x118/0x130)
 [<c0037274>] (____call_usermodehelper+0x118/0x130) from [<c000e784>]
(kernel_thread_exit+0x0/0x8)
 ---[ end trace 309cd09e6562b6c8 ]---

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-13 16:06 ` Rabin Vincent
@ 2012-02-13 16:23   ` Russell King - ARM Linux
  2012-02-13 16:59     ` Rabin Vincent
  0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-13 16:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> This part conditionally calls get_cpu() but unconditionally calls
> put_cpu():

Can you try getting rid of put_cpu(), and replacing get_cpu()
with smp_processor_id().

In theory, this function should be called with preempt disabled so
smp_processor_id() should be safe here.

Thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-13 16:23   ` Russell King - ARM Linux
@ 2012-02-13 16:59     ` Rabin Vincent
  2012-02-14 21:59       ` Stephen Warren
  0 siblings, 1 reply; 12+ messages in thread
From: Rabin Vincent @ 2012-02-13 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Feb 13, 2012 at 04:23:59PM +0000, Russell King - ARM Linux wrote:
> On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> > This part conditionally calls get_cpu() but unconditionally calls
> > put_cpu():
> 
> Can you try getting rid of put_cpu(), and replacing get_cpu()
> with smp_processor_id().
> 
> In theory, this function should be called with preempt disabled so
> smp_processor_id() should be safe here.

Just deleting put_cpu() and replacing get_cpu() with smp_processor_id()
doesn't cause any problems on my setup, because in my setup (v7 SMP) the
test for possible_tlb_flags fails so
cpu_mask_test_cpu(smp_processor_id(),...) never gets called.

However, local_flush_tlb_mm() itself does not appear to be called with
preemption disabled, because if I reverse the check so that
smp_processor_id() is called, I get loads of the following:

 BUG: using smp_processor_id() in preemptible [00000000] code: init/1
 [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
 [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
 [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
 [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00e2578>] (setup_arg_pages+0x260/0x39c)
 [<c00e2578>] (setup_arg_pages+0x260/0x39c) from [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8)
 [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e38>] (search_binary_handler+0xd8/0x2c8)
 [<c00e1e38>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b1c>] (do_execve+0x29c/0x3b4)
 [<c00e3b1c>] (do_execve+0x29c/0x3b4) from [<c0011080>] (kernel_execve+0x48/0x90)
 [<c0011080>] (kernel_execve+0x48/0x90) from [<c028b080>] (run_init_process+0x24/0x2c)
 [<c028b080>] (run_init_process+0x24/0x2c) from [<c028b0e4>] (init_post+0x5c/0xd4)
 [<c028b0e4>] (init_post+0x5c/0xd4) from [<c039bb04>] (kernel_init+0x174/0x1ac)

 BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/27
 [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
 [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
 [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
 [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00ca7e4>] (exit_mmap+0x134/0x1e8)
 [<c00ca7e4>] (exit_mmap+0x134/0x1e8) from [<c001d4b0>] (mmput+0x60/0xf0)
 [<c001d4b0>] (mmput+0x60/0xf0) from [<c00217bc>] (exit_mm+0x124/0x12c)
 [<c00217bc>] (exit_mm+0x124/0x12c) from [<c0023084>] (do_exit+0x1e8/0x7b8)
 [<c0023084>] (do_exit+0x1e8/0x7b8) from [<c0023944>] (do_group_exit+0x98/0xc4)
 [<c0023944>] (do_group_exit+0x98/0xc4) from [<c0023990>] (__wake_up_parent+0x0/0x30)
 [<c0023990>] (__wake_up_parent+0x0/0x30) from [<c000dca0>] (ret_fast_syscall+0x0/0x3c)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
  2011-09-06 16:53 ` Catalin Marinas
  2012-02-13 16:06 ` Rabin Vincent
@ 2012-02-14 18:52 ` Stephen Warren
  2 siblings, 0 replies; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 18:52 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King wrote at Tuesday, August 23, 2011 5:06 AM:
> The compiler does not conditionalize the assembly instructions for
> the tlb operations, which leads to sub-optimal code being generated
> when building a kernel for multiple CPUs.
> 
> We can tweak things fairly simply as the code fragment below shows:
> 
>     17f8:       e3120001        tst     r2, #1  ; 0x1
> ...
>     1800:       0a000000        beq     1808 <handle_pte_fault+0x194>
>     1804:       ee061f10        mcr     15, 0, r1, cr6, cr0, {0}
>     1808:       e3120004        tst     r2, #4  ; 0x4
>     180c:       0a000000        beq     1814 <handle_pte_fault+0x1a0>
>     1810:       ee081f36        mcr     15, 0, r1, cr8, cr6, {1}
> becomes:
>     17f0:       e3120001        tst     r2, #1  ; 0x1
>     17f4:       1e063f10        mcrne   15, 0, r3, cr6, cr0, {0}
>     17f8:       e3120004        tst     r2, #4  ; 0x4
>     17fc:       1e083f36        mcrne   15, 0, r3, cr8, cr6, {1}

Russell,

This change causes Tegra to fail to boot, starting with next-20120209.
The spew is shown below. I tested 10 times and got the same failure,
then reverted this patch and tested another 10 times without a failure.
I've eliminated other factors such as SD card, board, and compiler.

Is there a bug in the patch, or is it exposing some pre-existing bug in
the Tegra code?

Thanks.

[    3.179569] VFS: Mounted root (ext3 filesystem) readonly on device 179:1.
[    3.187189] Freeing init memory: 140K
[    3.191124] kjournald starting.  Commit interval 5 seconds
[    3.227325] ------------[ cut here ]------------
[    3.232026] WARNING: at kernel/mutex.c:314 __mutex_unlock_slowpath+0x60/0x14c()
[    3.239355] Modules linked in:
[    3.242627] [<c0013d80>] (unwind_backtrace+0x0/0x120) from [<c002519c>] (warn_slowpath_common+0x4c/0x64)
[    3.252164] [<c002519c>] (warn_slowpath_common+0x4c/0x64) from [<c00251cc>] (warn_slowpath_null+0x18/0x1c)
[    3.261882] [<c00251cc>] (warn_slowpath_null+0x18/0x1c) from [<c02e3c40>] (__mutex_unlock_slowpath+0x60/0x14c)
[    3.271942] [<c02e3c40>] (__mutex_unlock_slowpath+0x60/0x14c) from [<c0095ff0>] (expand_downwards+0xc0/0xc8)
[    3.282111] [<c0095ff0>] (expand_downwards+0xc0/0xc8) from [<c00ac910>] (setup_arg_pages+0x1a0/0x1d4)
[    3.291412] [<c00ac910>] (setup_arg_pages+0x1a0/0x1d4) from [<c00e3d84>] (load_elf_binary+0x3d8/0xaa0)
[    3.300781] [<c00e3d84>] (load_elf_binary+0x3d8/0xaa0) from [<c00abbd0>] (search_binary_handler+0x84/0x274)
[    3.310580] [<c00abbd0>] (search_binary_handler+0x84/0x274) from [<c00ad5a0>] (do_execve_common+0x190/0x2a8)
[    3.320511] [<c00ad5a0>] (do_execve_common+0x190/0x2a8) from [<c00ad6c0>] (do_execve+0x8/0xc)
[    3.329113] [<c00ad6c0>] (do_execve+0x8/0xc) from [<c0011084>] (kernel_execve+0x38/0x80)
[    3.337258] [<c0011084>] (kernel_execve+0x38/0x80) from [<c0008630>] (init_post+0x80/0xc4)
[    3.345588] [<c0008630>] (init_post+0x80/0xc4) from [<c040497c>] (kernel_init+0x16c/0x1b0)
[    3.353878] ---[ end trace 6d8d467ddbc23912 ]---
[    3.358918] ------------[ cut here ]------------
[    3.363570] kernel BUG at include/linux/pagemap.h:134!
[    3.368734] Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP ARM
[    3.376144] Modules linked in:
[    3.379242] CPU: 0    Tainted: G        W     (3.3.0-rc2-next-20120209+ #1)
[    3.386259] PC is at find_get_page+0x68/0xf4
[    3.390556] LR is at find_get_page+0x2c/0xf4
[    3.394857] pc : [<c0077fe4>]    lr : [<c0077fa8>]    psr: 20000013
[    3.394877] sp : df82dc50  ip : 00000000  fp : 00000000
[    3.406375] r10: 00000000  r9 : df46def0  r8 : df46dd70
[    3.411621] r7 : c06c75bc  r6 : 000280ff  r5 : df46def4  r4 : df82c000
[    3.418171] r3 : 07ffff00  r2 : 00000001  r1 : 000280ff  r0 : df40e564
[    3.424725] Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[    3.431886] Control: 10c5387d  Table: 1fa6404a  DAC: 00000015
[    3.437653] Process init (pid: 1, stack limit = 0xdf82c2f0)
[    3.443247] Stack: (0xdf82dc50 to 0xdf82e000)
[    3.447633] dc40:                                     000280ff 00000000 df46de10 000280ff
[    3.455850] dc60: df46dd70 c00cb60c df40f4f8 00000000 00000000 00001000 00000000 c00cb774
[    3.464066] dc80: ffffffff df40f4f8 00000000 c06c75bc 000280ff 00000000 df46dd70 00000000
[    3.472282] dca0: 000280ff 00000000 00000fff c00cbcd0 00001000 00000003 000280ff 00001000
[    3.480499] dcc0: df46dd70 c06c75bc 00000000 c00ceb5c 00001000 00001000 00000005 c00cbcd0
[    3.488715] dce0: 00001000 00091c00 000280ff 000280ff 00000000 00001000 df46dd70 dfa11400
[    3.496931] dd00: df410560 df82dd5c 000280ff 00000000 dfa10200 00000005 c0ba50a0 c00f8dc8
[    3.505148] dd20: 00001000 c00bb8ec df410560 df4105a8 c0b94210 df410560 df4104b0 dfa10000
[    3.513362] dd40: 00000000 00000001 df82de0c df46fc78 00000001 c00fa404 00000000 00000000
[    3.521577] dd60: 00000c00 00000005 df46cb68 0000a14d df46cb68 df46fc08 00000000 00000001
[    3.529795] dd80: df46fc78 c00ff43c df46fc08 c0426060 df46cb68 df46fc08 df82de90 c00afe0c
[    3.538013] dda0: df82de90 df82de04 df469a88 c00b1c8c df813f00 df82ddf0 00001000 c0089ce0
[    3.546232] ddc0: df804f00 df82de90 df82c000 c0385677 c0385678 0024a603 df82ddf0 0000002f
[    3.554449] dde0: df82c000 c00b1f68 df804f00 df8e0a18 0024a603 00000003 c0385673 80000013
[    3.562667] de00: c00a82c8 c00a489c c0447d6c df46fc08 df8267b0 df82de90 c02eda00 c0385672
[    3.570884] de20: ffffff9c df82c000 00000000 df82df60 df82c000 c00b3768 df82de5c df800200
[    3.579102] de40: df807ec0 c00a3094 df800200 dfa41740 c00e43d8 df800200 df807ec0 00000000
[    3.587318] de60: c00e43d8 c02eda00 00000001 ffffff9c df82de90 c0385672 00000000 df82df60
[    3.595536] de80: df82c000 c00b3b04 00000041 c00e43d8 df813f00 df469a88 000176fc 000176fc
[    3.603751] dea0: df804a80 df813f00 df469a88 df46fc08 00000011 00000002 00000001 00000000
[    3.611968] dec0: 000000d0 c00a3e2c df804a80 df8f8898 c0042c60 df804a80 000000d0 60000013
[    3.620185] dee0: c0042c60 00020020 00000000 df8e0a20 c0385672 c044afe8 c044b070 dfa06980
[    3.628401] df00: 00000000 c00ac4cc c0385672 00000000 00000012 c0385672 c044afe8 c044b070
[    3.636618] df20: dfa06980 c00ad4d8 c0385672 00000000 c044b070 df82df60 c044afe8 c0385672
[    3.644834] df40: c044b070 0000005f c0425148 c0404180 00000000 c00ad6c0 df82df60 c0011084
[    3.653044] df60: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[    3.661253] df80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[    3.669469] dfa0: 00000000 00000000 c047d4c0 c047d4c0 c04202a0 c0420280 c047d4c0 c0008638
[    3.677684] dfc0: c047d4c0 c040497c 00000007 00000007 c0404180 00000000 00000000 c0404810
[    3.685897] dfe0: c000ee6c 00000013 00000000 00000000 00000000 c000ee6c 00000000 00000000
[    3.694164] [<c0077fe4>] (find_get_page+0x68/0xf4) from [<c00cb60c>] (__find_get_block_slow+0x3c/0x158)
[    3.703625] [<c00cb60c>] (__find_get_block_slow+0x3c/0x158) from [<c00cbcd0>] (__find_get_block+0x34/0x50)
[    3.713335] [<c00cbcd0>] (__find_get_block+0x34/0x50) from [<c00ceb5c>] (__getblk_slow+0xb0/0x160)
[    3.722361] [<c00ceb5c>] (__getblk_slow+0xb0/0x160) from [<c00f8dc8>] (__ext3_get_inode_loc+0xd8/0x310)
[    3.731814] [<c00f8dc8>] (__ext3_get_inode_loc+0xd8/0x310) from [<c00fa404>] (ext3_iget+0x40/0x3c8)
[    3.740918] [<c00fa404>] (ext3_iget+0x40/0x3c8) from [<c00ff43c>] (ext3_lookup+0x7c/0xcc)
[    3.749156] [<c00ff43c>] (ext3_lookup+0x7c/0xcc) from [<c00afe0c>] (d_alloc_and_lookup+0x44/0x60)
[    3.758084] [<c00afe0c>] (d_alloc_and_lookup+0x44/0x60) from [<c00b1c8c>] (do_lookup+0x1f4/0x314)
[    3.767010] [<c00b1c8c>] (do_lookup+0x1f4/0x314) from [<c00b1f68>] (link_path_walk+0x1bc/0x7a8)
[    3.775760] [<c00b1f68>] (link_path_walk+0x1bc/0x7a8) from [<c00b3768>] (path_openat+0x9c/0x354)
[    3.784598] [<c00b3768>] (path_openat+0x9c/0x354) from [<c00b3b04>] (do_filp_open+0x30/0x7c)
[    3.793099] [<c00b3b04>] (do_filp_open+0x30/0x7c) from [<c00ac4cc>] (open_exec+0x18/0xe8)
[    3.801328] [<c00ac4cc>] (open_exec+0x18/0xe8) from [<c00ad4d8>] (do_execve_common+0xc8/0x2a8)
[    3.809982] [<c00ad4d8>] (do_execve_common+0xc8/0x2a8) from [<c00ad6c0>] (do_execve+0x8/0xc)
[    3.818484] [<c00ad6c0>] (do_execve+0x8/0xc) from [<c0011084>] (kernel_execve+0x38/0x80)
[    3.826626] [<c0011084>] (kernel_execve+0x38/0x80) from [<c0008638>] (init_post+0x88/0xc4)
[    3.834948] [<c0008638>] (init_post+0x88/0xc4) from [<c040497c>] (kernel_init+0x16c/0x1b0)
[    3.843252] Code: e3c3333e e3c330ff e3530000 0a000000 (e7f001f2) 
[    3.852770] ---[ end trace 6d8d467ddbc23913 ]---
[    3.857378] Kernel panic - not syncing: Fatal exception in interrupt

-- 
nvpublic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-13 16:59     ` Rabin Vincent
@ 2012-02-14 21:59       ` Stephen Warren
  2012-02-14 22:23         ` Russell King - ARM Linux
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 21:59 UTC (permalink / raw)
  To: linux-arm-kernel

Rabin Vincent wrote at Monday, February 13, 2012 9:59 AM:
> On Mon, Feb 13, 2012 at 04:23:59PM +0000, Russell King - ARM Linux wrote:
> > On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> > > This part conditionally calls get_cpu() but unconditionally calls
> > > put_cpu():
> >
> > Can you try getting rid of put_cpu(), and replacing get_cpu()
> > with smp_processor_id().
> >
> > In theory, this function should be called with preempt disabled so
> > smp_processor_id() should be safe here.
> 
> Just deleting put_cpu() and replacing get_cpu() with smp_processor_id()
> doesn't cause any problems on my setup, because in my setup (v7 SMP) the
> test for possible_tlb_flags fails so
> cpu_mask_test_cpu(smp_processor_id(),...) never gets called.

Aha, I should have searched harder for my stack trace... This thread
describes the exact problem I just posted about a couple of hours ago,
and Russell's suggested fix solves my problem.

> However, local_flush_tlb_mm() itself does not appear to be called with
> preemption disabled, because if I reverse the check so that
> smp_processor_id() is called, I get loads of the following:

I also tried that, and /don't/ see the following stack traces on Tegra,
FWIW.

>  BUG: using smp_processor_id() in preemptible [00000000] code: init/1
>  [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
>  [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
>  [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
>  [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00e2578>] (setup_arg_pages+0x260/0x39c)
>  [<c00e2578>] (setup_arg_pages+0x260/0x39c) from [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8)
>  [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e38>] (search_binary_handler+0xd8/0x2c8)
>  [<c00e1e38>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b1c>] (do_execve+0x29c/0x3b4)
>  [<c00e3b1c>] (do_execve+0x29c/0x3b4) from [<c0011080>] (kernel_execve+0x48/0x90)
>  [<c0011080>] (kernel_execve+0x48/0x90) from [<c028b080>] (run_init_process+0x24/0x2c)
>  [<c028b080>] (run_init_process+0x24/0x2c) from [<c028b0e4>] (init_post+0x5c/0xd4)
>  [<c028b0e4>] (init_post+0x5c/0xd4) from [<c039bb04>] (kernel_init+0x174/0x1ac)
> 
>  BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/27
>  [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
>  [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
>  [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
>  [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00ca7e4>] (exit_mmap+0x134/0x1e8)
>  [<c00ca7e4>] (exit_mmap+0x134/0x1e8) from [<c001d4b0>] (mmput+0x60/0xf0)
>  [<c001d4b0>] (mmput+0x60/0xf0) from [<c00217bc>] (exit_mm+0x124/0x12c)
>  [<c00217bc>] (exit_mm+0x124/0x12c) from [<c0023084>] (do_exit+0x1e8/0x7b8)
>  [<c0023084>] (do_exit+0x1e8/0x7b8) from [<c0023944>] (do_group_exit+0x98/0xc4)
>  [<c0023944>] (do_group_exit+0x98/0xc4) from [<c0023990>] (__wake_up_parent+0x0/0x30)
>  [<c0023990>] (__wake_up_parent+0x0/0x30) from [<c000dca0>] (ret_fast_syscall+0x0/0x3c)

-- 
nvpublic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-14 21:59       ` Stephen Warren
@ 2012-02-14 22:23         ` Russell King - ARM Linux
  2012-02-14 22:38           ` Stephen Warren
  0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-14 22:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Feb 14, 2012 at 01:59:34PM -0800, Stephen Warren wrote:
> Aha, I should have searched harder for my stack trace... This thread
> describes the exact problem I just posted about a couple of hours ago,
> and Russell's suggested fix solves my problem.

Here's a better fix:

 arch/arm/include/asm/tlbflush.h |   15 ++++++++-------
 1 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index bb6408a..01f7484 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -359,14 +359,15 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
-	    cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
-		tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
-		tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
-		tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
+		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+		}
+		put_cpu();
 	}
-	put_cpu();
 
 	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
 	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-14 22:23         ` Russell King - ARM Linux
@ 2012-02-14 22:38           ` Stephen Warren
  2012-02-14 23:21             ` Stephen Warren
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 22:38 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King wrote at Tuesday, February 14, 2012 3:24 PM:
> On Tue, Feb 14, 2012 at 01:59:34PM -0800, Stephen Warren wrote:
> > Aha, I should have searched harder for my stack trace... This thread
> > describes the exact problem I just posted about a couple of hours ago,
> > and Russell's suggested fix solves my problem.
> 
> Here's a better fix:
> ...

Tested-by: Stephen Warren <swarren@nvidia.com>

-- 
nvpublic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-14 22:38           ` Stephen Warren
@ 2012-02-14 23:21             ` Stephen Warren
  2012-02-14 23:34               ` Russell King - ARM Linux
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 23:21 UTC (permalink / raw)
  To: linux-arm-kernel

Russell,

One more query about the original patch; in the following chunk:

@@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
 {
        const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-       if (tlb_flag(TLB_DCLEAN))
-               asm("mcr        p15, 0, %0, c7, c10, 1  @ flush_pmd"
-                       : : "r" (pmd) : "cc");
-
-       if (tlb_flag(TLB_L2CLEAN_FR))
-               asm("mcr        p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
-                       : : "r" (pmd) : "cc");
+       tlb_op(TLB_DCLEAN, "c7, c10, 1  @ flush_pmd", pmd);
+       tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 }

You'll notice that the second mcr instruction is passed "p15, 1, ...".
However, the replacement code in tlb_op() always passes "p15, 0, ..."
to mcr/mcrne. I assume this is a problem?

The same thing applies to flush_pmd_entry() too.

-- 
nvpublic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-14 23:21             ` Stephen Warren
@ 2012-02-14 23:34               ` Russell King - ARM Linux
  2012-02-15  0:01                 ` Stephen Warren
  0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-14 23:34 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Feb 14, 2012 at 03:21:52PM -0800, Stephen Warren wrote:
> Russell,
> 
> One more query about the original patch; in the following chunk:
> 
> @@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
>  {
>         const unsigned int __tlb_flag = __cpu_tlb_flags;
>  
> -       if (tlb_flag(TLB_DCLEAN))
> -               asm("mcr        p15, 0, %0, c7, c10, 1  @ flush_pmd"
> -                       : : "r" (pmd) : "cc");
> -
> -       if (tlb_flag(TLB_L2CLEAN_FR))
> -               asm("mcr        p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
> -                       : : "r" (pmd) : "cc");
> +       tlb_op(TLB_DCLEAN, "c7, c10, 1  @ flush_pmd", pmd);
> +       tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
>  }
> 
> You'll notice that the second mcr instruction is passed "p15, 1, ...".
> However, the replacement code in tlb_op() always passes "p15, 0, ..."
> to mcr/mcrne. I assume this is a problem?
> 
> The same thing applies to flush_pmd_entry() too.

Damn it.  Well spotted, yes this needs fixing.  Here's an updated patch.

 arch/arm/include/asm/tlbflush.h |   28 ++++++++++++++++------------
 1 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index bb6408a..1f1d2ed 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -318,18 +318,21 @@ extern struct cpu_tlb_fns cpu_tlb;
 
 #define tlb_flag(f)	((always_tlb_flags & (f)) || (__tlb_flag & possible_tlb_flags & (f)))
 
-#define tlb_op(f, regs, arg)						\
+#define __tlb_op(f, insnarg, arg)					\
 	do {								\
 		if (always_tlb_flags & (f))				\
-			asm("mcr p15, 0, %0, " regs			\
+			asm("mcr " insnarg				\
 			    : : "r" (arg) : "cc");			\
 		else if (possible_tlb_flags & (f))			\
 			asm("tst %1, %2\n\t"				\
-			    "mcrne p15, 0, %0, " regs			\
+			    "mcrne " insnarg				\
 			    : : "r" (arg), "r" (__tlb_flag), "Ir" (f)	\
 			    : "cc");					\
 	} while (0)
 
+#define tlb_op(f, regs, arg)	__tlb_op(f, "p15, 0, %0, " regs, arg)
+#define tlb_l2_op(f, regs_arg)	__tlb_op(f, "p15, 1, %0, " regs, arg)
+
 static inline void local_flush_tlb_all(void)
 {
 	const int zero = 0;
@@ -359,14 +362,15 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
-	    cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
-		tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
-		tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
-		tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
+		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+		}
+		put_cpu();
 	}
-	put_cpu();
 
 	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
 	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
@@ -461,7 +465,7 @@ static inline void flush_pmd_entry(void *pmd)
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
 	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
-	tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
+	tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 
 	if (tlb_flag(TLB_WB))
 		dsb();
@@ -472,7 +476,7 @@ static inline void clean_pmd_entry(void *pmd)
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
 	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
-	tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
+	tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 }
 
 #undef tlb_op

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] Optimize multi-CPU tlb flushing a little more
  2012-02-14 23:34               ` Russell King - ARM Linux
@ 2012-02-15  0:01                 ` Stephen Warren
  0 siblings, 0 replies; 12+ messages in thread
From: Stephen Warren @ 2012-02-15  0:01 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King wrote at Tuesday, February 14, 2012 4:34 PM:
> On Tue, Feb 14, 2012 at 03:21:52PM -0800, Stephen Warren wrote:
> > Russell,
> >
> > One more query about the original patch; in the following chunk:
> >
> > @@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
> >  {
> >         const unsigned int __tlb_flag = __cpu_tlb_flags;
> >
> > -       if (tlb_flag(TLB_DCLEAN))
> > -               asm("mcr        p15, 0, %0, c7, c10, 1  @ flush_pmd"
> > -                       : : "r" (pmd) : "cc");
> > -
> > -       if (tlb_flag(TLB_L2CLEAN_FR))
> > -               asm("mcr        p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
> > -                       : : "r" (pmd) : "cc");
> > +       tlb_op(TLB_DCLEAN, "c7, c10, 1  @ flush_pmd", pmd);
> > +       tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
> >  }
> >
> > You'll notice that the second mcr instruction is passed "p15, 1, ...".
> > However, the replacement code in tlb_op() always passes "p15, 0, ..."
> > to mcr/mcrne. I assume this is a problem?
> >
> > The same thing applies to flush_pmd_entry() too.
> 
> Damn it.  Well spotted, yes this needs fixing.  Here's an updated patch.
> ...

That looks reasonable to me.

I didn't retest, since I didn't observe any problems due to this issue.

-- 
nvpublic

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-02-15  0:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
2011-09-06 16:53 ` Catalin Marinas
2012-02-13 16:06 ` Rabin Vincent
2012-02-13 16:23   ` Russell King - ARM Linux
2012-02-13 16:59     ` Rabin Vincent
2012-02-14 21:59       ` Stephen Warren
2012-02-14 22:23         ` Russell King - ARM Linux
2012-02-14 22:38           ` Stephen Warren
2012-02-14 23:21             ` Stephen Warren
2012-02-14 23:34               ` Russell King - ARM Linux
2012-02-15  0:01                 ` Stephen Warren
2012-02-14 18:52 ` Stephen Warren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).