* [PATCH] Optimize multi-CPU tlb flushing a little more
@ 2011-08-23 11:06 Russell King - ARM Linux
2011-09-06 16:53 ` Catalin Marinas
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Russell King - ARM Linux @ 2011-08-23 11:06 UTC (permalink / raw)
To: linux-arm-kernel
The compiler does not conditionalize the assembly instructions for
the tlb operations, which leads to sub-optimal code being generated
when building a kernel for multiple CPUs.
We can tweak things fairly simply as the code fragment below shows:
17f8: e3120001 tst r2, #1 ; 0x1
...
1800: 0a000000 beq 1808 <handle_pte_fault+0x194>
1804: ee061f10 mcr 15, 0, r1, cr6, cr0, {0}
1808: e3120004 tst r2, #4 ; 0x4
180c: 0a000000 beq 1814 <handle_pte_fault+0x1a0>
1810: ee081f36 mcr 15, 0, r1, cr8, cr6, {1}
becomes:
17f0: e3120001 tst r2, #1 ; 0x1
17f4: 1e063f10 mcrne 15, 0, r3, cr6, cr0, {0}
17f8: e3120004 tst r2, #4 ; 0x4
17fc: 1e083f36 mcrne 15, 0, r3, cr8, cr6, {1}
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
arch/arm/include/asm/tlbflush.h | 128 ++++++++++++++++------------------------
1 file changed, 52 insertions(+), 76 deletions(-)
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index d2005de..252874c 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -318,6 +318,18 @@ extern struct cpu_tlb_fns cpu_tlb;
#define tlb_flag(f) ((always_tlb_flags & (f)) || (__tlb_flag & possible_tlb_flags & (f)))
+#define tlb_op(f, regs, arg) \
+ do { \
+ if (always_tlb_flags & (f)) \
+ asm("mcr p15, 0, %0, " regs \
+ : : "r" (arg) : "cc"); \
+ else if (possible_tlb_flags & (f)) \
+ asm("tst %1, %2\n\t" \
+ "mcrne p15, 0, %0, " regs \
+ : : "r" (arg), "r" (__tlb_flag), "Ir" (f) \
+ : "cc"); \
+ } while (0)
+
static inline void local_flush_tlb_all(void)
{
const int zero = 0;
@@ -326,16 +338,11 @@ static inline void local_flush_tlb_all(void)
if (tlb_flag(TLB_WB))
dsb();
- if (tlb_flag(TLB_V3_FULL))
- asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_U_FULL | TLB_V6_U_FULL))
- asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_D_FULL | TLB_V6_D_FULL))
- asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
- asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V7_UIS_FULL))
- asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+ tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+ tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
+ tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
+ tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
+ tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
if (tlb_flag(TLB_BARRIER)) {
dsb();
@@ -352,29 +359,22 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
if (tlb_flag(TLB_WB))
dsb();
- if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
- if (tlb_flag(TLB_V3_FULL))
- asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_U_FULL))
- asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_D_FULL))
- asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V4_I_FULL))
- asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
+ if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
+ cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+ tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+ tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+ tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+ tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
}
put_cpu();
- if (tlb_flag(TLB_V6_U_ASID))
- asm("mcr p15, 0, %0, c8, c7, 2" : : "r" (asid) : "cc");
- if (tlb_flag(TLB_V6_D_ASID))
- asm("mcr p15, 0, %0, c8, c6, 2" : : "r" (asid) : "cc");
- if (tlb_flag(TLB_V6_I_ASID))
- asm("mcr p15, 0, %0, c8, c5, 2" : : "r" (asid) : "cc");
- if (tlb_flag(TLB_V7_UIS_ASID))
+ tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
+ tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
+ tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
#ifdef CONFIG_ARM_ERRATA_720789
- asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+ tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
#else
- asm("mcr p15, 0, %0, c8, c3, 2" : : "r" (asid) : "cc");
+ tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
#endif
if (tlb_flag(TLB_BARRIER))
@@ -392,30 +392,23 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
if (tlb_flag(TLB_WB))
dsb();
- if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
- if (tlb_flag(TLB_V3_PAGE))
- asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V4_U_PAGE))
- asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V4_D_PAGE))
- asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V4_I_PAGE))
- asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
+ if (possible_tlb_flags & (TLB_V3_PAGE|TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) &&
+ cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
+ tlb_op(TLB_V3_PAGE, "c6, c0, 0", uaddr);
+ tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr);
+ tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", uaddr);
+ tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", uaddr);
if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
}
- if (tlb_flag(TLB_V6_U_PAGE))
- asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V6_D_PAGE))
- asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V6_I_PAGE))
- asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
- if (tlb_flag(TLB_V7_UIS_PAGE))
+ tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
+ tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
+ tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
#ifdef CONFIG_ARM_ERRATA_720789
- asm("mcr p15, 0, %0, c8, c3, 3" : : "r" (uaddr & PAGE_MASK) : "cc");
+ tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
#else
- asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (uaddr) : "cc");
+ tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", uaddr);
#endif
if (tlb_flag(TLB_BARRIER))
@@ -432,25 +425,17 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
if (tlb_flag(TLB_WB))
dsb();
- if (tlb_flag(TLB_V3_PAGE))
- asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V4_U_PAGE))
- asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V4_D_PAGE))
- asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V4_I_PAGE))
- asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
+ tlb_op(TLB_V3_PAGE, "c6, c0, 0", kaddr);
+ tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr);
+ tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr);
+ tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr);
if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
- if (tlb_flag(TLB_V6_U_PAGE))
- asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V6_D_PAGE))
- asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V6_I_PAGE))
- asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
- if (tlb_flag(TLB_V7_UIS_PAGE))
- asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (kaddr) : "cc");
+ tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
+ tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
+ tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
+ tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
if (tlb_flag(TLB_BARRIER)) {
dsb();
@@ -475,13 +460,8 @@ static inline void flush_pmd_entry(pmd_t *pmd)
{
const unsigned int __tlb_flag = __cpu_tlb_flags;
- if (tlb_flag(TLB_DCLEAN))
- asm("mcr p15, 0, %0, c7, c10, 1 @ flush_pmd"
- : : "r" (pmd) : "cc");
-
- if (tlb_flag(TLB_L2CLEAN_FR))
- asm("mcr p15, 1, %0, c15, c9, 1 @ L2 flush_pmd"
- : : "r" (pmd) : "cc");
+ tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
+ tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
if (tlb_flag(TLB_WB))
dsb();
@@ -491,15 +471,11 @@ static inline void clean_pmd_entry(pmd_t *pmd)
{
const unsigned int __tlb_flag = __cpu_tlb_flags;
- if (tlb_flag(TLB_DCLEAN))
- asm("mcr p15, 0, %0, c7, c10, 1 @ flush_pmd"
- : : "r" (pmd) : "cc");
-
- if (tlb_flag(TLB_L2CLEAN_FR))
- asm("mcr p15, 1, %0, c15, c9, 1 @ L2 flush_pmd"
- : : "r" (pmd) : "cc");
+ tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
+ tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
}
+#undef tlb_op
#undef tlb_flag
#undef always_tlb_flags
#undef possible_tlb_flags
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
@ 2011-09-06 16:53 ` Catalin Marinas
2012-02-13 16:06 ` Rabin Vincent
2012-02-14 18:52 ` Stephen Warren
2 siblings, 0 replies; 12+ messages in thread
From: Catalin Marinas @ 2011-09-06 16:53 UTC (permalink / raw)
To: linux-arm-kernel
On 23 August 2011 12:06, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> The compiler does not conditionalize the assembly instructions for
> the tlb operations, which leads to sub-optimal code being generated
> when building a kernel for multiple CPUs.
>
> We can tweak things fairly simply as the code fragment below shows:
>
> ? ?17f8: ? ? ? e3120001 ? ? ? ?tst ? ? r2, #1 ?; 0x1
> ...
> ? ?1800: ? ? ? 0a000000 ? ? ? ?beq ? ? 1808 <handle_pte_fault+0x194>
> ? ?1804: ? ? ? ee061f10 ? ? ? ?mcr ? ? 15, 0, r1, cr6, cr0, {0}
> ? ?1808: ? ? ? e3120004 ? ? ? ?tst ? ? r2, #4 ?; 0x4
> ? ?180c: ? ? ? 0a000000 ? ? ? ?beq ? ? 1814 <handle_pte_fault+0x1a0>
> ? ?1810: ? ? ? ee081f36 ? ? ? ?mcr ? ? 15, 0, r1, cr8, cr6, {1}
> becomes:
> ? ?17f0: ? ? ? e3120001 ? ? ? ?tst ? ? r2, #1 ?; 0x1
> ? ?17f4: ? ? ? 1e063f10 ? ? ? ?mcrne ? 15, 0, r3, cr6, cr0, {0}
> ? ?17f8: ? ? ? e3120004 ? ? ? ?tst ? ? r2, #4 ?; 0x4
> ? ?17fc: ? ? ? 1e083f36 ? ? ? ?mcrne ? 15, 0, r3, cr8, cr6, {1}
We need to be careful in this area if a conditional TLB operation is
not supported by the hardware. Conditional undefined instructions may
trigger an undef abort even if the condition fails (though I think
that's the case only on a Qualcomm implementation, but you never know
in the future).
IIUC, with your patch we could get some conditional inner shareable
TLB maintenance on a UP implementation which doesn't have such
operation.
--
Catalin
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
2011-09-06 16:53 ` Catalin Marinas
@ 2012-02-13 16:06 ` Rabin Vincent
2012-02-13 16:23 ` Russell King - ARM Linux
2012-02-14 18:52 ` Stephen Warren
2 siblings, 1 reply; 12+ messages in thread
From: Rabin Vincent @ 2012-02-13 16:06 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Aug 23, 2011 at 16:36, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> @@ -352,29 +359,22 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> ? ? ? ?if (tlb_flag(TLB_WB))
> ? ? ? ? ? ? ? ?dsb();
>
> - ? ? ? if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V3_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_U_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_D_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
> - ? ? ? ? ? ? ? if (tlb_flag(TLB_V4_I_FULL))
> - ? ? ? ? ? ? ? ? ? ? ? asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
> + ? ? ? if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
> + ? ? ? ? ? cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> + ? ? ? ? ? ? ? tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> + ? ? ? ? ? ? ? tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> ? ? ? ?}
> ? ? ? ?put_cpu();
This part conditionally calls get_cpu() but unconditionally calls
put_cpu():
------------[ cut here ]------------
WARNING: at /home/rabin/kernel/arm/kernel/sched/core.c:3051
sub_preempt_count+0xa0/0xe0()
Modules linked in:
[<c0013c4c>] (unwind_backtrace+0x0/0xec) from [<c028b2e4>]
(dump_stack+0x20/0x24)
[<c028b2e4>] (dump_stack+0x20/0x24) from [<c001f808>]
(warn_slowpath_common+0x5c/0x74)
[<c001f808>] (warn_slowpath_common+0x5c/0x74) from [<c001f84c>]
(warn_slowpath_null+0x2c/0x34)
[<c001f84c>] (warn_slowpath_null+0x2c/0x34) from [<c004d588>]
(sub_preempt_count+0xa0/0xe0)
[<c004d588>] (sub_preempt_count+0xa0/0xe0) from [<c0012e54>]
(flush_tlb_mm+0x64/0xa4)
[<c0012e54>] (flush_tlb_mm+0x64/0xa4) from [<c00e25b8>]
(setup_arg_pages+0x260/0x39c)
[<c00e25b8>] (setup_arg_pages+0x260/0x39c) from [<c012000c>]
(load_elf_binary+0x3ec/0x11d8)
[<c012000c>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e78>]
(search_binary_handler+0xd8/0x2c8)
[<c00e1e78>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b5c>]
(do_execve+0x29c/0x3b4)
[<c00e3b5c>] (do_execve+0x29c/0x3b4) from [<c0011080>]
(kernel_execve+0x48/0x90)
[<c0011080>] (kernel_execve+0x48/0x90) from [<c0037274>]
(____call_usermodehelper+0x118/0x130)
[<c0037274>] (____call_usermodehelper+0x118/0x130) from [<c000e784>]
(kernel_thread_exit+0x0/0x8)
---[ end trace 309cd09e6562b6c8 ]---
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-13 16:06 ` Rabin Vincent
@ 2012-02-13 16:23 ` Russell King - ARM Linux
2012-02-13 16:59 ` Rabin Vincent
0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-13 16:23 UTC (permalink / raw)
To: linux-arm-kernel
On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> This part conditionally calls get_cpu() but unconditionally calls
> put_cpu():
Can you try getting rid of put_cpu(), and replacing get_cpu()
with smp_processor_id().
In theory, this function should be called with preempt disabled so
smp_processor_id() should be safe here.
Thanks.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-13 16:23 ` Russell King - ARM Linux
@ 2012-02-13 16:59 ` Rabin Vincent
2012-02-14 21:59 ` Stephen Warren
0 siblings, 1 reply; 12+ messages in thread
From: Rabin Vincent @ 2012-02-13 16:59 UTC (permalink / raw)
To: linux-arm-kernel
On Mon, Feb 13, 2012 at 04:23:59PM +0000, Russell King - ARM Linux wrote:
> On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> > This part conditionally calls get_cpu() but unconditionally calls
> > put_cpu():
>
> Can you try getting rid of put_cpu(), and replacing get_cpu()
> with smp_processor_id().
>
> In theory, this function should be called with preempt disabled so
> smp_processor_id() should be safe here.
Just deleting put_cpu() and replacing get_cpu() with smp_processor_id()
doesn't cause any problems on my setup, because in my setup (v7 SMP) the
test for possible_tlb_flags fails so
cpu_mask_test_cpu(smp_processor_id(),...) never gets called.
However, local_flush_tlb_mm() itself does not appear to be called with
preemption disabled, because if I reverse the check so that
smp_processor_id() is called, I get loads of the following:
BUG: using smp_processor_id() in preemptible [00000000] code: init/1
[<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
[<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
[<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
[<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00e2578>] (setup_arg_pages+0x260/0x39c)
[<c00e2578>] (setup_arg_pages+0x260/0x39c) from [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8)
[<c011ffcc>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e38>] (search_binary_handler+0xd8/0x2c8)
[<c00e1e38>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b1c>] (do_execve+0x29c/0x3b4)
[<c00e3b1c>] (do_execve+0x29c/0x3b4) from [<c0011080>] (kernel_execve+0x48/0x90)
[<c0011080>] (kernel_execve+0x48/0x90) from [<c028b080>] (run_init_process+0x24/0x2c)
[<c028b080>] (run_init_process+0x24/0x2c) from [<c028b0e4>] (init_post+0x5c/0xd4)
[<c028b0e4>] (init_post+0x5c/0xd4) from [<c039bb04>] (kernel_init+0x174/0x1ac)
BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/27
[<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
[<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
[<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
[<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00ca7e4>] (exit_mmap+0x134/0x1e8)
[<c00ca7e4>] (exit_mmap+0x134/0x1e8) from [<c001d4b0>] (mmput+0x60/0xf0)
[<c001d4b0>] (mmput+0x60/0xf0) from [<c00217bc>] (exit_mm+0x124/0x12c)
[<c00217bc>] (exit_mm+0x124/0x12c) from [<c0023084>] (do_exit+0x1e8/0x7b8)
[<c0023084>] (do_exit+0x1e8/0x7b8) from [<c0023944>] (do_group_exit+0x98/0xc4)
[<c0023944>] (do_group_exit+0x98/0xc4) from [<c0023990>] (__wake_up_parent+0x0/0x30)
[<c0023990>] (__wake_up_parent+0x0/0x30) from [<c000dca0>] (ret_fast_syscall+0x0/0x3c)
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
2011-09-06 16:53 ` Catalin Marinas
2012-02-13 16:06 ` Rabin Vincent
@ 2012-02-14 18:52 ` Stephen Warren
2 siblings, 0 replies; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 18:52 UTC (permalink / raw)
To: linux-arm-kernel
Russell King wrote at Tuesday, August 23, 2011 5:06 AM:
> The compiler does not conditionalize the assembly instructions for
> the tlb operations, which leads to sub-optimal code being generated
> when building a kernel for multiple CPUs.
>
> We can tweak things fairly simply as the code fragment below shows:
>
> 17f8: e3120001 tst r2, #1 ; 0x1
> ...
> 1800: 0a000000 beq 1808 <handle_pte_fault+0x194>
> 1804: ee061f10 mcr 15, 0, r1, cr6, cr0, {0}
> 1808: e3120004 tst r2, #4 ; 0x4
> 180c: 0a000000 beq 1814 <handle_pte_fault+0x1a0>
> 1810: ee081f36 mcr 15, 0, r1, cr8, cr6, {1}
> becomes:
> 17f0: e3120001 tst r2, #1 ; 0x1
> 17f4: 1e063f10 mcrne 15, 0, r3, cr6, cr0, {0}
> 17f8: e3120004 tst r2, #4 ; 0x4
> 17fc: 1e083f36 mcrne 15, 0, r3, cr8, cr6, {1}
Russell,
This change causes Tegra to fail to boot, starting with next-20120209.
The spew is shown below. I tested 10 times and got the same failure,
then reverted this patch and tested another 10 times without a failure.
I've eliminated other factors such as SD card, board, and compiler.
Is there a bug in the patch, or is it exposing some pre-existing bug in
the Tegra code?
Thanks.
[ 3.179569] VFS: Mounted root (ext3 filesystem) readonly on device 179:1.
[ 3.187189] Freeing init memory: 140K
[ 3.191124] kjournald starting. Commit interval 5 seconds
[ 3.227325] ------------[ cut here ]------------
[ 3.232026] WARNING: at kernel/mutex.c:314 __mutex_unlock_slowpath+0x60/0x14c()
[ 3.239355] Modules linked in:
[ 3.242627] [<c0013d80>] (unwind_backtrace+0x0/0x120) from [<c002519c>] (warn_slowpath_common+0x4c/0x64)
[ 3.252164] [<c002519c>] (warn_slowpath_common+0x4c/0x64) from [<c00251cc>] (warn_slowpath_null+0x18/0x1c)
[ 3.261882] [<c00251cc>] (warn_slowpath_null+0x18/0x1c) from [<c02e3c40>] (__mutex_unlock_slowpath+0x60/0x14c)
[ 3.271942] [<c02e3c40>] (__mutex_unlock_slowpath+0x60/0x14c) from [<c0095ff0>] (expand_downwards+0xc0/0xc8)
[ 3.282111] [<c0095ff0>] (expand_downwards+0xc0/0xc8) from [<c00ac910>] (setup_arg_pages+0x1a0/0x1d4)
[ 3.291412] [<c00ac910>] (setup_arg_pages+0x1a0/0x1d4) from [<c00e3d84>] (load_elf_binary+0x3d8/0xaa0)
[ 3.300781] [<c00e3d84>] (load_elf_binary+0x3d8/0xaa0) from [<c00abbd0>] (search_binary_handler+0x84/0x274)
[ 3.310580] [<c00abbd0>] (search_binary_handler+0x84/0x274) from [<c00ad5a0>] (do_execve_common+0x190/0x2a8)
[ 3.320511] [<c00ad5a0>] (do_execve_common+0x190/0x2a8) from [<c00ad6c0>] (do_execve+0x8/0xc)
[ 3.329113] [<c00ad6c0>] (do_execve+0x8/0xc) from [<c0011084>] (kernel_execve+0x38/0x80)
[ 3.337258] [<c0011084>] (kernel_execve+0x38/0x80) from [<c0008630>] (init_post+0x80/0xc4)
[ 3.345588] [<c0008630>] (init_post+0x80/0xc4) from [<c040497c>] (kernel_init+0x16c/0x1b0)
[ 3.353878] ---[ end trace 6d8d467ddbc23912 ]---
[ 3.358918] ------------[ cut here ]------------
[ 3.363570] kernel BUG at include/linux/pagemap.h:134!
[ 3.368734] Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP ARM
[ 3.376144] Modules linked in:
[ 3.379242] CPU: 0 Tainted: G W (3.3.0-rc2-next-20120209+ #1)
[ 3.386259] PC is at find_get_page+0x68/0xf4
[ 3.390556] LR is at find_get_page+0x2c/0xf4
[ 3.394857] pc : [<c0077fe4>] lr : [<c0077fa8>] psr: 20000013
[ 3.394877] sp : df82dc50 ip : 00000000 fp : 00000000
[ 3.406375] r10: 00000000 r9 : df46def0 r8 : df46dd70
[ 3.411621] r7 : c06c75bc r6 : 000280ff r5 : df46def4 r4 : df82c000
[ 3.418171] r3 : 07ffff00 r2 : 00000001 r1 : 000280ff r0 : df40e564
[ 3.424725] Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user
[ 3.431886] Control: 10c5387d Table: 1fa6404a DAC: 00000015
[ 3.437653] Process init (pid: 1, stack limit = 0xdf82c2f0)
[ 3.443247] Stack: (0xdf82dc50 to 0xdf82e000)
[ 3.447633] dc40: 000280ff 00000000 df46de10 000280ff
[ 3.455850] dc60: df46dd70 c00cb60c df40f4f8 00000000 00000000 00001000 00000000 c00cb774
[ 3.464066] dc80: ffffffff df40f4f8 00000000 c06c75bc 000280ff 00000000 df46dd70 00000000
[ 3.472282] dca0: 000280ff 00000000 00000fff c00cbcd0 00001000 00000003 000280ff 00001000
[ 3.480499] dcc0: df46dd70 c06c75bc 00000000 c00ceb5c 00001000 00001000 00000005 c00cbcd0
[ 3.488715] dce0: 00001000 00091c00 000280ff 000280ff 00000000 00001000 df46dd70 dfa11400
[ 3.496931] dd00: df410560 df82dd5c 000280ff 00000000 dfa10200 00000005 c0ba50a0 c00f8dc8
[ 3.505148] dd20: 00001000 c00bb8ec df410560 df4105a8 c0b94210 df410560 df4104b0 dfa10000
[ 3.513362] dd40: 00000000 00000001 df82de0c df46fc78 00000001 c00fa404 00000000 00000000
[ 3.521577] dd60: 00000c00 00000005 df46cb68 0000a14d df46cb68 df46fc08 00000000 00000001
[ 3.529795] dd80: df46fc78 c00ff43c df46fc08 c0426060 df46cb68 df46fc08 df82de90 c00afe0c
[ 3.538013] dda0: df82de90 df82de04 df469a88 c00b1c8c df813f00 df82ddf0 00001000 c0089ce0
[ 3.546232] ddc0: df804f00 df82de90 df82c000 c0385677 c0385678 0024a603 df82ddf0 0000002f
[ 3.554449] dde0: df82c000 c00b1f68 df804f00 df8e0a18 0024a603 00000003 c0385673 80000013
[ 3.562667] de00: c00a82c8 c00a489c c0447d6c df46fc08 df8267b0 df82de90 c02eda00 c0385672
[ 3.570884] de20: ffffff9c df82c000 00000000 df82df60 df82c000 c00b3768 df82de5c df800200
[ 3.579102] de40: df807ec0 c00a3094 df800200 dfa41740 c00e43d8 df800200 df807ec0 00000000
[ 3.587318] de60: c00e43d8 c02eda00 00000001 ffffff9c df82de90 c0385672 00000000 df82df60
[ 3.595536] de80: df82c000 c00b3b04 00000041 c00e43d8 df813f00 df469a88 000176fc 000176fc
[ 3.603751] dea0: df804a80 df813f00 df469a88 df46fc08 00000011 00000002 00000001 00000000
[ 3.611968] dec0: 000000d0 c00a3e2c df804a80 df8f8898 c0042c60 df804a80 000000d0 60000013
[ 3.620185] dee0: c0042c60 00020020 00000000 df8e0a20 c0385672 c044afe8 c044b070 dfa06980
[ 3.628401] df00: 00000000 c00ac4cc c0385672 00000000 00000012 c0385672 c044afe8 c044b070
[ 3.636618] df20: dfa06980 c00ad4d8 c0385672 00000000 c044b070 df82df60 c044afe8 c0385672
[ 3.644834] df40: c044b070 0000005f c0425148 c0404180 00000000 c00ad6c0 df82df60 c0011084
[ 3.653044] df60: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 3.661253] df80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 3.669469] dfa0: 00000000 00000000 c047d4c0 c047d4c0 c04202a0 c0420280 c047d4c0 c0008638
[ 3.677684] dfc0: c047d4c0 c040497c 00000007 00000007 c0404180 00000000 00000000 c0404810
[ 3.685897] dfe0: c000ee6c 00000013 00000000 00000000 00000000 c000ee6c 00000000 00000000
[ 3.694164] [<c0077fe4>] (find_get_page+0x68/0xf4) from [<c00cb60c>] (__find_get_block_slow+0x3c/0x158)
[ 3.703625] [<c00cb60c>] (__find_get_block_slow+0x3c/0x158) from [<c00cbcd0>] (__find_get_block+0x34/0x50)
[ 3.713335] [<c00cbcd0>] (__find_get_block+0x34/0x50) from [<c00ceb5c>] (__getblk_slow+0xb0/0x160)
[ 3.722361] [<c00ceb5c>] (__getblk_slow+0xb0/0x160) from [<c00f8dc8>] (__ext3_get_inode_loc+0xd8/0x310)
[ 3.731814] [<c00f8dc8>] (__ext3_get_inode_loc+0xd8/0x310) from [<c00fa404>] (ext3_iget+0x40/0x3c8)
[ 3.740918] [<c00fa404>] (ext3_iget+0x40/0x3c8) from [<c00ff43c>] (ext3_lookup+0x7c/0xcc)
[ 3.749156] [<c00ff43c>] (ext3_lookup+0x7c/0xcc) from [<c00afe0c>] (d_alloc_and_lookup+0x44/0x60)
[ 3.758084] [<c00afe0c>] (d_alloc_and_lookup+0x44/0x60) from [<c00b1c8c>] (do_lookup+0x1f4/0x314)
[ 3.767010] [<c00b1c8c>] (do_lookup+0x1f4/0x314) from [<c00b1f68>] (link_path_walk+0x1bc/0x7a8)
[ 3.775760] [<c00b1f68>] (link_path_walk+0x1bc/0x7a8) from [<c00b3768>] (path_openat+0x9c/0x354)
[ 3.784598] [<c00b3768>] (path_openat+0x9c/0x354) from [<c00b3b04>] (do_filp_open+0x30/0x7c)
[ 3.793099] [<c00b3b04>] (do_filp_open+0x30/0x7c) from [<c00ac4cc>] (open_exec+0x18/0xe8)
[ 3.801328] [<c00ac4cc>] (open_exec+0x18/0xe8) from [<c00ad4d8>] (do_execve_common+0xc8/0x2a8)
[ 3.809982] [<c00ad4d8>] (do_execve_common+0xc8/0x2a8) from [<c00ad6c0>] (do_execve+0x8/0xc)
[ 3.818484] [<c00ad6c0>] (do_execve+0x8/0xc) from [<c0011084>] (kernel_execve+0x38/0x80)
[ 3.826626] [<c0011084>] (kernel_execve+0x38/0x80) from [<c0008638>] (init_post+0x88/0xc4)
[ 3.834948] [<c0008638>] (init_post+0x88/0xc4) from [<c040497c>] (kernel_init+0x16c/0x1b0)
[ 3.843252] Code: e3c3333e e3c330ff e3530000 0a000000 (e7f001f2)
[ 3.852770] ---[ end trace 6d8d467ddbc23913 ]---
[ 3.857378] Kernel panic - not syncing: Fatal exception in interrupt
--
nvpublic
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-13 16:59 ` Rabin Vincent
@ 2012-02-14 21:59 ` Stephen Warren
2012-02-14 22:23 ` Russell King - ARM Linux
0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 21:59 UTC (permalink / raw)
To: linux-arm-kernel
Rabin Vincent wrote at Monday, February 13, 2012 9:59 AM:
> On Mon, Feb 13, 2012 at 04:23:59PM +0000, Russell King - ARM Linux wrote:
> > On Mon, Feb 13, 2012 at 09:36:35PM +0530, Rabin Vincent wrote:
> > > This part conditionally calls get_cpu() but unconditionally calls
> > > put_cpu():
> >
> > Can you try getting rid of put_cpu(), and replacing get_cpu()
> > with smp_processor_id().
> >
> > In theory, this function should be called with preempt disabled so
> > smp_processor_id() should be safe here.
>
> Just deleting put_cpu() and replacing get_cpu() with smp_processor_id()
> doesn't cause any problems on my setup, because in my setup (v7 SMP) the
> test for possible_tlb_flags fails so
> cpu_mask_test_cpu(smp_processor_id(),...) never gets called.
Aha, I should have searched harder for my stack trace... This thread
describes the exact problem I just posted about a couple of hours ago,
and Russell's suggested fix solves my problem.
> However, local_flush_tlb_mm() itself does not appear to be called with
> preemption disabled, because if I reverse the check so that
> smp_processor_id() is called, I get loads of the following:
I also tried that, and /don't/ see the following stack traces on Tegra,
FWIW.
> BUG: using smp_processor_id() in preemptible [00000000] code: init/1
> [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
> [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
> [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
> [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00e2578>] (setup_arg_pages+0x260/0x39c)
> [<c00e2578>] (setup_arg_pages+0x260/0x39c) from [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8)
> [<c011ffcc>] (load_elf_binary+0x3ec/0x11d8) from [<c00e1e38>] (search_binary_handler+0xd8/0x2c8)
> [<c00e1e38>] (search_binary_handler+0xd8/0x2c8) from [<c00e3b1c>] (do_execve+0x29c/0x3b4)
> [<c00e3b1c>] (do_execve+0x29c/0x3b4) from [<c0011080>] (kernel_execve+0x48/0x90)
> [<c0011080>] (kernel_execve+0x48/0x90) from [<c028b080>] (run_init_process+0x24/0x2c)
> [<c028b080>] (run_init_process+0x24/0x2c) from [<c028b0e4>] (init_post+0x5c/0xd4)
> [<c028b0e4>] (init_post+0x5c/0xd4) from [<c039bb04>] (kernel_init+0x174/0x1ac)
>
> BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/27
> [<c0013c0c>] (unwind_backtrace+0x0/0xec) from [<c028b2a4>] (dump_stack+0x20/0x24)
> [<c028b2a4>] (dump_stack+0x20/0x24) from [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8)
> [<c019bbb4>] (debug_smp_processor_id+0xc4/0xf8) from [<c0012e30>] (flush_tlb_mm+0x60/0x84)
> [<c0012e30>] (flush_tlb_mm+0x60/0x84) from [<c00ca7e4>] (exit_mmap+0x134/0x1e8)
> [<c00ca7e4>] (exit_mmap+0x134/0x1e8) from [<c001d4b0>] (mmput+0x60/0xf0)
> [<c001d4b0>] (mmput+0x60/0xf0) from [<c00217bc>] (exit_mm+0x124/0x12c)
> [<c00217bc>] (exit_mm+0x124/0x12c) from [<c0023084>] (do_exit+0x1e8/0x7b8)
> [<c0023084>] (do_exit+0x1e8/0x7b8) from [<c0023944>] (do_group_exit+0x98/0xc4)
> [<c0023944>] (do_group_exit+0x98/0xc4) from [<c0023990>] (__wake_up_parent+0x0/0x30)
> [<c0023990>] (__wake_up_parent+0x0/0x30) from [<c000dca0>] (ret_fast_syscall+0x0/0x3c)
--
nvpublic
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-14 21:59 ` Stephen Warren
@ 2012-02-14 22:23 ` Russell King - ARM Linux
2012-02-14 22:38 ` Stephen Warren
0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-14 22:23 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Feb 14, 2012 at 01:59:34PM -0800, Stephen Warren wrote:
> Aha, I should have searched harder for my stack trace... This thread
> describes the exact problem I just posted about a couple of hours ago,
> and Russell's suggested fix solves my problem.
Here's a better fix:
arch/arm/include/asm/tlbflush.h | 15 ++++++++-------
1 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index bb6408a..01f7484 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -359,14 +359,15 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
if (tlb_flag(TLB_WB))
dsb();
- if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
- cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
- tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
- tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
- tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
- tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+ if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
+ if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+ tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+ tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+ tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+ tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+ }
+ put_cpu();
}
- put_cpu();
tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-14 22:23 ` Russell King - ARM Linux
@ 2012-02-14 22:38 ` Stephen Warren
2012-02-14 23:21 ` Stephen Warren
0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 22:38 UTC (permalink / raw)
To: linux-arm-kernel
Russell King wrote at Tuesday, February 14, 2012 3:24 PM:
> On Tue, Feb 14, 2012 at 01:59:34PM -0800, Stephen Warren wrote:
> > Aha, I should have searched harder for my stack trace... This thread
> > describes the exact problem I just posted about a couple of hours ago,
> > and Russell's suggested fix solves my problem.
>
> Here's a better fix:
> ...
Tested-by: Stephen Warren <swarren@nvidia.com>
--
nvpublic
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-14 22:38 ` Stephen Warren
@ 2012-02-14 23:21 ` Stephen Warren
2012-02-14 23:34 ` Russell King - ARM Linux
0 siblings, 1 reply; 12+ messages in thread
From: Stephen Warren @ 2012-02-14 23:21 UTC (permalink / raw)
To: linux-arm-kernel
Russell,
One more query about the original patch; in the following chunk:
@@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
{
const unsigned int __tlb_flag = __cpu_tlb_flags;
- if (tlb_flag(TLB_DCLEAN))
- asm("mcr p15, 0, %0, c7, c10, 1 @ flush_pmd"
- : : "r" (pmd) : "cc");
-
- if (tlb_flag(TLB_L2CLEAN_FR))
- asm("mcr p15, 1, %0, c15, c9, 1 @ L2 flush_pmd"
- : : "r" (pmd) : "cc");
+ tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
+ tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
}
You'll notice that the second mcr instruction is passed "p15, 1, ...".
However, the replacement code in tlb_op() always passes "p15, 0, ..."
to mcr/mcrne. I assume this is a problem?
The same thing applies to flush_pmd_entry() too.
--
nvpublic
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-14 23:21 ` Stephen Warren
@ 2012-02-14 23:34 ` Russell King - ARM Linux
2012-02-15 0:01 ` Stephen Warren
0 siblings, 1 reply; 12+ messages in thread
From: Russell King - ARM Linux @ 2012-02-14 23:34 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Feb 14, 2012 at 03:21:52PM -0800, Stephen Warren wrote:
> Russell,
>
> One more query about the original patch; in the following chunk:
>
> @@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
> {
> const unsigned int __tlb_flag = __cpu_tlb_flags;
>
> - if (tlb_flag(TLB_DCLEAN))
> - asm("mcr p15, 0, %0, c7, c10, 1 @ flush_pmd"
> - : : "r" (pmd) : "cc");
> -
> - if (tlb_flag(TLB_L2CLEAN_FR))
> - asm("mcr p15, 1, %0, c15, c9, 1 @ L2 flush_pmd"
> - : : "r" (pmd) : "cc");
> + tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
> + tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
> }
>
> You'll notice that the second mcr instruction is passed "p15, 1, ...".
> However, the replacement code in tlb_op() always passes "p15, 0, ..."
> to mcr/mcrne. I assume this is a problem?
>
> The same thing applies to flush_pmd_entry() too.
Damn it. Well spotted, yes this needs fixing. Here's an updated patch.
arch/arm/include/asm/tlbflush.h | 28 ++++++++++++++++------------
1 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index bb6408a..1f1d2ed 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -318,18 +318,21 @@ extern struct cpu_tlb_fns cpu_tlb;
#define tlb_flag(f) ((always_tlb_flags & (f)) || (__tlb_flag & possible_tlb_flags & (f)))
-#define tlb_op(f, regs, arg) \
+#define __tlb_op(f, insnarg, arg) \
do { \
if (always_tlb_flags & (f)) \
- asm("mcr p15, 0, %0, " regs \
+ asm("mcr " insnarg \
: : "r" (arg) : "cc"); \
else if (possible_tlb_flags & (f)) \
asm("tst %1, %2\n\t" \
- "mcrne p15, 0, %0, " regs \
+ "mcrne " insnarg \
: : "r" (arg), "r" (__tlb_flag), "Ir" (f) \
: "cc"); \
} while (0)
+#define tlb_op(f, regs, arg) __tlb_op(f, "p15, 0, %0, " regs, arg)
+#define tlb_l2_op(f, regs_arg) __tlb_op(f, "p15, 1, %0, " regs, arg)
+
static inline void local_flush_tlb_all(void)
{
const int zero = 0;
@@ -359,14 +362,15 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
if (tlb_flag(TLB_WB))
dsb();
- if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL) &&
- cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
- tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
- tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
- tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
- tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+ if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
+ if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+ tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+ tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+ tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+ tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+ }
+ put_cpu();
}
- put_cpu();
tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
@@ -461,7 +465,7 @@ static inline void flush_pmd_entry(void *pmd)
const unsigned int __tlb_flag = __cpu_tlb_flags;
tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
- tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
+ tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
if (tlb_flag(TLB_WB))
dsb();
@@ -472,7 +476,7 @@ static inline void clean_pmd_entry(void *pmd)
const unsigned int __tlb_flag = __cpu_tlb_flags;
tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
- tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
+ tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
}
#undef tlb_op
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH] Optimize multi-CPU tlb flushing a little more
2012-02-14 23:34 ` Russell King - ARM Linux
@ 2012-02-15 0:01 ` Stephen Warren
0 siblings, 0 replies; 12+ messages in thread
From: Stephen Warren @ 2012-02-15 0:01 UTC (permalink / raw)
To: linux-arm-kernel
Russell King wrote at Tuesday, February 14, 2012 4:34 PM:
> On Tue, Feb 14, 2012 at 03:21:52PM -0800, Stephen Warren wrote:
> > Russell,
> >
> > One more query about the original patch; in the following chunk:
> >
> > @@ -491,15 +471,11 @@ static inline void clean_pmd_entry(void *pmd)
> > {
> > const unsigned int __tlb_flag = __cpu_tlb_flags;
> >
> > - if (tlb_flag(TLB_DCLEAN))
> > - asm("mcr p15, 0, %0, c7, c10, 1 @ flush_pmd"
> > - : : "r" (pmd) : "cc");
> > -
> > - if (tlb_flag(TLB_L2CLEAN_FR))
> > - asm("mcr p15, 1, %0, c15, c9, 1 @ L2 flush_pmd"
> > - : : "r" (pmd) : "cc");
> > + tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
> > + tlb_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
> > }
> >
> > You'll notice that the second mcr instruction is passed "p15, 1, ...".
> > However, the replacement code in tlb_op() always passes "p15, 0, ..."
> > to mcr/mcrne. I assume this is a problem?
> >
> > The same thing applies to flush_pmd_entry() too.
>
> Damn it. Well spotted, yes this needs fixing. Here's an updated patch.
> ...
That looks reasonable to me.
I didn't retest, since I didn't observe any problems due to this issue.
--
nvpublic
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2012-02-15 0:01 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-23 11:06 [PATCH] Optimize multi-CPU tlb flushing a little more Russell King - ARM Linux
2011-09-06 16:53 ` Catalin Marinas
2012-02-13 16:06 ` Rabin Vincent
2012-02-13 16:23 ` Russell King - ARM Linux
2012-02-13 16:59 ` Rabin Vincent
2012-02-14 21:59 ` Stephen Warren
2012-02-14 22:23 ` Russell King - ARM Linux
2012-02-14 22:38 ` Stephen Warren
2012-02-14 23:21 ` Stephen Warren
2012-02-14 23:34 ` Russell King - ARM Linux
2012-02-15 0:01 ` Stephen Warren
2012-02-14 18:52 ` Stephen Warren
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).