linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
@ 2015-12-11  6:26 Nicolas Pitre
  2015-12-11  9:24 ` Arnd Bergmann
  0 siblings, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-11  6:26 UTC (permalink / raw)
  To: linux-arm-kernel


The ARM compiler inserts calls to __aeabi_uidiv() and
__aeabi_idiv() when it needs to perform division on signed and
unsigned integers. If a processor has support for the udiv and
sdiv instructions, the kernel may overwrite the beginning of those
functions with those instructions and a "bx lr" to get better
performance.

To ensure those functions are aligned to a 32-bit word for easier
patching (which might not always be the case in Thumb mode) and the
two patched instructions for each case are contained in the same cache
line, a 8-byte alignment is enforced when ARM_PATCH_IDIV is configured.

This was heavily inspired by a previous patch by Stephen Boyd.

Signed-off-by: Nicolas Pitre <nico@linaro.org>

---

This is my counter proposal to Stephen's version. Those patches and the 
discussion that followed are available here:

  http://news.gmane.org/group/gmane.linux.kbuild.devel/thread=14007

I think what I propose here is much simpler and less intrusive.  Going
to the full call site patching provides between moderate to no 
additional performance benefits depending on the CPU core.  That should 
probably be compared with this solution with benchmark results to 
determine if it is worth it.

Of course the ultimate performance will come from having gcc emit those 
div instructions directly, but that would make the kernel binary 
incompatible with some systems in a multi-arch config. Hence it has to 
be a separate config option.


diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 34e1569a11..efea5fa975 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1604,6 +1604,21 @@ config THUMB2_AVOID_R_ARM_THM_JUMP11
 config ARM_ASM_UNIFIED
 	bool
 
+config ARM_PATCH_IDIV
+	bool "Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()"
+	depends on CPU_32v7 && !XIP_KERNEL
+	default y
+	help
+	  Some v7 CPUs have support for the udiv and sdiv instructions
+	  that can be used to implement the __aeabi_uidiv and __aeabi_idiv
+	  functions provided by the ARM runtime ABI.
+
+	  Enabling this option allows the kernel to modify itself to replace
+	  the first two instructions of these library functions with the
+	  udiv or sdiv plus "bx lr" instructions. Typically this will be
+	  faster and less power intensive than running the original library
+	  support code to do integer division.
+
 config AEABI
 	bool "Use the ARM EABI to compile the kernel"
 	help
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index 85e374f873..48c77d422a 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -250,8 +250,14 @@ static inline int cpu_is_pj4(void)
 
 	return 0;
 }
+
+static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
+{
+	return read_cpuid_part() == 0x56005810;
+}
 #else
-#define cpu_is_pj4()	0
+#define cpu_is_pj4()		0
+#define cpu_is_pj4_nomp()	0
 #endif
 
 static inline int __attribute_const__ cpuid_feature_extract_field(u32 features,
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 20edd349d3..d8614775a8 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -375,6 +375,77 @@ void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+
+/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
+static inline u32 __attribute_const__ sdiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		u32 insn = __opcode_thumb32_compose(0xfb90, 0xf0f1);
+		if (cpu_is_pj4_nomp())
+			insn = __opcode_thumb32_compose(0xee30, 0x0691);
+		return __opcode_to_mem_thumb32(insn);
+	}
+
+	if (cpu_is_pj4_nomp())
+		return __opcode_to_mem_arm(0xee300691);
+	return __opcode_to_mem_arm(0xe710f110);
+}
+
+/* "udiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 0" if we're on pj4 w/o MP */
+static inline u32 __attribute_const__ udiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		u32 insn = __opcode_thumb32_compose(0xfbb0, 0xf0f1);
+		if (cpu_is_pj4_nomp())
+			insn = __opcode_thumb32_compose(0xee30, 0x0611);
+		return __opcode_to_mem_thumb32(insn);
+	}
+
+	if (cpu_is_pj4_nomp())
+		return __opcode_to_mem_arm(0xee300611);
+	return __opcode_to_mem_arm(0xe730f110);
+}
+
+/* "bx lr" */
+static inline u32 __attribute_const__ bx_lr_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		u32 insn = __opcode_thumb32_compose(0x4770, 0x46c0);
+		return __opcode_to_mem_thumb32(insn);
+	}
+	
+	return __opcode_to_mem_arm(0xe12fff1e);
+}
+
+static void __init patch_aeabi_uidiv(void)
+{
+	extern void __aeabi_uidiv(void);
+	extern void __aeabi_idiv(void);
+	uintptr_t fn_addr;
+	unsigned int mask;
+
+	mask = IS_ENABLED(CONFIG_THUMB2_KERNEL) ? HWCAP_IDIVT : HWCAP_IDIVA;
+	if (!(elf_hwcap & mask))
+		return;
+
+	pr_info("CPU: div instructions available: patching division code\n");
+
+	fn_addr = ((uintptr_t)&__aeabi_uidiv) & ~1;
+	((u32 *)fn_addr)[0] = udiv_instruction();
+	((u32 *)fn_addr)[1] = bx_lr_instruction();
+	flush_icache_range(fn_addr, fn_addr + 8);
+
+	fn_addr = ((uintptr_t)&__aeabi_idiv) & ~1;
+	((u32 *)fn_addr)[0] = sdiv_instruction();
+	((u32 *)fn_addr)[1] = bx_lr_instruction();
+	flush_icache_range(fn_addr, fn_addr + 8);
+}
+
+#else
+static inline void patch_aeabi_uidiv(void) { }
+#endif
+
 static void __init cpuid_init_hwcaps(void)
 {
 	int block;
@@ -642,6 +713,7 @@ static void __init setup_processor(void)
 	elf_hwcap = list->elf_hwcap;
 
 	cpuid_init_hwcaps();
+	patch_aeabi_uidiv();
 
 #ifndef CONFIG_ARM_THUMB
 	elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index af2267f6a5..9397b2e532 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,6 +205,10 @@ Boston, MA 02111-1307, USA.  */
 .endm
 
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+	.align	3
+#endif
+
 ENTRY(__udivsi3)
 ENTRY(__aeabi_uidiv)
 UNWIND(.fnstart)
@@ -253,6 +257,10 @@ UNWIND(.fnstart)
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+	.align 3
+#endif
+
 ENTRY(__divsi3)
 ENTRY(__aeabi_idiv)
 UNWIND(.fnstart)

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11  6:26 [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv() Nicolas Pitre
@ 2015-12-11  9:24 ` Arnd Bergmann
  2015-12-11 17:10   ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-11  9:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 01:26:25 Nicolas Pitre wrote:

> This is my counter proposal to Stephen's version. Those patches and the 
> discussion that followed are available here:
> 
>   http://news.gmane.org/group/gmane.linux.kbuild.devel/thread=14007
> 
> I think what I propose here is much simpler and less intrusive.  Going
> to the full call site patching provides between moderate to no 
> additional performance benefits depending on the CPU core.  That should 
> probably be compared with this solution with benchmark results to 
> determine if it is worth it.

Looks really nice, as it's much simpler than Stephen's approach

> Of course the ultimate performance will come from having gcc emit those 
> div instructions directly, but that would make the kernel binary 
> incompatible with some systems in a multi-arch config. Hence it has to 
> be a separate config option.

Well, what we found is that we already have the same problem today
when enabling LPAE makes the kernel incompatible with platforms that
can be enabled. The affected platforms are basically the same, except
for the earlier PJ4 and Krait variants that have some idiv support
but no LPAE.

> diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
> index 85e374f873..48c77d422a 100644
> --- a/arch/arm/include/asm/cputype.h
> +++ b/arch/arm/include/asm/cputype.h
> @@ -250,8 +250,14 @@ static inline int cpu_is_pj4(void)
>  
>  	return 0;
>  }
> +
> +static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
> +{
> +	return read_cpuid_part() == 0x56005810;
> +}
>  #else
> -#define cpu_is_pj4()	0
> +#define cpu_is_pj4()		0
> +#define cpu_is_pj4_nomp()	0
>  #endif

It would be nice to use a symbolic constant for the above, and maybe define
all three known cpuid numbers for PJ4 variants.

I've looked at it some more and I now have doubts about this code:

#ifdef CONFIG_CPU_PJ4B
        .type   __v7_pj4b_proc_info, #object
__v7_pj4b_proc_info:
        .long   0x560f5800
        .long   0xff0fff00
        __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
        .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
#endif


Can someone have a look and tell me that I'm wrong when I read this
as matching both PJ4 and PJ4B (and PJ4B-MP)?

Either I'm misreading this, or we do the wrong thing in configurations
that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).

> +#ifdef CONFIG_ARM_PATCH_IDIV
> +
> +/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
> +static inline u32 __attribute_const__ sdiv_instruction(void)
> +{
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> +		u32 insn = __opcode_thumb32_compose(0xfb90, 0xf0f1);
> +		if (cpu_is_pj4_nomp())
> +			insn = __opcode_thumb32_compose(0xee30, 0x0691);
> +		return __opcode_to_mem_thumb32(insn);
> +	}

Strictly speaking, we can ignore the thumb instruction for pj4, as all
variants also support the normal T2 opcode for udiv/sdiv.

> diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
> index af2267f6a5..9397b2e532 100644
> --- a/arch/arm/lib/lib1funcs.S
> +++ b/arch/arm/lib/lib1funcs.S
> @@ -205,6 +205,10 @@ Boston, MA 02111-1307, USA.  */
>  .endm
>  
>  
> +#ifdef CONFIG_ARM_PATCH_IDIV
> +	.align	3
> +#endif
> +
>  ENTRY(__udivsi3)
>  ENTRY(__aeabi_uidiv)
>  UNWIND(.fnstart)

I'd probably just leave out the #ifdef and always align this, but it's not
important.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11  9:24 ` Arnd Bergmann
@ 2015-12-11 17:10   ` Nicolas Pitre
  2015-12-11 17:22     ` Nicolas Pitre
  2015-12-11 21:50     ` Arnd Bergmann
  0 siblings, 2 replies; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-11 17:10 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Dec 2015, Arnd Bergmann wrote:

> On Friday 11 December 2015 01:26:25 Nicolas Pitre wrote:
> 
> > This is my counter proposal to Stephen's version. Those patches and the 
> > discussion that followed are available here:
> > 
> >   http://news.gmane.org/group/gmane.linux.kbuild.devel/thread=14007
> > 
> > I think what I propose here is much simpler and less intrusive.  Going
> > to the full call site patching provides between moderate to no 
> > additional performance benefits depending on the CPU core.  That should 
> > probably be compared with this solution with benchmark results to 
> > determine if it is worth it.
> 
> Looks really nice, as it's much simpler than Stephen's approach
> 
> > Of course the ultimate performance will come from having gcc emit those 
> > div instructions directly, but that would make the kernel binary 
> > incompatible with some systems in a multi-arch config. Hence it has to 
> > be a separate config option.
> 
> Well, what we found is that we already have the same problem today
> when enabling LPAE makes the kernel incompatible with platforms that
> can be enabled. The affected platforms are basically the same, except
> for the earlier PJ4 and Krait variants that have some idiv support
> but no LPAE.

Still, making native div support depend on CONFIG_LPAE would be strange. 
A separate config symbol would mmake it self explanatory.

> > diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
> > index 85e374f873..48c77d422a 100644
> > --- a/arch/arm/include/asm/cputype.h
> > +++ b/arch/arm/include/asm/cputype.h
> > @@ -250,8 +250,14 @@ static inline int cpu_is_pj4(void)
> >  
> >  	return 0;
> >  }
> > +
> > +static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
> > +{
> > +	return read_cpuid_part() == 0x56005810;
> > +}
> >  #else
> > -#define cpu_is_pj4()	0
> > +#define cpu_is_pj4()		0
> > +#define cpu_is_pj4_nomp()	0
> >  #endif
> 
> It would be nice to use a symbolic constant for the above, and maybe define
> all three known cpuid numbers for PJ4 variants.
> 
> I've looked at it some more and I now have doubts about this code:
> 
> #ifdef CONFIG_CPU_PJ4B
>         .type   __v7_pj4b_proc_info, #object
> __v7_pj4b_proc_info:
>         .long   0x560f5800
>         .long   0xff0fff00
>         __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
>         .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
> #endif
> 
> 
> Can someone have a look and tell me that I'm wrong when I read this
> as matching both PJ4 and PJ4B (and PJ4B-MP)?
> 
> Either I'm misreading this, or we do the wrong thing in configurations
> that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).

I don't have the relevant documentation to validate it.  And I'd prefer 
if this was sorted out in a separate patch.  Maybe I should just drop 
the PJ4 variants from this patch for now.

> > +#ifdef CONFIG_ARM_PATCH_IDIV
> > +
> > +/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
> > +static inline u32 __attribute_const__ sdiv_instruction(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> > +		u32 insn = __opcode_thumb32_compose(0xfb90, 0xf0f1);
> > +		if (cpu_is_pj4_nomp())
> > +			insn = __opcode_thumb32_compose(0xee30, 0x0691);
> > +		return __opcode_to_mem_thumb32(insn);
> > +	}
> 
> Strictly speaking, we can ignore the thumb instruction for pj4, as all
> variants also support the normal T2 opcode for udiv/sdiv.

OK it is gone.

Now... The code bails out if ARM mode and HWCAP_IDIVA is not set so 
effectively the ARM mode on PJ4 is currently not used either. So I'm 
leaning towards having aving another patch to sort PJ4 out.

> > diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
> > index af2267f6a5..9397b2e532 100644
> > --- a/arch/arm/lib/lib1funcs.S
> > +++ b/arch/arm/lib/lib1funcs.S
> > @@ -205,6 +205,10 @@ Boston, MA 02111-1307, USA.  */
> >  .endm
> >  
> >  
> > +#ifdef CONFIG_ARM_PATCH_IDIV
> > +	.align	3
> > +#endif
> > +
> >  ENTRY(__udivsi3)
> >  ENTRY(__aeabi_uidiv)
> >  UNWIND(.fnstart)
> 
> I'd probably just leave out the #ifdef and always align this, but it's not
> important.

This serves as "documentation". The alignment is important when 
CONFIG_ARM_PATCH_IDIV is selected and not otherwise.


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 17:10   ` Nicolas Pitre
@ 2015-12-11 17:22     ` Nicolas Pitre
  2015-12-11 22:26       ` Arnd Bergmann
  2015-12-11 21:50     ` Arnd Bergmann
  1 sibling, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-11 17:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Dec 2015, Nicolas Pitre wrote:

> On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> 
> > Strictly speaking, we can ignore the thumb instruction for pj4, as all
> > variants also support the normal T2 opcode for udiv/sdiv.
> 
> OK it is gone.
> 
> Now... The code bails out if ARM mode and HWCAP_IDIVA is not set so 
> effectively the ARM mode on PJ4 is currently not used either. So I'm 
> leaning towards having aving another patch to sort PJ4 out.

Here's v2 of this patch with PJ4 removed so it can be sorted out 
separately.

----- >8
Subject: [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()

The ARM compiler inserts calls to __aeabi_uidiv() and
__aeabi_idiv() when it needs to perform division on signed and
unsigned integers. If a processor has support for the udiv and
sdiv instructions, the kernel may overwrite the beginning of those
functions with those instructions and a "bx lr" to get better
performance.

To ensure those functions are aligned to a 32-bit word for easier
patching (which might not always be the case in Thumb mode) and the
two patched instructions for each case are contained in the same cache
line, a 8-byte alignment is enforced when ARM_PATCH_IDIV is configured.

This was heavily inspired by a previous patch by Stephen Boyd.

Signed-off-by: Nicolas Pitre <nico@linaro.org>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 34e1569a11..efea5fa975 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1604,6 +1604,21 @@ config THUMB2_AVOID_R_ARM_THM_JUMP11
 config ARM_ASM_UNIFIED
 	bool
 
+config ARM_PATCH_IDIV
+	bool "Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()"
+	depends on CPU_32v7 && !XIP_KERNEL
+	default y
+	help
+	  Some v7 CPUs have support for the udiv and sdiv instructions
+	  that can be used to implement the __aeabi_uidiv and __aeabi_idiv
+	  functions provided by the ARM runtime ABI.
+
+	  Enabling this option allows the kernel to modify itself to replace
+	  the first two instructions of these library functions with the
+	  udiv or sdiv plus "bx lr" instructions. Typically this will be
+	  faster and less power intensive than running the original library
+	  support code to do integer division.
+
 config AEABI
 	bool "Use the ARM EABI to compile the kernel"
 	help
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 20edd349d3..332a0f6baf 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -375,6 +375,72 @@ void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+
+static inline u32 __attribute_const__ sdiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		/* "sdiv r0, r0, r1" */
+		u32 insn = __opcode_thumb32_compose(0xfb90, 0xf0f1);
+		return __opcode_to_mem_thumb32(insn);
+	}
+
+	/* "sdiv r0, r0, r1" */
+	return __opcode_to_mem_arm(0xe710f110);
+}
+
+static inline u32 __attribute_const__ udiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		/* "udiv r0, r0, r1" */
+		u32 insn = __opcode_thumb32_compose(0xfbb0, 0xf0f1);
+		return __opcode_to_mem_thumb32(insn);
+	}
+
+	/* "udiv r0, r0, r1" */
+	return __opcode_to_mem_arm(0xe730f110);
+}
+
+static inline u32 __attribute_const__ bx_lr_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		/* "bx lr; nop" */
+		u32 insn = __opcode_thumb32_compose(0x4770, 0x46c0);
+		return __opcode_to_mem_thumb32(insn);
+	}
+	
+	/* "bx lr" */
+	return __opcode_to_mem_arm(0xe12fff1e);
+}
+
+static void __init patch_aeabi_uidiv(void)
+{
+	extern void __aeabi_uidiv(void);
+	extern void __aeabi_idiv(void);
+	uintptr_t fn_addr;
+	unsigned int mask;
+
+	mask = IS_ENABLED(CONFIG_THUMB2_KERNEL) ? HWCAP_IDIVT : HWCAP_IDIVA;
+	if (!(elf_hwcap & mask))
+		return;
+
+	pr_info("CPU: div instructions available: patching division code\n");
+
+	fn_addr = ((uintptr_t)&__aeabi_uidiv) & ~1;
+	((u32 *)fn_addr)[0] = udiv_instruction();
+	((u32 *)fn_addr)[1] = bx_lr_instruction();
+	flush_icache_range(fn_addr, fn_addr + 8);
+
+	fn_addr = ((uintptr_t)&__aeabi_idiv) & ~1;
+	((u32 *)fn_addr)[0] = sdiv_instruction();
+	((u32 *)fn_addr)[1] = bx_lr_instruction();
+	flush_icache_range(fn_addr, fn_addr + 8);
+}
+
+#else
+static inline void patch_aeabi_uidiv(void) { }
+#endif
+
 static void __init cpuid_init_hwcaps(void)
 {
 	int block;
@@ -642,6 +708,7 @@ static void __init setup_processor(void)
 	elf_hwcap = list->elf_hwcap;
 
 	cpuid_init_hwcaps();
+	patch_aeabi_uidiv();
 
 #ifndef CONFIG_ARM_THUMB
 	elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index af2267f6a5..9397b2e532 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,6 +205,10 @@ Boston, MA 02111-1307, USA.  */
 .endm
 
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+	.align	3
+#endif
+
 ENTRY(__udivsi3)
 ENTRY(__aeabi_uidiv)
 UNWIND(.fnstart)
@@ -253,6 +257,10 @@ UNWIND(.fnstart)
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
 
+#ifdef CONFIG_ARM_PATCH_IDIV
+	.align 3
+#endif
+
 ENTRY(__divsi3)
 ENTRY(__aeabi_idiv)
 UNWIND(.fnstart)

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 17:10   ` Nicolas Pitre
  2015-12-11 17:22     ` Nicolas Pitre
@ 2015-12-11 21:50     ` Arnd Bergmann
  2015-12-11 22:00       ` Nicolas Pitre
  2015-12-11 22:34       ` Russell King - ARM Linux
  1 sibling, 2 replies; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-11 21:50 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 12:10:29 Nicolas Pitre wrote:
> On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> > On Friday 11 December 2015 01:26:25 Nicolas Pitre wrote:
> > > Of course the ultimate performance will come from having gcc emit those 
> > > div instructions directly, but that would make the kernel binary 
> > > incompatible with some systems in a multi-arch config. Hence it has to 
> > > be a separate config option.
> > 
> > Well, what we found is that we already have the same problem today
> > when enabling LPAE makes the kernel incompatible with platforms that
> > can be enabled. The affected platforms are basically the same, except
> > for the earlier PJ4 and Krait variants that have some idiv support
> > but no LPAE.
> 
> Still, making native div support depend on CONFIG_LPAE would be strange. 
> A separate config symbol would mmake it self explanatory.

Based on the discussion we had, I'd use CONFIG_CPU_V7VE or some variation
of that:

* Almost all early ARMv7 cpu cores (Cortex-A8/A9/A5, Scorpion, PJ4, PJ4B) are not
  V7VE, and they support neither LPAE nor IDIV

* All V7VE cores (Cortex-A7/A12/A15/A17, B15) by definition support both LPAE
  and IDIV. Krait400 and PJ4B-MP/PJ4C are not V7VE because they don't support
  virtualization, but for our purposes they are close enough that we want to
  build them with -march=armv7ve on toolchains that allow this.

* Most Krait (pre-Krait400) don't support LPAE but do support IDIV. However,
  with the latest change to mach-qcom/Kconfig we no longer care because we
  don't have separate Kconfig symbols per SoC any more and we can build a
  kernel for QCOM/MSM with either V7 or V7VE and with either LPAE or not, and
  we get what the user asked for.

* PJ4/PJ4B can be treated special when building a thumb2 kernel, we already
  have a Kconfig option for it that we could extend, but I'd just use your
  solution for this, it's good enough and it handles the special opcodes
  for ARM mode transparently (at least when you do a follow-up patch).

We haven't solved the question how the new Kconfig option fits in, but
I still think we should try to get this right, especially because the LPAE
handling is currently suboptimal in the way that you can enable LPAE on
any ARMv7 platform, whether that supports LPAE or not.

> > #ifdef CONFIG_CPU_PJ4B
> >         .type   __v7_pj4b_proc_info, #object
> > __v7_pj4b_proc_info:
> >         .long   0x560f5800
> >         .long   0xff0fff00
> >         __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
> >         .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
> > #endif
> > 
> > 
> > Can someone have a look and tell me that I'm wrong when I read this
> > as matching both PJ4 and PJ4B (and PJ4B-MP)?
> > 
> > Either I'm misreading this, or we do the wrong thing in configurations
> > that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).
> 
> I don't have the relevant documentation to validate it.  And I'd prefer 
> if this was sorted out in a separate patch.  Maybe I should just drop 
> the PJ4 variants from this patch for now.

To clarify: that point had nothing to do with your patch, I just think
I found an existing kernel bug that will cause pj4b_processor_functions
to be used on PJ4 (Dove, MMP2) in a kernel that includes both PJ4 and
PJ4B (Armada 370/XP, Berlin).

> > > +#ifdef CONFIG_ARM_PATCH_IDIV
> > > +
> > > +/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
> > > +static inline u32 __attribute_const__ sdiv_instruction(void)
> > > +{
> > > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> > > +		u32 insn = __opcode_thumb32_compose(0xfb90, 0xf0f1);
> > > +		if (cpu_is_pj4_nomp())
> > > +			insn = __opcode_thumb32_compose(0xee30, 0x0691);
> > > +		return __opcode_to_mem_thumb32(insn);
> > > +	}
> > 
> > Strictly speaking, we can ignore the thumb instruction for pj4, as all
> > variants also support the normal T2 opcode for udiv/sdiv.
> 
> OK it is gone.
> 
> Now... The code bails out if ARM mode and HWCAP_IDIVA is not set so 
> effectively the ARM mode on PJ4 is currently not used either. So I'm 
> leaning towards having aving another patch to sort PJ4 out.

Sounds good.

> > > diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
> > > index af2267f6a5..9397b2e532 100644
> > > --- a/arch/arm/lib/lib1funcs.S
> > > +++ b/arch/arm/lib/lib1funcs.S
> > > @@ -205,6 +205,10 @@ Boston, MA 02111-1307, USA.  */
> > >  .endm
> > >  
> > >  
> > > +#ifdef CONFIG_ARM_PATCH_IDIV
> > > +	.align	3
> > > +#endif
> > > +
> > >  ENTRY(__udivsi3)
> > >  ENTRY(__aeabi_uidiv)
> > >  UNWIND(.fnstart)
> > 
> > I'd probably just leave out the #ifdef and always align this, but it's not
> > important.
> 
> This serves as "documentation". The alignment is important when 
> CONFIG_ARM_PATCH_IDIV is selected and not otherwise.

Fair enough.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 21:50     ` Arnd Bergmann
@ 2015-12-11 22:00       ` Nicolas Pitre
  2015-12-11 22:48         ` Arnd Bergmann
  2015-12-11 22:34       ` Russell King - ARM Linux
  1 sibling, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-11 22:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Dec 2015, Arnd Bergmann wrote:

> On Friday 11 December 2015 12:10:29 Nicolas Pitre wrote:
> > On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> > > On Friday 11 December 2015 01:26:25 Nicolas Pitre wrote:
> > > > Of course the ultimate performance will come from having gcc emit those 
> > > > div instructions directly, but that would make the kernel binary 
> > > > incompatible with some systems in a multi-arch config. Hence it has to 
> > > > be a separate config option.
> > > 
> > > Well, what we found is that we already have the same problem today
> > > when enabling LPAE makes the kernel incompatible with platforms that
> > > can be enabled. The affected platforms are basically the same, except
> > > for the earlier PJ4 and Krait variants that have some idiv support
> > > but no LPAE.
> > 
> > Still, making native div support depend on CONFIG_LPAE would be strange. 
> > A separate config symbol would mmake it self explanatory.
> 
> Based on the discussion we had, I'd use CONFIG_CPU_V7VE or some variation
> of that:
> 
> * Almost all early ARMv7 cpu cores (Cortex-A8/A9/A5, Scorpion, PJ4, PJ4B) are not
>   V7VE, and they support neither LPAE nor IDIV
> 
> * All V7VE cores (Cortex-A7/A12/A15/A17, B15) by definition support both LPAE
>   and IDIV. Krait400 and PJ4B-MP/PJ4C are not V7VE because they don't support
>   virtualization, but for our purposes they are close enough that we want to
>   build them with -march=armv7ve on toolchains that allow this.
> 
> * Most Krait (pre-Krait400) don't support LPAE but do support IDIV. However,
>   with the latest change to mach-qcom/Kconfig we no longer care because we
>   don't have separate Kconfig symbols per SoC any more and we can build a
>   kernel for QCOM/MSM with either V7 or V7VE and with either LPAE or not, and
>   we get what the user asked for.
> 
> * PJ4/PJ4B can be treated special when building a thumb2 kernel, we already
>   have a Kconfig option for it that we could extend, but I'd just use your
>   solution for this, it's good enough and it handles the special opcodes
>   for ARM mode transparently (at least when you do a follow-up patch).
> 
> We haven't solved the question how the new Kconfig option fits in, but
> I still think we should try to get this right, especially because the LPAE
> handling is currently suboptimal in the way that you can enable LPAE on
> any ARMv7 platform, whether that supports LPAE or not.

You seem to have a good grasp of the problem space. I'd suggest you make 
a patch!  ;-)

I already have a pending patch optimizing __do_div64 with udiv. I'm 
waiting for the config option to be in place.

> > > #ifdef CONFIG_CPU_PJ4B
> > >         .type   __v7_pj4b_proc_info, #object
> > > __v7_pj4b_proc_info:
> > >         .long   0x560f5800
> > >         .long   0xff0fff00
> > >         __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
> > >         .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
> > > #endif
> > > 
> > > 
> > > Can someone have a look and tell me that I'm wrong when I read this
> > > as matching both PJ4 and PJ4B (and PJ4B-MP)?
> > > 
> > > Either I'm misreading this, or we do the wrong thing in configurations
> > > that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).
> > 
> > I don't have the relevant documentation to validate it.  And I'd prefer 
> > if this was sorted out in a separate patch.  Maybe I should just drop 
> > the PJ4 variants from this patch for now.
> 
> To clarify: that point had nothing to do with your patch, I just think
> I found an existing kernel bug that will cause pj4b_processor_functions
> to be used on PJ4 (Dove, MMP2) in a kernel that includes both PJ4 and
> PJ4B (Armada 370/XP, Berlin).

OK. I'd suggest starting another thread about this to get the right 
people's attention.

> > This serves as "documentation". The alignment is important when 
> > CONFIG_ARM_PATCH_IDIV is selected and not otherwise.
> 
> Fair enough.

I tested my patch with both ARM and Thumb kernels, also dumping kernel 
memory to be sure the right instructions were in place.

If someone is willing to ACK v2 of my patch I'll put it in the patch 
system.


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 17:22     ` Nicolas Pitre
@ 2015-12-11 22:26       ` Arnd Bergmann
  2015-12-11 23:57         ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-11 22:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 12:22:20 Nicolas Pitre wrote:
> Subject: [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
> 
> The ARM compiler inserts calls to __aeabi_uidiv() and
> __aeabi_idiv() when it needs to perform division on signed and
> unsigned integers. If a processor has support for the udiv and
> sdiv instructions, the kernel may overwrite the beginning of those
> functions with those instructions and a "bx lr" to get better
> performance.
> 
> To ensure those functions are aligned to a 32-bit word for easier
> patching (which might not always be the case in Thumb mode) and the
> two patched instructions for each case are contained in the same cache
> line, a 8-byte alignment is enforced when ARM_PATCH_IDIV is configured.
> 
> This was heavily inspired by a previous patch by Stephen Boyd.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> 

Acked-by: Arnd Bergmann <arnd@arndb.de>

Before you put it in the patch tracker, I think it would be good to
give Stephen a chance to comment as well, since he did a lot of
work upfront and this obsoletes his original patch series.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 21:50     ` Arnd Bergmann
  2015-12-11 22:00       ` Nicolas Pitre
@ 2015-12-11 22:34       ` Russell King - ARM Linux
  2015-12-11 22:51         ` Arnd Bergmann
  1 sibling, 1 reply; 19+ messages in thread
From: Russell King - ARM Linux @ 2015-12-11 22:34 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 11, 2015 at 10:50:02PM +0100, Arnd Bergmann wrote:
> On Friday 11 December 2015 12:10:29 Nicolas Pitre wrote:
> > On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> > > #ifdef CONFIG_CPU_PJ4B
> > >         .type   __v7_pj4b_proc_info, #object
> > > __v7_pj4b_proc_info:
> > >         .long   0x560f5800
> > >         .long   0xff0fff00
> > >         __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
> > >         .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
> > > #endif
> > > 
> > > 
> > > Can someone have a look and tell me that I'm wrong when I read this
> > > as matching both PJ4 and PJ4B (and PJ4B-MP)?
> > > 
> > > Either I'm misreading this, or we do the wrong thing in configurations
> > > that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).
> > 
> > I don't have the relevant documentation to validate it.  And I'd prefer 
> > if this was sorted out in a separate patch.  Maybe I should just drop 
> > the PJ4 variants from this patch for now.
> 
> To clarify: that point had nothing to do with your patch, I just think
> I found an existing kernel bug that will cause pj4b_processor_functions
> to be used on PJ4 (Dove, MMP2) in a kernel that includes both PJ4 and
> PJ4B (Armada 370/XP, Berlin).

It does look like it will end up matching a PJ4 CPU against the PJ4B entry
if it's enabled.  I think the question that needs to be asked is why the
mask is soo loose, and the past history gives us some information.  The
loosening of the mask was done by Gregory Clement a couple of years ago:

    ARM: 7754/1: Fix the CPU ID and the mask associated to the PJ4B

    This commit fixes the ID and mask for the PJ4B which was too
    restrictive and didn't match the CPU of the Armada 370 SoC.

 __v7_pj4b_proc_info:
-       .long   0x562f5840
-       .long   0xfffffff0
+       .long   0x560f5800
+       .long   0xff0fff00

So it was to include Armada 370.  So this now brings up the question...
what is the MIDR value used in Armada 370?

-- 
RMK's Patch system: http://www.arm.linux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 22:00       ` Nicolas Pitre
@ 2015-12-11 22:48         ` Arnd Bergmann
  0 siblings, 0 replies; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-11 22:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 17:00:50 Nicolas Pitre wrote:
> You seem to have a good grasp of the problem space. I'd suggest you make 
> a patch!  ;-)

Yes, I can do that once all the ARMv6/v7 multiplatform work is done, that
should at least reduce the number of special cases.

> > > > #ifdef CONFIG_CPU_PJ4B
> > > >         .type   __v7_pj4b_proc_info, #object
> > > > __v7_pj4b_proc_info:
> > > >         .long   0x560f5800
> > > >         .long   0xff0fff00
> > > >         __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
> > > >         .size   __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
> > > > #endif
> > > > 
> > > > 
> > > > Can someone have a look and tell me that I'm wrong when I read this
> > > > as matching both PJ4 and PJ4B (and PJ4B-MP)?
> > > > 
> > > > Either I'm misreading this, or we do the wrong thing in configurations
> > > > that include both PJ4B (berlin, mvebu) and PJ4 (MMP2/dove).
> > > 
> > > I don't have the relevant documentation to validate it.  And I'd prefer 
> > > if this was sorted out in a separate patch.  Maybe I should just drop 
> > > the PJ4 variants from this patch for now.
> > 
> > To clarify: that point had nothing to do with your patch, I just think
> > I found an existing kernel bug that will cause pj4b_processor_functions
> > to be used on PJ4 (Dove, MMP2) in a kernel that includes both PJ4 and
> > PJ4B (Armada 370/XP, Berlin).
> 
> OK. I'd suggest starting another thread about this to get the right 
> people's attention.

I've done a preliminary patch now, will send that out.

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 22:34       ` Russell King - ARM Linux
@ 2015-12-11 22:51         ` Arnd Bergmann
  2015-12-12  0:01           ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-11 22:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 22:34:16 Russell King - ARM Linux wrote:
> 
>  __v7_pj4b_proc_info:
> -       .long   0x562f5840
> -       .long   0xfffffff0
> +       .long   0x560f5800
> +       .long   0xff0fff00
> 
> So it was to include Armada 370.  So this now brings up the question...
> what is the MIDR value used in Armada 370?

I've listed them in an earlier thread, here is the list again:

                variant part    revision        name            features
mmp2:           0       0x581   5               PJ4             idivt
dove:           0       0x581   5               PJ4             idivt
Armada 370      1       0x581   1               PJ4B            idivt
mmp3:           2       0x584   2               PJ4-MP          idiva idivt lpae
Armada XP       2       0x584   2               PJ4-MP          idiva idivt lpae
Berlin          2       0x584   2               PJ4-MP          idiva idivt lpae

So the original table was wrong because it failed to include PJ4B (Armada 370),
but the current version is wrong, because it also includes PJ4 (Dove and MMP2).

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 22:26       ` Arnd Bergmann
@ 2015-12-11 23:57         ` Nicolas Pitre
  2016-01-05  1:23           ` Stephen Boyd
  0 siblings, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-11 23:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Dec 2015, Arnd Bergmann wrote:

> On Friday 11 December 2015 12:22:20 Nicolas Pitre wrote:
> > Subject: [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
> > 
> > The ARM compiler inserts calls to __aeabi_uidiv() and
> > __aeabi_idiv() when it needs to perform division on signed and
> > unsigned integers. If a processor has support for the udiv and
> > sdiv instructions, the kernel may overwrite the beginning of those
> > functions with those instructions and a "bx lr" to get better
> > performance.
> > 
> > To ensure those functions are aligned to a 32-bit word for easier
> > patching (which might not always be the case in Thumb mode) and the
> > two patched instructions for each case are contained in the same cache
> > line, a 8-byte alignment is enforced when ARM_PATCH_IDIV is configured.
> > 
> > This was heavily inspired by a previous patch by Stephen Boyd.
> > 
> > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > 
> 
> Acked-by: Arnd Bergmann <arnd@arndb.de>

Thanks.

> Before you put it in the patch tracker, I think it would be good to
> give Stephen a chance to comment as well, since he did a lot of
> work upfront and this obsoletes his original patch series.

Given he'll get back from vacation only after the new year, I'll put the 
patch in the tracker now so it can go in before the next merge window.  

Stephen's series could still be relevant by extending what is done here, 
and it requires what this patch is doing anyway for those call sites 
that can't be substituted by a div instruction (like conditional 
branches, tail call optimizations, etc.)


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 22:51         ` Arnd Bergmann
@ 2015-12-12  0:01           ` Nicolas Pitre
  2015-12-12  0:04             ` Arnd Bergmann
  0 siblings, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-12  0:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Dec 2015, Arnd Bergmann wrote:

> On Friday 11 December 2015 22:34:16 Russell King - ARM Linux wrote:
> > 
> >  __v7_pj4b_proc_info:
> > -       .long   0x562f5840
> > -       .long   0xfffffff0
> > +       .long   0x560f5800
> > +       .long   0xff0fff00
> > 
> > So it was to include Armada 370.  So this now brings up the question...
> > what is the MIDR value used in Armada 370?
> 
> I've listed them in an earlier thread, here is the list again:
> 
>                 variant part    revision        name            features
> mmp2:           0       0x581   5               PJ4             idivt
> dove:           0       0x581   5               PJ4             idivt
> Armada 370      1       0x581   1               PJ4B            idivt
> mmp3:           2       0x584   2               PJ4-MP          idiva idivt lpae
> Armada XP       2       0x584   2               PJ4-MP          idiva idivt lpae
> Berlin          2       0x584   2               PJ4-MP          idiva idivt lpae
> 
> So the original table was wrong because it failed to include PJ4B (Armada 370),
> but the current version is wrong, because it also includes PJ4 (Dove and MMP2).

I'd suggest you add the above table and conclusion to the commit log for 
your proposed fix.  Next time the question comes up the info will be 
right there.


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-12  0:01           ` Nicolas Pitre
@ 2015-12-12  0:04             ` Arnd Bergmann
  2015-12-12  1:17               ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-12  0:04 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 19:01:00 Nicolas Pitre wrote:
> On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> 
> > On Friday 11 December 2015 22:34:16 Russell King - ARM Linux wrote:
> > > 
> > >  __v7_pj4b_proc_info:
> > > -       .long   0x562f5840
> > > -       .long   0xfffffff0
> > > +       .long   0x560f5800
> > > +       .long   0xff0fff00
> > > 
> > > So it was to include Armada 370.  So this now brings up the question...
> > > what is the MIDR value used in Armada 370?
> > 
> > I've listed them in an earlier thread, here is the list again:
> > 
> >                 variant part    revision        name            features
> > mmp2:           0       0x581   5               PJ4             idivt
> > dove:           0       0x581   5               PJ4             idivt
> > Armada 370      1       0x581   1               PJ4B            idivt
> > mmp3:           2       0x584   2               PJ4-MP          idiva idivt lpae
> > Armada XP       2       0x584   2               PJ4-MP          idiva idivt lpae
> > Berlin          2       0x584   2               PJ4-MP          idiva idivt lpae
> > 
> > So the original table was wrong because it failed to include PJ4B (Armada 370),
> > but the current version is wrong, because it also includes PJ4 (Dove and MMP2).
> 
> I'd suggest you add the above table and conclusion to the commit log for 
> your proposed fix.  Next time the question comes up the info will be 
> right there.

I actually procrastinated the better part of my work day today documenting
the Marvell core types in the existing kernel documentation directory
and ended up with the patch below. ;-)

If you still remember some of the details of the ancient cores that were
not entirely clear to me, please have a look.

	Arnd

commit 6f8cedd5ba8ca52b3bc258244ce4a8927a7f12a8
Author: Arnd Bergmann <arnd@arndb.de>
Date:   Fri Dec 11 15:49:13 2015 +0100

    ARM: documentation: update Marvell product listing
    
    I'm still getting confused regarding which core specifically
    is used in which SoC, so I've added some more detail to the
    Marvell README file. I got most of this from random sources
    on the internet, so it's possible that some of the information
    is wrong, but most of it should be pretty obvious.
    
    There are a few remaining points I could not find out:
    
    * The CPU core in Orion 88F6183
    * The difference (if any) between PJ4B-MP and PJ4C
    * The naming of Feroceon/Jolteon/Flareon/Sheeva/Mohawk/PJ1/PJ4
      is still confusing, as they tend to overlap.
    
    Signed-off-by: Arnd Bergmann <arnd@arndb.de>

diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README
index ae89b67d8e23..1e76a729162b 100644
--- a/Documentation/arm/Marvell/README
+++ b/Documentation/arm/Marvell/README
@@ -22,7 +22,7 @@ Orion family
         88F5281
                Datasheet               : http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
         88F6183
-  Core: Feroceon ARMv5 compatible
+  Core: Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
   Linux kernel mach directory: arch/arm/mach-orion5x
   Linux kernel plat directory: arch/arm/plat-orion
 
@@ -52,7 +52,7 @@ Kirkwood family
                 Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6281_OpenSource.pdf
                 Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
   Homepage: http://www.marvell.com/embedded-processors/kirkwood/
-  Core: Feroceon ARMv5 compatible
+  Core: Feroceon 88fr131 ARMv5 compatible
   Linux kernel mach directory: arch/arm/mach-mvebu
   Linux kernel plat directory: none
 
@@ -71,7 +71,7 @@ Discovery family
         MV76100
                 Not supported by the Linux kernel.
 
-  Core: Feroceon ARMv5 compatible
+  Core: Feroceon 88fr571-vd ARMv5 compatible
 
   Linux kernel mach directory: arch/arm/mach-mv78xx0
   Linux kernel plat directory: arch/arm/plat-orion
@@ -86,20 +86,30 @@ EBU Armada family
     Product Brief:   http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
     Hardware Spec:   http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
     Functional Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
+    Core: Sheeva ARMv7 compatible PJ4B
 
   Armada 375 Flavors:
 	88F6720
     Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
-
-  Armada 380/385 Flavors:
-	88F6810
-	88F6820
-	88F6828
-
-  Armada 390/398 Flavors:
-	88F6920
-	88F6928
+    Core: ARM Cortex-A9
+
+  Armada 38x Flavors:
+	88F6810	Armada 380
+	88F6820 Armada 385
+	88F6828 Armada 388
+    Produce infos: http://www.marvell.com/embedded-processors/armada-38x/
+    Core: ARM Cortex-A9
+
+  Armada 39x Flavors:
+	88F6920 Armada 390
+	88F6928 Armada 398
     Product infos: http://www.marvell.com/embedded-processors/armada-39x/
+    Core: ARM Cortex-A9
+
+  Armada SP:
+	88RC1580
+    Product infos: http://www.marvell.com/embedded-processors/armada-sp/
+    Core: Sheeva ARMv7 comatible Quad-core PJ4C
 
   Armada XP Flavors:
         MV78230
@@ -113,7 +123,7 @@ EBU Armada family
       http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
       http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
 
-  Core: Sheeva ARMv7 compatible
+  Core: Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
 
   Linux kernel mach directory: arch/arm/mach-mvebu
   Linux kernel plat directory: none
@@ -155,7 +165,7 @@ PXA 2xx/3xx/93x/95x family
   Flavors:
         PXA21x, PXA25x, PXA26x
              Application processor only
-             Core: ARMv5 XScale core
+             Core: ARMv5 XScale1 core
         PXA270, PXA271, PXA272
              Product Brief         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_pb.pdf
              Design guide          : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_design_guide.pdf
@@ -163,7 +173,7 @@ PXA 2xx/3xx/93x/95x family
              Specification         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_emts.pdf
              Specification update  : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_spec_update.pdf
              Application processor only
-             Core: ARMv5 XScale core
+             Core: ARMv5 XScale2 core
         PXA300, PXA310, PXA320
              PXA 300 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA300_PB_R4.pdf
              PXA 310 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA310_PB_R4.pdf
@@ -174,10 +184,10 @@ PXA 2xx/3xx/93x/95x family
              Specification Update  : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Spec_Update.zip
              Reference Manual      : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_TavorP_BootROM_Ref_Manual.pdf
              Application processor only
-             Core: ARMv5 XScale core
+             Core: ARMv5 XScale3 core
         PXA930, PXA935
              Application processor with Communication processor
-             Core: ARMv5 XScale core
+             Core: ARMv5 XScale3 core
         PXA955
              Application processor with Communication processor
              Core: ARMv7 compatible Sheeva PJ4 core
@@ -196,7 +206,7 @@ PXA 2xx/3xx/93x/95x family
    Linux kernel mach directory: arch/arm/mach-pxa
    Linux kernel plat directory: arch/arm/plat-pxa
 
-MMP/MMP2 family (communication processor)
+MMP/MMP2/MMP3 family (communication processor)
 -----------------------------------------
 
    Flavors:
@@ -209,16 +219,32 @@ MMP/MMP2 family (communication processor)
              Boot ROM manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_ref_manual.pdf
              App node package     : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_app_note_package.pdf
              Application processor only
-             Core: ARMv5 compatible Marvell PJ1 (Mohawk)
-        PXA910
+             Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+        PXA910/PXA920
              Homepage             : http://www.marvell.com/communication-processors/pxa910/
              Product Brief        : http://www.marvell.com/communication-processors/pxa910/assets/Marvell_PXA910_Platform-001_PB_final.pdf
              Application processor with Communication processor
-             Core: ARMv5 compatible Marvell PJ1 (Mohawk)
-        MMP2, a.k.a Armada 610
+             Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+        PXA688, a.k.a. MMP2, a.k.a Armada 610
              Product Brief        : http://www.marvell.com/application-processors/armada-600/assets/armada610_pb.pdf
              Application processor only
-             Core: ARMv7 compatible Sheeva PJ4 core
+             Core: ARMv7 compatible Sheeva PJ4 88sv581x core
+	PXA2128, a.k.a. MMP3 (OLPC XO4, Linux support not upstream)
+	     Product Brief	  : http://www.marvell.com/application-processors/armada/pxa2128/assets/Marvell-ARMADA-PXA2128-SoC-PB.pdf
+	     Application processor only
+	     Core: Dual-core ARMv7 compatible Sheeva PJ4C core
+	PXA960/PXA968/PXA978 (Linux support not upstream)
+	     Application processor with Communication Processor
+	     Core: ARMv7 compatible Sheeva PJ4 core
+	PXA986/PXA988 (Linux support not upstream)
+	     Application processor with Communication Processor
+	     Core: Dual-core ARMv7 compatible Sheeva PJ4B-MP core
+	PXA1088/PXA1920 (Linux support not upstream)
+	     Application processor with Communication Processor
+	     Core: quad-core ARMv7 Cortex-A7
+	PXA1908/PXA1928/PXA1936
+	     Application processor with Communication Processor
+	     Core: multi-core ARMv8 Cortex-A53
 
    Comments:
 
@@ -237,6 +263,10 @@ Berlin family (Multimedia Solutions)
 -------------------------------------
 
   Flavors:
+	88DE3010, Armada 1000 (no Linux support)
+		Core:		Marvell PJ1 (ARMv5TE), Dual-core
+		Product Brief:	http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
+	88DE3005, Armada 1500-mini
 	88DE3005, Armada 1500 Mini
 		Design name:	BG2CD
 		Core:		ARM Cortex-A9, PL310 L2CC
@@ -247,14 +277,16 @@ Berlin family (Multimedia Solutions)
                 Homepage:       http://www.marvell.com/multimedia-solutions/armada-1500-mini-plus/
 	88DE3100, Armada 1500
 		Design name:	BG2
-		Core:		Marvell PJ4B (ARMv7), Tauros3 L2CC
-		Product Brief:	http://www.marvell.com/multimedia-solutions/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf
+		Core:		Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
+		Product Brief:	http://www.marvell.com/digital-entertainment/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf
 	88DE3114, Armada 1500 Pro
 		Design name:	BG2Q
 		Core:		Quad Core ARM Cortex-A9, PL310 L2CC
-	88DE????
+	88DE3214, Armada 1500 Pro 4K
 		Design name:	BG3
 		Core:		ARM Cortex-A15, CA15 integrated L2CC
+	88DE3218, ARMADA 1500 Ultra
+		Core:		ARM Cortex-A53
 
   Homepage: http://www.marvell.com/multimedia-solutions/
   Directory: arch/arm/mach-berlin
@@ -263,6 +295,49 @@ Berlin family (Multimedia Solutions)
    * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
      with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
 
+CPU Cores
+---------
+
+The XScale cores were designed by Intel, and shipped by Marvell in the older
+PXA processors. Feroceon is a Marvell designed core that developed in-house,
+and that evolved into Sheeva. The XScale and Feroceon cores were phased out
+over time and replaced with Sheeva cores in later products, which subsequently
+got replaced with licensed ARM Cortex-A cores.
+
+  XScale 1
+	CPUID 0x69052xxx
+	ARMv5, iWMMXt
+  XScale 2
+	CPUID 0x69054xxx
+	ARMv5, iWMMXt
+  XScale 3
+	CPUID 0x69056xxx or 0x69056xxx
+	ARMv5, iWMMXt
+  Feroceon 88fr331
+	CPUID 0x5615331x or 0x41xx926x
+	ARMv5TE, single issue
+  Feroceon 88fr531-vd "Jolteon"
+	CPUID 0x5605531x or 0x41xx926x
+	ARMv5TE, VFP, dual-issue
+  Feroceon 88fr571-vd "Jolteon"
+	CPUID 0x5615571x
+	ARMv5TE, VFP, dual-issue
+  Feroceon 88fr131
+	CPUID 0x5625131x
+	ARMv5TE, single-issue
+  Sheeva PJ1 88sv331 "Mohawk"
+	CPUID 0x561584xx
+	ARMv5, single-issue iWMMXt v2
+  Sheeva PJ4 88sv581x "Flareon"
+	CPUID 0x560f581x
+	ARMv7, idivt, optional iWMMXt v2
+  Sheeva PJ4B 88sv581x
+	CPUID 0x561f581x
+	ARMv7, idivt, optional iWMMXt v2
+  Sheeva PJ4B-MP / PJ4C
+	CPUID 0x562f584x
+	ARMv7, idivt/idiva, LPAE, optional iWMMXt v2 and/or NEON
+
 Long-term plans
 ---------------
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-12  0:04             ` Arnd Bergmann
@ 2015-12-12  1:17               ` Nicolas Pitre
  2015-12-12 20:41                 ` Arnd Bergmann
  0 siblings, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2015-12-12  1:17 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, 12 Dec 2015, Arnd Bergmann wrote:

> On Friday 11 December 2015 19:01:00 Nicolas Pitre wrote:
> > On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> > 
> > > On Friday 11 December 2015 22:34:16 Russell King - ARM Linux wrote:
> > > > 
> > > >  __v7_pj4b_proc_info:
> > > > -       .long   0x562f5840
> > > > -       .long   0xfffffff0
> > > > +       .long   0x560f5800
> > > > +       .long   0xff0fff00
> > > > 
> > > > So it was to include Armada 370.  So this now brings up the question...
> > > > what is the MIDR value used in Armada 370?
> > > 
> > > I've listed them in an earlier thread, here is the list again:
> > > 
> > >                 variant part    revision        name            features
> > > mmp2:           0       0x581   5               PJ4             idivt
> > > dove:           0       0x581   5               PJ4             idivt
> > > Armada 370      1       0x581   1               PJ4B            idivt
> > > mmp3:           2       0x584   2               PJ4-MP          idiva idivt lpae
> > > Armada XP       2       0x584   2               PJ4-MP          idiva idivt lpae
> > > Berlin          2       0x584   2               PJ4-MP          idiva idivt lpae
> > > 
> > > So the original table was wrong because it failed to include PJ4B (Armada 370),
> > > but the current version is wrong, because it also includes PJ4 (Dove and MMP2).
> > 
> > I'd suggest you add the above table and conclusion to the commit log for 
> > your proposed fix.  Next time the question comes up the info will be 
> > right there.
> 
> I actually procrastinated the better part of my work day today documenting
> the Marvell core types in the existing kernel documentation directory
> and ended up with the patch below. ;-)
> 
> If you still remember some of the details of the ancient cores that were
> not entirely clear to me, please have a look.

Well... FWIW, digging in my ancient mail folders, I found the following 
table sent to me in October of 2007. That doesn't add much to what you 
already have gathered. I have vague memories of a more exhaustive list 
but I can't find it anymore.

----- >8

family			out of order			in order
==============================================================================
no mmu		|				|			     |
no cache	|	Falcon			|	Falcon D	     |
"966"		|				|	a.k.a. Dragonite     |
==============================================================================
no mmu		|				|			     |
cache		|	Osprey			|	Osprey D	     |
"946"		|				|			     |
==============================================================================
		|	single issue: Mohawk	|			     |
mmu		|	a.k.a. Feroceon 1850	|			     |
cache		|	a.k.a. Feroceon FR-331	|	single issue:	     |
"926"		|				|	Mohawk D	     |
		|	dual issue: Jolteon	|			     |
		|	a.k.a. Feroceon 2850	|			     |
==============================================================================

Variants:
- Falcon DMC - Multi-Core version of the Falcon D
- Falcon DMT - Multi-Thread version of the Falcon D
- Osprey DMT - Multi-Thread version of Osprey D

There is also the Flareon, which is what will be used in Dove:
- Based on the Jolteon
- v6/v7 support
- packaged with L2, VFP, AXI

All of these cores can be mixed-and-matched with L2, VFP, AXI, wMMXt,
etc.

----- >8


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-12  1:17               ` Nicolas Pitre
@ 2015-12-12 20:41                 ` Arnd Bergmann
  0 siblings, 0 replies; 19+ messages in thread
From: Arnd Bergmann @ 2015-12-12 20:41 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 11 December 2015 20:17:44 Nicolas Pitre wrote:
> ----- >8
> 
> family                  out of order                    in order
> ==============================================================================
> no mmu          |                               |                            |
> no cache        |       Falcon                  |       Falcon D             |
> "966"           |                               |       a.k.a. Dragonite     |
> ==============================================================================
> no mmu          |                               |                            |
> cache           |       Osprey                  |       Osprey D             |
> "946"           |                               |                            |
> ==============================================================================
>                 |       single issue: Mohawk    |                            |
> mmu             |       a.k.a. Feroceon 1850    |                            |
> cache           |       a.k.a. Feroceon FR-331  |       single issue:        |
> "926"           |                               |       Mohawk D             |
>                 |       dual issue: Jolteon     |                            |
>                 |       a.k.a. Feroceon 2850    |                            |
> ==============================================================================
> 
> Variants:
> - Falcon DMC - Multi-Core version of the Falcon D
> - Falcon DMT - Multi-Thread version of the Falcon D
> - Osprey DMT - Multi-Thread version of Osprey D
> 
> There is also the Flareon, which is what will be used in Dove:
> - Based on the Jolteon
> - v6/v7 support
> - packaged with L2, VFP, AXI
> 
> All of these cores can be mixed-and-matched with L2, VFP, AXI, wMMXt,
> etc.
> 
> ----- >8

Ok, so it seems that Mohawk is not just the name for 88sv331 in IXP3xx but
also for 88fr331 in Orion, I should add that. The table also confirms that
Feroceon is used as a family name for multiple cores rather than just
one of the models. It doesn't explain how the Feroceon and Sheeva names
relate though, I'll just leave that part of my table as it is, it's
still my best guess.

My table is now:

  Feroceon-1850 88fr331 "Mohawk"
        CPUID 0x5615331x or 0x41xx926x
        ARMv5TE, single issue
  Feroceon-2850 88fr531-vd "Jolteon"
        CPUID 0x5605531x or 0x41xx926x
        ARMv5TE, VFP, dual-issue
  Feroceon 88fr571-vd "Jolteon"
        CPUID 0x5615571x
        ARMv5TE, VFP, dual-issue
  Feroceon 88fr131 "Mohawk-D"
        CPUID 0x5625131x
        ARMv5TE, single-issue, in-order
  Sheeva PJ1 88sv331 "Mohawk"
        CPUID 0x561584xx
        ARMv5, single-issue, iWMMXt v2

Connecting the 88fr131 product code with the "Mohawk-D" name is a wild
guess, based on the fact that this is a later version of 88fr331 and
runs at a higher clock frequency but has a lower number.

88fr131 is the core used in Kirkwood. Is there any way to find out if
Kirkwood is indeed an in-order core unlike Orion5x?

	Arnd

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2015-12-11 23:57         ` Nicolas Pitre
@ 2016-01-05  1:23           ` Stephen Boyd
  2016-01-05  1:42             ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Boyd @ 2016-01-05  1:23 UTC (permalink / raw)
  To: linux-arm-kernel

On 12/11, Nicolas Pitre wrote:
> On Fri, 11 Dec 2015, Arnd Bergmann wrote:
> 
> > On Friday 11 December 2015 12:22:20 Nicolas Pitre wrote:
> > > Subject: [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
> > > 
> 
> Thanks.
> 
> > Before you put it in the patch tracker, I think it would be good to
> > give Stephen a chance to comment as well, since he did a lot of
> > work upfront and this obsoletes his original patch series.
> 
> Given he'll get back from vacation only after the new year, I'll put the 
> patch in the tracker now so it can go in before the next merge window.  
> 
> Stephen's series could still be relevant by extending what is done here, 
> and it requires what this patch is doing anyway for those call sites 
> that can't be substituted by a div instruction (like conditional 
> branches, tail call optimizations, etc.)
> 

I'm back from vacation now. Where have we left off on this topic?

I can update the patches to be based on this patch here and
handle the conditional branches and tail call optimization cases
by adding some safety checks like we have for the ftrace branch
patching. But I'd rather not do that work unless we all agree
that it's worthwhile pursuing it.

Is there still any concern about the benefit of patching each
call site vs. patching the functions? The micro benchmark seems
to show some theoretical improvement on cortex-a7 and I can run
it on Scorpion and Krait processors to look for any potential
benefits there, but I'm not sure of any good kernel benchmark for
this. If it will be rejected due to complexity vs. benefit
arguments I'd rather work on something else.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2016-01-05  1:23           ` Stephen Boyd
@ 2016-01-05  1:42             ` Nicolas Pitre
  2016-01-08  2:44               ` Stephen Boyd
  0 siblings, 1 reply; 19+ messages in thread
From: Nicolas Pitre @ 2016-01-05  1:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 4 Jan 2016, Stephen Boyd wrote:

> On 12/11, Nicolas Pitre wrote:
> > Stephen's series could still be relevant by extending what is done here, 
> > and it requires what this patch is doing anyway for those call sites 
> > that can't be substituted by a div instruction (like conditional 
> > branches, tail call optimizations, etc.)
> > 
> 
> I'm back from vacation now. Where have we left off on this topic?
> 
> I can update the patches to be based on this patch here and
> handle the conditional branches and tail call optimization cases
> by adding some safety checks like we have for the ftrace branch
> patching. But I'd rather not do that work unless we all agree
> that it's worthwhile pursuing it.
> 
> Is there still any concern about the benefit of patching each
> call site vs. patching the functions? The micro benchmark seems
> to show some theoretical improvement on cortex-a7 and I can run
> it on Scorpion and Krait processors to look for any potential
> benefits there, but I'm not sure of any good kernel benchmark for
> this. If it will be rejected due to complexity vs. benefit
> arguments I'd rather work on something else.

You could run the benchmark on Scorpion and Krait to start with. If 
there is no improvement what so ever like on A15's then the answer might 
be rather simple.


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2016-01-05  1:42             ` Nicolas Pitre
@ 2016-01-08  2:44               ` Stephen Boyd
  2016-01-12 19:09                 ` Nicolas Pitre
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Boyd @ 2016-01-08  2:44 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/04, Nicolas Pitre wrote:
> On Mon, 4 Jan 2016, Stephen Boyd wrote:
> > 
> > I can update the patches to be based on this patch here and
> > handle the conditional branches and tail call optimization cases
> > by adding some safety checks like we have for the ftrace branch
> > patching. But I'd rather not do that work unless we all agree
> > that it's worthwhile pursuing it.
> > 
> > Is there still any concern about the benefit of patching each
> > call site vs. patching the functions? The micro benchmark seems
> > to show some theoretical improvement on cortex-a7 and I can run
> > it on Scorpion and Krait processors to look for any potential
> > benefits there, but I'm not sure of any good kernel benchmark for
> > this. If it will be rejected due to complexity vs. benefit
> > arguments I'd rather work on something else.
> 
> You could run the benchmark on Scorpion and Krait to start with. If 
> there is no improvement what so ever like on A15's then the answer might 
> be rather simple.
> 

So running the benchmark on Scorpion is not useful because we
don't have the idiv instruction there. On Krait I get the
following results. I ran this on a dragonboard apq8074 with
maxcpus=1 on the kernel command line.

Testing INLINE_DIV ...
real    0m 13.56s
user    0m 13.56s
sys     0m 0.00s

Testing PATCHED_DIV ...
real    0m 15.15s
user    0m 15.14s
sys     0m 0.00s

Testing OUTOFLINE_DIV ...
real    0m 18.09s
user    0m 18.09s
sys     0m 0.00s

Testing LIBGCC_DIV ...
real    0m 24.26s
user    0m 24.25s
sys     0m 0.00s

It looks like the branch actually costs us some time here.
Patching isn't as good as the compiler inserting the instruction
itself, but it is better than branching to the division routine.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv()
  2016-01-08  2:44               ` Stephen Boyd
@ 2016-01-12 19:09                 ` Nicolas Pitre
  0 siblings, 0 replies; 19+ messages in thread
From: Nicolas Pitre @ 2016-01-12 19:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 7 Jan 2016, Stephen Boyd wrote:

> On 01/04, Nicolas Pitre wrote:
> > On Mon, 4 Jan 2016, Stephen Boyd wrote:
> > > 
> > > I can update the patches to be based on this patch here and
> > > handle the conditional branches and tail call optimization cases
> > > by adding some safety checks like we have for the ftrace branch
> > > patching. But I'd rather not do that work unless we all agree
> > > that it's worthwhile pursuing it.
> > > 
> > > Is there still any concern about the benefit of patching each
> > > call site vs. patching the functions? The micro benchmark seems
> > > to show some theoretical improvement on cortex-a7 and I can run
> > > it on Scorpion and Krait processors to look for any potential
> > > benefits there, but I'm not sure of any good kernel benchmark for
> > > this. If it will be rejected due to complexity vs. benefit
> > > arguments I'd rather work on something else.
> > 
> > You could run the benchmark on Scorpion and Krait to start with. If 
> > there is no improvement what so ever like on A15's then the answer might 
> > be rather simple.
> > 
> 
> So running the benchmark on Scorpion is not useful because we
> don't have the idiv instruction there. On Krait I get the
> following results. I ran this on a dragonboard apq8074 with
> maxcpus=1 on the kernel command line.
> 
> Testing INLINE_DIV ...
> real    0m 13.56s
> user    0m 13.56s
> sys     0m 0.00s
> 
> Testing PATCHED_DIV ...
> real    0m 15.15s
> user    0m 15.14s
> sys     0m 0.00s
> 
> Testing OUTOFLINE_DIV ...
> real    0m 18.09s
> user    0m 18.09s
> sys     0m 0.00s
> 
> Testing LIBGCC_DIV ...
> real    0m 24.26s
> user    0m 24.25s
> sys     0m 0.00s
> 
> It looks like the branch actually costs us some time here.
> Patching isn't as good as the compiler inserting the instruction
> itself, but it is better than branching to the division routine.

I think it is up to you at this point whether or not you feel the call 
site patching complexity is worth it. Personally I'd rather go directly 
for the compiler flag solution to close the final performance gap, but I 
won't stand in the way of a call site patching patch.


Nicolas

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2016-01-12 19:09 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-11  6:26 [PATCH] ARM: Runtime patch udiv/sdiv instructions into __aeabi_{u}idiv() Nicolas Pitre
2015-12-11  9:24 ` Arnd Bergmann
2015-12-11 17:10   ` Nicolas Pitre
2015-12-11 17:22     ` Nicolas Pitre
2015-12-11 22:26       ` Arnd Bergmann
2015-12-11 23:57         ` Nicolas Pitre
2016-01-05  1:23           ` Stephen Boyd
2016-01-05  1:42             ` Nicolas Pitre
2016-01-08  2:44               ` Stephen Boyd
2016-01-12 19:09                 ` Nicolas Pitre
2015-12-11 21:50     ` Arnd Bergmann
2015-12-11 22:00       ` Nicolas Pitre
2015-12-11 22:48         ` Arnd Bergmann
2015-12-11 22:34       ` Russell King - ARM Linux
2015-12-11 22:51         ` Arnd Bergmann
2015-12-12  0:01           ` Nicolas Pitre
2015-12-12  0:04             ` Arnd Bergmann
2015-12-12  1:17               ` Nicolas Pitre
2015-12-12 20:41                 ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).