* [PATCH] ARM: lib: use LDRD/STRD for data copy
@ 2012-03-19 7:02 Boojin Kim
2012-03-19 8:55 ` Russell King - ARM Linux
` (3 more replies)
0 siblings, 4 replies; 25+ messages in thread
From: Boojin Kim @ 2012-03-19 7:02 UTC (permalink / raw)
To: linux-arm-kernel
This patch uses LDRD/STRD that loads and stores data as DWORD unit
for the copy of 8-words data.
It brings better performance than LDRM/STRM that was used originally.
Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
---
arch/arm/lib/copy_from_user.S | 14 +++++++++-----
arch/arm/lib/copy_template.S | 10 ++++++----
arch/arm/lib/copy_to_user.S | 13 +++++++++----
arch/arm/lib/memcpy.S | 13 +++++++++----
4 files changed, 33 insertions(+), 17 deletions(-)
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a..15d1e1c 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -51,11 +51,6 @@
ldr1w \ptr, \reg4, \abort
.endm
- .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
- ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
- .endm
-
.macro ldr1b ptr reg cond=al abort
ldrusr \reg, \ptr, 1, \cond, abort=\abort
.endm
@@ -68,6 +63,15 @@
stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
.endm
+ .macro cpy8w dst src reg1 reg2 abort
+ .irp offset, #0, #8, #16, #24
+ ldr1w \src, \reg1, \abort
+ ldr1w \src, \reg2, \abort
+ strd \reg1, \reg2, [\dst, \offset]
+ .endr
+ add \dst, \dst, #32
+ .endm
+
.macro str1b ptr reg cond=al abort
str\cond\()b \reg, [\ptr], #1
.endm
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 805e3f8..72640aa 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -28,9 +28,8 @@
* 'ptr' to the next word. The 'abort' argument is used for fixup tables.
*
* ldr4w ptr reg1 reg2 reg3 reg4 abort
- * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
*
- * This loads four or eight words starting from 'ptr', stores them
+ * This loads eight words starting from 'ptr', stores them
* in provided registers and increments 'ptr' past those words.
* The'abort' argument is used for fixup tables.
*
@@ -47,6 +46,10 @@
* Same as their ldr* counterparts, but data is stored to 'ptr' location
* rather than being loaded.
*
+ * cpy8w src dst reg1 reg2 abort
+ * This loads eight words starting from 'src' and stores them to 'dst'.
+ * The 'abort' argument is used for fixup tables.
+ *
* enter reg1 reg2
*
* Preserve the provided registers on the stack plus any additional
@@ -97,9 +100,8 @@
PLD( pld [r1, #92] )
3: PLD( pld [r1, #124] )
-4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4: cpy8w r0, r1, r4, r5, abort=20f
subs r2, r2, #32
- str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
bge 3b
PLD( cmn r2, #96 )
PLD( bge 4b )
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df6..9402a08 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -48,10 +48,6 @@
ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
.endm
- .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
- .endm
-
.macro ldr1b ptr reg cond=al abort
ldr\cond\()b \reg, [\ptr], #1
.endm
@@ -71,6 +67,15 @@
str1w \ptr, \reg8, \abort
.endm
+ .macro cpy8w dst src reg1 reg2 abort
+ .irp offset, #0, #8, #16, #24
+ ldrd \reg1, \reg2, [\src, \offset]
+ str1w \dst, \reg1, \abort
+ str1w \dst, \reg2, \abort
+ .endr
+ add \src, \src, #32
+ .endm
+
.macro str1b ptr reg cond=al abort
strusr \reg, \ptr, 1, \cond, abort=\abort
.endm
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index a9b9e22..25320c9 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -24,10 +24,6 @@
ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
.endm
- .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
- .endm
-
.macro ldr1b ptr reg cond=al abort
ldr\cond\()b \reg, [\ptr], #1
.endm
@@ -40,6 +36,15 @@
stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
.endm
+ .macro cpy8w dst src reg1 reg2 abort
+ .irp offset, #0, #8, #16, #24
+ ldrd \reg1, \reg2, [\src, \offset]
+ strd \reg1, \reg2, [\dst, \offset]
+ .endr
+ add \src, \src, #32
+ add \dst, \dst, #32
+ .endm
+
.macro str1b ptr reg cond=al abort
str\cond\()b \reg, [\ptr], #1
.endm
--
1.7.1
^ permalink raw reply related [flat|nested] 25+ messages in thread* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim @ 2012-03-19 8:55 ` Russell King - ARM Linux 2012-03-19 14:36 ` Rob Herring 2012-03-19 14:10 ` Nicolas Pitre ` (2 subsequent siblings) 3 siblings, 1 reply; 25+ messages in thread From: Russell King - ARM Linux @ 2012-03-19 8:55 UTC (permalink / raw) To: linux-arm-kernel On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: > This patch uses LDRD/STRD that loads and stores data as DWORD unit > for the copy of 8-words data. > It brings better performance than LDRM/STRM that was used originally. And what about CPUs that don't have ldrd/strd ? ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 8:55 ` Russell King - ARM Linux @ 2012-03-19 14:36 ` Rob Herring 2012-03-19 15:41 ` Russell King - ARM Linux 2012-03-20 0:21 ` Boojin Kim 0 siblings, 2 replies; 25+ messages in thread From: Rob Herring @ 2012-03-19 14:36 UTC (permalink / raw) To: linux-arm-kernel On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: >> This patch uses LDRD/STRD that loads and stores data as DWORD unit >> for the copy of 8-words data. >> It brings better performance than LDRM/STRM that was used originally. > > And what about CPUs that don't have ldrd/strd ? > And what about CPUs that do have ldrd/strd but is slower than ldm/stm? I'm pretty sure that is almost everything currently out there. Rob ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 14:36 ` Rob Herring @ 2012-03-19 15:41 ` Russell King - ARM Linux 2012-03-19 16:34 ` Måns Rullgård 2012-03-19 16:36 ` Rob Herring 2012-03-20 0:21 ` Boojin Kim 1 sibling, 2 replies; 25+ messages in thread From: Russell King - ARM Linux @ 2012-03-19 15:41 UTC (permalink / raw) To: linux-arm-kernel On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: > On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: > > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: > >> This patch uses LDRD/STRD that loads and stores data as DWORD unit > >> for the copy of 8-words data. > >> It brings better performance than LDRM/STRM that was used originally. > > > > And what about CPUs that don't have ldrd/strd ? > > > > And what about CPUs that do have ldrd/strd but is slower than ldm/stm? > I'm pretty sure that is almost everything currently out there. The double-word load/stores were introduced in ARMv6. Some Intel based CPUs prior to this have the support as well. Everything else doesn't. So taht's nowhere close to 'almost everything'. ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 15:41 ` Russell King - ARM Linux @ 2012-03-19 16:34 ` Måns Rullgård 2012-03-19 16:36 ` Rob Herring 1 sibling, 0 replies; 25+ messages in thread From: Måns Rullgård @ 2012-03-19 16:34 UTC (permalink / raw) To: linux-arm-kernel Russell King - ARM Linux <linux@arm.linux.org.uk> writes: > On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: >> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: >> > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: >> >> This patch uses LDRD/STRD that loads and stores data as DWORD unit >> >> for the copy of 8-words data. >> >> It brings better performance than LDRM/STRM that was used originally. >> > >> > And what about CPUs that don't have ldrd/strd ? >> > >> >> And what about CPUs that do have ldrd/strd but is slower than ldm/stm? >> I'm pretty sure that is almost everything currently out there. Care to give an example? I can't find one. > The double-word load/stores were introduced in ARMv6. Not true. LDRD/STRD were introduced in ARMv5TE. ARMv6 relaxed the alignment requirement of these instructions to 4 bytes from being implementation defined 4 or 8 bytes in ARMv5TE. -- M?ns Rullg?rd mans at mansr.com ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 15:41 ` Russell King - ARM Linux 2012-03-19 16:34 ` Måns Rullgård @ 2012-03-19 16:36 ` Rob Herring 2012-03-19 16:53 ` Nicolas Pitre ` (2 more replies) 1 sibling, 3 replies; 25+ messages in thread From: Rob Herring @ 2012-03-19 16:36 UTC (permalink / raw) To: linux-arm-kernel On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote: > On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: >> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: >>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: >>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit >>>> for the copy of 8-words data. >>>> It brings better performance than LDRM/STRM that was used originally. >>> >>> And what about CPUs that don't have ldrd/strd ? >>> >> >> And what about CPUs that do have ldrd/strd but is slower than ldm/stm? >> I'm pretty sure that is almost everything currently out there. > > The double-word load/stores were introduced in ARMv6. Some Intel based > CPUs prior to this have the support as well. Everything else doesn't. > > So taht's nowhere close to 'almost everything'. I meant of all platforms that support both instructions, ldm/stm will be faster than ldrd/strd on almost all of them AFAIK. I don't think the claim about being faster is true for an CortexA9 or anything prior. Linaro folks have done some benchmarking in this area and would be better to comment. Rob ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 16:36 ` Rob Herring @ 2012-03-19 16:53 ` Nicolas Pitre 2012-03-19 17:11 ` Måns Rullgård 2012-03-19 20:11 ` Michael Hope 2 siblings, 0 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-19 16:53 UTC (permalink / raw) To: linux-arm-kernel On Mon, 19 Mar 2012, Rob Herring wrote: > On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote: > > On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: > >> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: > >>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: > >>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit > >>>> for the copy of 8-words data. > >>>> It brings better performance than LDRM/STRM that was used originally. > >>> > >>> And what about CPUs that don't have ldrd/strd ? > >>> > >> > >> And what about CPUs that do have ldrd/strd but is slower than ldm/stm? > >> I'm pretty sure that is almost everything currently out there. > > > > The double-word load/stores were introduced in ARMv6. Some Intel based > > CPUs prior to this have the support as well. Everything else doesn't. > > > > So taht's nowhere close to 'almost everything'. > > I meant of all platforms that support both instructions, ldm/stm will be > faster than ldrd/strd on almost all of them AFAIK. I don't think the > claim about being faster is true for an CortexA9 or anything prior. > Linaro folks have done some benchmarking in this area and would be > better to comment. And more importantly, the generic copy functions in the kernel are typically used for small copies in most cases, while people tend to benchmark copy functions with large buffers, leading to wrong decisions. The functions worth optimizing for throughput are rather copy_page(), copy_user_page(), clear_page(), etc. And not forgetting that some of them areinvoked with a typical cache state for the involved memory. Nicolas ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 16:36 ` Rob Herring 2012-03-19 16:53 ` Nicolas Pitre @ 2012-03-19 17:11 ` Måns Rullgård 2012-03-19 20:11 ` Michael Hope 2 siblings, 0 replies; 25+ messages in thread From: Måns Rullgård @ 2012-03-19 17:11 UTC (permalink / raw) To: linux-arm-kernel Rob Herring <robherring2@gmail.com> writes: > On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote: >> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: >>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: >>>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: >>>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit >>>>> for the copy of 8-words data. >>>>> It brings better performance than LDRM/STRM that was used originally. >>>> >>>> And what about CPUs that don't have ldrd/strd ? >>>> >>> >>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm? >>> I'm pretty sure that is almost everything currently out there. >> >> The double-word load/stores were introduced in ARMv6. Some Intel based >> CPUs prior to this have the support as well. Everything else doesn't. >> >> So taht's nowhere close to 'almost everything'. > > I meant of all platforms that support both instructions, ldm/stm will be > faster than ldrd/strd on almost all of them AFAIK. I don't think the > claim about being faster is true for an CortexA9 or anything prior. The Cortex-A9 TRM insists ldrd and ldm should have the same timing. However, measuring it suggests that ldm is in fact faster, at least in some cases. The Cortex-A8 TRM is a bit unclear, but measuring gives the same speed for both. The manuals for older cores suggest equivalent timing, but I don't have any nearby to test. -- M?ns Rullg?rd mans at mansr.com ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 16:36 ` Rob Herring 2012-03-19 16:53 ` Nicolas Pitre 2012-03-19 17:11 ` Måns Rullgård @ 2012-03-19 20:11 ` Michael Hope 2 siblings, 0 replies; 25+ messages in thread From: Michael Hope @ 2012-03-19 20:11 UTC (permalink / raw) To: linux-arm-kernel On 20 March 2012 05:36, Rob Herring <robherring2@gmail.com> wrote: > On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote: >> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote: >>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: >>>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: >>>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit >>>>> for the copy of 8-words data. >>>>> It brings better performance than LDRM/STRM that was used originally. >>>> >>>> And what about CPUs that don't have ldrd/strd ? >>>> >>> >>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm? >>> I'm pretty sure that is almost everything currently out there. >> >> The double-word load/stores were introduced in ARMv6. ?Some Intel based >> CPUs prior to this have the support as well. ?Everything else doesn't. >> >> So taht's nowhere close to 'almost everything'. > > I meant of all platforms that support both instructions, ldm/stm will be > faster than ldrd/strd on almost all of them AFAIK. I don't think the > claim about being faster is true for an CortexA9 or anything prior. > Linaro folks have done some benchmarking in this area and would be > better to comment. My understanding is that the A15 does well with LDRD and poorly with LDM, all other cores do well with LDM, and the A9 at least does poorly with LDRD. I don't have numbers at hand to back it up. FYI, here's ARM's Cortex-A15 LDRD based memcpy implementation: http://sourceware.org/ml/newlib/2011/msg00469.html -- Michael ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 14:36 ` Rob Herring 2012-03-19 15:41 ` Russell King - ARM Linux @ 2012-03-20 0:21 ` Boojin Kim 1 sibling, 0 replies; 25+ messages in thread From: Boojin Kim @ 2012-03-20 0:21 UTC (permalink / raw) To: linux-arm-kernel Rob Herring wrote: > On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote: > > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote: > >> This patch uses LDRD/STRD that loads and stores data as DWORD unit > >> for the copy of 8-words data. > >> It brings better performance than LDRM/STRM that was used originally. > > > > And what about CPUs that don't have ldrd/strd ? > > > > And what about CPUs that do have ldrd/strd but is slower than ldm/stm? > I'm pretty sure that is almost everything currently out there. Actually I didn't measure the memcpy performance on all ARM SoCs. I just measured it with internal memcpy() benchmark on Cortex-A9 and Cortex-A15. ldrd/strd is faster than ldm/stm on cortex-a15. And it's similar on cortex-a9. I will try again this patch gives meaningful effect to the targeted ARM SoCs Thank you for your reply. > > Rob > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim 2012-03-19 8:55 ` Russell King - ARM Linux @ 2012-03-19 14:10 ` Nicolas Pitre 2012-03-20 0:05 ` Boojin Kim 2012-03-27 0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim 2012-03-27 0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim 3 siblings, 1 reply; 25+ messages in thread From: Nicolas Pitre @ 2012-03-19 14:10 UTC (permalink / raw) To: linux-arm-kernel On Mon, 19 Mar 2012, Boojin Kim wrote: > This patch uses LDRD/STRD that loads and stores data as DWORD unit > for the copy of 8-words data. > It brings better performance than LDRM/STRM that was used originally. > > Signed-off-by: Boojin Kim <boojin.kim@samsung.com> > Cc: Russell King <rmk+kernel@arm.linux.org.uk> Firstly, you're breaking those CPUs without ldrd/strd support. Secondly, you're breaking to_user/from_user copies when processor domains are not disabled. Then, my question is why didn't you simply provide an alternative implementation of ldr8w/str8w using ldrd/strd instead of interleaving them? Certainly that would have allowed you to benefit from SDRAM burst transfers which are typically aligned to d-cache lines, as well as locating the subs into the unavoidable result delay slot. > --- > arch/arm/lib/copy_from_user.S | 14 +++++++++----- > arch/arm/lib/copy_template.S | 10 ++++++---- > arch/arm/lib/copy_to_user.S | 13 +++++++++---- > arch/arm/lib/memcpy.S | 13 +++++++++---- > 4 files changed, 33 insertions(+), 17 deletions(-) > > diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S > index 66a477a..15d1e1c 100644 > --- a/arch/arm/lib/copy_from_user.S > +++ b/arch/arm/lib/copy_from_user.S > @@ -51,11 +51,6 @@ > ldr1w \ptr, \reg4, \abort > .endm > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > - ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort > - ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort > - .endm > - > .macro ldr1b ptr reg cond=al abort > ldrusr \reg, \ptr, 1, \cond, abort=\abort > .endm > @@ -68,6 +63,15 @@ > stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > .endm > > + .macro cpy8w dst src reg1 reg2 abort > + .irp offset, #0, #8, #16, #24 > + ldr1w \src, \reg1, \abort > + ldr1w \src, \reg2, \abort > + strd \reg1, \reg2, [\dst, \offset] > + .endr > + add \dst, \dst, #32 > + .endm > + > .macro str1b ptr reg cond=al abort > str\cond\()b \reg, [\ptr], #1 > .endm > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S > index 805e3f8..72640aa 100644 > --- a/arch/arm/lib/copy_template.S > +++ b/arch/arm/lib/copy_template.S > @@ -28,9 +28,8 @@ > * 'ptr' to the next word. The 'abort' argument is used for fixup tables. > * > * ldr4w ptr reg1 reg2 reg3 reg4 abort > - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > * > - * This loads four or eight words starting from 'ptr', stores them > + * This loads eight words starting from 'ptr', stores them > * in provided registers and increments 'ptr' past those words. > * The'abort' argument is used for fixup tables. > * > @@ -47,6 +46,10 @@ > * Same as their ldr* counterparts, but data is stored to 'ptr' location > * rather than being loaded. > * > + * cpy8w src dst reg1 reg2 abort > + * This loads eight words starting from 'src' and stores them to 'dst'. > + * The 'abort' argument is used for fixup tables. > + * > * enter reg1 reg2 > * > * Preserve the provided registers on the stack plus any additional > @@ -97,9 +100,8 @@ > PLD( pld [r1, #92] ) > > 3: PLD( pld [r1, #124] ) > -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > +4: cpy8w r0, r1, r4, r5, abort=20f > subs r2, r2, #32 > - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > bge 3b > PLD( cmn r2, #96 ) > PLD( bge 4b ) > diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S > index d066df6..9402a08 100644 > --- a/arch/arm/lib/copy_to_user.S > +++ b/arch/arm/lib/copy_to_user.S > @@ -48,10 +48,6 @@ > ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > .endm > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > - .endm > - > .macro ldr1b ptr reg cond=al abort > ldr\cond\()b \reg, [\ptr], #1 > .endm > @@ -71,6 +67,15 @@ > str1w \ptr, \reg8, \abort > .endm > > + .macro cpy8w dst src reg1 reg2 abort > + .irp offset, #0, #8, #16, #24 > + ldrd \reg1, \reg2, [\src, \offset] > + str1w \dst, \reg1, \abort > + str1w \dst, \reg2, \abort > + .endr > + add \src, \src, #32 > + .endm > + > .macro str1b ptr reg cond=al abort > strusr \reg, \ptr, 1, \cond, abort=\abort > .endm > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S > index a9b9e22..25320c9 100644 > --- a/arch/arm/lib/memcpy.S > +++ b/arch/arm/lib/memcpy.S > @@ -24,10 +24,6 @@ > ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > .endm > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > - .endm > - > .macro ldr1b ptr reg cond=al abort > ldr\cond\()b \reg, [\ptr], #1 > .endm > @@ -40,6 +36,15 @@ > stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > .endm > > + .macro cpy8w dst src reg1 reg2 abort > + .irp offset, #0, #8, #16, #24 > + ldrd \reg1, \reg2, [\src, \offset] > + strd \reg1, \reg2, [\dst, \offset] > + .endr > + add \src, \src, #32 > + add \dst, \dst, #32 > + .endm > + > .macro str1b ptr reg cond=al abort > str\cond\()b \reg, [\ptr], #1 > .endm > -- > 1.7.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH] ARM: lib: use LDRD/STRD for data copy 2012-03-19 14:10 ` Nicolas Pitre @ 2012-03-20 0:05 ` Boojin Kim 0 siblings, 0 replies; 25+ messages in thread From: Boojin Kim @ 2012-03-20 0:05 UTC (permalink / raw) To: linux-arm-kernel Nicolas Pitre wrote: > > This patch uses LDRD/STRD that loads and stores data as DWORD unit > > for the copy of 8-words data. > > It brings better performance than LDRM/STRM that was used originally. > > > > Signed-off-by: Boojin Kim <boojin.kim@samsung.com> > > Cc: Russell King <rmk+kernel@arm.linux.org.uk> > > Firstly, you're breaking those CPUs without ldrd/strd support. > I loss the point. I will fix it on next patch. > Secondly, you're breaking to_user/from_user copies when processor > domains are not disabled. Can you explain it in detail? Which one breaks the to_user/from_user copies? Thank you for your reply. > > Then, my question is why didn't you simply provide an alternative > implementation of ldr8w/str8w using ldrd/strd instead of interleaving > them? Certainly that would have allowed you to benefit from SDRAM burst > transfers which are typically aligned to d-cache lines, as well as > locating the subs into the unavoidable result delay slot. > > > --- > > arch/arm/lib/copy_from_user.S | 14 +++++++++----- > > arch/arm/lib/copy_template.S | 10 ++++++---- > > arch/arm/lib/copy_to_user.S | 13 +++++++++---- > > arch/arm/lib/memcpy.S | 13 +++++++++---- > > 4 files changed, 33 insertions(+), 17 deletions(-) > > > > diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S > > index 66a477a..15d1e1c 100644 > > --- a/arch/arm/lib/copy_from_user.S > > +++ b/arch/arm/lib/copy_from_user.S > > @@ -51,11 +51,6 @@ > > ldr1w \ptr, \reg4, \abort > > .endm > > > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > > - ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort > > - ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort > > - .endm > > - > > .macro ldr1b ptr reg cond=al abort > > ldrusr \reg, \ptr, 1, \cond, abort=\abort > > .endm > > @@ -68,6 +63,15 @@ > > stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > > .endm > > > > + .macro cpy8w dst src reg1 reg2 abort > > + .irp offset, #0, #8, #16, #24 > > + ldr1w \src, \reg1, \abort > > + ldr1w \src, \reg2, \abort > > + strd \reg1, \reg2, [\dst, \offset] > > + .endr > > + add \dst, \dst, #32 > > + .endm > > + > > .macro str1b ptr reg cond=al abort > > str\cond\()b \reg, [\ptr], #1 > > .endm > > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S > > index 805e3f8..72640aa 100644 > > --- a/arch/arm/lib/copy_template.S > > +++ b/arch/arm/lib/copy_template.S > > @@ -28,9 +28,8 @@ > > * 'ptr' to the next word. The 'abort' argument is used for fixup tables. > > * > > * ldr4w ptr reg1 reg2 reg3 reg4 abort > > - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > > * > > - * This loads four or eight words starting from 'ptr', stores them > > + * This loads eight words starting from 'ptr', stores them > > * in provided registers and increments 'ptr' past those words. > > * The'abort' argument is used for fixup tables. > > * > > @@ -47,6 +46,10 @@ > > * Same as their ldr* counterparts, but data is stored to 'ptr' location > > * rather than being loaded. > > * > > + * cpy8w src dst reg1 reg2 abort > > + * This loads eight words starting from 'src' and stores them to 'dst'. > > + * The 'abort' argument is used for fixup tables. > > + * > > * enter reg1 reg2 > > * > > * Preserve the provided registers on the stack plus any additional > > @@ -97,9 +100,8 @@ > > PLD( pld [r1, #92] ) > > > > 3: PLD( pld [r1, #124] ) > > -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > +4: cpy8w r0, r1, r4, r5, abort=20f > > subs r2, r2, #32 > > - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > bge 3b > > PLD( cmn r2, #96 ) > > PLD( bge 4b ) > > diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S > > index d066df6..9402a08 100644 > > --- a/arch/arm/lib/copy_to_user.S > > +++ b/arch/arm/lib/copy_to_user.S > > @@ -48,10 +48,6 @@ > > ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > > .endm > > > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > > - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > > - .endm > > - > > .macro ldr1b ptr reg cond=al abort > > ldr\cond\()b \reg, [\ptr], #1 > > .endm > > @@ -71,6 +67,15 @@ > > str1w \ptr, \reg8, \abort > > .endm > > > > + .macro cpy8w dst src reg1 reg2 abort > > + .irp offset, #0, #8, #16, #24 > > + ldrd \reg1, \reg2, [\src, \offset] > > + str1w \dst, \reg1, \abort > > + str1w \dst, \reg2, \abort > > + .endr > > + add \src, \src, #32 > > + .endm > > + > > .macro str1b ptr reg cond=al abort > > strusr \reg, \ptr, 1, \cond, abort=\abort > > .endm > > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S > > index a9b9e22..25320c9 100644 > > --- a/arch/arm/lib/memcpy.S > > +++ b/arch/arm/lib/memcpy.S > > @@ -24,10 +24,6 @@ > > ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > > .endm > > > > - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > > - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > > - .endm > > - > > .macro ldr1b ptr reg cond=al abort > > ldr\cond\()b \reg, [\ptr], #1 > > .endm > > @@ -40,6 +36,15 @@ > > stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > > .endm > > > > + .macro cpy8w dst src reg1 reg2 abort > > + .irp offset, #0, #8, #16, #24 > > + ldrd \reg1, \reg2, [\src, \offset] > > + strd \reg1, \reg2, [\dst, \offset] > > + .endr > > + add \src, \src, #32 > > + add \dst, \dst, #32 > > + .endm > > + > > .macro str1b ptr reg cond=al abort > > str\cond\()b \reg, [\ptr], #1 > > .endm > > -- > > 1.7.1 > > > > > > > > _______________________________________________ > > linux-arm-kernel mailing list > > linux-arm-kernel at lists.infradead.org > > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size 2012-03-19 7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim 2012-03-19 8:55 ` Russell King - ARM Linux 2012-03-19 14:10 ` Nicolas Pitre @ 2012-03-27 0:26 ` Boojin Kim 2012-03-27 2:35 ` Nicolas Pitre 2012-03-27 0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim 3 siblings, 1 reply; 25+ messages in thread From: Boojin Kim @ 2012-03-27 0:26 UTC (permalink / raw) To: linux-arm-kernel This patch adds the optimized memcpy() for the architecture that has 64 byte PLD size. Signed-off-by: Boojin Kim <boojin.kim@samsung.com> Cc: Russell King <rmk+kernel@arm.linux.org.uk> --- arch/arm/Kconfig | 7 ++++++ arch/arm/lib/copy_template.S | 44 +++++++++++++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8fec56d..ba306b3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS default 16 if ARCH_EP93XX default 8 +config ARM_PLD_SIZE + int + default 64 if ARCH_EXYNOS5 + default 32 + help + Configure preload size used on memcpy(). Select 64 for cortex-a15. + config IWMMXT bool "Enable iWMMXt support" depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4 diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 805e3f8..7dc5b8c 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -66,6 +66,7 @@ * than one 32bit instruction in Thumb-2) */ +#define PLDSIZE (CONFIG_ARM_PLD_SIZE) enter r4, lr @@ -90,19 +91,44 @@ CALGN( add pc, r4, ip ) PLD( pld [r1, #0] ) -2: PLD( subs r2, r2, #96 ) - PLD( pld [r1, #28] ) + +#if (PLDSIZE == 64) +2: PLD( cmp r2, #32) + PLD( blt .32cpy) +.64cpy: PLD( subs r2, r2, #(PLDSIZE*3+32) ) + PLD( pld [r1, #PLDSIZE-4] ) PLD( blt 4f ) - PLD( pld [r1, #60] ) - PLD( pld [r1, #92] ) + PLD( pld [r1, #PLDSIZE*2-4] ) + PLD( pld [r1, #PLDSIZE*3-4] ) + +3: PLD( pld [r1, #PLDSIZE*4-4] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #PLDSIZE + bge 3b + PLD( cmn r2, #(PLDSIZE*3) ) + PLD( bge 4b ) + PLD( cmn r2, #(PLDSIZE*4-32) ) + PLD( blt 5f) +.32cpy: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +#else +2: PLD( subs r2, r2, #(PLDSIZE*3) ) + PLD( pld [r1, #(PLDSIZE-4)] ) + PLD( blt 4f ) + PLD( pld [r1, #(PLDSIZE*2-4)] ) + PLD( pld [r1, #(PLDSIZE*3-4)] ) -3: PLD( pld [r1, #124] ) -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - subs r2, r2, #32 - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +3: PLD( pld [r1, #(PLDSIZE*4-4)] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #PLDSIZE + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f bge 3b - PLD( cmn r2, #96 ) + PLD( cmn r2, #(PLDSIZE*3) ) PLD( bge 4b ) +#endif 5: ands ip, r2, #28 rsb ip, ip, #32 -- 1.7.1 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size 2012-03-27 0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim @ 2012-03-27 2:35 ` Nicolas Pitre 2012-03-28 0:28 ` Boojin Kim 0 siblings, 1 reply; 25+ messages in thread From: Nicolas Pitre @ 2012-03-27 2:35 UTC (permalink / raw) To: linux-arm-kernel On Tue, 27 Mar 2012, Boojin Kim wrote: > This patch adds the optimized memcpy() for the architecture that has 64 byte PLD size. > > Signed-off-by: Boojin Kim <boojin.kim@samsung.com> > Cc: Russell King <rmk+kernel@arm.linux.org.uk> This creates quite convoluted code. If this is worth doing, we'll have to find a cleaner way to do this. Could you please provide performance measurement numbers with and without this patch, and similarly for the next patch? Did you try enabling the cache alignment code? What performance difference if any did you see? > --- > arch/arm/Kconfig | 7 ++++++ > arch/arm/lib/copy_template.S | 44 +++++++++++++++++++++++++++++++++-------- > 2 files changed, 42 insertions(+), 9 deletions(-) > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig > index 8fec56d..ba306b3 100644 > --- a/arch/arm/Kconfig > +++ b/arch/arm/Kconfig > @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS > default 16 if ARCH_EP93XX > default 8 > > +config ARM_PLD_SIZE > + int > + default 64 if ARCH_EXYNOS5 > + default 32 > + help > + Configure preload size used on memcpy(). Select 64 for cortex-a15. > + > config IWMMXT > bool "Enable iWMMXt support" > depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4 > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S > index 805e3f8..7dc5b8c 100644 > --- a/arch/arm/lib/copy_template.S > +++ b/arch/arm/lib/copy_template.S > @@ -66,6 +66,7 @@ > * than one 32bit instruction in Thumb-2) > */ > > +#define PLDSIZE (CONFIG_ARM_PLD_SIZE) > > enter r4, lr > > @@ -90,19 +91,44 @@ > CALGN( add pc, r4, ip ) > > PLD( pld [r1, #0] ) > -2: PLD( subs r2, r2, #96 ) > - PLD( pld [r1, #28] ) > + > +#if (PLDSIZE == 64) > +2: PLD( cmp r2, #32) > + PLD( blt .32cpy) > +.64cpy: PLD( subs r2, r2, #(PLDSIZE*3+32) ) > + PLD( pld [r1, #PLDSIZE-4] ) > PLD( blt 4f ) > - PLD( pld [r1, #60] ) > - PLD( pld [r1, #92] ) > + PLD( pld [r1, #PLDSIZE*2-4] ) > + PLD( pld [r1, #PLDSIZE*3-4] ) > + > +3: PLD( pld [r1, #PLDSIZE*4-4] ) > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + subs r2, r2, #PLDSIZE > + bge 3b > + PLD( cmn r2, #(PLDSIZE*3) ) > + PLD( bge 4b ) > + PLD( cmn r2, #(PLDSIZE*4-32) ) > + PLD( blt 5f) > +.32cpy: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > +#else > +2: PLD( subs r2, r2, #(PLDSIZE*3) ) > + PLD( pld [r1, #(PLDSIZE-4)] ) > + PLD( blt 4f ) > + PLD( pld [r1, #(PLDSIZE*2-4)] ) > + PLD( pld [r1, #(PLDSIZE*3-4)] ) > > -3: PLD( pld [r1, #124] ) > -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > - subs r2, r2, #32 > - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > +3: PLD( pld [r1, #(PLDSIZE*4-4)] ) > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + subs r2, r2, #PLDSIZE > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > bge 3b > - PLD( cmn r2, #96 ) > + PLD( cmn r2, #(PLDSIZE*3) ) > PLD( bge 4b ) > +#endif > > 5: ands ip, r2, #28 > rsb ip, ip, #32 > -- > 1.7.1 > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size 2012-03-27 2:35 ` Nicolas Pitre @ 2012-03-28 0:28 ` Boojin Kim 2012-03-28 5:23 ` Nicolas Pitre 0 siblings, 1 reply; 25+ messages in thread From: Boojin Kim @ 2012-03-28 0:28 UTC (permalink / raw) To: linux-arm-kernel Nicolas wrote: > This creates quite convoluted code. If this is worth doing, we'll have > to find a cleaner way to do this. > > Could you please provide performance measurement numbers with and > without this patch, and similarly for the next patch? > > Did you try enabling the cache alignment code? What performance > difference if any did you see? My patch brings about 10% better result on cache boundary. 64bytes PLD size makes the cache efficiency be higher on machines that has 64byte cache line. And, Which one is convoluted code? Can you explain it more detail? Thank you for your review. > > > --- > > arch/arm/Kconfig | 7 ++++++ > > arch/arm/lib/copy_template.S | 44 +++++++++++++++++++++++++++++++++-------- > > 2 files changed, 42 insertions(+), 9 deletions(-) > > > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig > > index 8fec56d..ba306b3 100644 > > --- a/arch/arm/Kconfig > > +++ b/arch/arm/Kconfig > > @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS > > default 16 if ARCH_EP93XX > > default 8 > > > > +config ARM_PLD_SIZE > > + int > > + default 64 if ARCH_EXYNOS5 > > + default 32 > > + help > > + Configure preload size used on memcpy(). Select 64 for cortex-a15. > > + > > config IWMMXT > > bool "Enable iWMMXt support" > > depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4 > > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S > > index 805e3f8..7dc5b8c 100644 > > --- a/arch/arm/lib/copy_template.S > > +++ b/arch/arm/lib/copy_template.S > > @@ -66,6 +66,7 @@ > > * than one 32bit instruction in Thumb-2) > > */ > > > > +#define PLDSIZE (CONFIG_ARM_PLD_SIZE) > > > > enter r4, lr > > > > @@ -90,19 +91,44 @@ > > CALGN( add pc, r4, ip ) > > > > PLD( pld [r1, #0] ) > > -2: PLD( subs r2, r2, #96 ) > > - PLD( pld [r1, #28] ) > > + > > +#if (PLDSIZE == 64) > > +2: PLD( cmp r2, #32) > > + PLD( blt .32cpy) > > +.64cpy: PLD( subs r2, r2, #(PLDSIZE*3+32) ) > > + PLD( pld [r1, #PLDSIZE-4] ) > > PLD( blt 4f ) > > - PLD( pld [r1, #60] ) > > - PLD( pld [r1, #92] ) > > + PLD( pld [r1, #PLDSIZE*2-4] ) > > + PLD( pld [r1, #PLDSIZE*3-4] ) > > + > > +3: PLD( pld [r1, #PLDSIZE*4-4] ) > > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + subs r2, r2, #PLDSIZE > > + bge 3b > > + PLD( cmn r2, #(PLDSIZE*3) ) > > + PLD( bge 4b ) > > + PLD( cmn r2, #(PLDSIZE*4-32) ) > > + PLD( blt 5f) > > +.32cpy: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > +#else > > +2: PLD( subs r2, r2, #(PLDSIZE*3) ) > > + PLD( pld [r1, #(PLDSIZE-4)] ) > > + PLD( blt 4f ) > > + PLD( pld [r1, #(PLDSIZE*2-4)] ) > > + PLD( pld [r1, #(PLDSIZE*3-4)] ) > > > > -3: PLD( pld [r1, #124] ) > > -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > - subs r2, r2, #32 > > - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > +3: PLD( pld [r1, #(PLDSIZE*4-4)] ) > > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > + subs r2, r2, #PLDSIZE > > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > > bge 3b > > - PLD( cmn r2, #96 ) > > + PLD( cmn r2, #(PLDSIZE*3) ) > > PLD( bge 4b ) > > +#endif > > > > 5: ands ip, r2, #28 > > rsb ip, ip, #32 > > -- > > 1.7.1 > > > > > > > > _______________________________________________ > > linux-arm-kernel mailing list > > linux-arm-kernel at lists.infradead.org > > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size 2012-03-28 0:28 ` Boojin Kim @ 2012-03-28 5:23 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre 0 siblings, 1 reply; 25+ messages in thread From: Nicolas Pitre @ 2012-03-28 5:23 UTC (permalink / raw) To: linux-arm-kernel On Wed, 28 Mar 2012, Boojin Kim wrote: > Nicolas wrote: > > > This creates quite convoluted code. If this is worth doing, we'll have > > to find a cleaner way to do this. > > > > Could you please provide performance measurement numbers with and > > without this patch, and similarly for the next patch? > > > > Did you try enabling the cache alignment code? What performance > > difference if any did you see? > My patch brings about 10% better result on cache boundary. > 64bytes PLD size makes the cache efficiency be higher on machines that has 64byte cache line. > And, Which one is convoluted code? Can you explain it more detail? Yes, I will. I now have reworked this code to be extensible and still as clean as possible. I'm not going to post it right away though, given that it is late and I prefer to have another look at it after I had some sleep. Nicolas ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 0/4] memcpy optimized with strd/ldrd 2012-03-28 5:23 ` Nicolas Pitre @ 2012-03-29 4:00 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre ` (3 more replies) 0 siblings, 4 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-29 4:00 UTC (permalink / raw) To: linux-arm-kernel Here's my version. Lightly tested. I have no A15 hardware to run any performance comparison though. ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/4] ARM: copy_template.S: move some registers around 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre @ 2012-03-29 4:00 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre ` (2 subsequent siblings) 3 siblings, 0 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-29 4:00 UTC (permalink / raw) To: linux-arm-kernel From: Nicolas Pitre <nicolas.pitre@linaro.org> The copy length is held in r2, making it difficult to use a consecutive set of registers starting on an even register number as required by the LDRD and STRD instructions. Let's move the length to lr instead, and adjust affected code accordingly. Functionally speaking, this patch is a no-op. Signed-off-by: nicolas Pitre <nico@linaro.org> --- arch/arm/lib/copy_template.S | 92 +++++++++++++++++++++--------------------- 1 files changed, 46 insertions(+), 46 deletions(-) diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 805e3f8fb0..7244dcef0d 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -69,7 +69,7 @@ enter r4, lr - subs r2, r2, #4 + subs lr, r2, #4 blt 8f ands ip, r0, #3 PLD( pld [r1, #0] ) @@ -77,34 +77,34 @@ ands ip, r1, #3 bne 10f -1: subs r2, r2, #(28) +1: subs lr, lr, #(28) stmfd sp!, {r5 - r8} blt 5f CALGN( ands ip, r0, #31 ) CALGN( rsb r3, ip, #32 ) - CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( sbcnes r4, r3, lr ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) - CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( subs lr, lr, r3 ) @ C gets set CALGN( add pc, r4, ip ) PLD( pld [r1, #0] ) -2: PLD( subs r2, r2, #96 ) +2: PLD( subs lr, lr, #96 ) PLD( pld [r1, #28] ) PLD( blt 4f ) PLD( pld [r1, #60] ) PLD( pld [r1, #92] ) 3: PLD( pld [r1, #124] ) -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - subs r2, r2, #32 - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +4: ldr8w r1, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f + subs lr, lr, #32 + str8w r0, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f bge 3b - PLD( cmn r2, #96 ) + PLD( cmn lr, #96 ) PLD( bge 4b ) -5: ands ip, r2, #28 +5: ands ip, lr, #28 rsb ip, ip, #32 #if LDR1W_SHIFT > 0 lsl ip, ip, #LDR1W_SHIFT @@ -115,13 +115,13 @@ .rept (1 << LDR1W_SHIFT) W(nop) .endr + ldr1w r1, r2, abort=20f ldr1w r1, r3, abort=20f ldr1w r1, r4, abort=20f ldr1w r1, r5, abort=20f ldr1w r1, r6, abort=20f ldr1w r1, r7, abort=20f ldr1w r1, r8, abort=20f - ldr1w r1, lr, abort=20f #if LDR1W_SHIFT < STR1W_SHIFT lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT @@ -133,73 +133,75 @@ .rept (1 << STR1W_SHIFT) W(nop) .endr + str1w r0, r2, abort=20f str1w r0, r3, abort=20f str1w r0, r4, abort=20f str1w r0, r5, abort=20f str1w r0, r6, abort=20f str1w r0, r7, abort=20f str1w r0, r8, abort=20f - str1w r0, lr, abort=20f CALGN( bcs 2b ) 7: ldmfd sp!, {r5 - r8} -8: movs r2, r2, lsl #31 - ldr1b r1, r3, ne, abort=21f +8: movs lr, lr, lsl #31 + ldr1b r1, r2, ne, abort=21f + ldr1b r1, r3, cs, abort=21f ldr1b r1, r4, cs, abort=21f - ldr1b r1, ip, cs, abort=21f - str1b r0, r3, ne, abort=21f + str1b r0, r2, ne, abort=21f + str1b r0, r3, cs, abort=21f str1b r0, r4, cs, abort=21f - str1b r0, ip, cs, abort=21f exit r4, pc 9: rsb ip, ip, #4 cmp ip, #2 - ldr1b r1, r3, gt, abort=21f - ldr1b r1, r4, ge, abort=21f - ldr1b r1, lr, abort=21f - str1b r0, r3, gt, abort=21f - str1b r0, r4, ge, abort=21f - subs r2, r2, ip - str1b r0, lr, abort=21f + ldr1b r1, r2, gt, abort=21f + ldr1b r1, r3, ge, abort=21f + ldr1b r1, r4, abort=21f + str1b r0, r2, gt, abort=21f + str1b r0, r3, ge, abort=21f + subs lr, lr, ip + str1b r0, r4, abort=21f blt 8b ands ip, r1, #3 beq 1b 10: bic r1, r1, #3 cmp ip, #2 - ldr1w r1, lr, abort=21f + ldr1w r1, ip, abort=21f beq 17f bgt 18f .macro forward_copy_shift pull push - subs r2, r2, #28 + subs lr, lr, #28 blt 14f - CALGN( ands ip, r0, #31 ) - CALGN( rsb ip, ip, #32 ) - CALGN( sbcnes r4, ip, r2 ) @ C is always set here - CALGN( subcc r2, r2, ip ) + CALGN( ands r3, r0, #31 ) + CALGN( rsb r3, r3, #32 ) + CALGN( sbcnes r4, r3, lr ) @ C is always set here + CALGN( subcc lr, lr, r3 ) CALGN( bcc 15f ) 11: stmfd sp!, {r5 - r9} PLD( pld [r1, #0] ) - PLD( subs r2, r2, #96 ) + PLD( subs lr, lr, #96 ) PLD( pld [r1, #28] ) PLD( blt 13f ) PLD( pld [r1, #60] ) PLD( pld [r1, #92] ) 12: PLD( pld [r1, #124] ) -13: ldr4w r1, r4, r5, r6, r7, abort=19f - mov r3, lr, pull #\pull - subs r2, r2, #32 - ldr4w r1, r8, r9, ip, lr, abort=19f +13: ldr4w r1, r3, r4, r5, r6, abort=19f + mov r2, ip, pull #\pull + subs lr, lr, #32 + ldr4w r1, r7, r8, r9, ip, abort=19f + orr r2, r2, r3, push #\push + mov r3, r3, pull #\pull orr r3, r3, r4, push #\push mov r4, r4, pull #\pull orr r4, r4, r5, push #\push @@ -213,25 +215,23 @@ orr r8, r8, r9, push #\push mov r9, r9, pull #\pull orr r9, r9, ip, push #\push - mov ip, ip, pull #\pull - orr ip, ip, lr, push #\push - str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + str8w r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f bge 12b - PLD( cmn r2, #96 ) + PLD( cmn lr, #96 ) PLD( bge 13b ) ldmfd sp!, {r5 - r9} -14: ands ip, r2, #28 +14: ands r3, lr, #28 beq 16f -15: mov r3, lr, pull #\pull - ldr1w r1, lr, abort=21f - subs ip, ip, #4 - orr r3, r3, lr, push #\push - str1w r0, r3, abort=21f +15: mov r2, ip, pull #\pull + ldr1w r1, ip, abort=21f + subs r3, r3, #4 + orr r2, r2, ip, push #\push + str1w r0, r2, abort=21f bgt 15b - CALGN( cmp r2, #0 ) + CALGN( cmp lr, #0 ) CALGN( bge 11b ) 16: sub r1, r1, #(\push / 8) -- 1.7.9.rc2 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre 2012-03-29 4:00 ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre @ 2012-03-29 4:00 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre 2012-03-29 4:00 ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre 3 siblings, 0 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-29 4:00 UTC (permalink / raw) To: linux-arm-kernel From: Nicolas Pitre <nicolas.pitre@linaro.org> Let's rework the unaligned copy loop to enforce a range of contigous registers starting from an even register, and to use a single ldr8w construct instead of two ldr4w's. There are no users of ldr4w anymore, so its various definitions are removed. By using one additional temporary registers, it is possible to have the same register set for the loads and the stores, and to make the loop friendlier to superscalar CPUs at the same time. Signed-off-by: Nicolas Pitre <nico@linaro.org> --- arch/arm/lib/copy_from_user.S | 11 +++---- arch/arm/lib/copy_template.S | 57 ++++++++++++++++++++--------------------- arch/arm/lib/copy_to_user.S | 4 --- arch/arm/lib/memcpy.S | 4 --- 4 files changed, 33 insertions(+), 43 deletions(-) diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index 66a477a3e3..d1df0ec62b 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -44,16 +44,15 @@ ldrusr \reg, \ptr, 4, abort=\abort .endm - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort ldr1w \ptr, \reg1, \abort ldr1w \ptr, \reg2, \abort ldr1w \ptr, \reg3, \abort ldr1w \ptr, \reg4, \abort - .endm - - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort - ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort + ldr1w \ptr, \reg5, \abort + ldr1w \ptr, \reg6, \abort + ldr1w \ptr, \reg7, \abort + ldr1w \ptr, \reg8, \abort .endm .macro ldr1b ptr reg cond=al abort diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 7244dcef0d..84e94cd48c 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -27,10 +27,9 @@ * This loads one word from 'ptr', stores it in 'reg' and increments * 'ptr' to the next word. The 'abort' argument is used for fixup tables. * - * ldr4w ptr reg1 reg2 reg3 reg4 abort * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort * - * This loads four or eight words starting from 'ptr', stores them + * This loads eight words starting from 'ptr', stores them * in provided registers and increments 'ptr' past those words. * The'abort' argument is used for fixup tables. * @@ -63,7 +62,7 @@ * * Correction to be applied to the "ip" register when branching into * the ldr1w or str1w instructions (some of these macros may expand to - * than one 32bit instruction in Thumb-2) + * more than one 32bit instruction in Thumb-2) */ @@ -170,7 +169,7 @@ 10: bic r1, r1, #3 cmp ip, #2 - ldr1w r1, ip, abort=21f + ldr1w r1, r2, abort=21f beq 17f bgt 18f @@ -178,6 +177,7 @@ .macro forward_copy_shift pull push subs lr, lr, #28 + mov ip, r2, pull #\pull blt 14f CALGN( ands r3, r0, #31 ) @@ -186,7 +186,7 @@ CALGN( subcc lr, lr, r3 ) CALGN( bcc 15f ) -11: stmfd sp!, {r5 - r9} +11: stmfd sp!, {r5 - sl} PLD( pld [r1, #0] ) PLD( subs lr, lr, #96 ) @@ -196,40 +196,39 @@ PLD( pld [r1, #92] ) 12: PLD( pld [r1, #124] ) -13: ldr4w r1, r3, r4, r5, r6, abort=19f - mov r2, ip, pull #\pull +13: ldr8w r1, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f subs lr, lr, #32 - ldr4w r1, r7, r8, r9, ip, abort=19f - orr r2, r2, r3, push #\push - mov r3, r3, pull #\pull - orr r3, r3, r4, push #\push - mov r4, r4, pull #\pull - orr r4, r4, r5, push #\push - mov r5, r5, pull #\pull - orr r5, r5, r6, push #\push - mov r6, r6, pull #\pull - orr r6, r6, r7, push #\push - mov r7, r7, pull #\pull - orr r7, r7, r8, push #\push - mov r8, r8, pull #\pull - orr r8, r8, r9, push #\push - mov r9, r9, pull #\pull - orr r9, r9, ip, push #\push + mov sl, r2, pull #\pull + orr r2, ip, r2, push #\push + mov ip, r3, pull #\pull + orr r3, sl, r3, push #\push + mov sl, r4, pull #\pull + orr r4, ip, r4, push #\push + mov ip, r5, pull #\pull + orr r5, sl, r5, push #\push + mov sl, r6, pull #\pull + orr r6, ip, r6, push #\push + mov ip, r7, pull #\pull + orr r7, sl, r7, push #\push + mov sl, r8, pull #\pull + orr r8, ip, r8, push #\push + mov ip, r9, pull #\pull + orr r9, sl, r9, push #\push str8w r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f bge 12b PLD( cmn lr, #96 ) PLD( bge 13b ) - ldmfd sp!, {r5 - r9} + ldmfd sp!, {r5 - sl} 14: ands r3, lr, #28 beq 16f -15: mov r2, ip, pull #\pull - ldr1w r1, ip, abort=21f +15: ldr1w r1, r2, abort=21f subs r3, r3, #4 - orr r2, r2, ip, push #\push - str1w r0, r2, abort=21f + orr r4, ip, r2, push #\push + mov ip, r2, pull #\pull + str1w r0, r4, abort=21f bgt 15b CALGN( cmp lr, #0 ) CALGN( bge 11b ) @@ -255,7 +254,7 @@ */ .macro copy_abort_preamble -19: ldmfd sp!, {r5 - r9} +19: ldmfd sp!, {r5 - sl} b 21f 20: ldmfd sp!, {r5 - r8} 21: diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index d066df686e..a83bc04365 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -44,10 +44,6 @@ W(ldr) \reg, [\ptr], #4 .endm - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} - .endm - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index a9b9e2287a..adbccc6e2d 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -20,10 +20,6 @@ W(ldr) \reg, [\ptr], #4 .endm - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} - .endm - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm -- 1.7.9.rc2 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre 2012-03-29 4:00 ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre 2012-03-29 4:00 ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre @ 2012-03-29 4:00 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre 3 siblings, 0 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-29 4:00 UTC (permalink / raw) To: linux-arm-kernel From: Nicolas Pitre <nicolas.pitre@linaro.org> Let's enforce a range of contigous registers with the remaining ldr8w and str8w accessors. An additional register needs to be preserved to achieve this although not strictly necessary otherwise, but this will allow for a greater flexibility inthe accessor implementation. Signed-off-by: Nicolas Pitre <nico@linaro.org> --- arch/arm/lib/copy_template.S | 10 +++++----- 1 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 84e94cd48c..f6f42c3330 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -77,7 +77,7 @@ bne 10f 1: subs lr, lr, #(28) - stmfd sp!, {r5 - r8} + stmfd sp!, {r5 - r9} blt 5f CALGN( ands ip, r0, #31 ) @@ -96,9 +96,9 @@ PLD( pld [r1, #92] ) 3: PLD( pld [r1, #124] ) -4: ldr8w r1, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f +4: ldr8w r1, r2, r3, r4, r5, r6, r7, r8, r9, abort=20f subs lr, lr, #32 - str8w r0, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f + str8w r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=20f bge 3b PLD( cmn lr, #96 ) PLD( bge 4b ) @@ -142,7 +142,7 @@ CALGN( bcs 2b ) -7: ldmfd sp!, {r5 - r8} +7: ldmfd sp!, {r5 - r9} 8: movs lr, lr, lsl #31 ldr1b r1, r2, ne, abort=21f @@ -256,7 +256,7 @@ .macro copy_abort_preamble 19: ldmfd sp!, {r5 - sl} b 21f -20: ldmfd sp!, {r5 - r8} +20: ldmfd sp!, {r5 - r9} 21: .endm -- 1.7.9.rc2 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre ` (2 preceding siblings ...) 2012-03-29 4:00 ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre @ 2012-03-29 4:00 ` Nicolas Pitre 3 siblings, 0 replies; 25+ messages in thread From: Nicolas Pitre @ 2012-03-29 4:00 UTC (permalink / raw) To: linux-arm-kernel From: Nicolas Pitre <nicolas.pitre@linaro.org> Because STRD requires a 64-bit aligned destination pointer, we unconditionally enable the cache alignment code. Same concern with LDRD, but we conditionally execute them or the LDM fallback depending on the source pointer alignment. Obviously, this could be optimized further by duplicating each loop and increasing the code. Convincing benchmarks would be in order before doing so. Signed-off-by: Nicolas Pitre <nico@linaro.org> --- arch/arm/Kconfig | 9 +++++++++ arch/arm/lib/copy_from_user.S | 15 ++++++++++++++- arch/arm/lib/copy_template.S | 3 +++ arch/arm/lib/copy_to_user.S | 11 ++++++++++- arch/arm/lib/memcpy.S | 26 ++++++++++++++++++++++++-- 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5098564d58..b87069730a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1826,6 +1826,15 @@ config UACCESS_WITH_MEMCPY However, if the CPU data cache is using a write-allocate mode, this option is unlikely to provide any performance gain. +config USE_LDRDSTRD_OVER_LDMSTM + bool "Use 64-bit access instructions to optimize memory copy" + depends on CPU_V7 + help + Some processors, notably the Cortex-A15, are known to perform + better when accessing memory using LDRD/STRD instructions instead + of LDM/STM. Select this to optimize memory copy routines + accordingly. + config SECCOMP bool prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index d1df0ec62b..375cbbf0e5 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -40,6 +40,12 @@ #endif #define STR1W_SHIFT 0 +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM +/* Enforce destination cache line alignment */ +#undef CALGN +#define CALGN(x...) x +#endif + .macro ldr1w ptr reg abort ldrusr \reg, \ptr, 4, abort=\abort .endm @@ -64,7 +70,14 @@ .endm .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM + strd \reg1, \reg2, [\ptr], #8 + strd \reg3, \reg4, [\ptr], #8 + strd \reg5, \reg6, [\ptr], #8 + strd \reg7, \reg8, [\ptr], #8 +#else + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#endif .endm .macro str1b ptr reg cond=al abort diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index f6f42c3330..6a9823d51f 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -63,6 +63,9 @@ * Correction to be applied to the "ip" register when branching into * the ldr1w or str1w instructions (some of these macros may expand to * more than one 32bit instruction in Thumb-2) + * + * Note: ldr8w is the only accessor that is allowed to change the + * condition code. */ diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index a83bc04365..11534edea1 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -45,7 +45,16 @@ .endm .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM + tst \ptr, #7 + ldreqd \reg1, \reg2, [\ptr], #8 + ldreqd \reg3, \reg4, [\ptr], #8 + ldreqd \reg5, \reg6, [\ptr], #8 + ldreqd \reg7, \reg8, [\ptr], #8 + ldmneia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#else + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#endif .endm .macro ldr1b ptr reg cond=al abort diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index adbccc6e2d..db49a300c8 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -16,12 +16,27 @@ #define LDR1W_SHIFT 0 #define STR1W_SHIFT 0 +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM +/* Enforce destination cache line alignment */ +#undef CALGN +#define CALGN(x...) x +#endif + .macro ldr1w ptr reg abort W(ldr) \reg, [\ptr], #4 .endm .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM + tst \ptr, #7 + ldreqd \reg1, \reg2, [\ptr], #8 + ldreqd \reg3, \reg4, [\ptr], #8 + ldreqd \reg5, \reg6, [\ptr], #8 + ldreqd \reg7, \reg8, [\ptr], #8 + ldmneia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#else + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#endif .endm .macro ldr1b ptr reg cond=al abort @@ -33,7 +48,14 @@ .endm .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM + strd \reg1, \reg2, [\ptr], #8 + strd \reg3, \reg4, [\ptr], #8 + strd \reg5, \reg6, [\ptr], #8 + strd \reg7, \reg8, [\ptr], #8 +#else + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} +#endif .endm .macro str1b ptr reg cond=al abort -- 1.7.9.rc2 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy 2012-03-19 7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim ` (2 preceding siblings ...) 2012-03-27 0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim @ 2012-03-27 0:27 ` Boojin Kim 2012-03-27 7:40 ` Russell King - ARM Linux 3 siblings, 1 reply; 25+ messages in thread From: Boojin Kim @ 2012-03-27 0:27 UTC (permalink / raw) To: linux-arm-kernel This patch uses LDRD/STRD that loads and stores data as DWORD unit. It brings better performance than LDRM/STRM with cortex-a15. Signed-off-by: Boojin Kim <boojin.kim@samsung.com> Cc: Russell King <rmk+kernel@arm.linux.org.uk> --- arch/arm/lib/copy_from_user.S | 9 +++++++++ arch/arm/lib/copy_template.S | 14 ++++++++------ arch/arm/lib/copy_to_user.S | 9 +++++++++ arch/arm/lib/memcpy.S | 9 +++++++++ 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index 66a477a..dd1fe01 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -68,6 +68,15 @@ stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm + .macro cpy8w dst src reg1 reg2 abort + .irp offset, #0, #8, #16, #24 + ldr1w \src, \reg1, \abort + ldr1w \src, \reg2, \abort + strd \reg1, \reg2, [\dst, \offset] + .endr + add \dst, \dst, #32 + .endm + .macro str1b ptr reg cond=al abort str\cond\()b \reg, [\ptr], #1 .endm diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 7dc5b8c..a2dd5e2 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -47,6 +47,11 @@ * Same as their ldr* counterparts, but data is stored to 'ptr' location * rather than being loaded. * + * cpy8w src dst reg1 reg2 abort + * + * This loads eight words starting from 'src' and stores them to 'dst'. + * The 'abort' argument is used for fixup tables. + * * enter reg1 reg2 * * Preserve the provided registers on the stack plus any additional @@ -102,18 +107,15 @@ PLD( pld [r1, #PLDSIZE*3-4] ) 3: PLD( pld [r1, #PLDSIZE*4-4] ) -4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +4: cpy8w r0, r1, r4, r5, abort=20f + cpy8w r0, r1, r4, r5, abort=20f subs r2, r2, #PLDSIZE bge 3b PLD( cmn r2, #(PLDSIZE*3) ) PLD( bge 4b ) PLD( cmn r2, #(PLDSIZE*4-32) ) PLD( blt 5f) -.32cpy: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f - str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f +32copy: cpy8w r0, r1, r4, r5, abort=20f #else 2: PLD( subs r2, r2, #(PLDSIZE*3) ) PLD( pld [r1, #(PLDSIZE-4)] ) diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index d066df6..fc8ea7a 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -71,6 +71,15 @@ str1w \ptr, \reg8, \abort .endm + .macro cpy8w dst src reg1 reg2 abort + .irp offset, #0, #8, #16, #24 + ldrd \reg1, \reg2, [\src, \offset] + str1w \dst, \reg1, \abort + str1w \dst, \reg2, \abort + .endr + add \src, \src, #32 + .endm + .macro str1b ptr reg cond=al abort strusr \reg, \ptr, 1, \cond, abort=\abort .endm diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index a9b9e22..5b4ca72 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -40,6 +40,15 @@ stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm + .macro cpy8w dst src reg1 reg2 abort + .irp offset, #0, #8, #16, #24 + ldrd \reg1, \reg2, [\src, \offset] + strd \reg1, \reg2, [\dst, \offset] + .endr + add \src, \src, #32 + add \dst, \dst, #32 + .endm + .macro str1b ptr reg cond=al abort str\cond\()b \reg, [\ptr], #1 .endm -- 1.7.1 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy 2012-03-27 0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim @ 2012-03-27 7:40 ` Russell King - ARM Linux 2012-03-28 0:19 ` Boojin Kim 0 siblings, 1 reply; 25+ messages in thread From: Russell King - ARM Linux @ 2012-03-27 7:40 UTC (permalink / raw) To: linux-arm-kernel On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote: > This patch uses LDRD/STRD that loads and stores data as DWORD unit. > It brings better performance than LDRM/STRM with cortex-a15. Why should I bother looking at this rubbish? You've been told before that using ldrd and strd unconditionally is not acceptable. Stop wasting peoples review time. ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy 2012-03-27 7:40 ` Russell King - ARM Linux @ 2012-03-28 0:19 ` Boojin Kim 2012-03-28 4:10 ` Boojin Kim 0 siblings, 1 reply; 25+ messages in thread From: Boojin Kim @ 2012-03-28 0:19 UTC (permalink / raw) To: linux-arm-kernel Russell King wrote: > Sent: Tuesday, March 27, 2012 4:41 PM > To: Boojin Kim > Cc: linux-arm-kernel at lists.infradead.org; 'Catalin Marinas'; 'Nicolas Pitre'; > kgene.kim at samsung.com > Subject: Re: [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy > > On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote: > > This patch uses LDRD/STRD that loads and stores data as DWORD unit. > > It brings better performance than LDRM/STRM with cortex-a15. > > Why should I bother looking at this rubbish? You've been told before > that using ldrd and strd unconditionally is not acceptable. Stop > wasting peoples review time. This patch brings better memcpy results on Cortex-a15. Please see following result. I measured it on cortex-a15. 2nd line is default memcpy. 3rd line is memcpy using ldrd/strd with this patch. 4th line is memcpy using ldrd/strd and PLD optimization on my 1st patch. =================================================================== Memcpy performance (unit: size: Bytes, results: MBps) =================================================================== size default ldrd/strd ldrd/strd + PLD opti =================================================================== 64 1245.615434 1565.004006 1565.004006 128 1743.861607 2393.535539 2491.230867 256 2199.46509 3212.376645 3487.723214 512 2569.901316 4137.976695 4479.644495 1024 2880.715339 4245.923913 5250.336022 2048 3623.608534 4752.128954 5365.728022 4096 4120.516878 5119.593709 5710.891813 8192 4431.366988 5126.312336 5440.45961 16384 4603.712434 5040.322581 5529.016277 32768 4559.381383 4712.002413 5238.893546 65536 3483.446661 3513.802215 3516.965843 131072 3495.623479 3498.460677 3506.31136 262144 3484.02921 3475.987876 3499.783013 524288 3427.662608 3430.037525 3454.637159 1048576 2263.903195 2225.9222 2458.911587 2097152 1732.182125 1703.940362 1833.96223 4194304 1713.663165 1708.351146 1781.780052 =================================================================== I think it brings meaningful results on cache boundary. So I tried it again. And, I saw your review. So, I make this patch be effective on cortex-a15 only if machine selects it. Thanks for your time and review :) ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy 2012-03-28 0:19 ` Boojin Kim @ 2012-03-28 4:10 ` Boojin Kim 0 siblings, 0 replies; 25+ messages in thread From: Boojin Kim @ 2012-03-28 4:10 UTC (permalink / raw) To: linux-arm-kernel Boojin Kim wrote: > > Cc: linux-arm-kernel at lists.infradead.org; 'Catalin Marinas'; 'Nicolas Pitre'; > > kgene.kim at samsung.com > > Subject: Re: [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy > > > > On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote: > > > This patch uses LDRD/STRD that loads and stores data as DWORD unit. > > > It brings better performance than LDRM/STRM with cortex-a15. > > > > Why should I bother looking at this rubbish? You've been told before > > that using ldrd and strd unconditionally is not acceptable. Stop > > wasting peoples review time. > This patch brings better memcpy results on Cortex-a15. Additionally, Following is ARM's memcpy implementation that mentions LDRD/STRD is better for cortex-a15. http://sourceware.org/ml/newlib/2011/msg00469.html I'm tring to optimize memcpy for cortex-a15 because it's seems to be worth. Thank you.. > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel ^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2012-03-29 4:00 UTC | newest] Thread overview: 25+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2012-03-19 7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim 2012-03-19 8:55 ` Russell King - ARM Linux 2012-03-19 14:36 ` Rob Herring 2012-03-19 15:41 ` Russell King - ARM Linux 2012-03-19 16:34 ` Måns Rullgård 2012-03-19 16:36 ` Rob Herring 2012-03-19 16:53 ` Nicolas Pitre 2012-03-19 17:11 ` Måns Rullgård 2012-03-19 20:11 ` Michael Hope 2012-03-20 0:21 ` Boojin Kim 2012-03-19 14:10 ` Nicolas Pitre 2012-03-20 0:05 ` Boojin Kim 2012-03-27 0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim 2012-03-27 2:35 ` Nicolas Pitre 2012-03-28 0:28 ` Boojin Kim 2012-03-28 5:23 ` Nicolas Pitre 2012-03-29 4:00 ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre 2012-03-29 4:00 ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre 2012-03-29 4:00 ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre 2012-03-29 4:00 ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre 2012-03-29 4:00 ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre 2012-03-27 0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim 2012-03-27 7:40 ` Russell King - ARM Linux 2012-03-28 0:19 ` Boojin Kim 2012-03-28 4:10 ` Boojin Kim
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).