* [PATCH] ARM: copy_page.S: take into account the size of the cache line
@ 2009-07-09 16:23 Kirill A. Shutemov
2009-07-10 23:51 ` Jamie Lokier
0 siblings, 1 reply; 5+ messages in thread
From: Kirill A. Shutemov @ 2009-07-09 16:23 UTC (permalink / raw)
To: ARM Linux Mailing List
Cc: linux-kernel, Kirill A. Shutemov, Siarhei Siamashka
From: Kirill A. Shutemov <kirill@shutemov.name>
Optimized version of copy_page() was written with assumption that cache
line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes.
This patch tries to generalize copy_page() to work with any cache line
size if cache line size is multiple of 16 and page size is multiple of
two cache line size.
Unfortunately, kernel doesn't provide a macros with correct cache size.
L1_CACHE_SHIFT is 5 on any ARM. So we have to define macros for this
propose by ourself.
After this optimization we've got ~25% speedup on OMAP3(tested in
userspace).
There is test for kernelspace which trigger copy-on-write after fork():
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define BUF_SIZE (10000*4096)
#define NFORK 200
int main(int argc, char **argv)
{
char *buf = malloc(BUF_SIZE);
int i;
memset(buf, 0, BUF_SIZE);
for(i = 0; i < NFORK; i++) {
if (fork()) {
wait(NULL);
} else {
int j;
for(j = 0; j < BUF_SIZE; j+= 4096)
buf[j] = (j & 0xFF) + 1;
break;
}
}
free(buf);
return 0;
}
Before optimization this test takes ~66 seconds, after optimization
takes ~56 seconds.
Signed-off-by: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
---
arch/arm/lib/copy_page.S | 21 +++++++++++++--------
1 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index 6ae04db..3bd1b9c 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -13,7 +13,13 @@
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
-#define COPY_COUNT (PAGE_SZ/64 PLD( -1 ))
+#if CONFIG_ARCH_OMAP3
+#define CACHE_LINE_SZ 64
+#else
+#define CACHE_LINE_SZ 32
+#endif
+
+#define COPY_COUNT (PAGE_SZ/(2 * CACHE_LINE_SZ) PLD( -1 ))
.text
.align 5
@@ -26,17 +32,16 @@
ENTRY(copy_page)
stmfd sp!, {r4, lr} @ 2
PLD( pld [r1, #0] )
- PLD( pld [r1, #32] )
+ PLD( pld [r1, #CACHE_LINE_SZ] )
mov r2, #COPY_COUNT @ 1
ldmia r1!, {r3, r4, ip, lr} @ 4+1
-1: PLD( pld [r1, #64] )
- PLD( pld [r1, #96] )
-2: stmia r0!, {r3, r4, ip, lr} @ 4
- ldmia r1!, {r3, r4, ip, lr} @ 4+1
- stmia r0!, {r3, r4, ip, lr} @ 4
- ldmia r1!, {r3, r4, ip, lr} @ 4+1
+1: PLD( pld [r1, #(2*CACHE_LINE_SZ)])
+ PLD( pld [r1, #(3*CACHE_LINE_SZ)])
+2:
+ .rept (2 * (CACHE_LINE_SZ) / 16 - 1)
stmia r0!, {r3, r4, ip, lr} @ 4
ldmia r1!, {r3, r4, ip, lr} @ 4
+ .endr
subs r2, r2, #1 @ 1
stmia r0!, {r3, r4, ip, lr} @ 4
ldmgtia r1!, {r3, r4, ip, lr} @ 4
--
1.6.3.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] ARM: copy_page.S: take into account the size of the cache line
2009-07-09 16:23 [PATCH] ARM: copy_page.S: take into account the size of the cache line Kirill A. Shutemov
@ 2009-07-10 23:51 ` Jamie Lokier
2009-07-15 13:12 ` Siarhei Siamashka
0 siblings, 1 reply; 5+ messages in thread
From: Jamie Lokier @ 2009-07-10 23:51 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: ARM Linux Mailing List, linux-kernel, Siarhei Siamashka
Kirill A. Shutemov wrote:
> From: Kirill A. Shutemov <kirill@shutemov.name>
>
> Optimized version of copy_page() was written with assumption that cache
> line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes.
>
> This patch tries to generalize copy_page() to work with any cache line
> size if cache line size is multiple of 16 and page size is multiple of
> two cache line size.
>
> Unfortunately, kernel doesn't provide a macros with correct cache size.
> L1_CACHE_SHIFT is 5 on any ARM. So we have to define macros for this
> propose by ourself.
Why don't you fix L1_CACHE_SHIFT for Cortex-A8?
-- Jamie
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] ARM: copy_page.S: take into account the size of the cache line
2009-07-10 23:51 ` Jamie Lokier
@ 2009-07-15 13:12 ` Siarhei Siamashka
2009-07-15 13:55 ` Russell King - ARM Linux
0 siblings, 1 reply; 5+ messages in thread
From: Siarhei Siamashka @ 2009-07-15 13:12 UTC (permalink / raw)
To: ext Jamie Lokier
Cc: Kirill A. Shutemov, ARM Linux Mailing List,
linux-kernel@vger.kernel.org
On Saturday 11 July 2009 02:51:23 ext Jamie Lokier wrote:
> Kirill A. Shutemov wrote:
> > From: Kirill A. Shutemov <kirill@shutemov.name>
> >
> > Optimized version of copy_page() was written with assumption that cache
> > line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes.
> >
> > This patch tries to generalize copy_page() to work with any cache line
> > size if cache line size is multiple of 16 and page size is multiple of
> > two cache line size.
> >
> > Unfortunately, kernel doesn't provide a macros with correct cache size.
> > L1_CACHE_SHIFT is 5 on any ARM. So we have to define macros for this
> > propose by ourself.
>
> Why don't you fix L1_CACHE_SHIFT for Cortex-A8?
That's the plan.
Right now Kirill is on a vacation, but I think he can continue investigating
this stuff when he is back and will come up with a clean solution.
Fixing L1_CACHE_SHIFT may open a whole can of worms (fixing some old
bugs, or breaking some things that might work only when incorrectly
assuming that cache line is always 32 bytes). For example, looks like this
thing in 'arch/arm/include/asm/dma-mapping.h' may be dangerous for
ARM cores, which have cache line size different from 32:
static inline int dma_get_cache_alignment(void)
{
return 32;
}
--
Best regards,
Siarhei Siamashka
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] ARM: copy_page.S: take into account the size of the cache line
2009-07-15 13:12 ` Siarhei Siamashka
@ 2009-07-15 13:55 ` Russell King - ARM Linux
0 siblings, 0 replies; 5+ messages in thread
From: Russell King - ARM Linux @ 2009-07-15 13:55 UTC (permalink / raw)
To: Siarhei Siamashka
Cc: ext Jamie Lokier, Kirill A. Shutemov, ARM Linux Mailing List,
linux-kernel@vger.kernel.org
On Wed, Jul 15, 2009 at 04:12:19PM +0300, Siarhei Siamashka wrote:
> On Saturday 11 July 2009 02:51:23 ext Jamie Lokier wrote:
> > Kirill A. Shutemov wrote:
> > > From: Kirill A. Shutemov <kirill@shutemov.name>
> > >
> > > Optimized version of copy_page() was written with assumption that cache
> > > line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes.
> > >
> > > This patch tries to generalize copy_page() to work with any cache line
> > > size if cache line size is multiple of 16 and page size is multiple of
> > > two cache line size.
> > >
> > > Unfortunately, kernel doesn't provide a macros with correct cache size.
> > > L1_CACHE_SHIFT is 5 on any ARM. So we have to define macros for this
> > > propose by ourself.
> >
> > Why don't you fix L1_CACHE_SHIFT for Cortex-A8?
>
> That's the plan.
L1_CACHE_SHIFT is supposed to be a constant and the maximum cache line
shift for the processors in use.
Other functions (eg, dma_get_cache_alignment) can return either this or
the real cache line size.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] ARM: copy_page.S: take into account the size of the cache line
2009-09-02 13:24 [PATCH 2/2] " Russell King - ARM Linux
@ 2009-09-02 17:19 ` Kirill A. Shutemov
0 siblings, 0 replies; 5+ messages in thread
From: Kirill A. Shutemov @ 2009-09-02 17:19 UTC (permalink / raw)
To: linux-arm-kernel, linux-kernel
Cc: Koskinen Aaro, Bityutskiy Artem, Moiseichuk Leonid,
Siarhei Siamashka, Kirill A. Shutemov
Optimized version of copy_page() was written with assumption that cache
line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes.
This patch tries to generalize copy_page() to work with any cache line
size if cache line size is multiple of 16 and page size is multiple of
two cache line size.
After this optimization we've got ~25% speedup on OMAP3(tested in
userspace).
There is test for kernelspace which trigger copy-on-write after fork():
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define BUF_SIZE (10000*4096)
#define NFORK 200
int main(int argc, char **argv)
{
char *buf = malloc(BUF_SIZE);
int i;
memset(buf, 0, BUF_SIZE);
for(i = 0; i < NFORK; i++) {
if (fork()) {
wait(NULL);
} else {
int j;
for(j = 0; j < BUF_SIZE; j+= 4096)
buf[j] = (j & 0xFF) + 1;
break;
}
}
free(buf);
return 0;
}
Before optimization this test takes ~66 seconds, after optimization
takes ~56 seconds.
Signed-off-by: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
---
arch/arm/lib/copy_page.S | 16 ++++++++--------
1 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index 6ae04db..6ee2f67 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -12,8 +12,9 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
+#include <asm/cache.h>
-#define COPY_COUNT (PAGE_SZ/64 PLD( -1 ))
+#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
.text
.align 5
@@ -26,17 +27,16 @@
ENTRY(copy_page)
stmfd sp!, {r4, lr} @ 2
PLD( pld [r1, #0] )
- PLD( pld [r1, #32] )
+ PLD( pld [r1, #L1_CACHE_BYTES] )
mov r2, #COPY_COUNT @ 1
ldmia r1!, {r3, r4, ip, lr} @ 4+1
-1: PLD( pld [r1, #64] )
- PLD( pld [r1, #96] )
-2: stmia r0!, {r3, r4, ip, lr} @ 4
- ldmia r1!, {r3, r4, ip, lr} @ 4+1
- stmia r0!, {r3, r4, ip, lr} @ 4
- ldmia r1!, {r3, r4, ip, lr} @ 4+1
+1: PLD( pld [r1, #2 * L1_CACHE_BYTES])
+ PLD( pld [r1, #3 * L1_CACHE_BYTES])
+2:
+ .rept (2 * L1_CACHE_BYTES / 16 - 1)
stmia r0!, {r3, r4, ip, lr} @ 4
ldmia r1!, {r3, r4, ip, lr} @ 4
+ .endr
subs r2, r2, #1 @ 1
stmia r0!, {r3, r4, ip, lr} @ 4
ldmgtia r1!, {r3, r4, ip, lr} @ 4
--
1.6.4.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-09-02 14:20 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-09 16:23 [PATCH] ARM: copy_page.S: take into account the size of the cache line Kirill A. Shutemov
2009-07-10 23:51 ` Jamie Lokier
2009-07-15 13:12 ` Siarhei Siamashka
2009-07-15 13:55 ` Russell King - ARM Linux
-- strict thread matches above, loose matches on Subject: below --
2009-09-02 13:24 [PATCH 2/2] " Russell King - ARM Linux
2009-09-02 17:19 ` [PATCH] " Kirill A. Shutemov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox