* [PATCH 1/2] ARM: Introduce ARM_L1_CACHE_SHIFT to define cache line size @ 2009-09-02 16:11 Kirill A. Shutemov 2009-09-02 16:11 ` [PATCH 2/2] ARM: copy_page.S: take into account the size of the cache line Kirill A. Shutemov 0 siblings, 1 reply; 4+ messages in thread From: Kirill A. Shutemov @ 2009-09-02 16:11 UTC (permalink / raw) To: linux-arm-kernel Currently kernel believes that all ARM CPUs have L1_CACHE_SHIFT == 5. It's not true at least for CPUs based on Cortex-A8. List of CPUs with cache line size != 32 should be expanded later. Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> --- arch/arm/include/asm/cache.h | 2 +- arch/arm/mm/Kconfig | 5 +++++ 2 files changed, 6 insertions(+), 1 deletions(-) diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h index feaa75f..2ee7743 100644 --- a/arch/arm/include/asm/cache.h +++ b/arch/arm/include/asm/cache.h @@ -4,7 +4,7 @@ #ifndef __ASMARM_CACHE_H #define __ASMARM_CACHE_H -#define L1_CACHE_SHIFT 5 +#define L1_CACHE_SHIFT (CONFIG_ARM_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) /* diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 83c025e..3c37d4c 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -771,3 +771,8 @@ config CACHE_XSC3L2 select OUTER_CACHE help This option enables the L2 cache on XScale3. + +config ARM_L1_CACHE_SHIFT + int + default 6 if ARCH_OMAP3 + default 5 -- 1.6.3.4 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] ARM: copy_page.S: take into account the size of the cache line 2009-09-02 16:11 [PATCH 1/2] ARM: Introduce ARM_L1_CACHE_SHIFT to define cache line size Kirill A. Shutemov @ 2009-09-02 16:11 ` Kirill A. Shutemov 2009-09-02 13:24 ` Russell King - ARM Linux 0 siblings, 1 reply; 4+ messages in thread From: Kirill A. Shutemov @ 2009-09-02 16:11 UTC (permalink / raw) To: linux-arm-kernel Optimized version of copy_page() was written with assumption that cache line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes. This patch tries to generalize copy_page() to work with any cache line size if cache line size is multiple of 16 and page size is multiple of two cache line size. After this optimization we've got ~25% speedup on OMAP3(tested in userspace). There is test for kernelspace which trigger copy-on-write after fork(): #include <stdlib.h> #include <string.h> #include <unistd.h> #define BUF_SIZE (10000*4096) #define NFORK 200 int main(int argc, char **argv) { char *buf = malloc(BUF_SIZE); int i; memset(buf, 0, BUF_SIZE); for(i = 0; i < NFORK; i++) { if (fork()) { wait(NULL); } else { int j; for(j = 0; j < BUF_SIZE; j+= 4096) buf[j] = (j & 0xFF) + 1; break; } } free(buf); return 0; } Before optimization this test takes ~66 seconds, after optimization takes ~56 seconds. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@nokia.com> Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> --- arch/arm/lib/copy_page.S | 15 +++++++-------- 1 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S index 6ae04db..2107b4a 100644 --- a/arch/arm/lib/copy_page.S +++ b/arch/arm/lib/copy_page.S @@ -13,7 +13,7 @@ #include <asm/assembler.h> #include <asm/asm-offsets.h> -#define COPY_COUNT (PAGE_SZ/64 PLD( -1 )) +#define COPY_COUNT (PAGE_SZ/(2 * L1_CACHE_BYTES) PLD( -1 )) .text .align 5 @@ -26,17 +26,16 @@ ENTRY(copy_page) stmfd sp!, {r4, lr} @ 2 PLD( pld [r1, #0] ) - PLD( pld [r1, #32] ) + PLD( pld [r1, #L1_CACHE_BYTES] ) mov r2, #COPY_COUNT @ 1 ldmia r1!, {r3, r4, ip, lr} @ 4+1 -1: PLD( pld [r1, #64] ) - PLD( pld [r1, #96] ) -2: stmia r0!, {r3, r4, ip, lr} @ 4 - ldmia r1!, {r3, r4, ip, lr} @ 4+1 - stmia r0!, {r3, r4, ip, lr} @ 4 - ldmia r1!, {r3, r4, ip, lr} @ 4+1 +1: PLD( pld [r1, #(2*L1_CACHE_BYTES)]) + PLD( pld [r1, #(3*L1_CACHE_BYTES)]) +2: + .rept (2 * (L1_CACHE_BYTES) / 16 - 1) stmia r0!, {r3, r4, ip, lr} @ 4 ldmia r1!, {r3, r4, ip, lr} @ 4 + .endr subs r2, r2, #1 @ 1 stmia r0!, {r3, r4, ip, lr} @ 4 ldmgtia r1!, {r3, r4, ip, lr} @ 4 -- 1.6.3.4 ^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] ARM: copy_page.S: take into account the size of the cache line 2009-09-02 16:11 ` [PATCH 2/2] ARM: copy_page.S: take into account the size of the cache line Kirill A. Shutemov @ 2009-09-02 13:24 ` Russell King - ARM Linux 2009-09-02 17:19 ` [PATCH] " Kirill A. Shutemov 0 siblings, 1 reply; 4+ messages in thread From: Russell King - ARM Linux @ 2009-09-02 13:24 UTC (permalink / raw) To: linux-arm-kernel On Wed, Sep 02, 2009 at 07:11:53PM +0300, Kirill A. Shutemov wrote: > ENTRY(copy_page) > stmfd sp!, {r4, lr} @ 2 > PLD( pld [r1, #0] ) > - PLD( pld [r1, #32] ) > + PLD( pld [r1, #L1_CACHE_BYTES] ) > mov r2, #COPY_COUNT @ 1 > ldmia r1!, {r3, r4, ip, lr} @ 4+1 > -1: PLD( pld [r1, #64] ) > - PLD( pld [r1, #96] ) > -2: stmia r0!, {r3, r4, ip, lr} @ 4 > - ldmia r1!, {r3, r4, ip, lr} @ 4+1 > - stmia r0!, {r3, r4, ip, lr} @ 4 > - ldmia r1!, {r3, r4, ip, lr} @ 4+1 > +1: PLD( pld [r1, #(2*L1_CACHE_BYTES)]) > + PLD( pld [r1, #(3*L1_CACHE_BYTES)]) I really hate unnecessary parens - please remove them, and put a space each side of the '*'. > +2: > + .rept (2 * (L1_CACHE_BYTES) / 16 - 1) Parens are not required around L1_CACHE_BYTES either. Other than that, patch is fine. ^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH] ARM: copy_page.S: take into account the size of the cache line 2009-09-02 13:24 ` Russell King - ARM Linux @ 2009-09-02 17:19 ` Kirill A. Shutemov 0 siblings, 0 replies; 4+ messages in thread From: Kirill A. Shutemov @ 2009-09-02 17:19 UTC (permalink / raw) To: linux-arm-kernel Optimized version of copy_page() was written with assumption that cache line size is 32 bytes. On Cortex-A8 cache line size is 64 bytes. This patch tries to generalize copy_page() to work with any cache line size if cache line size is multiple of 16 and page size is multiple of two cache line size. After this optimization we've got ~25% speedup on OMAP3(tested in userspace). There is test for kernelspace which trigger copy-on-write after fork(): #include <stdlib.h> #include <string.h> #include <unistd.h> #define BUF_SIZE (10000*4096) #define NFORK 200 int main(int argc, char **argv) { char *buf = malloc(BUF_SIZE); int i; memset(buf, 0, BUF_SIZE); for(i = 0; i < NFORK; i++) { if (fork()) { wait(NULL); } else { int j; for(j = 0; j < BUF_SIZE; j+= 4096) buf[j] = (j & 0xFF) + 1; break; } } free(buf); return 0; } Before optimization this test takes ~66 seconds, after optimization takes ~56 seconds. Signed-off-by: Siarhei Siamashka <siarhei.siamashka@nokia.com> Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> --- arch/arm/lib/copy_page.S | 16 ++++++++-------- 1 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S index 6ae04db..6ee2f67 100644 --- a/arch/arm/lib/copy_page.S +++ b/arch/arm/lib/copy_page.S @@ -12,8 +12,9 @@ #include <linux/linkage.h> #include <asm/assembler.h> #include <asm/asm-offsets.h> +#include <asm/cache.h> -#define COPY_COUNT (PAGE_SZ/64 PLD( -1 )) +#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 )) .text .align 5 @@ -26,17 +27,16 @@ ENTRY(copy_page) stmfd sp!, {r4, lr} @ 2 PLD( pld [r1, #0] ) - PLD( pld [r1, #32] ) + PLD( pld [r1, #L1_CACHE_BYTES] ) mov r2, #COPY_COUNT @ 1 ldmia r1!, {r3, r4, ip, lr} @ 4+1 -1: PLD( pld [r1, #64] ) - PLD( pld [r1, #96] ) -2: stmia r0!, {r3, r4, ip, lr} @ 4 - ldmia r1!, {r3, r4, ip, lr} @ 4+1 - stmia r0!, {r3, r4, ip, lr} @ 4 - ldmia r1!, {r3, r4, ip, lr} @ 4+1 +1: PLD( pld [r1, #2 * L1_CACHE_BYTES]) + PLD( pld [r1, #3 * L1_CACHE_BYTES]) +2: + .rept (2 * L1_CACHE_BYTES / 16 - 1) stmia r0!, {r3, r4, ip, lr} @ 4 ldmia r1!, {r3, r4, ip, lr} @ 4 + .endr subs r2, r2, #1 @ 1 stmia r0!, {r3, r4, ip, lr} @ 4 ldmgtia r1!, {r3, r4, ip, lr} @ 4 -- 1.6.4.2 ^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2009-09-02 17:19 UTC | newest] Thread overview: 4+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2009-09-02 16:11 [PATCH 1/2] ARM: Introduce ARM_L1_CACHE_SHIFT to define cache line size Kirill A. Shutemov 2009-09-02 16:11 ` [PATCH 2/2] ARM: copy_page.S: take into account the size of the cache line Kirill A. Shutemov 2009-09-02 13:24 ` Russell King - ARM Linux 2009-09-02 17:19 ` [PATCH] " Kirill A. Shutemov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).