* [PATCH] add a clear_pages function to clear pages of higher order
@ 2005-03-10 20:35 Christoph Lameter
2005-03-10 21:38 ` Dave Hansen
0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2005-03-10 20:35 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, Mel Gorman
The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
clear_page that is capable of zeroing multiple pages at once. The following patch adds
a function "clear_pages" that is capable of clearing multiple continuous pages at once.
This used to be part of the prezeroing patchset but there may be benefits
to huge pages and regular kernel code as well. Also Mel Gorman's patchset
to reduce fragmentation and introduce prezeroing in a different way may
benefit from this patch. The patch only provides a clear_pages function
for ia32, ia64, x86_64 and sparc64 (all tested). Other platforms may
provide a clear_pages function by defining __HAVE_ARCH_CLEAR_PAGES.
Patch against 2.6.11-bk6
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-10 10:57:06.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-10 10:57:10.000000000 -0800
@@ -628,11 +628,19 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}
-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;
BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef __HAVE_ARCH_CLEAR_PAGES
+ if (!PageHighMem(page)) {
+ clear_pages(page_address(page), order);
+ return;
+ }
+#endif
+
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.11/mm/hugetlb.c
===================================================================
--- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800
+++ linux-2.6.11/mm/hugetlb.c 2005-03-10 10:57:10.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;
spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER);
return page;
}
Index: linux-2.6.11/include/asm-ia64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -56,8 +56,10 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS
-extern void clear_page (void *page);
+extern void clear_pages (void *page, int order);
extern void copy_page (void *to, void *from);
+#define clear_page(__page) clear_pages(__page, 0)
+#define __HAVE_ARCH_CLEAR_PAGES
/*
* clear_user_page() and copy_user_page() can't be inline functions because
Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-10 10:57:10.000000000 -0800
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock);
EXPORT_SYMBOL(__up);
#include <asm/page.h>
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);
#ifdef CONFIG_VIRTUAL_MEM_MAP
#include <linux/bootmem.h>
Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>
@@ -29,27 +30,33 @@
#define dst4 r11
#define dst_last r31
+#define totsize r14
-GLOBAL_ENTRY(clear_page)
+GLOBAL_ENTRY(clear_pages)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
@@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page)
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
-END(clear_page)
+END(clear_pages)
Index: linux-2.6.11/include/asm-i386/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/include/asm-i386/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -18,7 +18,7 @@
#include <asm/mmx.h>
-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_pages(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)
#else
@@ -28,11 +28,13 @@
* Maybe the K6-III ?
*/
-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_pages(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
#endif
+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(page) clear_pages(page, 0)
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.11/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-10 10:57:10.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>
extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);
#endif
Index: linux-2.6.11/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-10 10:57:10.000000000 -0800
@@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void *
* other MMX using processors do not.
*/
-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;
@@ -138,7 +138,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi
* Generic MMX implementation without K7 specific streaming
*/
-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;
@@ -267,7 +267,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi
* Favour MMX for page clear and copy.
*/
-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}
static void slow_copy_page(void *to, void *from)
Index: linux-2.6.11/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -32,8 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
-void clear_page(void *);
+void clear_pages(void *, int);
void copy_page(void *, void *);
+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(__page) clear_pages(__page, 0)
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-10 10:57:10.000000000 -0800
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);
#endif
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);
EXPORT_SYMBOL(cpu_pda);
#ifdef CONFIG_SMP
Index: linux-2.6.11/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -1,12 +1,16 @@
/*
* Zero a page.
* rdi page
+ * rsi order
*/
- .globl clear_page
+ .globl clear_pages
.p2align 4
-clear_page:
+clear_pages:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
- movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -23,7 +27,7 @@ clear_page:
jnz .Lloop
nop
ret
-clear_page_end:
+clear_pages_end:
/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
@@ -32,19 +36,22 @@ clear_page_end:
.section .altinstructions,"a"
.align 8
- .quad clear_page
- .quad clear_page_c
+ .quad clear_pages
+ .quad clear_pages_c
.byte X86_FEATURE_K8_C
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
+ .byte clear_pages_end-clear_pages
+ .byte clear_pages_c_end-clear_pages_c
.previous
.section .altinstr_replacement,"ax"
-clear_page_c:
- movl $4096/8,%ecx
+clear_pages_c:
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
rep
stosq
ret
-clear_page_c_end:
+clear_pages_c_end:
.previous
Index: linux-2.6.11/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -28,9 +28,12 @@
.text
.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1
/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr
flush %g6
wrpr %o4, 0x0, %pstate
+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1
clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
Index: linux-2.6.11/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800
+++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -14,8 +14,10 @@
#ifndef __ASSEMBLY__
-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, int order);
+#define clear_page(X) _clear_page((void *)(X), 0)
+#define clear_pages _clear_page
+
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.11/include/linux/gfp.h
===================================================================
--- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800
+++ linux-2.6.11/include/linux/gfp.h 2005-03-10 10:57:10.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru
void page_alloc_init(void);
+void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags);
#endif /* __LINUX_GFP_H */
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-10 20:35 [PATCH] add a clear_pages function to clear pages of higher order Christoph Lameter @ 2005-03-10 21:38 ` Dave Hansen 2005-03-10 22:46 ` Christoph Lameter 2005-03-11 1:03 ` Christoph Lameter 0 siblings, 2 replies; 28+ messages in thread From: Dave Hansen @ 2005-03-10 21:38 UTC (permalink / raw) To: Christoph Lameter; +Cc: Andrew Morton, Linux Kernel Mailing List, Mel Gorman On Thu, 2005-03-10 at 12:35 -0800, Christoph Lameter wrote: > +#ifdef __HAVE_ARCH_CLEAR_PAGES > + if (!PageHighMem(page)) { > + clear_pages(page_address(page), order); > + return; > + } > +#endif > + > for(i = 0; i < (1 << order); i++) > clear_highpage(page + i); > } ... > --- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800 > +++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 10:57:10.000000000 -0800 > @@ -56,8 +56,10 @@ > # ifdef __KERNEL__ > # define STRICT_MM_TYPECHECKS > > -extern void clear_page (void *page); > +extern void clear_pages (void *page, int order); > extern void copy_page (void *to, void *from); > +#define clear_page(__page) clear_pages(__page, 0) > +#define __HAVE_ARCH_CLEAR_PAGES Although this is a simple instance, could this please be done in a Kconfig file? If that #define happens inside of other #ifdefs, it can be quite hard to decipher the special .config incantation to get it set. On the other hand, if the dependencies are spelled out in a Kconfig entry... BTW, I tried applying this to 2.6.11-bk6, and it rejected: ... patching file include/asm-i386/page.h Hunk #2 FAILED at 28. 1 out of 2 hunks FAILED -- saving rejects to file include/asm-i386/page.h.rej ... There were some more rejects as well. Were there some other patches applied first? -- Dave ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-10 21:38 ` Dave Hansen @ 2005-03-10 22:46 ` Christoph Lameter 2005-03-11 1:03 ` Christoph Lameter 1 sibling, 0 replies; 28+ messages in thread From: Christoph Lameter @ 2005-03-10 22:46 UTC (permalink / raw) To: Dave Hansen; +Cc: Andrew Morton, Linux Kernel Mailing List, Mel Gorman On Thu, 10 Mar 2005, Dave Hansen wrote: > > +extern void clear_pages (void *page, int order); > > extern void copy_page (void *to, void *from); > > +#define clear_page(__page) clear_pages(__page, 0) > > +#define __HAVE_ARCH_CLEAR_PAGES > > Although this is a simple instance, could this please be done in a > Kconfig file? If that #define happens inside of other #ifdefs, it can > be quite hard to decipher the special .config incantation to get it set. > On the other hand, if the dependencies are spelled out in a Kconfig > entry... Ok will do. > BTW, I tried applying this to 2.6.11-bk6, and it rejected: > ... > patching file include/asm-i386/page.h > Hunk #2 FAILED at 28. > 1 out of 2 hunks FAILED -- saving rejects to file > include/asm-i386/page.h.rej > ... > > There were some more rejects as well. Were there some other patches > applied first? Patches work fine here. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-10 21:38 ` Dave Hansen 2005-03-10 22:46 ` Christoph Lameter @ 2005-03-11 1:03 ` Christoph Lameter 2005-03-11 8:08 ` Denis Vlasenko 1 sibling, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-03-11 1:03 UTC (permalink / raw) To: Dave Hansen; +Cc: Andrew Morton, Linux Kernel Mailing List, Mel Gorman Changelog: - use Kconfig and CONFIG_CLEAR_PAGES The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a clear_page that is capable of zeroing multiple pages at once. The following patch adds a function "clear_pages" that is capable of clearing multiple continuous pages at once. Patch against 2.6.11-bk6 Signed-off-by: Christoph Lameter <clameter@sgi.com> Index: linux-2.6.11/mm/page_alloc.c =================================================================== --- linux-2.6.11.orig/mm/page_alloc.c 2005-03-10 14:42:43.000000000 -0800 +++ linux-2.6.11/mm/page_alloc.c 2005-03-10 15:01:53.000000000 -0800 @@ -628,11 +628,19 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags) { int i; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + +#ifdef CONFIG_CLEAR_PAGES + if (!PageHighMem(page)) { + clear_pages(page_address(page), order); + return; + } +#endif + for(i = 0; i < (1 << order); i++) clear_highpage(page + i); } Index: linux-2.6.11/mm/hugetlb.c =================================================================== --- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800 +++ linux-2.6.11/mm/hugetlb.c 2005-03-10 15:01:53.000000000 -0800 @@ -78,7 +78,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(void) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(); @@ -89,8 +88,7 @@ struct page *alloc_huge_page(void) spin_unlock(&hugetlb_lock); set_page_count(page, 1); page[1].mapping = (void *)free_huge_page; - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_highpage(&page[i]); + prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER); return page; } Index: linux-2.6.11/include/asm-ia64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800 +++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 15:02:47.000000000 -0800 @@ -56,8 +56,9 @@ # ifdef __KERNEL__ # define STRICT_MM_TYPECHECKS -extern void clear_page (void *page); +extern void clear_pages (void *page, int order); extern void copy_page (void *to, void *from); +#define clear_page(__page) clear_pages(__page, 0) /* * clear_user_page() and copy_user_page() can't be inline functions because Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-10 15:01:53.000000000 -0800 @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock); EXPORT_SYMBOL(__up); #include <asm/page.h> -EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_pages); #ifdef CONFIG_VIRTUAL_MEM_MAP #include <linux/bootmem.h> Index: linux-2.6.11/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800 @@ -7,6 +7,7 @@ * 1/06/01 davidm Tuned for Itanium. * 2/12/02 kchen Tuned for both Itanium and McKinley * 3/08/02 davidm Some more tweaking + * 12/10/04 clameter Make it work on pages of order size */ #include <linux/config.h> @@ -29,27 +30,33 @@ #define dst4 r11 #define dst_last r31 +#define totsize r14 -GLOBAL_ENTRY(clear_page) +GLOBAL_ENTRY(clear_pages) .prologue - .regstk 1,0,0,0 - mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until + .regstk 2,0,0,0 + mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count + mov totsize = PAGE_SIZE .save ar.lc, saved_lc mov saved_lc = ar.lc - + ;; .body + adds dst1 = 16, in0 mov ar.lc = (PREFETCH_LINES - 1) mov dst_fetch = in0 - adds dst1 = 16, in0 adds dst2 = 32, in0 + shl r16 = r16, in1 + shl totsize = totsize, in1 ;; .fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE adds dst3 = 48, in0 // executing this multiple times is harmless br.cloop.sptk.few .fetch + add r16 = -1,r16 + add dst_last = totsize, dst_fetch + adds dst4 = 64, in0 ;; - addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch mov ar.lc = r16 // one L3 line per iteration - adds dst4 = 64, in0 + adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last ;; #ifdef CONFIG_ITANIUM // Optimized for Itanium @@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page) ;; mov ar.lc = saved_lc // restore lc br.ret.sptk.many rp -END(clear_page) +END(clear_pages) Index: linux-2.6.11/include/asm-i386/page.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800 +++ linux-2.6.11/include/asm-i386/page.h 2005-03-10 15:02:59.000000000 -0800 @@ -18,7 +18,7 @@ #include <asm/mmx.h> -#define clear_page(page) mmx_clear_page((void *)(page)) +#define clear_pages(page, order) mmx_clear_page((void *)(page),order) #define copy_page(to,from) mmx_copy_page(to,from) #else @@ -28,11 +28,12 @@ * Maybe the K6-III ? */ -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) +#define clear_pages(page, order) memset((void *)(page), 0, PAGE_SIZE << (order)) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) #endif +#define clear_page(page) clear_pages(page, 0) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) Index: linux-2.6.11/include/asm-i386/mmx.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-10 15:01:53.000000000 -0800 @@ -8,7 +8,7 @@ #include <linux/types.h> extern void *_mmx_memcpy(void *to, const void *from, size_t size); -extern void mmx_clear_page(void *page); +extern void mmx_clear_page(void *page, int order); extern void mmx_copy_page(void *to, void *from); #endif Index: linux-2.6.11/arch/i386/lib/mmx.c =================================================================== --- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-10 15:01:53.000000000 -0800 @@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void * * other MMX using processors do not. */ -static void fast_clear_page(void *page) +static void fast_clear_page(void *page, int order) { int i; @@ -138,7 +138,7 @@ static void fast_clear_page(void *page) " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/64;i++) + for(i=0;i<((4096/64) << order);i++) { __asm__ __volatile__ ( " movntq %%mm0, (%0)\n" @@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi * Generic MMX implementation without K7 specific streaming */ -static void fast_clear_page(void *page) +static void fast_clear_page(void *page, int order) { int i; @@ -267,7 +267,7 @@ static void fast_clear_page(void *page) " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/128;i++) + for(i=0;i<((4096/128) << order);i++) { __asm__ __volatile__ ( " movq %%mm0, (%0)\n" @@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi * Favour MMX for page clear and copy. */ -static void slow_zero_page(void * page) +static void slow_clear_page(void * page, int order) { int d0, d1; __asm__ __volatile__( \ "cld\n\t" \ "rep ; stosl" \ : "=&c" (d0), "=&D" (d1) - :"a" (0),"1" (page),"0" (1024) + :"a" (0),"1" (page),"0" (1024 << order) :"memory"); } - -void mmx_clear_page(void * page) + +void mmx_clear_page(void * page, int order) { if(unlikely(in_interrupt())) - slow_zero_page(page); + slow_clear_page(page, order); else - fast_clear_page(page); + fast_clear_page(page, order); } static void slow_copy_page(void *to, void *from) Index: linux-2.6.11/include/asm-x86_64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-10 15:03:10.000000000 -0800 @@ -32,8 +32,9 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -void clear_page(void *); +void clear_pages(void *, int); void copy_page(void *, void *); +#define clear_page(__page) clear_pages(__page, 0) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-01 23:37:49.000000000 -0800 +++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-10 15:01:53.000000000 -0800 @@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start); #endif EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_pages); EXPORT_SYMBOL(cpu_pda); #ifdef CONFIG_SMP Index: linux-2.6.11/arch/x86_64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800 @@ -1,12 +1,16 @@ /* * Zero a page. * rdi page + * rsi order */ - .globl clear_page + .globl clear_pages .p2align 4 -clear_page: +clear_pages: + movl $4096/64,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx xorl %eax,%eax - movl $4096/64,%ecx .p2align 4 .Lloop: decl %ecx @@ -23,7 +27,7 @@ clear_page: jnz .Lloop nop ret -clear_page_end: +clear_pages_end: /* C stepping K8 run faster using the string instructions. It is also a lot simpler. Use this when possible */ @@ -32,19 +36,22 @@ clear_page_end: .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c + .quad clear_pages + .quad clear_pages_c .byte X86_FEATURE_K8_C - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c + .byte clear_pages_end-clear_pages + .byte clear_pages_c_end-clear_pages_c .previous .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx +clear_pages_c: + movl $4096/8,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx xorl %eax,%eax rep stosq ret -clear_page_c_end: +clear_pages_c_end: .previous Index: linux-2.6.11/arch/sparc64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800 +++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800 @@ -28,9 +28,12 @@ .text .globl _clear_page -_clear_page: /* %o0=dest */ +_clear_page: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 ba,pt %xcc, clear_page_common - clr %o4 + sllx %o2, %o1, %o1 /* This thing is pretty important, it shows up * on the profiles via do_anonymous_page(). @@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr flush %g6 wrpr %o4, 0x0, %pstate + sethi %hi(PAGE_SIZE/64), %o1 mov 1, %o4 + or %o1, %lo(PAGE_SIZE/64), %o1 clear_page_common: VISEntryHalf membar #StoreLoad | #StoreStore | #LoadStore fzero %f0 - sethi %hi(PAGE_SIZE/64), %o1 mov %o0, %g1 ! remember vaddr for tlbflush fzero %f2 - or %o1, %lo(PAGE_SIZE/64), %o1 faddd %f0, %f2, %f4 fmuld %f0, %f2, %f6 faddd %f0, %f2, %f8 Index: linux-2.6.11/include/asm-sparc64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800 +++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-10 15:03:43.000000000 -0800 @@ -14,8 +14,10 @@ #ifndef __ASSEMBLY__ -extern void _clear_page(void *page); -#define clear_page(X) _clear_page((void *)(X)) +extern void _clear_page(void *page, int order); +#define clear_page(X) _clear_page((void *)(X), 0) +#define clear_pages _clear_page + struct page; extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) Index: linux-2.6.11/include/linux/gfp.h =================================================================== --- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800 +++ linux-2.6.11/include/linux/gfp.h 2005-03-10 15:01:53.000000000 -0800 @@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru void page_alloc_init(void); +void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags); #endif /* __LINUX_GFP_H */ Index: linux-2.6.11/arch/x86_64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-10 14:42:41.000000000 -0800 +++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-10 15:01:53.000000000 -0800 @@ -78,6 +78,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_PAGES + bool + default y + source "init/Kconfig" Index: linux-2.6.11/arch/i386/Kconfig =================================================================== --- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-10 14:42:41.000000000 -0800 +++ linux-2.6.11/arch/i386/Kconfig 2005-03-10 15:01:53.000000000 -0800 @@ -33,6 +33,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_PAGES + bool + default y + source "init/Kconfig" menu "Processor type and features" Index: linux-2.6.11/arch/ia64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800 +++ linux-2.6.11/arch/ia64/Kconfig 2005-03-10 15:01:53.000000000 -0800 @@ -46,6 +46,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_PAGES + bool + default y + choice prompt "System type" default IA64_GENERIC Index: linux-2.6.11/arch/sparc64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800 +++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-10 15:02:16.000000000 -0800 @@ -16,6 +16,10 @@ config TIME_INTERPOLATION bool default y +config CLEAR_PAGES + bool + default y + source "init/Kconfig" config SYSVIPC_COMPAT ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-11 1:03 ` Christoph Lameter @ 2005-03-11 8:08 ` Denis Vlasenko 2005-03-17 1:33 ` Christoph Lameter 2005-03-18 10:12 ` Andi Kleen 0 siblings, 2 replies; 28+ messages in thread From: Denis Vlasenko @ 2005-03-11 8:08 UTC (permalink / raw) To: Christoph Lameter, Dave Hansen, Andi Kleen Cc: Andrew Morton, Linux Kernel Mailing List, Mel Gorman On Friday 11 March 2005 03:03, Christoph Lameter wrote: > Changelog: > - use Kconfig and CONFIG_CLEAR_PAGES > > The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a > clear_page that is capable of zeroing multiple pages at once. The following patch adds > a function "clear_pages" that is capable of clearing multiple continuous pages at once. > > Patch against 2.6.11-bk6 > > Signed-off-by: Christoph Lameter <clameter@sgi.com> [snip] > -clear_page_end: > +clear_pages_end: > > /* C stepping K8 run faster using the string instructions. > It is also a lot simpler. Use this when possible */ Andi Kleen (iirc) says that non-temporal stores seem to be big win in microbenchmarks (and I second that), but they are a net loss when we are going to use zeroed page just after zeroing. He recommends avoid using non-temporal stores With this new page prezeroing infrastructure, that argument most likely is not right anymore. Especially clearing of high-order pages definitely will benefit from NT stores because they do not kill L1 data cache in the process. I don't have K8 and therefore cannot be 100% sure, but I really doubt that K8 optimize "rep stosq" into _NT_ stores. Andi? -- vda ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-11 8:08 ` Denis Vlasenko @ 2005-03-17 1:33 ` Christoph Lameter 2005-03-18 9:54 ` Denis Vlasenko 2005-03-18 10:12 ` Andi Kleen 1 sibling, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-03-17 1:33 UTC (permalink / raw) To: Denis Vlasenko Cc: Dave Hansen, Andi Kleen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64 On Fri, 11 Mar 2005, Denis Vlasenko wrote: > Andi Kleen (iirc) says that non-temporal stores seem to be > big win in microbenchmarks (and I second that), but they are > a net loss when we are going to use zeroed page just after > zeroing. He recommends avoid using non-temporal stores > > With this new page prezeroing infrastructure, that argument > most likely is not right anymore. Especially clearing of > high-order pages definitely will benefit from NT stores > because they do not kill L1 data cache in the process. > > I don't have K8 and therefore cannot be 100% sure, but > I really doubt that K8 optimize "rep stosq" into _NT_ stores. Hmm. That would be interesting to know and may be necessary to justify the continued existence of this patch. I tried to get some numbers on the performance wins for zeroing larger pages with the patch as is (no NT stores) and came up with: Processor Performance Increase ---------------------------------------------------------------- Itanium 2 1.3Ghz M1/R5 1.5% AMD Athlon 64 3200+ i386 mode 3% AMD Athlon 64 3200+ x86_64 mode 3.3% (this is if the zeroing engine is the cpu of course. Prezeroing may be done through some DMA gizmo independent of the cpu) Itanium has more extensive optimization capabilities and seems to be able to better cope with the loop logic for regular clear_page. Thus the improvement is even less on Itanium. Numbers obtained with the following patch that allows to get performance data from /proc/meminfo on zeroing performance (just divide Cycles by Pages for clear_page and clear_pages): Index: linux-2.6.11/mm/page_alloc.c =================================================================== --- linux-2.6.11.orig/mm/page_alloc.c 2005-03-16 17:12:51.000000000 -0800 +++ linux-2.6.11/mm/page_alloc.c 2005-03-16 17:17:28.000000000 -0800 @@ -633,13 +633,33 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags) { int i; + unsigned long t1; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + +#ifdef CONFIG_CLEAR_PAGES + if (!PageHighMem(page) && order>4) { + unsigned long t; + + t1=get_cycles(); + clear_pages(page_address(page), order); + t = get_cycles() - t1; + add_page_state(clear_pages_cycles, t); + add_page_state(clear_pages_order, 1 << order); + inc_page_state(clear_pages_nr); + return; + } +#endif + + t1=get_cycles(); for(i = 0; i < (1 << order); i++) clear_highpage(page + i); + add_page_state(clear_page_cycles, get_cycles() - t1); + add_page_state(clear_page_order, 1 << order); + inc_page_state(clear_page_nr); } /* Index: linux-2.6.11/include/linux/page-flags.h =================================================================== --- linux-2.6.11.orig/include/linux/page-flags.h 2005-03-16 17:12:51.000000000 -0800 +++ linux-2.6.11/include/linux/page-flags.h 2005-03-16 17:13:02.000000000 -0800 @@ -131,6 +131,13 @@ struct page_state { unsigned long allocstall; /* direct reclaim calls */ unsigned long pgrotated; /* pages rotated to tail of the LRU */ + + unsigned long clear_page_nr; /* Nr of clear_page request */ + unsigned long clear_page_cycles; /* Cycles spent in clear_page */ + unsigned long clear_page_order; /* Sum of orders */ + unsigned long clear_pages_nr; /* Nr of clear_pages requests */ + unsigned long clear_pages_cycles; /* Nr of cycles in clear_pages */ + unsigned long clear_pages_order; /* Sum of orders */ }; extern void get_page_state(struct page_state *ret); Index: linux-2.6.11/fs/proc/proc_misc.c =================================================================== --- linux-2.6.11.orig/fs/proc/proc_misc.c 2005-03-16 17:12:50.000000000 -0800 +++ linux-2.6.11/fs/proc/proc_misc.c 2005-03-16 17:22:18.000000000 -0800 @@ -127,7 +127,7 @@ static int meminfo_read_proc(char *page, unsigned long allowed; struct vmalloc_info vmi; - get_page_state(&ps); + get_full_page_state(&ps); get_zone_counts(&active, &inactive, &free); /* @@ -168,7 +168,13 @@ static int meminfo_read_proc(char *page, "PageTables: %8lu kB\n" "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "VmallocChunk: %8lu kB\n" + "ClearPage # %8lu\n" + "ClearPage Pgs %8lu\n" + "ClearPage Cyc %8lu\n" + "ClearPages # %8lu\n" + "ClearPages Pg %8lu\n" + "ClearPages Cy %8lu\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -191,7 +197,13 @@ static int meminfo_read_proc(char *page, K(ps.nr_page_table_pages), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, - vmi.largest_chunk >> 10 + vmi.largest_chunk >> 10, + ps.clear_page_nr, + ps.clear_page_order, + ps.clear_page_cycles, + ps.clear_pages_nr, + ps.clear_pages_order, + ps.clear_pages_cycles ); len += hugetlb_report_meminfo(page + len); ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-17 1:33 ` Christoph Lameter @ 2005-03-18 9:54 ` Denis Vlasenko 2005-03-18 15:00 ` Christoph Lameter 0 siblings, 1 reply; 28+ messages in thread From: Denis Vlasenko @ 2005-03-18 9:54 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Hansen, Andi Kleen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer [-- Attachment #1: Type: text/plain, Size: 2818 bytes --] On Thursday 17 March 2005 03:33, Christoph Lameter wrote: > On Fri, 11 Mar 2005, Denis Vlasenko wrote: > > > Andi Kleen (iirc) says that non-temporal stores seem to be > > big win in microbenchmarks (and I second that), but they are > > a net loss when we are going to use zeroed page just after > > zeroing. He recommends avoid using non-temporal stores > > > > With this new page prezeroing infrastructure, that argument > > most likely is not right anymore. Especially clearing of > > high-order pages definitely will benefit from NT stores > > because they do not kill L1 data cache in the process. > > > > I don't have K8 and therefore cannot be 100% sure, but > > I really doubt that K8 optimize "rep stosq" into _NT_ stores. > > Hmm. That would be interesting to know and may be necessary to justify > the continued existence of this patch. I tried to get some numbers on > the performance wins for zeroing larger pages with the patch as is (no > NT stores) and came up with: > > Processor Performance Increase > ---------------------------------------------------------------- > Itanium 2 1.3Ghz M1/R5 1.5% > AMD Athlon 64 3200+ i386 mode 3% > AMD Athlon 64 3200+ x86_64 mode 3.3% > > (this is if the zeroing engine is the cpu of course. Prezeroing > may be done through some DMA gizmo independent of the cpu) > > Itanium has more extensive optimization capabilities and > seems to be able to better cope with the loop logic for regular > clear_page. Thus the improvement is even less on Itanium. > > Numbers obtained with the following patch that allows to get performance > data from /proc/meminfo on zeroing performance (just divide Cycles by > Pages for clear_page and clear_pages): Here is a patch which allows to try different page zeroing optimizations to be tested at runtime via sysctl. Was run tested in 2.6.8 time. Rediffed to 2.6.11. Feel free to adapt to your patch and test. Also attached is a tarball for microbenchmarking routines. There are two result files. Duron: normal_clear_page - took 8644 max, 8400 min cycles per page repstosl_clear_page - took 8626 max, 8418 min cycles per page movq_clear_page - took 8647 max, 8300 min cycles per page movntq_clear_page - took 2777 max, 2720 min cycles per page And amd64: normal_clear_page - took 9427 max, 5781 min cycles per page repstosl_clear_page - took 9305 max, 5680 min cycles per page movq_clear_page - took 6167 max, 5576 min cycles per page movntq_clear_page - took 5456 max, 2354 min cycles per page NT stores are not about 5% increase. 200%-300%. Provided you are ok with the fact that zeroed page ends up evicted from cache. Luckily, this is exactly what you want with prezeroing. -- vda [-- Attachment #2: x86_SSE_clear_page.2611.patch --] [-- Type: text/x-diff, Size: 18884 bytes --] diff -urpN linux-2.6.11.src/arch/i386/lib/Makefile linux-2.6.11-nt.src/arch/i386/lib/Makefile --- linux-2.6.11.src/arch/i386/lib/Makefile Tue Oct 19 00:53:10 2004 +++ linux-2.6.11-nt.src/arch/i386/lib/Makefile Fri Mar 18 11:30:51 2005 @@ -4,7 +4,7 @@ lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \ - bitops.o + bitops.o page_ops.o mmx_page.o sse_page.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o diff -urpN linux-2.6.11.src/arch/i386/lib/mmx.c linux-2.6.11-nt.src/arch/i386/lib/mmx.c --- linux-2.6.11.src/arch/i386/lib/mmx.c Tue Oct 19 00:54:23 2004 +++ linux-2.6.11-nt.src/arch/i386/lib/mmx.c Fri Mar 18 11:30:51 2005 @@ -120,280 +120,3 @@ void *_mmx_memcpy(void *to, const void * kernel_fpu_end(); return p; } - -#ifdef CONFIG_MK7 - -/* - * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and - * other MMX using processors do not. - */ - -static void fast_clear_page(void *page) -{ - int i; - - kernel_fpu_begin(); - - __asm__ __volatile__ ( - " pxor %%mm0, %%mm0\n" : : - ); - - for(i=0;i<4096/64;i++) - { - __asm__ __volatile__ ( - " movntq %%mm0, (%0)\n" - " movntq %%mm0, 8(%0)\n" - " movntq %%mm0, 16(%0)\n" - " movntq %%mm0, 24(%0)\n" - " movntq %%mm0, 32(%0)\n" - " movntq %%mm0, 40(%0)\n" - " movntq %%mm0, 48(%0)\n" - " movntq %%mm0, 56(%0)\n" - : : "r" (page) : "memory"); - page+=64; - } - /* since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again. - */ - __asm__ __volatile__ ( - " sfence \n" : : - ); - kernel_fpu_end(); -} - -static void fast_copy_page(void *to, void *from) -{ - int i; - - kernel_fpu_begin(); - - /* maybe the prefetch stuff can go before the expensive fnsave... - * but that is for later. -AV - */ - __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" - : : "r" (from) ); - - for(i=0; i<(4096-320)/64; i++) - { - __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movntq %%mm0, (%1)\n" - " movq 8(%0), %%mm1\n" - " movntq %%mm1, 8(%1)\n" - " movq 16(%0), %%mm2\n" - " movntq %%mm2, 16(%1)\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm4\n" - " movntq %%mm4, 32(%1)\n" - " movq 40(%0), %%mm5\n" - " movntq %%mm5, 40(%1)\n" - " movq 48(%0), %%mm6\n" - " movntq %%mm6, 48(%1)\n" - " movq 56(%0), %%mm7\n" - " movntq %%mm7, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; - } - for(i=(4096-320)/64; i<4096/64; i++) - { - __asm__ __volatile__ ( - "2: movq (%0), %%mm0\n" - " movntq %%mm0, (%1)\n" - " movq 8(%0), %%mm1\n" - " movntq %%mm1, 8(%1)\n" - " movq 16(%0), %%mm2\n" - " movntq %%mm2, 16(%1)\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm4\n" - " movntq %%mm4, 32(%1)\n" - " movq 40(%0), %%mm5\n" - " movntq %%mm5, 40(%1)\n" - " movq 48(%0), %%mm6\n" - " movntq %%mm6, 48(%1)\n" - " movq 56(%0), %%mm7\n" - " movntq %%mm7, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; - } - /* since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again. - */ - __asm__ __volatile__ ( - " sfence \n" : : - ); - kernel_fpu_end(); -} - -#else - -/* - * Generic MMX implementation without K7 specific streaming - */ - -static void fast_clear_page(void *page) -{ - int i; - - kernel_fpu_begin(); - - __asm__ __volatile__ ( - " pxor %%mm0, %%mm0\n" : : - ); - - for(i=0;i<4096/128;i++) - { - __asm__ __volatile__ ( - " movq %%mm0, (%0)\n" - " movq %%mm0, 8(%0)\n" - " movq %%mm0, 16(%0)\n" - " movq %%mm0, 24(%0)\n" - " movq %%mm0, 32(%0)\n" - " movq %%mm0, 40(%0)\n" - " movq %%mm0, 48(%0)\n" - " movq %%mm0, 56(%0)\n" - " movq %%mm0, 64(%0)\n" - " movq %%mm0, 72(%0)\n" - " movq %%mm0, 80(%0)\n" - " movq %%mm0, 88(%0)\n" - " movq %%mm0, 96(%0)\n" - " movq %%mm0, 104(%0)\n" - " movq %%mm0, 112(%0)\n" - " movq %%mm0, 120(%0)\n" - : : "r" (page) : "memory"); - page+=128; - } - - kernel_fpu_end(); -} - -static void fast_copy_page(void *to, void *from) -{ - int i; - - - kernel_fpu_begin(); - - __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" - : : "r" (from) ); - - for(i=0; i<4096/64; i++) - { - __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; - } - kernel_fpu_end(); -} - - -#endif - -/* - * Favour MMX for page clear and copy. - */ - -static void slow_zero_page(void * page) -{ - int d0, d1; - __asm__ __volatile__( \ - "cld\n\t" \ - "rep ; stosl" \ - : "=&c" (d0), "=&D" (d1) - :"a" (0),"1" (page),"0" (1024) - :"memory"); -} - -void mmx_clear_page(void * page) -{ - if(unlikely(in_interrupt())) - slow_zero_page(page); - else - fast_clear_page(page); -} - -static void slow_copy_page(void *to, void *from) -{ - int d0, d1, d2; - __asm__ __volatile__( \ - "cld\n\t" \ - "rep ; movsl" \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (1024),"1" ((long) to),"2" ((long) from) \ - : "memory"); -} - - -void mmx_copy_page(void *to, void *from) -{ - if(unlikely(in_interrupt())) - slow_copy_page(to, from); - else - fast_copy_page(to, from); -} diff -urpN linux-2.6.11.src/arch/i386/lib/mmx_page.c linux-2.6.11-nt.src/arch/i386/lib/mmx_page.c --- linux-2.6.11.src/arch/i386/lib/mmx_page.c Thu Jan 1 03:00:00 1970 +++ linux-2.6.11-nt.src/arch/i386/lib/mmx_page.c Fri Mar 18 11:30:51 2005 @@ -0,0 +1,253 @@ +/* + * MMX/3DNow! library helper functions + * + * To do: + * We can use MMX just for prefetch in IRQ's. This may be a win. + * (reported so on K6-III) + * We should use a better code neutral filler for the short jump + * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? + * We also want to clobber the filler register so we don't get any + * register forwarding stalls on the filler. + * + * Add *user handling. Checksums are not a win with MMX on any CPU + * tested so far for any MMX solution figured. + * + * 22/09/2000 - Arjan van de Ven + * Improved for non-egineering-sample Athlons + * + */ + +#include <asm/i387.h> + +/* + * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and + * other MMX using processors do not. + */ + +void zero_page_3dnow(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i = 0; i < PAGE_SIZE/64; i++) + { + __asm__ __volatile__ ( + " movntq %%mm0, (%0)\n" + " movntq %%mm0, 8(%0)\n" + " movntq %%mm0, 16(%0)\n" + " movntq %%mm0, 24(%0)\n" + " movntq %%mm0, 32(%0)\n" + " movntq %%mm0, 40(%0)\n" + " movntq %%mm0, 48(%0)\n" + " movntq %%mm0, 56(%0)\n" + : : "r" (page) : "memory" + ); + page+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence\n" : : + ); + kernel_fpu_end(); +} + +void copy_page_3dnow(void *to, void *from) +{ + int i; + + kernel_fpu_begin(); + + /* maybe the prefetch stuff can go before the expensive fnsave... + * but that is for later. -AV + */ + __asm__ __volatile__ ( + "1: prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) + ); + + for(i = 0; i < (PAGE_SIZE-320)/64; i++) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory" + ); + from+=64; + to+=64; + } + for(i = (PAGE_SIZE-320)/64; i < PAGE_SIZE/64; i++) + { + __asm__ __volatile__ ( + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + : : "r" (from), "r" (to) : "memory" + ); + from+=64; + to+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence\n" : : + ); + kernel_fpu_end(); +} + +/* + * Generic MMX implementation without K7 specific streaming + */ +void zero_page_mmx(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i = 0; i < PAGE_SIZE/128; i++) + { + __asm__ __volatile__ ( + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" + " movq %%mm0, 16(%0)\n" + " movq %%mm0, 24(%0)\n" + " movq %%mm0, 32(%0)\n" + " movq %%mm0, 40(%0)\n" + " movq %%mm0, 48(%0)\n" + " movq %%mm0, 56(%0)\n" + " movq %%mm0, 64(%0)\n" + " movq %%mm0, 72(%0)\n" + " movq %%mm0, 80(%0)\n" + " movq %%mm0, 88(%0)\n" + " movq %%mm0, 96(%0)\n" + " movq %%mm0, 104(%0)\n" + " movq %%mm0, 112(%0)\n" + " movq %%mm0, 120(%0)\n" + : : "r" (page) : "memory" + ); + page+=128; + } + + kernel_fpu_end(); +} + +void copy_page_mmx(void *to, void *from) +{ + int i; + + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + "1: prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) + ); + + for(i = 0; i < PAGE_SIZE/64; i++) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory" + ); + from+=64; + to+=64; + } + kernel_fpu_end(); +} diff -urpN linux-2.6.11.src/arch/i386/lib/page_ops.c linux-2.6.11-nt.src/arch/i386/lib/page_ops.c --- linux-2.6.11.src/arch/i386/lib/page_ops.c Thu Jan 1 03:00:00 1970 +++ linux-2.6.11-nt.src/arch/i386/lib/page_ops.c Fri Mar 18 11:30:51 2005 @@ -0,0 +1,108 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sysctl.h> + +#include <asm/hardirq.h> + +void zero_page_mmx(void*); +void copy_page_mmx(void*, const void*); +void zero_page_3dnow(void*); +void copy_page_3dnow(void*, const void*); +void zero_page_sse(void*); +void copy_page_sse(void*, const void*); + +static void zero_page_slow(void * page) +{ + int d0, d1; + __asm__ __volatile__( + " cld\n" + " rep ; stosl\n" + : "=&c" (d0), "=&D" (d1) + :"a" (0),"1" (page),"0" (PAGE_SIZE/4) + :"memory" + ); +} + +static void copy_page_slow(void *to, const void *from) +{ + int d0, d1, d2; + __asm__ __volatile__( + " cld\n" + " rep ; movsl\n" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (PAGE_SIZE/4),"1" ((long) to),"2" ((long) from) + : "memory" + ); +} + +int change_pageops = 0; + +static void (*zero_f)(void *) = zero_page_slow; +static void (*copy_f)(void *, const void*) = copy_page_slow; + +#define SW_TO(a) do { \ + zero_f = zero_page_##a; \ + copy_f = copy_page_##a; \ + printk("Switched to " #a " clear/copy page ops\n"); \ +} while(0) + +static void change_ops(void) +{ + switch(change_pageops) { + case 1: SW_TO(slow); break; + case 2: SW_TO(mmx); break; + case 3: SW_TO(3dnow); break; + case 4: SW_TO(sse); break; + default: + printk("unimplemented!\n"); + } + change_pageops = 0; +} + +void clear_page(void *page) +{ + if(unlikely(in_interrupt())) { + zero_page_slow(page); + return; + } + if(!change_pageops) { + zero_f(page); + return; + } + change_ops(); + zero_f(page); +} + +void copy_page(void *to, const void *from) +{ + if(unlikely(in_interrupt())) { + copy_page_slow(to, from); + return; + } + if(!change_pageops) { + copy_f(to, from); + return; + } + change_ops(); + copy_f(to, from); +} + +static struct ctl_table pageop_table[] = { + { + .ctl_name = 19847, /* I typed random number */ + .procname = "pageop", + .data = &change_pageops, + .maxlen = sizeof(change_pageops), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static int __init pageops_init(void) +{ + register_sysctl_table(pageop_table, 1); + return 0; +} + +module_init(pageops_init) diff -urpN linux-2.6.11.src/arch/i386/lib/sse_page.c linux-2.6.11-nt.src/arch/i386/lib/sse_page.c --- linux-2.6.11.src/arch/i386/lib/sse_page.c Thu Jan 1 03:00:00 1970 +++ linux-2.6.11-nt.src/arch/i386/lib/sse_page.c Fri Mar 18 11:30:51 2005 @@ -0,0 +1,112 @@ +/* +* linux/arch/i386/lib/sse.c +* +* Copyright 2004 Jens Maurer +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +* +* Send feedback to <Jens.Maurer@gmx.net> +*/ + +#include <linux/preempt.h> /* preempt_disable */ +#include <asm/page.h> /* PAGE_SIZE */ +#include <asm/system.h> /* cr0 ops */ + + +/* +* SSE library helper functions +*/ + +#define SSE_START(cr0) do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + } while(0) + + +#define SSE_END(cr0) do { \ + write_cr0(cr0); \ + preempt_enable(); \ + } while(0) + +void zero_page_sse(void * page) +{ + unsigned char xmm_save[16]; + unsigned int cr0; + int i; + + SSE_START(cr0); + asm volatile( + " movups %%xmm0, (%0)\n" + " xorps %%xmm0, %%xmm0\n" + : : "r" (xmm_save) + ); + for(i = 0; i < PAGE_SIZE/16/4; i++) { + asm volatile( + " movntps %%xmm0, (%0)\n" + " movntps %%xmm0, 16(%0)\n" + " movntps %%xmm0, 32(%0)\n" + " movntps %%xmm0, 48(%0)\n" + : : "r"(page) : "memory" + ); + page += 16*4; + } + asm volatile( + " movups (%0), %%xmm0\n" + " sfence\n" + : : "r" (xmm_save) : "memory" + ); + SSE_END(cr0); +} + +void copy_page_sse(void *to, void *from) +{ + unsigned char xmm_save[16*4]; + unsigned int cr0; + int i; + + SSE_START(cr0); + asm volatile( + " movups %%xmm0, (%0)\n" + " movups %%xmm1, 16(%0)\n" + " movups %%xmm2, 32(%0)\n" + " movups %%xmm3, 48(%0)\n" + : : "r" (xmm_save) + ); + for(i = 0; i < PAGE_SIZE/16/4; i++) { + asm volatile( + " movaps (%0), %%xmm0\n" + " movaps 16(%0), %%xmm1\n" + " movaps 32(%0), %%xmm2\n" + " movaps 48(%0), %%xmm3\n" + " movntps %%xmm0, (%1)\n" + " movntps %%xmm1, 16(%1)\n" + " movntps %%xmm2, 32(%1)\n" + " movntps %%xmm3, 48(%1)\n" + : : "r" (from), "r" (to) : "memory" + ); + from += 16*4; + to += 16*4; + } + asm volatile( + " movups (%0), %%xmm0\n" + " movups 16(%0), %%xmm1\n" + " movups 32(%0), %%xmm2\n" + " movups 48(%0), %%xmm3\n" + " sfence\n" + : : "r" (xmm_save) : "memory" + ); + SSE_END(cr0); +} diff -urpN linux-2.6.11.src/include/asm-i386/page.h linux-2.6.11-nt.src/include/asm-i386/page.h --- linux-2.6.11.src/include/asm-i386/page.h Thu Mar 3 09:31:08 2005 +++ linux-2.6.11-nt.src/include/asm-i386/page.h Fri Mar 18 11:30:51 2005 @@ -12,26 +12,8 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -#include <linux/config.h> - -#ifdef CONFIG_X86_USE_3DNOW - -#include <asm/mmx.h> - -#define clear_page(page) mmx_clear_page((void *)(page)) -#define copy_page(to,from) mmx_copy_page(to,from) - -#else - -/* - * On older X86 processors it's not a win to use MMX here it seems. - * Maybe the K6-III ? - */ - -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) -#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) - -#endif +extern void clear_page(void*); +extern void copy_page(void*, const void*); #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) [-- Attachment #3: page_asm.tar.bz2 --] [-- Type: application/x-tbz, Size: 5697 bytes --] ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-18 9:54 ` Denis Vlasenko @ 2005-03-18 15:00 ` Christoph Lameter 2005-03-18 19:28 ` Andi Kleen 0 siblings, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-03-18 15:00 UTC (permalink / raw) To: Denis Vlasenko Cc: Dave Hansen, Andi Kleen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Fri, 18 Mar 2005, Denis Vlasenko wrote: > NT stores are not about 5% increase. 200%-300%. Provided you are ok with > the fact that zeroed page ends up evicted from cache. Luckily, this is exactly > what you want with prezeroing. These are pretty significant results. Maybe its best to use non-temporal stores in general for clearing pages? I checked and Itanium has always used non-temporal stores. So there will be no benefit for us from this approach (we have 16k and 64k page sizes which may make the situation a bit different). Try to update the i386 architectures to do the same? Or for prezeroing, you could register a zeroing driver that would use the non-temporal stores with V8 of the prezeroing patches. In any case the clear_pages patch is not useful the way it was intended for us and I am have dropped this from the prezeroing patch. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-18 15:00 ` Christoph Lameter @ 2005-03-18 19:28 ` Andi Kleen 2005-03-18 20:19 ` Christoph Lameter ` (2 more replies) 0 siblings, 3 replies; 28+ messages in thread From: Andi Kleen @ 2005-03-18 19:28 UTC (permalink / raw) To: Christoph Lameter Cc: Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Fri, Mar 18, 2005 at 07:00:06AM -0800, Christoph Lameter wrote: > On Fri, 18 Mar 2005, Denis Vlasenko wrote: > > > NT stores are not about 5% increase. 200%-300%. Provided you are ok with > > the fact that zeroed page ends up evicted from cache. Luckily, this is exactly > > what you want with prezeroing. > > These are pretty significant results. Maybe its best to use non-temporal The differences are actually less. I do not know what Denis benchmarked, but in my tests the difference was never more than ~10%. He got a zero too much? It does not make any sense if you think of it - the memory bus of the CPU cannot be that much faster than the cache. And the drawback of eating the cache misses later is really very significant. > stores in general for clearing pages? I checked and Itanium has always > used non-temporal stores. So there will be no benefit for us from this That is weird. I would actually try to switch to temporal stores, maybe it will improve some benchmarks. > approach (we have 16k and 64k page sizes which may make the situation a > bit different). Try to update the i386 architectures to do the same? Definitely not. You can experiment with using it for the cleaner daemon, but even there I would use some heuristic to make sure you only use it on a page that are at the end of a pretty long queue. e.g. if you can guarantee that the page allocator will go through 500k-1MB before going to the NT page that is cache cold it may be a good idea. But that might be pretty complicated and I am not sure it will be worth it. But for the clear running in the page fault handler context it is definitely a bad idea. -Andi ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-18 19:28 ` Andi Kleen @ 2005-03-18 20:19 ` Christoph Lameter 2005-03-21 15:30 ` Denis Vlasenko 2005-03-24 18:34 ` David Mosberger 2 siblings, 0 replies; 28+ messages in thread From: Christoph Lameter @ 2005-03-18 20:19 UTC (permalink / raw) To: Andi Kleen Cc: Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Fri, 18 Mar 2005, Andi Kleen wrote: > It does not make any sense if you think of it - the memory bus > of the CPU cannot be that much faster than the cache. The memory bus would be able to reach a higher rate if properly optimized for sequential writes to memory. A cache typically does random writes. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-18 19:28 ` Andi Kleen 2005-03-18 20:19 ` Christoph Lameter @ 2005-03-21 15:30 ` Denis Vlasenko 2005-03-24 18:34 ` David Mosberger 2 siblings, 0 replies; 28+ messages in thread From: Denis Vlasenko @ 2005-03-21 15:30 UTC (permalink / raw) To: Andi Kleen, Christoph Lameter Cc: Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer [-- Attachment #1: Type: text/plain, Size: 1835 bytes --] On Friday 18 March 2005 21:28, Andi Kleen wrote: > On Fri, Mar 18, 2005 at 07:00:06AM -0800, Christoph Lameter wrote: > > On Fri, 18 Mar 2005, Denis Vlasenko wrote: > > > > > NT stores are not about 5% increase. 200%-300%. Provided you are ok with > > > the fact that zeroed page ends up evicted from cache. Luckily, this is exactly > > > what you want with prezeroing. > > > > These are pretty significant results. Maybe its best to use non-temporal > > The differences are actually less. I do not know what Denis benchmarked, > but in my tests the difference was never more than ~10%. He got a zero > too much? No. See attached. # gcc -O2 0main.c # ./a.out Page clear/copy benchmark program. buffer size: 1 Mb Each test tried 64 times, max and min CPU cycles per page are reported. Please disregard max values. They are due to system interference only. clear_page() tests: normal_clear_page - took 44214 max,12615 min cycles per page normal_clear_page - took 18969 max,12649 min cycles per page repstosl_clear_page - took 19897 max,12655 min cycles per page movq_clear_page - took 39391 max,10782 min cycles per page movntq_clear_page - took 21612 max, 4779 min cycles per page copy_page() tests: .... I'm basically saying that 'microbenchmark-visible' performance of NT stores is 200-300% higher than 'normal' stores. BTW: cache eviction is not an intrisic property of non-temporal stores. It's merely how they're implemented in current CPUs: if NT stores hit cached line, invalidate it and push stores to bus. Else just push stores to bus without reading cacheline from RAM first. It is possible that some future CPU won't evict cacheline if NT stores happened to hit it: "if NT stores hit cached line, MODIFY it and push stores to bus". -- vda [-- Attachment #2: page_asm.tar.bz2 --] [-- Type: application/x-tbz, Size: 5707 bytes --] ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-18 19:28 ` Andi Kleen 2005-03-18 20:19 ` Christoph Lameter 2005-03-21 15:30 ` Denis Vlasenko @ 2005-03-24 18:34 ` David Mosberger 2005-03-24 18:41 ` Christoph Lameter ` (2 more replies) 2 siblings, 3 replies; 28+ messages in thread From: David Mosberger @ 2005-03-24 18:34 UTC (permalink / raw) To: Andi Kleen Cc: Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer >>>>> On Fri, 18 Mar 2005 20:28:08 +0100, Andi Kleen <ak@muc.de> said: >> stores in general for clearing pages? I checked and Itanium has >> always used non-temporal stores. So there will be no benefit for >> us from this Andi> That is weird. I would actually try to switch to temporal Andi> stores, maybe it will improve some benchmarks. That's definitely the case. See my earlier post on this topic: http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html Unfortunately, nobody reported any results for larger machines and/or more interesting workloads, so the patch is in limbo at this time. Clearly, if the CPU that's clearing the page is likely to use that same page soon after, it'd be useful to use temporal stores. --david ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 18:34 ` David Mosberger @ 2005-03-24 18:41 ` Christoph Lameter 2005-03-24 19:03 ` David S. Miller 2005-03-27 17:12 ` Andi Kleen 2005-04-06 0:15 ` Christoph Lameter 2 siblings, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-03-24 18:41 UTC (permalink / raw) To: davidm Cc: Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005, David Mosberger wrote: > That's definitely the case. See my earlier post on this topic: > > http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html > > Unfortunately, nobody reported any results for larger machines and/or > more interesting workloads, so the patch is in limbo at this time. > Clearly, if the CPU that's clearing the page is likely to use that > same page soon after, it'd be useful to use temporal stores. So it would be useful to have clear_page -> Temporal. Only zaps one page and clear_pages -> Zaps arbitrary order of page non-temporal Rework the clear_pages patch to do just that? Maybe rename clear_pages clear_pages_nt? prep_zero_page would use a temporal clear for an order 0 page but a nontemporal clear for higher order pages. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 18:41 ` Christoph Lameter @ 2005-03-24 19:03 ` David S. Miller 2005-03-24 22:49 ` Christoph Lameter 0 siblings, 1 reply; 28+ messages in thread From: David S. Miller @ 2005-03-24 19:03 UTC (permalink / raw) To: Christoph Lameter Cc: davidm, ak, clameter, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005 10:41:06 -0800 (PST) Christoph Lameter <clameter@engr.sgi.com> wrote: > So it would be useful to have > > clear_page -> Temporal. Only zaps one page > > and > > clear_pages -> Zaps arbitrary order of page non-temporal > > > Rework the clear_pages patch to do just that? Maybe rename clear_pages > clear_pages_nt? > > prep_zero_page would use a temporal clear for an order 0 page but a > nontemporal clear for higher order pages. That sounds about right to me. Hmmm, I'm inspired to experiment with this on sparc64 a bit. :-) ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 19:03 ` David S. Miller @ 2005-03-24 22:49 ` Christoph Lameter 2005-03-24 23:13 ` David S. Miller 2005-03-25 2:29 ` David S. Miller 0 siblings, 2 replies; 28+ messages in thread From: Christoph Lameter @ 2005-03-24 22:49 UTC (permalink / raw) To: David S. Miller Cc: davidm, ak, clameter, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005, David S. Miller wrote: > > prep_zero_page would use a temporal clear for an order 0 page but a > > nontemporal clear for higher order pages. > > That sounds about right to me. > > Hmmm, I'm inspired to experiment with this on sparc64 a bit. Could you help me fix up this patch replacing the old clear_pages patch? Introduces a new function clear_cold(void *pageaddress, int order) to clear pages of an arbitrary size with non temporal stores. Cold clearing is typically faster than hot clearing. Hot clearing is beneficial when the data is to be used soon. (The hot cold distincion also work well with the new hot and cold aware prezeroing daemon) - Use cold clearing for huge pages. - For ia64 also make clear_page uses temporal stores. - Patch needs fixes to work properly on i386, x86_64 and sparc64. - There may be other allocations that can benefit from the increased performance possible for cold zeroed pages if the pages are not to be used right away. Add __GFP_COLD to the gfp_flags for those. Signed-off-by: Christoph Lameter <clameter@sgi.com> Index: linux-2.6.11/mm/hugetlb.c =================================================================== --- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800 +++ linux-2.6.11/mm/hugetlb.c 2005-03-24 14:12:53.000000000 -0800 @@ -78,7 +78,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(void) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(); @@ -89,8 +88,7 @@ struct page *alloc_huge_page(void) spin_unlock(&hugetlb_lock); set_page_count(page, 1); page[1].mapping = (void *)free_huge_page; - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_highpage(&page[i]); + prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER | __GFP_COLD); return page; } Index: linux-2.6.11/mm/page_alloc.c =================================================================== --- linux-2.6.11.orig/mm/page_alloc.c 2005-03-24 13:15:40.000000000 -0800 +++ linux-2.6.11/mm/page_alloc.c 2005-03-24 14:15:15.000000000 -0800 @@ -633,11 +633,17 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +void prep_zero_page(struct page *page, int order, int gfp_flags) { int i; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + +#ifdef CONFIG_CLEAR_COLD + if ((gfp_flags & __GFP_COLD) && !PageHighmem(page)) + clear_cold(page_address(page), order) + else +#endif for(i = 0; i < (1 << order); i++) clear_highpage(page + i); } Index: linux-2.6.11/include/linux/gfp.h =================================================================== --- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800 +++ linux-2.6.11/include/linux/gfp.h 2005-03-24 14:12:53.000000000 -0800 @@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru void page_alloc_init(void); +void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags); #endif /* __LINUX_GFP_H */ Index: linux-2.6.11/arch/ia64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800 +++ linux-2.6.11/arch/ia64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -46,6 +46,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + choice prompt "System type" default IA64_GENERIC Index: linux-2.6.11/include/asm-ia64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800 +++ linux-2.6.11/include/asm-ia64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -57,6 +57,8 @@ # define STRICT_MM_TYPECHECKS extern void clear_page (void *page); +/* Clear arbitrary order page using nontemporal writes */ +extern void clear_cold (void *page, unsigned int order); extern void copy_page (void *to, void *from); /* Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-24 14:12:53.000000000 -0800 @@ -39,6 +39,7 @@ EXPORT_SYMBOL(__up); #include <asm/page.h> EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_cold); #ifdef CONFIG_VIRTUAL_MEM_MAP #include <linux/bootmem.h> Index: linux-2.6.11/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800 @@ -7,6 +7,8 @@ * 1/06/01 davidm Tuned for Itanium. * 2/12/02 kchen Tuned for both Itanium and McKinley * 3/08/02 davidm Some more tweaking + * 24/3/04 clameter Make clear_page use temporal stores + add clear_cold using nontemporal stores */ #include <linux/config.h> @@ -53,6 +55,58 @@ GLOBAL_ENTRY(clear_page) ;; #ifdef CONFIG_ITANIUM // Optimized for Itanium +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 + cmp.lt p8,p0=dst_fetch, dst_last + ;; +#else + // Optimized for McKinley +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 + stf.spill [dst3] = f0, 64 + stf.spill [dst4] = f0, 128 + cmp.lt p8,p0=dst_fetch, dst_last + ;; + stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 +#endif + stf.spill [dst3] = f0, 64 +(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE + br.cloop.sptk.few 1b + ;; + mov ar.lc = saved_lc // restore lc + br.ret.sptk.many rp +END(clear_page) + + +GLOBAL_ENTRY(clear_cold) + .prologue + .regstk 2,0,0,0 + mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count + mov totsize = PAGE_SIZE + .save ar.lc, saved_lc + mov saved_lc = ar.lc + ;; + .body + adds dst1 = 16, in0 + mov ar.lc = (PREFETCH_LINES - 1) + mov dst_fetch = in0 + adds dst2 = 32, in0 + shl r16 = r16, in1 + shl totsize = totsize, in1 + ;; +.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE + adds dst3 = 48, in0 // executing this multiple times is harmless + br.cloop.sptk.few .fetch + add r16 = -1,r16 + add dst_last = totsize, dst_fetch + adds dst4 = 64, in0 + ;; + mov ar.lc = r16 // one L3 line per iteration + adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last + ;; +#ifdef CONFIG_ITANIUM + // Optimized for Itanium 1: stf.spill.nta [dst1] = f0, 64 stf.spill.nta [dst2] = f0, 64 cmp.lt p8,p0=dst_fetch, dst_last @@ -74,4 +128,4 @@ GLOBAL_ENTRY(clear_page) ;; mov ar.lc = saved_lc // restore lc br.ret.sptk.many rp -END(clear_page) +END(clear_cold) Index: linux-2.6.11/arch/i386/Kconfig =================================================================== --- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-24 13:15:36.000000000 -0800 +++ linux-2.6.11/arch/i386/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -33,6 +33,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" menu "Processor type and features" Index: linux-2.6.11/include/asm-i386/page.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800 +++ linux-2.6.11/include/asm-i386/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -19,6 +19,7 @@ #include <asm/mmx.h> #define clear_page(page) mmx_clear_page((void *)(page)) +#define clear_cold(page, order) mmx_clear_cold((void *)(page), order) #define copy_page(to,from) mmx_copy_page(to,from) #else @@ -29,6 +30,8 @@ */ #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) +/* Clear arbitrary order page with nontemporal stores... is memset temporal?? */ +#define clear_cold(page, order) memset((void *)(page), 0, PAGE_SIZE << order) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) #endif Index: linux-2.6.11/include/asm-i386/mmx.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-24 14:12:53.000000000 -0800 @@ -9,6 +9,7 @@ extern void *_mmx_memcpy(void *to, const void *from, size_t size); extern void mmx_clear_page(void *page); +extern void mmx_clear_cold(void *page, unsigned int order); extern void mmx_copy_page(void *to, void *from); #endif Index: linux-2.6.11/arch/i386/lib/mmx.c =================================================================== --- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-24 14:12:53.000000000 -0800 @@ -397,3 +397,14 @@ void mmx_copy_page(void *to, void *from) else fast_copy_page(to, from); } + +/* FIXME: Make this a real cold zeroing function */ +void mmx_clear_cold(void *page, int order) +{ + int i; + + for(i=0; i < (1 << order); i++) { + mmx_clear_page(page); + page += PAGE_SIZE; + } +} Index: linux-2.6.11/arch/x86_64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-24 13:15:37.000000000 -0800 +++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -78,6 +78,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" Index: linux-2.6.11/include/asm-x86_64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -33,6 +33,8 @@ #ifndef __ASSEMBLY__ void clear_page(void *); +/* Clear arbitrary order page using non-temporal writes */ +void clear_cold(void *, int order); void copy_page(void *, void *); #define clear_user_page(page, vaddr, pg) clear_page(page) Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 13:15:37.000000000 -0800 +++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 14:12:53.000000000 -0800 @@ -108,6 +108,7 @@ EXPORT_SYMBOL(pci_mem_start); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_cold); EXPORT_SYMBOL(cpu_pda); #ifdef CONFIG_SMP Index: linux-2.6.11/arch/x86_64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800 @@ -48,3 +48,57 @@ clear_page_c: ret clear_page_c_end: .previous + + +/* + * Zero a page cold. + * rdi page + * rsi order + */ + .globl clear_cold + .p2align 4 +clear_cold: + movl $4096/64,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx + xorl %eax,%eax + .p2align 4 +.Lcloop: + decl %ecx +#define PUTC(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUTC(1) + PUTC(2) + PUTC(3) + PUTC(4) + PUTC(5) + PUTC(6) + PUTC(7) + leaq 64(%rdi),%rdi + jnz .Lcloop + nop + ret +clear_cold_end: + + .section .altinstructions,"a" + .align 8 + .quad clear_cold + .quad clear_cold_c + .byte X86_FEATURE_K8_C + .byte clear_cold_end-clear_cold + .byte clear_cold_c_end-clear_cold_c + .previous + + .section .altinstr_replacement,"ax" +clear_cold_c: + movl $4096/8,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx + xorl %eax,%eax + rep + stosq + ret +clear_cold_c_end: + .previous Index: linux-2.6.11/arch/sparc64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800 +++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800 @@ -103,3 +103,82 @@ clear_page_common: out: retl nop + .globl clear_cold +clear_cold: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 + ba,pt %xcc, clear_cold_common + sllx %o2, %o1, %o1 + + /* This thing is pretty important, it shows up + * on the profiles via do_anonymous_page(). + */ + .align 32 + .globl clear_cold_page +clear_cold_user_page: /* %o0=dest, %o1=vaddr */ + lduw [%g6 + TI_PRE_COUNT], %o2 + sethi %uhi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_SIZE), %o4 + + sllx %g2, 32, %g2 + sethi %uhi(TTE_BITS_TOP), %g3 + + sllx %g3, 32, %g3 + sub %o0, %g2, %g1 ! paddr + + or %g3, TTE_BITS_BOTTOM, %g3 + and %o1, %o4, %o0 ! vaddr D-cache alias bit + + or %g1, %g3, %g1 ! TTE data + sethi %hi(TLBTEMP_BASE), %o3 + + add %o2, 1, %o4 + add %o0, %o3, %o0 ! TTE vaddr + + /* Disable preemption. */ + mov TLB_TAG_ACCESS, %g3 + stw %o4, [%g6 + TI_PRE_COUNT] + + /* Load TLB entry. */ + rdpr %pstate, %o4 + wrpr %o4, PSTATE_IE, %pstate + stxa %o0, [%g3] ASI_DMMU + stxa %g1, [%g0] ASI_DTLB_DATA_IN + flush %g6 + wrpr %o4, 0x0, %pstate + + sethi %hi(PAGE_SIZE/64), %o1 + mov 1, %o4 + or %o1, %lo(PAGE_SIZE/64), %o1 + +clear_cold_common: + VISEntryHalf + membar #StoreLoad | #StoreStore | #LoadStore + fzero %f0 + mov %o0, %g1 ! remember vaddr for tlbflush + fzero %f2 + faddd %f0, %f2, %f4 + fmuld %f0, %f2, %f6 + faddd %f0, %f2, %f8 + fmuld %f0, %f2, %f10 + + faddd %f0, %f2, %f12 + fmuld %f0, %f2, %f14 +2: stda %f0, [%o0 + %g0] ASI_BLK_P + subcc %o1, 1, %o1 + bne,pt %icc, 2b + add %o0, 0x40, %o0 + membar #Sync + VISExitHalf + + brz,pn %o4, outcold + nop + + stxa %g0, [%g1] ASI_DMMU_DEMAP + membar #Sync + stw %o2, [%g6 + TI_PRE_COUNT] + +outcold: retl + nop + Index: linux-2.6.11/include/asm-sparc64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800 +++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -16,6 +16,8 @@ extern void _clear_page(void *page); #define clear_page(X) _clear_page((void *)(X)) +/* Non temporal clear an arbitrary order page */ +extern void clear_cold(void *page, unsigned int order); struct page; extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) Index: linux-2.6.11/arch/sparc64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800 +++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -16,6 +16,10 @@ config TIME_INTERPOLATION bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" config SYSVIPC_COMPAT ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 22:49 ` Christoph Lameter @ 2005-03-24 23:13 ` David S. Miller 2005-03-25 2:29 ` David S. Miller 1 sibling, 0 replies; 28+ messages in thread From: David S. Miller @ 2005-03-24 23:13 UTC (permalink / raw) To: Christoph Lameter Cc: davidm, ak, clameter, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005 14:49:55 -0800 (PST) Christoph Lameter <clameter@engr.sgi.com> wrote: > On Thu, 24 Mar 2005, David S. Miller wrote: > > > > prep_zero_page would use a temporal clear for an order 0 page but a > > > nontemporal clear for higher order pages. > > > > That sounds about right to me. > > > > Hmmm, I'm inspired to experiment with this on sparc64 a bit. > > Could you help me fix up this patch replacing the old clear_pages patch? Sure, I'll play with it. Meanwhile, here are some numbers. I changed just the clear_page() implementation on sparc64 so that it used prefetching and normal temporal stores. The machine is a uniprocessor 1.5Ghz Ultra-IIIi, 64K write-through D-cache, 64K I-cache, 1MB L2 cache. I did 4 timed 'vmlinux' builds after a fresh boot: BEFORE: real 9m8.720s user 8m28.345s sys 0m32.734s real 9m2.034s user 8m28.763s sys 0m32.512s real 9m1.848s user 8m28.970s sys 0m32.204s real 9m1.701s user 8m28.715s sys 0m32.394s AFTER: real 9m2.241s user 8m16.633s sys 0m36.451s real 8m53.739s user 8m17.165s sys 0m36.052s real 8m54.089s user 8m17.266s sys 0m36.219s real 8m54.071s user 8m17.473s sys 0m36.073s So, at the very least, my results agree with D. Mosberger's on IA64. At the cost of ~4 seconds of system time, we gain ~11 seconds of user time. I'm pretty much convinced this is a win. I wonder if it matters to do something similar for copy_page*() as well. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 22:49 ` Christoph Lameter 2005-03-24 23:13 ` David S. Miller @ 2005-03-25 2:29 ` David S. Miller 2005-03-25 2:43 ` Christoph Lameter 1 sibling, 1 reply; 28+ messages in thread From: David S. Miller @ 2005-03-25 2:29 UTC (permalink / raw) To: Christoph Lameter Cc: davidm, ak, clameter, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005 14:49:55 -0800 (PST) Christoph Lameter <clameter@engr.sgi.com> wrote: > Could you help me fix up this patch replacing the old clear_pages patch? Ok, first you need to mark the order and gfp arguments as unsigned for mm/page_alloc.c:prep_zero_page() so that it matches the prototype you added to include/linux/gfp.h else the compiler warns a lot. Next, in the same function in mm/page_alloc.c, "PageHighmem()" is typo'd, it should be "PageHighMem()". The clear_cold() call on the next line needs a semicolon. Erm... were any of your test builds done with the new CONFIG_CLEAR_COLD option enabled? :-) Next, replace your arch/sparc64/lib/clear_page.S diff with this one and things would be working and we'll be using the proper temporal vs. non-temporal stores on that platform. ===== arch/sparc64/lib/clear_page.S 1.1 vs edited ===== --- 1.1/arch/sparc64/lib/clear_page.S 2004-08-08 19:54:07 -07:00 +++ edited/arch/sparc64/lib/clear_page.S 2005-03-24 15:56:33 -08:00 @@ -72,26 +72,34 @@ mov 1, %o4 clear_page_common: - VISEntryHalf membar #StoreLoad | #StoreStore | #LoadStore - fzero %f0 sethi %hi(PAGE_SIZE/64), %o1 mov %o0, %g1 ! remember vaddr for tlbflush - fzero %f2 or %o1, %lo(PAGE_SIZE/64), %o1 - faddd %f0, %f2, %f4 - fmuld %f0, %f2, %f6 - faddd %f0, %f2, %f8 - fmuld %f0, %f2, %f10 - faddd %f0, %f2, %f12 - fmuld %f0, %f2, %f14 -1: stda %f0, [%o0 + %g0] ASI_BLK_P +#define PREFETCH(x, y) prefetch x, y +#define PREFETCH_CODE 2 + + PREFETCH([%o0 + 0x000], PREFETCH_CODE) + PREFETCH([%o0 + 0x040], PREFETCH_CODE) + PREFETCH([%o0 + 0x080], PREFETCH_CODE) + PREFETCH([%o0 + 0x0c0], PREFETCH_CODE) + PREFETCH([%o0 + 0x100], PREFETCH_CODE) + PREFETCH([%o0 + 0x140], PREFETCH_CODE) + PREFETCH([%o0 + 0x180], PREFETCH_CODE) +1: + stx %g0, [%o0 + 0x00] + stx %g0, [%o0 + 0x08] + stx %g0, [%o0 + 0x10] + stx %g0, [%o0 + 0x18] + stx %g0, [%o0 + 0x20] + stx %g0, [%o0 + 0x28] + stx %g0, [%o0 + 0x30] + stx %g0, [%o0 + 0x38] + PREFETCH([%o0 + 0x1c0], PREFETCH_CODE) subcc %o1, 1, %o1 bne,pt %icc, 1b add %o0, 0x40, %o0 - membar #Sync - VISExitHalf brz,pn %o4, out nop @@ -101,5 +109,32 @@ stw %o2, [%g6 + TI_PRE_COUNT] out: retl + nop + + .globl clear_cold +clear_cold: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 + sllx %o2, %o1, %o1 + VISEntryHalf + membar #StoreLoad | #StoreStore | #LoadStore + fzero %f0 + fzero %f2 + faddd %f0, %f2, %f4 + fmuld %f0, %f2, %f6 + faddd %f0, %f2, %f8 + fmuld %f0, %f2, %f10 + + faddd %f0, %f2, %f12 + fmuld %f0, %f2, %f14 +2: stda %f0, [%o0 + %g0] ASI_BLK_P + subcc %o1, 1, %o1 + bne,pt %icc, 2b + add %o0, 0x40, %o0 + membar #Sync + VISExitHalf + + retl nop ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-25 2:29 ` David S. Miller @ 2005-03-25 2:43 ` Christoph Lameter 0 siblings, 0 replies; 28+ messages in thread From: Christoph Lameter @ 2005-03-25 2:43 UTC (permalink / raw) To: David S. Miller Cc: Christoph Lameter, davidm, ak, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005, David S. Miller wrote: > Erm... were any of your test builds done with the new CONFIG_CLEAR_COLD > option enabled? :-) These were all fixed but I failed to do a "quilt refresh" .... sigh... The email issues are also fixed now .... sigh. What a day. > Next, replace your arch/sparc64/lib/clear_page.S diff with this one and > things would be working and we'll be using the proper temporal vs. > non-temporal stores on that platform. Thanks. Here is the patch with your changes and a "quilt refresh" ;-) --------------------------------------------------------------------- Introduces a new function clear_cold(void *pageaddress, int order) to clear pages of an arbitrary size with non temporal stores. Cold clearing is typically faster than hot clearing. Hot clearing is beneficial when the data is to be used soon. (Will also work well with the new hot and cold aware prezeroing daemon) Use cold clearing for huge pages. For ia64 also make clear_page uses temporal stores. Patch needs fixes to work properly on i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Index: linux-2.6.11/mm/hugetlb.c =================================================================== --- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800 +++ linux-2.6.11/mm/hugetlb.c 2005-03-24 14:12:53.000000000 -0800 @@ -78,7 +78,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(void) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(); @@ -89,8 +88,7 @@ struct page *alloc_huge_page(void) spin_unlock(&hugetlb_lock); set_page_count(page, 1); page[1].mapping = (void *)free_huge_page; - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_highpage(&page[i]); + prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER | __GFP_COLD); return page; } Index: linux-2.6.11/mm/page_alloc.c =================================================================== --- linux-2.6.11.orig/mm/page_alloc.c 2005-03-24 13:15:40.000000000 -0800 +++ linux-2.6.11/mm/page_alloc.c 2005-03-24 18:39:22.000000000 -0800 @@ -633,11 +633,17 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +void prep_zero_page(struct page *page, unsigned int order, int gfp_flags) { int i; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + +#ifdef CONFIG_CLEAR_COLD + if ((gfp_flags & __GFP_COLD) && !PageHighMem(page)) + clear_cold(page_address(page), order); + else +#endif for(i = 0; i < (1 << order); i++) clear_highpage(page + i); } Index: linux-2.6.11/include/linux/gfp.h =================================================================== --- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800 +++ linux-2.6.11/include/linux/gfp.h 2005-03-24 14:16:44.000000000 -0800 @@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru void page_alloc_init(void); +void prep_zero_page(struct page *, unsigned int order, int gfp_flags); #endif /* __LINUX_GFP_H */ Index: linux-2.6.11/arch/ia64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800 +++ linux-2.6.11/arch/ia64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -46,6 +46,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + choice prompt "System type" default IA64_GENERIC Index: linux-2.6.11/include/asm-ia64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800 +++ linux-2.6.11/include/asm-ia64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -57,6 +57,8 @@ # define STRICT_MM_TYPECHECKS extern void clear_page (void *page); +/* Clear arbitrary order page using nontemporal writes */ +extern void clear_cold (void *page, unsigned int order); extern void copy_page (void *to, void *from); /* Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-24 14:12:53.000000000 -0800 @@ -39,6 +39,7 @@ EXPORT_SYMBOL(__up); #include <asm/page.h> EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_cold); #ifdef CONFIG_VIRTUAL_MEM_MAP #include <linux/bootmem.h> Index: linux-2.6.11/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-24 14:24:29.000000000 -0800 @@ -7,6 +7,8 @@ * 1/06/01 davidm Tuned for Itanium. * 2/12/02 kchen Tuned for both Itanium and McKinley * 3/08/02 davidm Some more tweaking + * 24/3/04 clameter Make clear_page use temporal stores + add clear_cold using nontemporal stores */ #include <linux/config.h> @@ -53,6 +55,59 @@ GLOBAL_ENTRY(clear_page) ;; #ifdef CONFIG_ITANIUM // Optimized for Itanium +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 + cmp.lt p8,p0=dst_fetch, dst_last + ;; +#else + // Optimized for McKinley +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 + stf.spill [dst3] = f0, 64 + stf.spill [dst4] = f0, 128 + cmp.lt p8,p0=dst_fetch, dst_last + ;; + stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 +#endif + stf.spill [dst3] = f0, 64 +(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE + br.cloop.sptk.few 1b + ;; + mov ar.lc = saved_lc // restore lc + br.ret.sptk.many rp +END(clear_page) + +#define totsize r14 + +GLOBAL_ENTRY(clear_cold) + .prologue + .regstk 2,0,0,0 + mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count + mov totsize = PAGE_SIZE + .save ar.lc, saved_lc + mov saved_lc = ar.lc + ;; + .body + adds dst1 = 16, in0 + mov ar.lc = (PREFETCH_LINES - 1) + mov dst_fetch = in0 + adds dst2 = 32, in0 + shl r16 = r16, in1 + shl totsize = totsize, in1 + ;; +.fetc: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE + adds dst3 = 48, in0 // executing this multiple times is harmless + br.cloop.sptk.few .fetc + add r16 = -1,r16 + add dst_last = totsize, dst_fetch + adds dst4 = 64, in0 + ;; + mov ar.lc = r16 // one L3 line per iteration + adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last + ;; +#ifdef CONFIG_ITANIUM + // Optimized for Itanium 1: stf.spill.nta [dst1] = f0, 64 stf.spill.nta [dst2] = f0, 64 cmp.lt p8,p0=dst_fetch, dst_last @@ -74,4 +129,4 @@ GLOBAL_ENTRY(clear_page) ;; mov ar.lc = saved_lc // restore lc br.ret.sptk.many rp -END(clear_page) +END(clear_cold) Index: linux-2.6.11/arch/i386/Kconfig =================================================================== --- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-24 13:15:36.000000000 -0800 +++ linux-2.6.11/arch/i386/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -33,6 +33,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" menu "Processor type and features" Index: linux-2.6.11/include/asm-i386/page.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800 +++ linux-2.6.11/include/asm-i386/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -19,6 +19,7 @@ #include <asm/mmx.h> #define clear_page(page) mmx_clear_page((void *)(page)) +#define clear_cold(page, order) mmx_clear_cold((void *)(page), order) #define copy_page(to,from) mmx_copy_page(to,from) #else @@ -29,6 +30,8 @@ */ #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) +/* Clear arbitrary order page with nontemporal stores... is memset temporal?? */ +#define clear_cold(page, order) memset((void *)(page), 0, PAGE_SIZE << order) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) #endif Index: linux-2.6.11/include/asm-i386/mmx.h =================================================================== --- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-24 14:12:53.000000000 -0800 @@ -9,6 +9,7 @@ extern void *_mmx_memcpy(void *to, const void *from, size_t size); extern void mmx_clear_page(void *page); +extern void mmx_clear_cold(void *page, unsigned int order); extern void mmx_copy_page(void *to, void *from); #endif Index: linux-2.6.11/arch/i386/lib/mmx.c =================================================================== --- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800 +++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-24 14:12:53.000000000 -0800 @@ -397,3 +397,14 @@ void mmx_copy_page(void *to, void *from) else fast_copy_page(to, from); } + +/* FIXME: Make this a real cold zeroing function */ +void mmx_clear_cold(void *page, int order) +{ + int i; + + for(i=0; i < (1 << order); i++) { + mmx_clear_page(page); + page += PAGE_SIZE; + } +} Index: linux-2.6.11/arch/x86_64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-24 13:15:37.000000000 -0800 +++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -78,6 +78,10 @@ config GENERIC_IOMAP bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" Index: linux-2.6.11/include/asm-x86_64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -33,6 +33,8 @@ #ifndef __ASSEMBLY__ void clear_page(void *); +/* Clear arbitrary order page using non-temporal writes */ +void clear_cold(void *, int order); void copy_page(void *, void *); #define clear_user_page(page, vaddr, pg) clear_page(page) Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 13:15:37.000000000 -0800 +++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 14:12:53.000000000 -0800 @@ -108,6 +108,7 @@ EXPORT_SYMBOL(pci_mem_start); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_cold); EXPORT_SYMBOL(cpu_pda); #ifdef CONFIG_SMP Index: linux-2.6.11/arch/x86_64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800 +++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800 @@ -48,3 +48,57 @@ clear_page_c: ret clear_page_c_end: .previous + + +/* + * Zero a page cold. + * rdi page + * rsi order + */ + .globl clear_cold + .p2align 4 +clear_cold: + movl $4096/64,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx + xorl %eax,%eax + .p2align 4 +.Lcloop: + decl %ecx +#define PUTC(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUTC(1) + PUTC(2) + PUTC(3) + PUTC(4) + PUTC(5) + PUTC(6) + PUTC(7) + leaq 64(%rdi),%rdi + jnz .Lcloop + nop + ret +clear_cold_end: + + .section .altinstructions,"a" + .align 8 + .quad clear_cold + .quad clear_cold_c + .byte X86_FEATURE_K8_C + .byte clear_cold_end-clear_cold + .byte clear_cold_c_end-clear_cold_c + .previous + + .section .altinstr_replacement,"ax" +clear_cold_c: + movl $4096/8,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx + xorl %eax,%eax + rep + stosq + ret +clear_cold_c_end: + .previous Index: linux-2.6.11/arch/sparc64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800 +++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-24 18:39:44.000000000 -0800 @@ -72,13 +72,127 @@ clear_user_page: /* %o0=dest, %o1=vaddr mov 1, %o4 clear_page_common: + membar #StoreLoad | #StoreStore | #LoadStore + sethi %hi(PAGE_SIZE/64), %o1 + mov %o0, %g1 ! remember vaddr for tlbflush + or %o1, %lo(PAGE_SIZE/64), %o1 + +#define PREFETCH(x, y) prefetch x, y +#define PREFETCH_CODE 2 + + PREFETCH([%o0 + 0x000], PREFETCH_CODE) + PREFETCH([%o0 + 0x040], PREFETCH_CODE) + PREFETCH([%o0 + 0x080], PREFETCH_CODE) + PREFETCH([%o0 + 0x0c0], PREFETCH_CODE) + PREFETCH([%o0 + 0x100], PREFETCH_CODE) + PREFETCH([%o0 + 0x140], PREFETCH_CODE) + PREFETCH([%o0 + 0x180], PREFETCH_CODE) +1: + stx %g0, [%o0 + 0x00] + stx %g0, [%o0 + 0x08] + stx %g0, [%o0 + 0x10] + stx %g0, [%o0 + 0x18] + stx %g0, [%o0 + 0x20] + stx %g0, [%o0 + 0x28] + stx %g0, [%o0 + 0x30] + stx %g0, [%o0 + 0x38] + PREFETCH([%o0 + 0x1c0], PREFETCH_CODE) + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 0x40, %o0 + + brz,pn %o4, out + nop + + stxa %g0, [%g1] ASI_DMMU_DEMAP + membar #Sync + stw %o2, [%g6 + TI_PRE_COUNT] + +out: retl + nop + + .globl clear_cold +clear_cold: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 + sllx %o2, %o1, %o1 VISEntryHalf membar #StoreLoad | #StoreStore | #LoadStore fzero %f0 + fzero %f2 + faddd %f0, %f2, %f4 + fmuld %f0, %f2, %f6 + faddd %f0, %f2, %f8 + fmuld %f0, %f2, %f10 + + faddd %f0, %f2, %f12 + fmuld %f0, %f2, %f14 +2: stda %f0, [%o0 + %g0] ASI_BLK_P + subcc %o1, 1, %o1 + bne,pt %icc, 2b + add %o0, 0x40, %o0 + membar #Sync + VISExitHalf + + retl + nop + + .globl clear_cold +clear_cold: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 + ba,pt %xcc, clear_cold_common + sllx %o2, %o1, %o1 + + /* This thing is pretty important, it shows up + * on the profiles via do_anonymous_page(). + */ + .align 32 + .globl clear_cold_page +clear_cold_user_page: /* %o0=dest, %o1=vaddr */ + lduw [%g6 + TI_PRE_COUNT], %o2 + sethi %uhi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_SIZE), %o4 + + sllx %g2, 32, %g2 + sethi %uhi(TTE_BITS_TOP), %g3 + + sllx %g3, 32, %g3 + sub %o0, %g2, %g1 ! paddr + + or %g3, TTE_BITS_BOTTOM, %g3 + and %o1, %o4, %o0 ! vaddr D-cache alias bit + + or %g1, %g3, %g1 ! TTE data + sethi %hi(TLBTEMP_BASE), %o3 + + add %o2, 1, %o4 + add %o0, %o3, %o0 ! TTE vaddr + + /* Disable preemption. */ + mov TLB_TAG_ACCESS, %g3 + stw %o4, [%g6 + TI_PRE_COUNT] + + /* Load TLB entry. */ + rdpr %pstate, %o4 + wrpr %o4, PSTATE_IE, %pstate + stxa %o0, [%g3] ASI_DMMU + stxa %g1, [%g0] ASI_DTLB_DATA_IN + flush %g6 + wrpr %o4, 0x0, %pstate + sethi %hi(PAGE_SIZE/64), %o1 + mov 1, %o4 + or %o1, %lo(PAGE_SIZE/64), %o1 + +clear_cold_common: + VISEntryHalf + membar #StoreLoad | #StoreStore | #LoadStore + fzero %f0 mov %o0, %g1 ! remember vaddr for tlbflush fzero %f2 - or %o1, %lo(PAGE_SIZE/64), %o1 faddd %f0, %f2, %f4 fmuld %f0, %f2, %f6 faddd %f0, %f2, %f8 @@ -86,20 +200,20 @@ clear_page_common: faddd %f0, %f2, %f12 fmuld %f0, %f2, %f14 -1: stda %f0, [%o0 + %g0] ASI_BLK_P +2: stda %f0, [%o0 + %g0] ASI_BLK_P subcc %o1, 1, %o1 - bne,pt %icc, 1b + bne,pt %icc, 2b add %o0, 0x40, %o0 membar #Sync VISExitHalf - brz,pn %o4, out + brz,pn %o4, outcold nop stxa %g0, [%g1] ASI_DMMU_DEMAP membar #Sync stw %o2, [%g6 + TI_PRE_COUNT] -out: retl +outcold: retl nop Index: linux-2.6.11/include/asm-sparc64/page.h =================================================================== --- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800 +++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-24 14:12:53.000000000 -0800 @@ -16,6 +16,8 @@ extern void _clear_page(void *page); #define clear_page(X) _clear_page((void *)(X)) +/* Non temporal clear an arbitrary order page */ +extern void clear_cold(void *page, unsigned int order); struct page; extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) Index: linux-2.6.11/arch/sparc64/Kconfig =================================================================== --- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800 +++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-24 14:12:53.000000000 -0800 @@ -16,6 +16,10 @@ config TIME_INTERPOLATION bool default y +config CLEAR_COLD + bool + default y + source "init/Kconfig" config SYSVIPC_COMPAT ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 18:34 ` David Mosberger 2005-03-24 18:41 ` Christoph Lameter @ 2005-03-27 17:12 ` Andi Kleen 2005-03-27 18:23 ` David S. Miller 2005-03-29 1:58 ` Christoph Lameter 2005-04-06 0:15 ` Christoph Lameter 2 siblings, 2 replies; 28+ messages in thread From: Andi Kleen @ 2005-03-27 17:12 UTC (permalink / raw) To: davidm Cc: Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer > Clearly, if the CPU that's clearing the page is likely to use that > same page soon after, it'd be useful to use temporal stores. That is always the case in the current code (without Christophers pre cleaning daemon). The page fault handler clears and user space is guaranteed to need at least one cacheline from the fresh page because it just did a page fault on it. With non temporal stores you guarantee at least one hard cache miss directly after the return to user space. I suspect even with precleaning the average time from cleaning to use will be quite short. -Andi ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-27 17:12 ` Andi Kleen @ 2005-03-27 18:23 ` David S. Miller 2005-03-29 1:58 ` Christoph Lameter 1 sibling, 0 replies; 28+ messages in thread From: David S. Miller @ 2005-03-27 18:23 UTC (permalink / raw) To: Andi Kleen Cc: davidm, clameter, vda, haveblue, akpm, linux-kernel, mel, linux-ia64, Jens.Maurer On 27 Mar 2005 19:12:20 +0200 Andi Kleen <ak@muc.de> wrote: > With non temporal stores > you guarantee at least one hard cache miss directly after > the return to user space. This is true if the cacheline were not present already at the time of the non-temporal store. I know what you're trying to say, I'm just clarifying. The real question is if a large enough ratio of those cachelines in the page get similarly accessed. I happen to think the answer to that for any real example is yes. Yet, I have no way to prove this. It would be cool to do some hacks under Xen or user-mode Linux to get some real statistics about this. Actually, this could be done also with hacks to valgrind or other similar tools. QEMU could also be used. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-27 17:12 ` Andi Kleen 2005-03-27 18:23 ` David S. Miller @ 2005-03-29 1:58 ` Christoph Lameter 1 sibling, 0 replies; 28+ messages in thread From: Christoph Lameter @ 2005-03-29 1:58 UTC (permalink / raw) To: Andi Kleen Cc: davidm, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Sun, 27 Mar 2005, Andi Kleen wrote: > > Clearly, if the CPU that's clearing the page is likely to use that > > same page soon after, it'd be useful to use temporal stores. > > That is always the case in the current code (without Christophers > pre cleaning daemon). The page fault handler clears and user space > is guaranteed to need at least one cacheline from the fresh page > because it just did a page fault on it. With non temporal stores > you guarantee at least one hard cache miss directly after > the return to user space. It is not the case that *all* the cachelines of a page are going to be used right after zeroing. For the page fault case it is only guaranteed that *one* cacheline will be used. In the PTE/PMD/PUD page allocation cases it is likely that only a single cacheline is used. There are some cases in the code (apart from the fault handler) where zeroed pages are allocated with no guarantee of use (f.e. the allocations for buffers for shared memory or pipes). > I suspect even with precleaning the average time from cleaning to use will be > quite short. If the time is short then hot cleaning is the right way to go and then prezeroing is of no benefit. Prezeroing can only be of benefit if there is sufficient time between the zeroing and the use of the data. It must be sufficiently long to cause the the cachelines to no longer be in in the caches. Then the loading of these cachelines may be avoided which yields the performance benefit. ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-24 18:34 ` David Mosberger 2005-03-24 18:41 ` Christoph Lameter 2005-03-27 17:12 ` Andi Kleen @ 2005-04-06 0:15 ` Christoph Lameter 2005-04-06 0:23 ` David Mosberger 2 siblings, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-04-06 0:15 UTC (permalink / raw) To: davidm Cc: Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Thu, 24 Mar 2005, David Mosberger wrote: > That's definitely the case. See my earlier post on this topic: > > http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html > > Unfortunately, nobody reported any results for larger machines and/or > more interesting workloads, so the patch is in limbo at this time. > Clearly, if the CPU that's clearing the page is likely to use that > same page soon after, it'd be useful to use temporal stores. Here are some numbers using lmbench of temporal writes vs. non temporal writes on ia64 (8p machine but lmbench run only for one load). There seems to be some benefit for fork/exec but overall this does not seem to be a clear win. I suspect that the distinction between temporal vs. nontemporal writes is be more beneficial on machines with smaller pagesizes since the likelyhood that most cachelines of a page are used soon is increased and therefore hot zeroing is more beneficial. L M B E N C H 3 . 0 S U M M A R Y ------------------------------------ (Alpha software, do not distribute) Basic system parameters ------------------------------------------------------------------------------------------- Host OS Description Mhz tlb cache mem scal pages line par load bytes --------- ------------------------- ----------------------- ---- ----- ----- ------ ---- margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1 margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1 Processor, Processes - times in microseconds - smaller is better ------------------------------------------------------------------------------------------ Host OS Mhz null null open slct sig sig fork exec sh call I/O stat clos TCP inst hndl proc proc proc --------- ------------------------- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.90 6.11 15.7 0.39 2.43 528. 1926 4853 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.27 4.86 6.10 15.7 0.39 2.45 522. 1910 4260 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.85 6.10 15.8 0.39 2.40 526. 1916 4429 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.84 6.11 15.7 0.39 2.40 531. 1838 4429 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.85 6.11 15.8 0.39 2.47 553. 1931 5118 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 5.09 6.37 15.7 0.39 2.40 537. 1934 5133 margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 5.09 6.35 15.8 0.39 2.40 555. 1939 5389 margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.26 4.88 6.10 15.8 0.39 2.42 519. 1829 4787 margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.26 4.87 6.09 15.8 0.39 2.40 516. 1830 5057 margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.27 4.86 6.10 15.8 0.39 2.40 512. 1878 5166 Context switching - times in microseconds - smaller is better ------------------------------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------------------- ------ ------ ------ ------ ------ ------- ------- margin Linux 2.6.12-rc1-bk3 7.3300 2.7400 7.0400 4.4600 6.6200 3.94000 8.38000 margin Linux 2.6.12-rc1-bk3 7.6100 8.1000 7.3200 4.5900 7.1700 5.50000 7.84000 margin Linux 2.6.12-rc1-bk3 7.2400 8.0000 7.2100 4.3800 6.7500 4.77000 7.37000 margin Linux 2.6.12-rc1-bk3 7.4100 8.0400 7.0500 4.5100 7.2500 4.11000 7.03000 margin Linux 2.6.12-rc1-bk3 7.2600 8.2100 7.2400 4.6500 6.6500 4.08000 7.81000 margin Linux 2.6.12-rc1-bk3 7.4600 7.9000 7.3800 4.3800 6.6200 4.83000 7.27000 margin Linux 2.6.12-rc1-bk3 7.4400 8.2000 7.2000 5.8700 6.8000 4.86000 7.95000 margin Linux 2.6.12-rc1-bk3-dm 7.4400 8.3100 7.1300 5.6900 6.6500 5.49000 7.49000 margin Linux 2.6.12-rc1-bk3-dm 2.1300 8.0100 7.3800 4.6700 6.5500 4.22000 8.16000 margin Linux 2.6.12-rc1-bk3-dm 7.4900 8.1200 2.1500 4.3600 6.6900 5.54000 7.38000 *Local* Communication latencies in microseconds - smaller is better --------------------------------------------------------------------------------- Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP ctxsw UNIX UDP TCP conn --------- ------------------------- ----- ----- ---- ----- ----- ----- ----- ---- margin Linux 2.6.12-rc1-bk3 7.330 16.9 24.8 29.6 36.0 31.4 49.5 52. margin Linux 2.6.12-rc1-bk3 7.610 17.4 22.0 31.5 52. margin Linux 2.6.12-rc1-bk3 7.240 17.5 21.6 31.3 53. margin Linux 2.6.12-rc1-bk3 7.410 17.6 11.8 31.2 51. margin Linux 2.6.12-rc1-bk3 7.260 17.1 20.6 28.2 37.6 51.0 99.7 92. margin Linux 2.6.12-rc1-bk3 7.460 17.0 21.0 30.2 69.5 35.3 77.4 52. margin Linux 2.6.12-rc1-bk3 7.440 39.7 19.8 29.1 65.3 34.3 44.8 53. margin Linux 2.6.12-rc1-bk3-dm 7.440 17.4 20.5 29.4 37.0 34.3 86.7 77. margin Linux 2.6.12-rc1-bk3-dm 2.130 17.8 20.6 28.7 37.2 31.8 44.9 77. margin Linux 2.6.12-rc1-bk3-dm 7.490 17.5 11.3 29.0 37.4 77.1 46.1 53. File & VM system latencies in microseconds - smaller is better ------------------------------------------------------------------------------------------- Host OS 0K File 10K File Mmap Prot Page 100fd Create Delete Create Delete Latency Fault Fault selct --------- ------------------------- ------ ------ ------ ------ ------- ----- ------- ----- margin Linux 2.6.12-rc1-bk3 340.0 0.162 1.26430 10.6 margin Linux 2.6.12-rc1-bk3 339.0 0.176 1.26310 10.5 margin Linux 2.6.12-rc1-bk3 342.0 0.180 1.25700 10.5 margin Linux 2.6.12-rc1-bk3 341.0 0.207 1.25640 10.5 margin Linux 2.6.12-rc1-bk3 339.0 0.166 1.26310 10.6 margin Linux 2.6.12-rc1-bk3 343.0 0.159 1.26350 10.6 margin Linux 2.6.12-rc1-bk3 339.0 0.174 1.25660 10.6 margin Linux 2.6.12-rc1-bk3-dm 340.0 0.185 1.26090 10.6 margin Linux 2.6.12-rc1-bk3-dm 340.0 0.128 1.26310 10.5 margin Linux 2.6.12-rc1-bk3-dm 343.0 0.159 1.25960 10.5 *Local* Communication bandwidths in MB/s - bigger is better ----------------------------------------------------------------------------------------- Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem UNIX reread reread (libc) (hand) read write --------- ------------------------- ---- ---- ---- ------ ------ ------ ------ ---- ----- margin Linux 2.6.12-rc1-bk3 1172 1826 562. 1732.6 573.5 535.7 284.9 521. 514.7 margin Linux 2.6.12-rc1-bk3 1169 1883 868. 1733.5 573.8 535.2 283.9 521. 514.6 margin Linux 2.6.12-rc1-bk3 1149 1897 654. 1725.5 573.6 535.1 285.2 521. 514.7 margin Linux 2.6.12-rc1-bk3 1167 1883 921. 1726.1 573.8 534.9 283.1 521. 514.7 margin Linux 2.6.12-rc1-bk3 1167 1146 413. 1726.8 573.6 535.4 283.6 522. 515.1 margin Linux 2.6.12-rc1-bk3 1156 1875 905. 1721.7 573.9 535.4 283.8 521. 515.0 margin Linux 2.6.12-rc1-bk3 1103 1741 493. 1727.7 573.6 534.8 283.3 521. 514.8 margin Linux 2.6.12-rc1-bk3-dm 1160 1361 886. 1718.7 573.6 535.0 284.7 521. 514.8 margin Linux 2.6.12-rc1-bk3-dm 1166 1759 665. 1733.0 565.0 535.2 284.6 521. 514.8 margin Linux 2.6.12-rc1-bk3-dm 1140 1879 606. 1706.6 573.6 535.1 283.5 521. 514.6 patch: Index: linux-2.6.11/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800 +++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-31 14:25:17.000000000 -0800 @@ -43,7 +43,7 @@ GLOBAL_ENTRY(clear_page) adds dst1 = 16, in0 adds dst2 = 32, in0 ;; -.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE +.fetch: stf.spill [dst_fetch] = f0, L3_LINE_SIZE adds dst3 = 48, in0 // executing this multiple times is harmless br.cloop.sptk.few .fetch ;; @@ -53,23 +53,23 @@ GLOBAL_ENTRY(clear_page) ;; #ifdef CONFIG_ITANIUM // Optimized for Itanium -1: stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 cmp.lt p8,p0=dst_fetch, dst_last ;; #else // Optimized for McKinley -1: stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 - stf.spill.nta [dst3] = f0, 64 - stf.spill.nta [dst4] = f0, 128 +1: stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 + stf.spill [dst3] = f0, 64 + stf.spill [dst4] = f0, 128 cmp.lt p8,p0=dst_fetch, dst_last ;; - stf.spill.nta [dst1] = f0, 64 - stf.spill.nta [dst2] = f0, 64 + stf.spill [dst1] = f0, 64 + stf.spill [dst2] = f0, 64 #endif - stf.spill.nta [dst3] = f0, 64 -(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE + stf.spill [dst3] = f0, 64 +(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE br.cloop.sptk.few 1b ;; mov ar.lc = saved_lc // restore lc ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-04-06 0:15 ` Christoph Lameter @ 2005-04-06 0:23 ` David Mosberger 2005-04-06 0:33 ` Christoph Lameter 0 siblings, 1 reply; 28+ messages in thread From: David Mosberger @ 2005-04-06 0:23 UTC (permalink / raw) To: Christoph Lameter Cc: davidm, Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer >>>>> On Tue, 5 Apr 2005 17:15:53 -0700 (PDT), Christoph Lameter <clameter@engr.sgi.com> said: Christoph> On Thu, 24 Mar 2005, David Mosberger wrote: >> That's definitely the case. See my earlier post on this topic: >> http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html >> Unfortunately, nobody reported any results for larger machines >> and/or more interesting workloads, so the patch is in limbo at >> this time. Clearly, if the CPU that's clearing the page is >> likely to use that same page soon after, it'd be useful to use >> temporal stores. Christoph> Here are some numbers using lmbench of temporal writes Christoph> vs. non temporal writes on ia64 (8p machine but lmbench Christoph> run only for one load). There seems to be some benefit Christoph> for fork/exec but overall this does not seem to be a Christoph> clear win. I suspect that the distinction between Christoph> temporal vs. nontemporal writes is be more beneficial on Christoph> machines with smaller pagesizes since the likelyhood that Christoph> most cachelines of a page are used soon is increased and Christoph> therefore hot zeroing is more beneficial. What LMbench test other than fork/exec would you have expected to be affected by this? LMbench is not a good benchmark for this (remember: it's a _micro_ benchmark). --david ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-04-06 0:23 ` David Mosberger @ 2005-04-06 0:33 ` Christoph Lameter 2005-04-06 4:48 ` David Mosberger 0 siblings, 1 reply; 28+ messages in thread From: Christoph Lameter @ 2005-04-06 0:33 UTC (permalink / raw) To: davidm Cc: Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Tue, 5 Apr 2005, David Mosberger wrote: > What LMbench test other than fork/exec would you have expected to be > affected by this? LMbench is not a good benchmark for this (remember: > it's a _micro_ benchmark). LMbench does a variety of things and I expected to see at least something on the page fault test and hopefully also some variations for other tests. Which benchmark would you recommend for this? ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-04-06 0:33 ` Christoph Lameter @ 2005-04-06 4:48 ` David Mosberger 2005-04-06 5:15 ` Gerrit Huizenga 0 siblings, 1 reply; 28+ messages in thread From: David Mosberger @ 2005-04-06 4:48 UTC (permalink / raw) To: Christoph Lameter Cc: davidm, Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer >>>>> On Tue, 5 Apr 2005 17:33:59 -0700 (PDT), Christoph Lameter <clameter@engr.sgi.com> said: Christoph> Which benchmark would you recommend for this? I don't know about "recommend", but I think SPECweb, SPECjbb, the-UNIX-multi-user-benchmark-whose-name-I-keep-forgetting, and in general anything that involves process-activity and/or large working sets might be interesting (in other words: anything but microbenchmarks; I'm afraid). --david ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-04-06 4:48 ` David Mosberger @ 2005-04-06 5:15 ` Gerrit Huizenga 2005-04-06 16:03 ` Grant Grundler 0 siblings, 1 reply; 28+ messages in thread From: Gerrit Huizenga @ 2005-04-06 5:15 UTC (permalink / raw) To: davidm Cc: Christoph Lameter, Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Tue, 05 Apr 2005 21:48:22 PDT, David Mosberger wrote: > >>>>> On Tue, 5 Apr 2005 17:33:59 -0700 (PDT), Christoph Lameter <clameter@engr.sgi.com> said: > > Christoph> Which benchmark would you recommend for this? > > I don't know about "recommend", but I think SPECweb, SPECjbb, > the-UNIX-multi-user-benchmark-whose-name-I-keep-forgetting, and in > general anything that involves process-activity and/or large working > sets might be interesting (in other words: anything but > microbenchmarks; I'm afraid). SpecSDET, Aim7 or ReAim from OSDL are probably what you are thinking of. gerrit ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-04-06 5:15 ` Gerrit Huizenga @ 2005-04-06 16:03 ` Grant Grundler 0 siblings, 0 replies; 28+ messages in thread From: Grant Grundler @ 2005-04-06 16:03 UTC (permalink / raw) To: Gerrit Huizenga Cc: davidm, Christoph Lameter, Andi Kleen, Christoph Lameter, Denis Vlasenko, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman, linux-ia64, Jens.Maurer On Tue, Apr 05, 2005 at 10:15:18PM -0700, Gerrit Huizenga wrote: > SpecSDET, Aim7 or ReAim from OSDL are probably what you are thinking of. SDET isn't publicly available. I hope by now osdl-reaim is called "osdl-aim7": http://lkml.org/lkml/2003/8/1/172 grant ^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH] add a clear_pages function to clear pages of higher order 2005-03-11 8:08 ` Denis Vlasenko 2005-03-17 1:33 ` Christoph Lameter @ 2005-03-18 10:12 ` Andi Kleen 1 sibling, 0 replies; 28+ messages in thread From: Andi Kleen @ 2005-03-18 10:12 UTC (permalink / raw) To: Denis Vlasenko Cc: Christoph Lameter, Dave Hansen, Andrew Morton, Linux Kernel Mailing List, Mel Gorman > Andi Kleen (iirc) says that non-temporal stores seem to be > big win in microbenchmarks (and I second that), but they are > a net loss when we are going to use zeroed page just after > zeroing. He recommends avoid using non-temporal stores The rule of thumb is to only use non temporal stores when your data set is bigger than the L2/L3 caches of the CPU. This means >1MB. The kernel normally never works on data sets that big. For Christophers new background cleaner daemon it may be worth it when the queue is a LILO. This means it is likely there is a relatively long time between the clearing operation and a workload using it. But even then it is a very close call and would need clear benchmark numbers in macrobenchmarks. -Andi ^ permalink raw reply [flat|nested] 28+ messages in thread
end of thread, other threads:[~2005-04-06 16:03 UTC | newest] Thread overview: 28+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-03-10 20:35 [PATCH] add a clear_pages function to clear pages of higher order Christoph Lameter 2005-03-10 21:38 ` Dave Hansen 2005-03-10 22:46 ` Christoph Lameter 2005-03-11 1:03 ` Christoph Lameter 2005-03-11 8:08 ` Denis Vlasenko 2005-03-17 1:33 ` Christoph Lameter 2005-03-18 9:54 ` Denis Vlasenko 2005-03-18 15:00 ` Christoph Lameter 2005-03-18 19:28 ` Andi Kleen 2005-03-18 20:19 ` Christoph Lameter 2005-03-21 15:30 ` Denis Vlasenko 2005-03-24 18:34 ` David Mosberger 2005-03-24 18:41 ` Christoph Lameter 2005-03-24 19:03 ` David S. Miller 2005-03-24 22:49 ` Christoph Lameter 2005-03-24 23:13 ` David S. Miller 2005-03-25 2:29 ` David S. Miller 2005-03-25 2:43 ` Christoph Lameter 2005-03-27 17:12 ` Andi Kleen 2005-03-27 18:23 ` David S. Miller 2005-03-29 1:58 ` Christoph Lameter 2005-04-06 0:15 ` Christoph Lameter 2005-04-06 0:23 ` David Mosberger 2005-04-06 0:33 ` Christoph Lameter 2005-04-06 4:48 ` David Mosberger 2005-04-06 5:15 ` Gerrit Huizenga 2005-04-06 16:03 ` Grant Grundler 2005-03-18 10:12 ` Andi Kleen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox