linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [CFT] faster athlon/duron memory copy implementation
@ 2002-10-24 17:15 Manfred Spraul
  2002-10-24 17:37 ` Robert Love
                   ` (25 more replies)
  0 siblings, 26 replies; 51+ messages in thread
From: Manfred Spraul @ 2002-10-24 17:15 UTC (permalink / raw)
  To: linux-kernel; +Cc: arjanv

[-- Attachment #1: Type: text/plain, Size: 341 bytes --]

AMD recommends to perform memory copies with backward read operations 
instead of prefetch.

http://208.15.46.63/events/gdc2002.htm

Attached is a test app that compares several memory copy implementations.
Could you run it and report the results to me, together with cpu, 
chipset and memory type?

Please run 2 or 3 times.

--
    Manfred

[-- Attachment #2: athlon.c --]
[-- Type: text/plain, Size: 13212 bytes --]

/*

(C) 2000 Arjan van de Ven and others  licensed under the terms of the GPL


$Revision: 1.6 $
*/

static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $";
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* The 2.4 kernel one, adapted for userspace */

static void fast_clear_page(void *page)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
	
	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/128;i++)
	{
		__asm__ __volatile__ (
		"  movq %%mm0, (%0)\n"
		"  movq %%mm0, 8(%0)\n"
		"  movq %%mm0, 16(%0)\n"
		"  movq %%mm0, 24(%0)\n"
		"  movq %%mm0, 32(%0)\n"
		"  movq %%mm0, 40(%0)\n"
		"  movq %%mm0, 48(%0)\n"
		"  movq %%mm0, 56(%0)\n"
		"  movq %%mm0, 64(%0)\n"
		"  movq %%mm0, 72(%0)\n"
		"  movq %%mm0, 80(%0)\n"
		"  movq %%mm0, 88(%0)\n"
		"  movq %%mm0, 96(%0)\n"
		"  movq %%mm0, 104(%0)\n"
		"  movq %%mm0, 112(%0)\n"
		"  movq %%mm0, 120(%0)\n"
		: : "r" (page) : "memory");
		page+=128;
	}
	__asm__ __volatile__ (
		"  femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
	
}

/* modified version for Athlon-family processors */
static void faster_clear_page(void *page)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/64;i++)
	{
		__asm__ __volatile__ (
		"  movntq %%mm0, (%0)\n"
		"  movntq %%mm0, 8(%0)\n"
		"  movntq %%mm0, 16(%0)\n"
		"  movntq %%mm0, 24(%0)\n"
		"  movntq %%mm0, 32(%0)\n"
		"  movntq %%mm0, 40(%0)\n"
		"  movntq %%mm0, 48(%0)\n"
		"  movntq %%mm0, 56(%0)\n"
		: : "r" (page) : "memory");
		page+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
 * but serves as my playground.
 */
static void even_faster_clear_page(void *page)
{
	int i;
	char fpu_save[108];
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/64;i++)
	{
		__asm__ __volatile__ (
		"  movntq %%mm0, (%0)\n"
		"  movntq %%mm0, 8(%0)\n"
		"  movntq %%mm0, 16(%0)\n"
		"  movntq %%mm0, 24(%0)\n"
		"  movntq %%mm0, 32(%0)\n"
		"  movntq %%mm0, 40(%0)\n"
		"  movntq %%mm0, 48(%0)\n"
		"  movntq %%mm0, 56(%0)\n"
		: : "r" (page) : "memory");
		page+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* The "fallback" one as used by the kernel */
static void slow_zero_page(void * page)
{
	int d0, d1;
	__asm__ __volatile__( \
		"cld\n\t" \
		"rep ; stosl" \
		: "=&c" (d0), "=&D" (d1)
		:"a" (0),"1" (page),"0" (1024)
		:"memory");
}

static void slow_copy_page(void *to, void *from)
{
	int d0, d1, d2;
	__asm__ __volatile__( \
		"cld\n\t" \
		"rep ; movsl" \
		: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
		: "0" (1024),"1" ((long) to),"2" ((long) from) \
		: "memory");
}


/* 2.4 kernel mmx copy_page function */
static void fast_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	__asm__ __volatile__ (
		"1: prefetch (%0)\n"
		"   prefetch 64(%0)\n"
		"   prefetch 128(%0)\n"
		"   prefetch 192(%0)\n"
		"   prefetch 256(%0)\n"
		: : "r" (from) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"1: prefetch 320(%0)\n"
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq %%mm0, (%1)\n"
		"   movq %%mm1, 8(%1)\n"
		"   movq %%mm2, 16(%1)\n"
		"   movq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm0\n"
		"   movq 40(%0), %%mm1\n"
		"   movq 48(%0), %%mm2\n"
		"   movq 56(%0), %%mm3\n"
		"   movq %%mm0, 32(%1)\n"
		"   movq %%mm1, 40(%1)\n"
		"   movq %%mm2, 48(%1)\n"
		"   movq %%mm3, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" femms\n" : :
	);
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

}


/* Athlon improved version */
static void faster_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"1: prefetchnta 320(%0)\n"
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq 32(%0), %%mm4\n"
		"   movq 40(%0), %%mm5\n"
		"   movq 48(%0), %%mm6\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm0, (%1)\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
}
	__asm__ __volatile__ (
		" femms \n "
		" sfence\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
 * but serves as my playground.
 */
static void even_faster_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"   prefetchnta 256(%0)\n"
		"   movq (%0), %%mm0\n"
		"   movntq %%mm0, (%1)\n"
		"   movq 8(%0), %%mm1\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movq 16(%0), %%mm2\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movq 24(%0), %%mm3\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm4\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movq 40(%0), %%mm5\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movq 48(%0), %%mm6\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" femms \n "
		" sfence\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
	
}


/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as the count is constant.
 */
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
	switch (n) {
		case 0:
			return to;
		case 1:
			*(unsigned char *)to = *(const unsigned char *)from;
			return to;
		case 2:
			*(unsigned short *)to = *(const unsigned short *)from;
			return to;
		case 3:
			*(unsigned short *)to = *(const unsigned short *)from;
			*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
			return to;
		case 4:
			*(unsigned long *)to = *(const unsigned long *)from;
			return to;
		case 6:	/* for Ethernet addresses */
			*(unsigned long *)to = *(const unsigned long *)from;
			*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
			return to;
		case 8:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			return to;
		case 12:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			return to;
		case 16:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			return to;
		case 20:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
			return to;
	}
#define COMMON(x) \
__asm__ __volatile__( \
	"rep ; movsl" \
	x \
	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
	: "memory");
{
	int d0, d1, d2;
	switch (n % 4) {
		case 0: COMMON(""); return to;
		case 1: COMMON("\n\tmovsb"); return to;
		case 2: COMMON("\n\tmovsw"); return to;
		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
	}
}
  
#undef COMMON
}


static void normal_copy_page(void *to, void *from)
{
	__constant_memcpy(to,from,4096);
}


/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as we by now know that both pattern and count is constant..
 */
static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
	switch (count) {
		case 0:
			return s;
		case 1:
			*(unsigned char *)s = pattern;
			return s;
		case 2:
			*(unsigned short *)s = pattern;
			return s;
		case 3:
			*(unsigned short *)s = pattern;
			*(2+(unsigned char *)s) = pattern;
			return s;
		case 4:
			*(unsigned long *)s = pattern;
			return s;
	}
#define COMMON(x) \
__asm__  __volatile__( \
	"rep ; stosl" \
	x \
	: "=&c" (d0), "=&D" (d1) \
	: "a" (pattern),"0" (count/4),"1" ((long) s) \
	: "memory")
{
	int d0, d1;
	switch (count % 4) {
		case 0: COMMON(""); return s;
		case 1: COMMON("\n\tstosb"); return s;
		case 2: COMMON("\n\tstosw"); return s;
		default: COMMON("\n\tstosw\n\tstosb"); return s;
	}
}
  
#undef COMMON
}

static void normal_clear_page(void *to)
{
	 __constant_c_and_count_memset(to,0,4096);
}

/* test version to see if we can go even faster */
static void no_prefetch_copy_page(void *to, void *from) {
	int i, d1;
        char fpu_save[108];

	for (i=4096-256;i>=0;i-=256)
		__asm__ __volatile(
			"movl 192(%1,%2),%0\n"
			"movl 128(%1,%2),%0\n"
			"movl 64(%1,%2),%0\n"
			"movl 0(%1,%2),%0\n"
			: "=&r" (d1)
			: "r" (from), "r" (i));

        __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++) {
		__asm__ __volatile__ (
		"   movq (%0), %%mm0\n"
		"   movntq %%mm0, (%1)\n"
		"   movq 8(%0), %%mm1\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movq 16(%0), %%mm2\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movq 24(%0), %%mm3\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm4\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movq 40(%0), %%mm5\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movq 48(%0), %%mm6\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" emms\n"
		" frstor %0;\n" ::"m"(fpu_save[0]) );
}


#define rdtsc(low,high) \
     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
     
typedef void (clear_func)(void *);
typedef void (copy_func)(void *,void *);

void test_one_clearpage(clear_func *func, char *name, char *Buffer)
{
	char *temp;
	int i;
	unsigned int blow,bhigh,alow,ahigh;
	unsigned long long before,after;

	rdtsc(blow,bhigh);
	temp = Buffer;
	for (i=0;i<4*1024;i++) {
		func(temp);
		temp += 4096;
	}
	rdtsc(alow,ahigh);
	before =  blow + (((long long)bhigh)<<32);
	after = alow +(((long long)ahigh)<<32);
	if (before>after) {
		printf("test invalid; timer overflow \n");
		return;
	}
	printf("clear_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(4*1024) );


}
     
void test_one_copypage(copy_func *func, char *name, char *Buffer)
{
	char *temp;
	int i;
	unsigned int blow,bhigh,alow,ahigh;
	unsigned long long before,after;

	sleep(1);
	rdtsc(blow,bhigh);
	temp = Buffer;
	for (i=0;i<2*1024;i++) {
		func(temp,temp+8*1024*1024);
		temp += 4096;
	}
	rdtsc(alow,ahigh);
	before =  blow+ (((long long)bhigh)<<32);
	after = alow+(((long long)ahigh)<<32);
	if (before>after) {
		printf("test invalid; timer overflow \n");
		return;
	}
	printf("copy_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(2*1024) );


}
     
     
void test_clearpage(char *Buffer)
{
	printf("clear_page() tests \n");

	test_one_clearpage(fast_clear_page,"warm up run",Buffer);
	test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
	test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
	test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
	test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
	test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
}   

void test_copypage(char *Buffer)
{
	printf("copy_page() tests \n");
	
	test_one_copypage(fast_copy_page,  "warm up run",Buffer);
	test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
	test_one_copypage(slow_copy_page,  "2.4 MMX fallback",Buffer);
	test_one_copypage(fast_copy_page,  "2.4 MMX version",Buffer);
	test_one_copypage(faster_copy_page,"faster_copy",Buffer);
	test_one_copypage(even_faster_copy_page,"even_faster",Buffer);
	test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
}

int main()
{
	char *Buffer;
	
	Buffer = malloc(1024*1024*16);
	memset(Buffer,0xfe,1024*1024*16);
	
	printf("Athlon test program %s \n",cvsid);
	
	printf("\n");
	test_copypage(Buffer);

	free(Buffer);

	return 0;
}

^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [CFT] faster athlon/duron memory copy implementation
@ 2002-10-24 18:27 Shawn Starr
  0 siblings, 0 replies; 51+ messages in thread
From: Shawn Starr @ 2002-10-24 18:27 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel

I'll run this when I get home. I have a Athlon MP 2000+ (with one CPU only right now).

Shawn.

-- 
Shawn Starr
UNIX Systems Administrator, Operations
Datawire Communication Networks Inc.
10 Carlson Court, Suite 300
Toronto, ON, M9W 6L2
T: 416-213-2001 ext 179  F: 416-213-2008
shawn.starr@datawire.net
"The power to Transact" - http://www.datawire.net


^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [CFT] faster athlon/duron memory copy implementation
@ 2002-10-24 20:51 Dieter Nützel
  2002-10-24 21:01 ` Dieter Nützel
  2002-10-24 21:16 ` Willy TARREAU
  0 siblings, 2 replies; 51+ messages in thread
From: Dieter Nützel @ 2002-10-24 20:51 UTC (permalink / raw)
  To: Linux Kernel List; +Cc: Manfred Spraul, Robert Love

[-- Attachment #1: Type: text/plain, Size: 7077 bytes --]

Rober Love wrote:
> The majority of the program is inline assembly so I do not think
> compiler is playing a huge role here.

I think they are...

> Regardless, the numbers are all pretty uniform in saying the new no
> prefetch method is superior so its a mute point.

But all "your" numbers are slow.
Look at mine with the "right" (TM) flags ;-)

processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 6
model           : 6
model name      : AMD Athlon(tm) MP 1900+
stepping        : 2
cpu MHz         : 1600.377
cache size      : 256 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca 
cmov pat pse36 mmx fxsr sse syscall mp mmxext 3dnowext 3dnow
bogomips        : 3145.72

processor       : 1
vendor_id       : AuthenticAMD
cpu family      : 6
model           : 6
model name      : AMD Athlon(tm) MP
stepping        : 2
cpu MHz         : 1600.377
cache size      : 256 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca 
cmov pat pse36 mmx fxsr sse syscall mp mmxext 3dnowext 3dnow
bogomips        : 3194.88


SuSE Linux 7.3

glibc-2.2.4
Addons: db db2 linuxthreads noversion
Build CFLAGS: -O -mcpu=k6 -mpreferred-stack-boundary=2 -malign-functions=4 
-fschedule-insns2 -fexpensive-optimizations -g
Build CC: gcc
Compiler version: 2.95.3 20010315 (SuSE)

Linux 2.5.43-mm2
Kernel compiler FLAGS
HOSTCC          = gcc
HOSTCFLAGS      = -Wall -Wstrict-prototypes -O -fomit-frame-pointer -mcpu=k6 
-mpreferred-stack-boundary=2 -malign-functions=4 -fschedule-insns2 
-fexpensive-optimizations

YES, I only use "-mcpu=k6" and "-O" for ages (since 26. August 1999 ;-) on my 
Athlons.

nuetzel/Entwicklung> ./athlon ; ./athlon ; ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run'        took 17409 cycles per page
clear_page function '2.4 non MMX'        took 12340 cycles per page
clear_page function '2.4 MMX fallback'   took 12429 cycles per page
clear_page function '2.4 MMX version'    took 9794 cycles per page
clear_page function 'faster_clear_page'  took 4639 cycles per page
clear_page function 'even_faster_clear'  took 4914 cycles per page

copy_page() tests
copy_page function 'warm up run'         took 16506 cycles per page
copy_page function '2.4 non MMX'         took 18412 cycles per page
copy_page function '2.4 MMX fallback'    took 18468 cycles per page
copy_page function '2.4 MMX version'     took 16550 cycles per page
copy_page function 'faster_copy'         took 10239 cycles per page
copy_page function 'even_faster'         took 10816 cycles per page


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run'        took 17148 cycles per page
clear_page function '2.4 non MMX'        took 12426 cycles per page
clear_page function '2.4 MMX fallback'   took 12330 cycles per page
clear_page function '2.4 MMX version'    took 9776 cycles per page
clear_page function 'faster_clear_page'  took 4619 cycles per page
clear_page function 'even_faster_clear'  took 4938 cycles per page

copy_page() tests
copy_page function 'warm up run'         took 16640 cycles per page
copy_page function '2.4 non MMX'         took 18434 cycles per page
copy_page function '2.4 MMX fallback'    took 18454 cycles per page
copy_page function '2.4 MMX version'     took 16533 cycles per page
copy_page function 'faster_copy'         took 10418 cycles per page
copy_page function 'even_faster'         took 10707 cycles per page


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run'        took 17475 cycles per page
clear_page function '2.4 non MMX'        took 12435 cycles per page
clear_page function '2.4 MMX fallback'   took 12379 cycles per page
clear_page function '2.4 MMX version'    took 9902 cycles per page
clear_page function 'faster_clear_page'  took 4665 cycles per page
clear_page function 'even_faster_clear'  took 4947 cycles per page

copy_page() tests
copy_page function 'warm up run'         took 16606 cycles per page
copy_page function '2.4 non MMX'         took 18439 cycles per page
copy_page function '2.4 MMX fallback'    took 18676 cycles per page
copy_page function '2.4 MMX version'     took 16560 cycles per page
copy_page function 'faster_copy'         took 10239 cycles per page
copy_page function 'even_faster'         took 10728 cycles per page

nuetzel/Entwicklung> ./athlon2 ; ./athlon2 ; ./athlon2
1600.061 MHz
clear_page by 'normal_clear_page'        took 12463 cycles (501.5 MB/s)
clear_page by 'slow_zero_page'           took 12461 cycles (501.6 MB/s)
clear_page by 'fast_clear_page'          took 9555 cycles (654.1 MB/s)
clear_page by 'faster_clear_page'        took 4436 cycles (1408.7 MB/s)

copy_page by 'normal_copy_page'  took 8992 cycles (695.0 MB/s)
copy_page by 'slow_copy_page'    took 9010 cycles (693.7 MB/s)
copy_page by 'fast_copy_page'    took 8134 cycles (768.3 MB/s)
copy_page by 'faster_copy'       took 5546 cycles (1126.8 MB/s)
copy_page by 'even_faster'       took 5616 cycles (1112.9 MB/s)


1600.057 MHz
clear_page by 'normal_clear_page'        took 12555 cycles (497.8 MB/s)
clear_page by 'slow_zero_page'           took 12740 cycles (490.6 MB/s)
clear_page by 'fast_clear_page'          took 9783 cycles (638.8 MB/s)
clear_page by 'faster_clear_page'        took 4459 cycles (1401.4 MB/s)

copy_page by 'normal_copy_page'  took 9123 cycles (685.0 MB/s)
copy_page by 'slow_copy_page'    took 9080 cycles (688.3 MB/s)
copy_page by 'fast_copy_page'    took 8232 cycles (759.3 MB/s)
copy_page by 'faster_copy'       took 5535 cycles (1129.1 MB/s)
copy_page by 'even_faster'       took 5565 cycles (1123.1 MB/s)


1600.060 MHz
clear_page by 'normal_clear_page'        took 12625 cycles (495.1 MB/s)
clear_page by 'slow_zero_page'           took 12541 cycles (498.3 MB/s)
clear_page by 'fast_clear_page'          took 9648 cycles (647.8 MB/s)
clear_page by 'faster_clear_page'        took 4463 cycles (1400.2 MB/s)

copy_page by 'normal_copy_page'  took 9178 cycles (680.9 MB/s)
copy_page by 'slow_copy_page'    took 9011 cycles (693.6 MB/s)
copy_page by 'fast_copy_page'    took 8138 cycles (768.0 MB/s)
copy_page by 'faster_copy'       took 5508 cycles (1134.7 MB/s)
copy_page by 'even_faster'       took 5552 cycles (1125.6 MB/s)

Regards,
	Dieter
-- 
Dieter Nützel
Graduate Student, Computer Science

University of Hamburg
Department of Computer Science
@home: Dieter.Nuetzel at hamburg.de (replace at with @)

[-- Attachment #2: athlon2.c.bz2 --]
[-- Type: application/x-bzip2, Size: 2912 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [CFT] faster athlon/duron memory copy implementation
@ 2002-10-24 22:01 Harm Verhagen
  0 siblings, 0 replies; 51+ messages in thread
From: Harm Verhagen @ 2002-10-24 22:01 UTC (permalink / raw)
  To: linux-kernel

Athlon XP 1800+, VIA KT333, 256MB DDR2100

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run'         took 16180 cycles per page
copy_page function '2.4 non MMX'         took 17913 cycles per page
copy_page function '2.4 MMX fallback'    took 18610 cycles per page
copy_page function '2.4 MMX version'     took 16200 cycles per page
copy_page function 'faster_copy'         took 9908 cycles per page
copy_page function 'even_faster'         took 10117 cycles per page
copy_page function 'no_prefetch'         took 6993 cycles per page
[harm@pchome memcpy2]$ ./memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run'         took 16293 cycles per page
copy_page function '2.4 non MMX'         took 17929 cycles per page
copy_page function '2.4 MMX fallback'    took 18637 cycles per page
copy_page function '2.4 MMX version'     took 16209 cycles per page
copy_page function 'faster_copy'         took 9907 cycles per page
copy_page function 'even_faster'         took 10122 cycles per page
copy_page function 'no_prefetch'         took 6964 cycles per page

harm@pchome memcpy2]$ cat /proc/cpuinfo
processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 6
model           : 6
model name      : AMD Athlon(TM) XP 1800+
stepping        : 2
cpu MHz         : 1532.941
cache size      : 256 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu vme de tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat
pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips        : 3060.53





^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [CFT] faster athlon/duron memory copy implementation
@ 2002-10-25 16:29 Jorge Bernal "Koke"
  0 siblings, 0 replies; 51+ messages in thread
From: Jorge Bernal "Koke" @ 2002-10-25 16:29 UTC (permalink / raw)
  To: Linux-Kernel

[-- Attachment #1: Type: text/plain, Size: 21446 bytes --]

El jue, 24-10-2002 a las 19:15, Manfred Spraul escribió:
> AMD recommends to perform memory copies with backward read operations 
> instead of prefetch.
> 
> http://208.15.46.63/events/gdc2002.htm
> 
> Attached is a test app that compares several memory copy
implementations.
> Could you run it and report the results to me, together with cpu, 
> chipset and memory type?
> 
> Please run 2 or 3 times.
> 

My machine: Athlon 1600XP, 512MB DDR, KT226A??

koke@tuxland:~/src/testing$ ./athlon 
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ 

copy_page() tests 
copy_page function 'warm up run'         took 15408 cycles per page
copy_page function '2.4 non MMX'         took 16805 cycles per page
copy_page function '2.4 MMX fallback'    took 16695 cycles per page
copy_page function '2.4 MMX version'     took 15424 cycles per page
copy_page function 'faster_copy'         took 9481 cycles per page
copy_page function 'even_faster'         took 9354 cycles per page
copy_page function 'no_prefetch'         took 6635 cycles per page
koke@tuxland:~/src/testing$ ./athlon 
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ 

copy_page() tests 
copy_page function 'warm up run'         took 15418 cycles per page
copy_page function '2.4 non MMX'         took 16792 cycles per page
copy_page function '2.4 MMX fallback'    took 16754 cycles per page
copy_page function '2.4 MMX version'     took 15495 cycles per page
copy_page function 'faster_copy'         took 9426 cycles per page
copy_page function 'even_faster'         took 9490 cycles per page
copy_page function 'no_prefetch'         took 6591 cycles per page
koke@tuxland:~/src/testing$ ./athlon 
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ 

copy_page() tests 
copy_page function 'warm up run'         took 16485 cycles per page
copy_page function '2.4 non MMX'         took 16759 cycles per page
copy_page function '2.4 MMX fallback'    took 16769 cycles per page
copy_page function '2.4 MMX version'     took 15377 cycles per page
copy_page function 'faster_copy'         took 9732 cycles per page
copy_page function 'even_faster'         took 12125 cycles per page
copy_page function 'no_prefetch'         took 9439 cycles per page
koke@tuxland:~/src/testing$


> --
>     Manfred
> ----
> 

> /*
> 
> (C) 2000 Arjan van de Ven and others  licensed under the terms of the
GPL
> 
> 
> $Revision: 1.6 $
> */
> 
> static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp
$";
> #include <unistd.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> 
> /* The 2.4 kernel one, adapted for userspace */
> 
> static void fast_clear_page(void *page)
> {
>       int i;
>       char fpu_save[108];
> 
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>       
>       __asm__ __volatile__ (
>               "  pxor %%mm0, %%mm0\n" : :
>       );
> 
>       for(i=0;i<4096/128;i++)
>       {
>               __asm__ __volatile__ (
>               "  movq %%mm0, (%0)\n"
>               "  movq %%mm0, 8(%0)\n"
>               "  movq %%mm0, 16(%0)\n"
>               "  movq %%mm0, 24(%0)\n"
>               "  movq %%mm0, 32(%0)\n"
>               "  movq %%mm0, 40(%0)\n"
>               "  movq %%mm0, 48(%0)\n"
>               "  movq %%mm0, 56(%0)\n"
>               "  movq %%mm0, 64(%0)\n"
>               "  movq %%mm0, 72(%0)\n"
>               "  movq %%mm0, 80(%0)\n"
>               "  movq %%mm0, 88(%0)\n"
>               "  movq %%mm0, 96(%0)\n"
>               "  movq %%mm0, 104(%0)\n"
>               "  movq %%mm0, 112(%0)\n"
>               "  movq %%mm0, 120(%0)\n"
>               : : "r" (page) : "memory");
>               page+=128;
>       }
>       __asm__ __volatile__ (
>               "  femms\n" : :
>       );
>       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>       
> }
> 
> /* modified version for Athlon-family processors */
> static void faster_clear_page(void *page)
> {
>       int i;
>       char fpu_save[108];
> 
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>       __asm__ __volatile__ (
>               "  pxor %%mm0, %%mm0\n" : :
>       );
> 
>       for(i=0;i<4096/64;i++)
>       {
>               __asm__ __volatile__ (
>               "  movntq %%mm0, (%0)\n"
>               "  movntq %%mm0, 8(%0)\n"
>               "  movntq %%mm0, 16(%0)\n"
>               "  movntq %%mm0, 24(%0)\n"
>               "  movntq %%mm0, 32(%0)\n"
>               "  movntq %%mm0, 40(%0)\n"
>               "  movntq %%mm0, 48(%0)\n"
>               "  movntq %%mm0, 56(%0)\n"
>               : : "r" (page) : "memory");
>               page+=64;
>       }
>       __asm__ __volatile__ (
>               " sfence \n "
>               " femms\n" : :
>       );
>       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
> 
> }
> 
> /* test version to go even faster... this might be the same as faster_
>  * but serves as my playground.
>  */
> static void even_faster_clear_page(void *page)
> {
>       int i;
>       char fpu_save[108];
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> 
>       __asm__ __volatile__ (
>               "  pxor %%mm0, %%mm0\n" : :
>       );
> 
>       for(i=0;i<4096/64;i++)
>       {
>               __asm__ __volatile__ (
>               "  movntq %%mm0, (%0)\n"
>               "  movntq %%mm0, 8(%0)\n"
>               "  movntq %%mm0, 16(%0)\n"
>               "  movntq %%mm0, 24(%0)\n"
>               "  movntq %%mm0, 32(%0)\n"
>               "  movntq %%mm0, 40(%0)\n"
>               "  movntq %%mm0, 48(%0)\n"
>               "  movntq %%mm0, 56(%0)\n"
>               : : "r" (page) : "memory");
>               page+=64;
>       }
>       __asm__ __volatile__ (
>               " sfence \n "
>               " femms\n" : :
>       );
>       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
> 
> }
> 
> /* The "fallback" one as used by the kernel */
> static void slow_zero_page(void * page)
> {
>       int d0, d1;
>       __asm__ __volatile__( \
>               "cld\n\t" \
>               "rep ; stosl" \
>               : "=&c" (d0), "=&D" (d1)
>               :"a" (0),"1" (page),"0" (1024)
>               :"memory");
> }
> 
> static void slow_copy_page(void *to, void *from)
> {
>       int d0, d1, d2;
>       __asm__ __volatile__( \
>               "cld\n\t" \
>               "rep ; movsl" \
>               : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
>               : "0" (1024),"1" ((long) to),"2" ((long) from) \
>               : "memory");
> }
> 
> 
> /* 2.4 kernel mmx copy_page function */
> static void fast_copy_page(void *to, void *from)
> {
>       int i;
>       char fpu_save[108];
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> 
>       __asm__ __volatile__ (
>               "1: prefetch (%0)\n"
>               "   prefetch 64(%0)\n"
>               "   prefetch 128(%0)\n"
>               "   prefetch 192(%0)\n"
>               "   prefetch 256(%0)\n"
>               : : "r" (from) );
> 
>       for(i=0; i<4096/64; i++)
>       {
>               __asm__ __volatile__ (
>               "1: prefetch 320(%0)\n"
>               "2: movq (%0), %%mm0\n"
>               "   movq 8(%0), %%mm1\n"
>               "   movq 16(%0), %%mm2\n"
>               "   movq 24(%0), %%mm3\n"
>               "   movq %%mm0, (%1)\n"
>               "   movq %%mm1, 8(%1)\n"
>               "   movq %%mm2, 16(%1)\n"
>               "   movq %%mm3, 24(%1)\n"
>               "   movq 32(%0), %%mm0\n"
>               "   movq 40(%0), %%mm1\n"
>               "   movq 48(%0), %%mm2\n"
>               "   movq 56(%0), %%mm3\n"
>               "   movq %%mm0, 32(%1)\n"
>               "   movq %%mm1, 40(%1)\n"
>               "   movq %%mm2, 48(%1)\n"
>               "   movq %%mm3, 56(%1)\n"
>               : : "r" (from), "r" (to) : "memory");
>               from+=64;
>               to+=64;
>       }
>       __asm__ __volatile__ (
>               " femms\n" : :
>       );
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> 
> }
> 
> 
> /* Athlon improved version */
> static void faster_copy_page(void *to, void *from)
> {
>       int i;
>       char fpu_save[108];
> 
>       __asm__ __volatile__ (
>               "1: prefetchnta (%0)\n"
>               "   prefetchnta 64(%0)\n"
>               "   prefetchnta 128(%0)\n"
>               "   prefetchnta 192(%0)\n"
>               : : "r" (from) );
> 
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> 
>       for(i=0; i<4096/64; i++)
>       {
>               __asm__ __volatile__ (
>               "1: prefetchnta 320(%0)\n"
>               "2: movq (%0), %%mm0\n"
>               "   movq 8(%0), %%mm1\n"
>               "   movq 16(%0), %%mm2\n"
>               "   movq 24(%0), %%mm3\n"
>               "   movq 32(%0), %%mm4\n"
>               "   movq 40(%0), %%mm5\n"
>               "   movq 48(%0), %%mm6\n"
>               "   movq 56(%0), %%mm7\n"
>               "   movntq %%mm0, (%1)\n"
>               "   movntq %%mm1, 8(%1)\n"
>               "   movntq %%mm2, 16(%1)\n"
>               "   movntq %%mm3, 24(%1)\n"
>               "   movntq %%mm4, 32(%1)\n"
>               "   movntq %%mm5, 40(%1)\n"
>               "   movntq %%mm6, 48(%1)\n"
>               "   movntq %%mm7, 56(%1)\n"
>               : : "r" (from), "r" (to) : "memory");
>               from+=64;
>               to+=64;
> }
>       __asm__ __volatile__ (
>               " femms \n "
>               " sfence\n" : :
>       );
>       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
> 
> }
> 
> /* test version to go even faster... this might be the same as faster_
>  * but serves as my playground.
>  */
> static void even_faster_copy_page(void *to, void *from)
> {
>       int i;
>       char fpu_save[108];
> 
>       __asm__ __volatile__ (
>               "1: prefetchnta (%0)\n"
>               "   prefetchnta 64(%0)\n"
>               "   prefetchnta 128(%0)\n"
>               "   prefetchnta 192(%0)\n"
>               : : "r" (from) );
> 
>       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> 
>       for(i=0; i<4096/64; i++)
>       {
>               __asm__ __volatile__ (
>               "   prefetchnta 256(%0)\n"
>               "   movq (%0), %%mm0\n"
>               "   movntq %%mm0, (%1)\n"
>               "   movq 8(%0), %%mm1\n"
>               "   movntq %%mm1, 8(%1)\n"
>               "   movq 16(%0), %%mm2\n"
>               "   movntq %%mm2, 16(%1)\n"
>               "   movq 24(%0), %%mm3\n"
>               "   movntq %%mm3, 24(%1)\n"
>               "   movq 32(%0), %%mm4\n"
>               "   movntq %%mm4, 32(%1)\n"
>               "   movq 40(%0), %%mm5\n"
>               "   movntq %%mm5, 40(%1)\n"
>               "   movq 48(%0), %%mm6\n"
>               "   movntq %%mm6, 48(%1)\n"
>               "   movq 56(%0), %%mm7\n"
>               "   movntq %%mm7, 56(%1)\n"
>               : : "r" (from), "r" (to) : "memory");
>               from+=64;
>               to+=64;
>       }
>       __asm__ __volatile__ (
>               " femms \n "
>               " sfence\n" : :
>       );
>       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>       
> }
> 
> 
> /*
>  * This looks horribly ugly, but the compiler can optimize it totally,
>  * as the count is constant.
>  */
> static inline void * __constant_memcpy(void * to, const void * from,
size_t n)
> {
>       switch (n) {
>               case 0:
>                       return to;
>               case 1:
>                       *(unsigned char *)to = *(const unsigned char
*)from;
>                       return to;
>               case 2:
>                       *(unsigned short *)to = *(const unsigned short
*)from;
>                       return to;
>               case 3:
>                       *(unsigned short *)to = *(const unsigned short
*)from;
>                       *(2+(unsigned char *)to) = *(2+(const unsigned
char *)from);
>                       return to;
>               case 4:
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       return to;
>               case 6: /* for Ethernet addresses */
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       *(2+(unsigned short *)to) = *(2+(const unsigned
short *)from);
>                       return to;
>               case 8:
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
>                       return to;
>               case 12:
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
>                       *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
>                       return to;
>               case 16:
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
>                       *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
>                       *(3+(unsigned long *)to) = *(3+(const unsigned
long *)from);
>                       return to;
>               case 20:
>                       *(unsigned long *)to = *(const unsigned long
*)from;
>                       *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
>                       *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
>                       *(3+(unsigned long *)to) = *(3+(const unsigned
long *)from);
>                       *(4+(unsigned long *)to) = *(4+(const unsigned
long *)from);
>                       return to;
>       }
> #define COMMON(x) \
> __asm__ __volatile__( \
>       "rep ; movsl" \
>       x \
>       : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
>       : "0" (n/4),"1" ((long) to),"2" ((long) from) \
>       : "memory");
> {
>       int d0, d1, d2;
>       switch (n % 4) {
>               case 0: COMMON(""); return to;
>               case 1: COMMON("\n\tmovsb"); return to;
>               case 2: COMMON("\n\tmovsw"); return to;
>               default: COMMON("\n\tmovsw\n\tmovsb"); return to;
>       }
> }
>   
> #undef COMMON
> }
> 
> 
> static void normal_copy_page(void *to, void *from)
> {
>       __constant_memcpy(to,from,4096);
> }
> 
> 
> /*
>  * This looks horribly ugly, but the compiler can optimize it totally,
>  * as we by now know that both pattern and count is constant..
>  */
> static inline void * __constant_c_and_count_memset(void * s, unsigned
long pattern, size_t count)
> {
>       switch (count) {
>               case 0:
>                       return s;
>               case 1:
>                       *(unsigned char *)s = pattern;
>                       return s;
>               case 2:
>                       *(unsigned short *)s = pattern;
>                       return s;
>               case 3:
>                       *(unsigned short *)s = pattern;
>                       *(2+(unsigned char *)s) = pattern;
>                       return s;
>               case 4:
>                       *(unsigned long *)s = pattern;
>                       return s;
>       }
> #define COMMON(x) \
> __asm__  __volatile__( \
>       "rep ; stosl" \
>       x \
>       : "=&c" (d0), "=&D" (d1) \
>       : "a" (pattern),"0" (count/4),"1" ((long) s) \
>       : "memory")
> {
>       int d0, d1;
>       switch (count % 4) {
>               case 0: COMMON(""); return s;
>               case 1: COMMON("\n\tstosb"); return s;
>               case 2: COMMON("\n\tstosw"); return s;
>               default: COMMON("\n\tstosw\n\tstosb"); return s;
>       }
> }
>   
> #undef COMMON
> }
> 
> static void normal_clear_page(void *to)
> {
>        __constant_c_and_count_memset(to,0,4096);
> }
> 
> /* test version to see if we can go even faster */
> static void no_prefetch_copy_page(void *to, void *from) {
>       int i, d1;
>         char fpu_save[108];
> 
>       for (i=4096-256;i>=0;i-=256)
>               __asm__ __volatile(
>                       "movl 192(%1,%2),%0\n"
>                       "movl 128(%1,%2),%0\n"
>                       "movl 64(%1,%2),%0\n"
>                       "movl 0(%1,%2),%0\n"
>                       : "=&r" (d1)
>                       : "r" (from), "r" (i));
> 
>         __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0])
);
> 
>       for(i=0; i<4096/64; i++) {
>               __asm__ __volatile__ (
>               "   movq (%0), %%mm0\n"
>               "   movntq %%mm0, (%1)\n"
>               "   movq 8(%0), %%mm1\n"
>               "   movntq %%mm1, 8(%1)\n"
>               "   movq 16(%0), %%mm2\n"
>               "   movntq %%mm2, 16(%1)\n"
>               "   movq 24(%0), %%mm3\n"
>               "   movntq %%mm3, 24(%1)\n"
>               "   movq 32(%0), %%mm4\n"
>               "   movntq %%mm4, 32(%1)\n"
>               "   movq 40(%0), %%mm5\n"
>               "   movntq %%mm5, 40(%1)\n"
>               "   movq 48(%0), %%mm6\n"
>               "   movntq %%mm6, 48(%1)\n"
>               "   movq 56(%0), %%mm7\n"
>               "   movntq %%mm7, 56(%1)\n"
>               : : "r" (from), "r" (to) : "memory");
>               from+=64;
>               to+=64;
>       }
>       __asm__ __volatile__ (
>               " sfence \n "
>               " emms\n"
>               " frstor %0;\n" ::"m"(fpu_save[0]) );
> }
> 
> 
> #define rdtsc(low,high) \
>      __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
>      
> typedef void (clear_func)(void *);
> typedef void (copy_func)(void *,void *);
> 
> void test_one_clearpage(clear_func *func, char *name, char *Buffer)
> {
>       char *temp;
>       int i;
>       unsigned int blow,bhigh,alow,ahigh;
>       unsigned long long before,after;
> 
>       rdtsc(blow,bhigh);
>       temp = Buffer;
>       for (i=0;i<4*1024;i++) {
>               func(temp);
>               temp += 4096;
>       }
>       rdtsc(alow,ahigh);
>       before =  blow + (((long long)bhigh)<<32);
>       after = alow +(((long long)ahigh)<<32);
>       if (before>after) {
>               printf("test invalid; timer overflow \n");
>               return;
>       }
>       printf("clear_page function '%s'\t took %4lli cycles per
page\n",name,(after-before)/(4*1024) );
> 
> 
> }
>      
> void test_one_copypage(copy_func *func, char *name, char *Buffer)
> {
>       char *temp;
>       int i;
>       unsigned int blow,bhigh,alow,ahigh;
>       unsigned long long before,after;
> 
>       sleep(1);
>       rdtsc(blow,bhigh);
>       temp = Buffer;
>       for (i=0;i<2*1024;i++) {
>               func(temp,temp+8*1024*1024);
>               temp += 4096;
>       }
>       rdtsc(alow,ahigh);
>       before =  blow+ (((long long)bhigh)<<32);
>       after = alow+(((long long)ahigh)<<32);
>       if (before>after) {
>               printf("test invalid; timer overflow \n");
>               return;
>       }
>       printf("copy_page function '%s'\t took %4lli cycles per
page\n",name,(after-before)/(2*1024) );
> 
> 
> }
>      
>      
> void test_clearpage(char *Buffer)
> {
>       printf("clear_page() tests \n");
> 
>       test_one_clearpage(fast_clear_page,"warm up run",Buffer);
>       test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
>       test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
>       test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
>      
test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
>      
test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
> }   
> 
> void test_copypage(char *Buffer)
> {
>       printf("copy_page() tests \n");
>       
>       test_one_copypage(fast_copy_page,  "warm up run",Buffer);
>       test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
>       test_one_copypage(slow_copy_page,  "2.4 MMX fallback",Buffer);
>       test_one_copypage(fast_copy_page,  "2.4 MMX version",Buffer);
>       test_one_copypage(faster_copy_page,"faster_copy",Buffer);
>       test_one_copypage(even_faster_copy_page,"even_faster",Buffer);
>       test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
> }
> 
> int main()
> {
>       char *Buffer;
>       
>       Buffer = malloc(1024*1024*16);
>       memset(Buffer,0xfe,1024*1024*16);
>       
>       printf("Athlon test program %s \n",cvsid);
>       
>       printf("\n");
>       test_copypage(Buffer);
> 
>       free(Buffer);
> 
>       return 0;
> }
-- 
Jorge Bernal (Koke)
The software required Win95 or better, so I installed Linux
ICQ#: 63593654
MSN: koke_jb

[-- Attachment #2: Esta parte del mensaje esta firmada digitalmente --]
[-- Type: application/pgp-signature, Size: 232 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2002-10-26 12:04 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-24 17:15 [CFT] faster athlon/duron memory copy implementation Manfred Spraul
2002-10-24 17:37 ` Robert Love
2002-10-24 18:05   ` Zach Brown
2002-10-24 17:41 ` Andreas Steinmetz
2002-10-24 17:48 ` Matthias Welk
2002-10-24 19:01   ` erich
2002-10-24 19:11     ` Arjan van de Ven
2002-10-24 19:38     ` Manfred Spraul
2002-10-25  0:59       ` Panagiotis Papadakos
2002-10-24 17:53 ` Roger Luethi
2002-10-24 18:10 ` Daniel Egger
2002-10-24 19:15   ` Florin Iucha
2002-10-24 19:28   ` Manfred Spraul
2002-10-24 19:38     ` Dave Jones
2002-10-24 19:43     ` Ken Witherow
2002-10-25 13:08     ` Daniel Egger
2002-10-24 18:17 ` Eric Lammerts
2002-10-24 18:26 ` David Rees
2002-10-24 18:35 ` Josh McKinney
2002-10-24 18:36 ` Dave Jones
2002-10-24 18:43 ` Simon Fowler
2002-10-24 18:50   ` Simon Fowler
2002-10-24 18:56   ` Dave Jones
2002-10-24 18:48 ` Ernst Herzberg
2002-10-24 20:09   ` Ed Sweetman
2002-10-24 20:13     ` Robert Love
2002-10-24 20:31       ` Ed Sweetman
2002-10-24 20:49         ` Dave Jones
2002-10-24 20:26     ` Dave Jones
2002-10-25  9:19       ` Måns Rullgård
2002-10-24 19:11 ` Marcus Libäck
2002-10-24 19:19 ` Brian Gerst
2002-10-24 19:31 ` Matthias Schniedermeyer
2002-10-24 19:33 ` Pascal Schmidt
2002-10-24 19:39 ` Olaf Dietsche
2002-10-24 20:27 ` Mike Civil
2002-10-24 20:44 ` Willy TARREAU
2002-10-24 21:46 ` Josh Fryman
2002-10-24 22:18 ` Tim Schmielau
2002-10-24 23:09 ` Hirokazu Takahashi
2002-10-24 23:37 ` Ryan Cumming
2002-10-25  0:10 ` Matthias Andree
2002-10-25  8:35 ` venom
2002-10-25 13:31 ` Denis Vlasenko
2002-10-26 12:11 ` Jurjen Oskam
  -- strict thread matches above, loose matches on Subject: below --
2002-10-24 18:27 Shawn Starr
2002-10-24 20:51 Dieter Nützel
2002-10-24 21:01 ` Dieter Nützel
2002-10-24 21:16 ` Willy TARREAU
2002-10-24 22:01 Harm Verhagen
2002-10-25 16:29 Jorge Bernal "Koke"

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).