[CFT] faster athlon/duron memory copy implementation

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Manfred Spraul <manfred@colorfullife.com>
To: linux-kernel@vger.kernel.org
Cc: arjanv@redhat.com
Subject: [CFT] faster athlon/duron memory copy implementation
Date: Thu, 24 Oct 2002 19:15:43 +0200	[thread overview]
Message-ID: <3DB82ABF.8030706@colorfullife.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 341 bytes --]

AMD recommends to perform memory copies with backward read operations 
instead of prefetch.

http://208.15.46.63/events/gdc2002.htm

Attached is a test app that compares several memory copy implementations.
Could you run it and report the results to me, together with cpu, 
chipset and memory type?

Please run 2 or 3 times.

--
    Manfred

[-- Attachment #2: athlon.c --]
[-- Type: text/plain, Size: 13212 bytes --]

/*

(C) 2000 Arjan van de Ven and others  licensed under the terms of the GPL


$Revision: 1.6 $
*/

static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $";
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* The 2.4 kernel one, adapted for userspace */

static void fast_clear_page(void *page)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
	
	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/128;i++)
	{
		__asm__ __volatile__ (
		"  movq %%mm0, (%0)\n"
		"  movq %%mm0, 8(%0)\n"
		"  movq %%mm0, 16(%0)\n"
		"  movq %%mm0, 24(%0)\n"
		"  movq %%mm0, 32(%0)\n"
		"  movq %%mm0, 40(%0)\n"
		"  movq %%mm0, 48(%0)\n"
		"  movq %%mm0, 56(%0)\n"
		"  movq %%mm0, 64(%0)\n"
		"  movq %%mm0, 72(%0)\n"
		"  movq %%mm0, 80(%0)\n"
		"  movq %%mm0, 88(%0)\n"
		"  movq %%mm0, 96(%0)\n"
		"  movq %%mm0, 104(%0)\n"
		"  movq %%mm0, 112(%0)\n"
		"  movq %%mm0, 120(%0)\n"
		: : "r" (page) : "memory");
		page+=128;
	}
	__asm__ __volatile__ (
		"  femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
	
}

/* modified version for Athlon-family processors */
static void faster_clear_page(void *page)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/64;i++)
	{
		__asm__ __volatile__ (
		"  movntq %%mm0, (%0)\n"
		"  movntq %%mm0, 8(%0)\n"
		"  movntq %%mm0, 16(%0)\n"
		"  movntq %%mm0, 24(%0)\n"
		"  movntq %%mm0, 32(%0)\n"
		"  movntq %%mm0, 40(%0)\n"
		"  movntq %%mm0, 48(%0)\n"
		"  movntq %%mm0, 56(%0)\n"
		: : "r" (page) : "memory");
		page+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
 * but serves as my playground.
 */
static void even_faster_clear_page(void *page)
{
	int i;
	char fpu_save[108];
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	__asm__ __volatile__ (
		"  pxor %%mm0, %%mm0\n" : :
	);

	for(i=0;i<4096/64;i++)
	{
		__asm__ __volatile__ (
		"  movntq %%mm0, (%0)\n"
		"  movntq %%mm0, 8(%0)\n"
		"  movntq %%mm0, 16(%0)\n"
		"  movntq %%mm0, 24(%0)\n"
		"  movntq %%mm0, 32(%0)\n"
		"  movntq %%mm0, 40(%0)\n"
		"  movntq %%mm0, 48(%0)\n"
		"  movntq %%mm0, 56(%0)\n"
		: : "r" (page) : "memory");
		page+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" femms\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* The "fallback" one as used by the kernel */
static void slow_zero_page(void * page)
{
	int d0, d1;
	__asm__ __volatile__( \
		"cld\n\t" \
		"rep ; stosl" \
		: "=&c" (d0), "=&D" (d1)
		:"a" (0),"1" (page),"0" (1024)
		:"memory");
}

static void slow_copy_page(void *to, void *from)
{
	int d0, d1, d2;
	__asm__ __volatile__( \
		"cld\n\t" \
		"rep ; movsl" \
		: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
		: "0" (1024),"1" ((long) to),"2" ((long) from) \
		: "memory");
}


/* 2.4 kernel mmx copy_page function */
static void fast_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	__asm__ __volatile__ (
		"1: prefetch (%0)\n"
		"   prefetch 64(%0)\n"
		"   prefetch 128(%0)\n"
		"   prefetch 192(%0)\n"
		"   prefetch 256(%0)\n"
		: : "r" (from) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"1: prefetch 320(%0)\n"
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq %%mm0, (%1)\n"
		"   movq %%mm1, 8(%1)\n"
		"   movq %%mm2, 16(%1)\n"
		"   movq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm0\n"
		"   movq 40(%0), %%mm1\n"
		"   movq 48(%0), %%mm2\n"
		"   movq 56(%0), %%mm3\n"
		"   movq %%mm0, 32(%1)\n"
		"   movq %%mm1, 40(%1)\n"
		"   movq %%mm2, 48(%1)\n"
		"   movq %%mm3, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" femms\n" : :
	);
	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

}


/* Athlon improved version */
static void faster_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"1: prefetchnta 320(%0)\n"
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq 32(%0), %%mm4\n"
		"   movq 40(%0), %%mm5\n"
		"   movq 48(%0), %%mm6\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm0, (%1)\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
}
	__asm__ __volatile__ (
		" femms \n "
		" sfence\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
 * but serves as my playground.
 */
static void even_faster_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"   prefetchnta 256(%0)\n"
		"   movq (%0), %%mm0\n"
		"   movntq %%mm0, (%1)\n"
		"   movq 8(%0), %%mm1\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movq 16(%0), %%mm2\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movq 24(%0), %%mm3\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm4\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movq 40(%0), %%mm5\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movq 48(%0), %%mm6\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" femms \n "
		" sfence\n" : :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
	
}


/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as the count is constant.
 */
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
	switch (n) {
		case 0:
			return to;
		case 1:
			*(unsigned char *)to = *(const unsigned char *)from;
			return to;
		case 2:
			*(unsigned short *)to = *(const unsigned short *)from;
			return to;
		case 3:
			*(unsigned short *)to = *(const unsigned short *)from;
			*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
			return to;
		case 4:
			*(unsigned long *)to = *(const unsigned long *)from;
			return to;
		case 6:	/* for Ethernet addresses */
			*(unsigned long *)to = *(const unsigned long *)from;
			*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
			return to;
		case 8:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			return to;
		case 12:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			return to;
		case 16:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			return to;
		case 20:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
			return to;
	}
#define COMMON(x) \
__asm__ __volatile__( \
	"rep ; movsl" \
	x \
	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
	: "memory");
{
	int d0, d1, d2;
	switch (n % 4) {
		case 0: COMMON(""); return to;
		case 1: COMMON("\n\tmovsb"); return to;
		case 2: COMMON("\n\tmovsw"); return to;
		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
	}
}
  
#undef COMMON
}


static void normal_copy_page(void *to, void *from)
{
	__constant_memcpy(to,from,4096);
}


/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as we by now know that both pattern and count is constant..
 */
static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
	switch (count) {
		case 0:
			return s;
		case 1:
			*(unsigned char *)s = pattern;
			return s;
		case 2:
			*(unsigned short *)s = pattern;
			return s;
		case 3:
			*(unsigned short *)s = pattern;
			*(2+(unsigned char *)s) = pattern;
			return s;
		case 4:
			*(unsigned long *)s = pattern;
			return s;
	}
#define COMMON(x) \
__asm__  __volatile__( \
	"rep ; stosl" \
	x \
	: "=&c" (d0), "=&D" (d1) \
	: "a" (pattern),"0" (count/4),"1" ((long) s) \
	: "memory")
{
	int d0, d1;
	switch (count % 4) {
		case 0: COMMON(""); return s;
		case 1: COMMON("\n\tstosb"); return s;
		case 2: COMMON("\n\tstosw"); return s;
		default: COMMON("\n\tstosw\n\tstosb"); return s;
	}
}
  
#undef COMMON
}

static void normal_clear_page(void *to)
{
	 __constant_c_and_count_memset(to,0,4096);
}

/* test version to see if we can go even faster */
static void no_prefetch_copy_page(void *to, void *from) {
	int i, d1;
        char fpu_save[108];

	for (i=4096-256;i>=0;i-=256)
		__asm__ __volatile(
			"movl 192(%1,%2),%0\n"
			"movl 128(%1,%2),%0\n"
			"movl 64(%1,%2),%0\n"
			"movl 0(%1,%2),%0\n"
			: "=&r" (d1)
			: "r" (from), "r" (i));

        __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	for(i=0; i<4096/64; i++) {
		__asm__ __volatile__ (
		"   movq (%0), %%mm0\n"
		"   movntq %%mm0, (%1)\n"
		"   movq 8(%0), %%mm1\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movq 16(%0), %%mm2\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movq 24(%0), %%mm3\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm4\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movq 40(%0), %%mm5\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movq 48(%0), %%mm6\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		" sfence \n "
		" emms\n"
		" frstor %0;\n" ::"m"(fpu_save[0]) );
}


#define rdtsc(low,high) \
     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
     
typedef void (clear_func)(void *);
typedef void (copy_func)(void *,void *);

void test_one_clearpage(clear_func *func, char *name, char *Buffer)
{
	char *temp;
	int i;
	unsigned int blow,bhigh,alow,ahigh;
	unsigned long long before,after;

	rdtsc(blow,bhigh);
	temp = Buffer;
	for (i=0;i<4*1024;i++) {
		func(temp);
		temp += 4096;
	}
	rdtsc(alow,ahigh);
	before =  blow + (((long long)bhigh)<<32);
	after = alow +(((long long)ahigh)<<32);
	if (before>after) {
		printf("test invalid; timer overflow \n");
		return;
	}
	printf("clear_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(4*1024) );


}
     
void test_one_copypage(copy_func *func, char *name, char *Buffer)
{
	char *temp;
	int i;
	unsigned int blow,bhigh,alow,ahigh;
	unsigned long long before,after;

	sleep(1);
	rdtsc(blow,bhigh);
	temp = Buffer;
	for (i=0;i<2*1024;i++) {
		func(temp,temp+8*1024*1024);
		temp += 4096;
	}
	rdtsc(alow,ahigh);
	before =  blow+ (((long long)bhigh)<<32);
	after = alow+(((long long)ahigh)<<32);
	if (before>after) {
		printf("test invalid; timer overflow \n");
		return;
	}
	printf("copy_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(2*1024) );


}
     
     
void test_clearpage(char *Buffer)
{
	printf("clear_page() tests \n");

	test_one_clearpage(fast_clear_page,"warm up run",Buffer);
	test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
	test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
	test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
	test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
	test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
}   

void test_copypage(char *Buffer)
{
	printf("copy_page() tests \n");
	
	test_one_copypage(fast_copy_page,  "warm up run",Buffer);
	test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
	test_one_copypage(slow_copy_page,  "2.4 MMX fallback",Buffer);
	test_one_copypage(fast_copy_page,  "2.4 MMX version",Buffer);
	test_one_copypage(faster_copy_page,"faster_copy",Buffer);
	test_one_copypage(even_faster_copy_page,"even_faster",Buffer);
	test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
}

int main()
{
	char *Buffer;
	
	Buffer = malloc(1024*1024*16);
	memset(Buffer,0xfe,1024*1024*16);
	
	printf("Athlon test program %s \n",cvsid);
	
	printf("\n");
	test_copypage(Buffer);

	free(Buffer);

	return 0;
}

next             reply	other threads:[~2002-10-24 17:09 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-10-24 17:15 Manfred Spraul [this message]
2002-10-24 17:37 ` [CFT] faster athlon/duron memory copy implementation Robert Love
2002-10-24 18:05   ` Zach Brown
2002-10-24 17:41 ` Andreas Steinmetz
2002-10-24 17:48 ` Matthias Welk
2002-10-24 19:01   ` erich
2002-10-24 19:11     ` Arjan van de Ven
2002-10-24 19:38     ` Manfred Spraul
2002-10-25  0:59       ` Panagiotis Papadakos
2002-10-24 17:53 ` Roger Luethi
2002-10-24 18:10 ` Daniel Egger
2002-10-24 19:15   ` Florin Iucha
2002-10-24 19:28   ` Manfred Spraul
2002-10-24 19:38     ` Dave Jones
2002-10-24 19:43     ` Ken Witherow
2002-10-25 13:08     ` Daniel Egger
2002-10-24 18:17 ` Eric Lammerts
2002-10-24 18:26 ` David Rees
2002-10-24 18:35 ` Josh McKinney
2002-10-24 18:36 ` Dave Jones
2002-10-24 18:43 ` Simon Fowler
2002-10-24 18:50   ` Simon Fowler
2002-10-24 18:56   ` Dave Jones
2002-10-24 18:48 ` Ernst Herzberg
2002-10-24 20:09   ` Ed Sweetman
2002-10-24 20:13     ` Robert Love
2002-10-24 20:31       ` Ed Sweetman
2002-10-24 20:49         ` Dave Jones
2002-10-24 20:26     ` Dave Jones
2002-10-25  9:19       ` Måns Rullgård
2002-10-24 19:11 ` Marcus Libäck
2002-10-24 19:19 ` Brian Gerst
2002-10-24 19:31 ` Matthias Schniedermeyer
2002-10-24 19:33 ` Pascal Schmidt
2002-10-24 19:39 ` Olaf Dietsche
2002-10-24 20:27 ` Mike Civil
2002-10-24 20:44 ` Willy TARREAU
2002-10-24 21:46 ` Josh Fryman
2002-10-24 22:18 ` Tim Schmielau
2002-10-24 23:09 ` Hirokazu Takahashi
2002-10-24 23:37 ` Ryan Cumming
2002-10-25  0:10 ` Matthias Andree
2002-10-25  8:35 ` venom
2002-10-25 13:31 ` Denis Vlasenko
2002-10-26 12:11 ` Jurjen Oskam
  -- strict thread matches above, loose matches on Subject: below --
2002-10-24 18:27 Shawn Starr
2002-10-24 20:51 Dieter Nützel
2002-10-24 21:01 ` Dieter Nützel
2002-10-24 21:16 ` Willy TARREAU
2002-10-24 22:01 Harm Verhagen
2002-10-25 16:29 Jorge Bernal "Koke"

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3DB82ABF.8030706@colorfullife.com \
    --to=manfred@colorfullife.com \
    --cc=arjanv@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.