public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jens Maurer <Jens.Maurer@gmx.net>
To: Linux Kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH] Use x86 SSE instructions for clear_page, copy_page
Date: Tue, 17 Aug 2004 08:13:37 +0200	[thread overview]
Message-ID: <4121A211.8080902@gmx.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 8999 bytes --]

The attached patch (against kernel 2.6.8.1) enables using SSE
instructions for copy_page and clear_page.

A user-space test on my Pentium III 850 MHz shows a 3x speedup for
clear_page (compared to the default "rep stosl"), and a 50% speedup
for copy_page (compared to the default "rep movsl").  For a Pentium-4,
the speedup is about 50% in both the clear_page and copy_page cases.

The attached (admittedly perverse) user-space program
"malloc-fork-load.c" takes 30 sec with stock kernel 2.6.8.1, which
improves to about 15 sec when running a kernel with the attached
kernel patch applied.

Notes: I cannot replace clear_page and copy_page with their SSE
equivalents at compile-time, because clear_page is used before the CPU
is fully set up (in particular the CR4.OSFXSR bit, without which SSE
instructions kill the kernel with an invalid operand exception).  The
current function-pointer based approach could be extended to include
the current MMX-based improvements for AMD CPUs as well.  If a
function pointer is considered too wasteful for a boot-time
initialization issue, a "memcpy" approach similar to the
"apply_alternatives()" code modifications would be possible.

Please test.

Jens Maurer


diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/arch/i386/Kconfig linux-2.6.8.1/arch/i386/Kconfig
--- linux-2.6.8.1.orig/arch/i386/Kconfig	Mon Aug 16 22:02:03 2004
+++ linux-2.6.8.1/arch/i386/Kconfig	Mon Aug 16 21:56:09 2004
@@ -419,6 +419,11 @@
  	depends on MCYRIXIII || MK7
  	default y

+config X86_USE_SSE
+	bool
+	depends on MPENTIUMIII || MPENTIUMM || MPENTIUM4
+	default y
+
  config X86_OOSTORE
  	bool
  	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/arch/i386/kernel/setup.c linux-2.6.8.1/arch/i386/kernel/setup.c
--- linux-2.6.8.1.orig/arch/i386/kernel/setup.c	Mon Aug 16 22:02:04 2004
+++ linux-2.6.8.1/arch/i386/kernel/setup.c	Mon Aug 16 21:56:14 2004
@@ -1241,13 +1241,46 @@
  }

  static int no_replacement __initdata = 0;
-
+
+#ifdef CONFIG_X86_USE_SSE
+
+static void std_clear_page(void *page)
+{
+	int d0, d1;
+	asm volatile("cld\n\t"
+		     "rep; stosl"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (0), "0" (PAGE_SIZE/4), "1" (page)
+		     : "memory");
+}
+
+static void std_copy_page(void *to, void *from)
+{
+	int d0, d1, d2;
+	asm volatile("cld\n\t"
+		     "rep; movsl"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (PAGE_SIZE/4), "1" (to), "2" (from)
+		     : "memory");
+}
+
+void (*__sse_clear_page)(void *) = &std_clear_page;
+void (*__sse_copy_page)(void *, void *) = &std_copy_page;
+EXPORT_SYMBOL(__sse_clear_page);
+EXPORT_SYMBOL(__sse_copy_page);
+#endif
+
  void __init alternative_instructions(void)
  {
  	extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+	extern void activate_sse_replacements(void);
  	if (no_replacement)
  		return;
  	apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+#ifdef CONFIG_X86_USE_SSE
+	activate_sse_replacements();
+#endif
  }

  static int __init noreplacement_setup(char *s)
diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/arch/i386/lib/Makefile linux-2.6.8.1/arch/i386/lib/Makefile
--- linux-2.6.8.1.orig/arch/i386/lib/Makefile	Mon Aug 16 22:02:05 2004
+++ linux-2.6.8.1/arch/i386/lib/Makefile	Mon Aug 16 21:56:15 2004
@@ -7,4 +7,5 @@
  	bitops.o

  lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+lib-$(CONFIG_X86_USE_SSE) += sse.o
  lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/arch/i386/lib/sse.c linux-2.6.8.1/arch/i386/lib/sse.c
--- linux-2.6.8.1.orig/arch/i386/lib/sse.c	Thu Jan  1 01:00:00 1970
+++ linux-2.6.8.1/arch/i386/lib/sse.c	Mon Aug  9 00:57:23 2004
@@ -0,0 +1,115 @@
+/*
+ * linux/arch/i386/lib/sse.c
+ *
+ * Copyright 2004 Jens Maurer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Send feedback to <Jens.Maurer@gmx.net>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <asm/system.h>
+
+
+/*
+ *	SSE library helper functions
+ */
+
+#define SSE_START(cr0) do { \
+	preempt_disable(); \
+	cr0 = read_cr0(); \
+	clts(); \
+	} while(0)
+
+
+#define SSE_END(cr0) do { \
+	write_cr0(cr0); \
+	preempt_enable(); \
+	} while(0)
+
+static void sse_clear_page(void * page)
+{
+	unsigned char xmm_save[16];
+	unsigned int cr0;
+	int i;
+
+	SSE_START(cr0);
+	asm volatile("movups %%xmm0, (%0)\n\t"
+		     "xorps %%xmm0, %%xmm0"
+		     : : "r" (xmm_save));
+	for(i = 0; i < PAGE_SIZE/16/4; i++) {
+		asm volatile("movntps %%xmm0,   (%0)\n\t"
+			     "movntps %%xmm0, 16(%0)\n\t"
+			     "movntps %%xmm0, 32(%0)\n\t"
+			     "movntps %%xmm0, 48(%0)"
+			     : : "r"(page) : "memory");
+		page += 16*4;
+	}
+	asm volatile("sfence\n\t"
+		     "movups (%0), %%xmm0"
+		     : : "r" (xmm_save) : "memory");
+	SSE_END(cr0);
+}
+
+static void sse_copy_page(void *to, void *from)
+{
+	unsigned char tmp[16*4+15] __attribute__((aligned(16)));
+	/* gcc 3.4.x does not honor alignment requests for stack variables */
+	unsigned char * xmm_save =
+		(unsigned char *)ALIGN((unsigned long)tmp, 16);
+	unsigned int cr0;
+	int i;
+
+	SSE_START(cr0);
+	asm volatile("movaps %%xmm0,   (%0)\n\t"
+		     "movaps %%xmm1, 16(%0)\n\t"
+		     "movaps %%xmm2, 32(%0)\n\t"
+		     "movaps %%xmm3, 48(%0)"
+		     : : "r" (xmm_save));
+	for(i = 0; i < 4096/16/4; i++) {
+		asm volatile("movaps   (%0), %%xmm0\n\t"
+			     "movaps 16(%0), %%xmm1\n\t"
+			     "movaps 32(%0), %%xmm2\n\t"
+			     "movaps 48(%0), %%xmm3\n\t"
+			     "movntps %%xmm0,   (%1)\n\t"
+			     "movntps %%xmm1, 16(%1)\n\t"
+			     "movntps %%xmm2, 32(%1)\n\t"
+			     "movntps %%xmm3, 48(%1)"
+			     : : "r" (from), "r" (to) : "memory");
+		from += 16*4;
+		to += 16*4;
+	}
+	asm volatile("sfence\n"
+		     "movaps   (%0), %%xmm0\n\t"
+		     "movaps 16(%0), %%xmm1\n\t"
+		     "movaps 32(%0), %%xmm2\n\t"
+		     "movaps 48(%0), %%xmm3"
+		     : : "r" (xmm_save) : "memory");
+	SSE_END(cr0);
+}
+
+void activate_sse_replacements(void)
+{
+	if(cpu_has_xmm && (mmu_cr4_features & X86_CR4_OSFXSR)) {
+		__sse_clear_page = &sse_clear_page;
+		__sse_copy_page = &sse_copy_page;
+	}
+}
diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/include/asm-i386/page.h linux-2.6.8.1/include/asm-i386/page.h
--- linux-2.6.8.1.orig/include/asm-i386/page.h	Mon Aug 16 22:04:14 2004
+++ linux-2.6.8.1/include/asm-i386/page.h	Mon Aug 16 21:58:35 2004
@@ -21,6 +21,15 @@
  #define clear_page(page)	mmx_clear_page((void *)(page))
  #define copy_page(to,from)	mmx_copy_page(to,from)

+#elif defined(CONFIG_X86_USE_SSE)
+
+#include <asm/sse.h>
+
+extern void (*__sse_clear_page)(void *);
+extern void (*__sse_copy_page)(void *, void*);
+#define clear_page(page)	(*__sse_clear_page)(page)
+#define copy_page(to,from)	(*__sse_copy_page)(to,from)
+
  #else

  /*
diff -urN -X /home/jmaurer/Linux/excludes-for-diff.txt linux-2.6.8.1.orig/include/asm-i386/sse.h linux-2.6.8.1/include/asm-i386/sse.h
--- linux-2.6.8.1.orig/include/asm-i386/sse.h	Thu Jan  1 01:00:00 1970
+++ linux-2.6.8.1/include/asm-i386/sse.h	Sun Aug  8 22:21:36 2004
@@ -0,0 +1,34 @@
+/*
+ * linux/include/asm-i386/sse.h
+ *
+ * Copyright 2004 Jens Maurer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef _ASM_SSE_H
+#define _ASM_SSE_H
+
+/*
+ *	SSE helper operations
+ */
+
+#include <linux/types.h>
+
+extern void sse_clear_page(void *page);
+extern void sse_copy_page(void *to, void *from);
+
+#endif

[-- Attachment #2: malloc-fork-load.c --]
[-- Type: text/plain, Size: 801 bytes --]


#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>


#define N 20240
#define SIZE 4096

int main()
{
  int k;
  for(k = 0; k < 10; k++) {
    int i = 0;
    int pid;
    unsigned char *mem = mmap(0, N*SIZE, PROT_READ|PROT_WRITE,
  			MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
    if(mem == MAP_FAILED)
      perror("mmap");
    printf("pagesize: %d\n", getpagesize());
    for(i = 0; i < N; i++)
      mem[i*SIZE] = i*1000000007ul;
    printf("pages allocated\n");
    pid = fork();
    if(pid == 0) {
      /* child */
      for(i = 0; i < N; i++)
        mem[i*SIZE+1] = i;          /* force copy */
      printf("copy complete\n");
      exit(0);
    } else if(pid == -1) {
      perror("fork");
    } else {
      /* parent */
      waitpid(pid, NULL, 0);
    }
    munmap(mem, N*SIZE);
  }
}

             reply	other threads:[~2004-08-17  6:13 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-08-17  6:13 Jens Maurer [this message]
2004-08-17  7:27 ` [PATCH] Use x86 SSE instructions for clear_page, copy_page Arjan van de Ven
2004-08-17  8:10   ` Andrey Panin
2004-08-17  8:11     ` Arjan van de Ven
2004-08-17 22:40   ` Jens Maurer
2004-08-18  2:33     ` David S. Miller
2004-08-22 20:49       ` Jens Maurer
2004-08-18  7:00 ` Ingo Molnar
2004-08-18  7:11   ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4121A211.8080902@gmx.net \
    --to=jens.maurer@gmx.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox