linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] arch/x86: use kernel_fpu_[begin|end] for RAID5 checksumming
@ 2012-04-30 23:51 Jim Kukunas
  2012-05-03  4:57 ` NeilBrown
  0 siblings, 1 reply; 3+ messages in thread
From: Jim Kukunas @ 2012-04-30 23:51 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, hpa, linux-kernel

Currently, the SSE and AVX xor functions manually save and restore the
[X|Y]MM registers. Instead, we should use kernel_fpu_[begin|end].

This patch sacrifices some throughput,~5-10% for AVX and ~2% for SSE, in
exchange for safety against future FPU corruption bugs.

Patch applies to md/for-next.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 arch/x86/include/asm/xor_32.h  |   56 +++++-------------------------------
 arch/x86/include/asm/xor_64.h  |   61 ++++++---------------------------------
 arch/x86/include/asm/xor_avx.h |   55 ++++++++----------------------------
 3 files changed, 30 insertions(+), 142 deletions(-)

diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 4545708..aabd585 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = {
  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  */
 
-#define XMMS_SAVE				\
-do {						\
-	preempt_disable();			\
-	cr0 = read_cr0();			\
-	clts();					\
-	asm volatile(				\
-		"movups %%xmm0,(%0)	;\n\t"	\
-		"movups %%xmm1,0x10(%0)	;\n\t"	\
-		"movups %%xmm2,0x20(%0)	;\n\t"	\
-		"movups %%xmm3,0x30(%0)	;\n\t"	\
-		:				\
-		: "r" (xmm_save) 		\
-		: "memory");			\
-} while (0)
-
-#define XMMS_RESTORE				\
-do {						\
-	asm volatile(				\
-		"sfence			;\n\t"	\
-		"movups (%0),%%xmm0	;\n\t"	\
-		"movups 0x10(%0),%%xmm1	;\n\t"	\
-		"movups 0x20(%0),%%xmm2	;\n\t"	\
-		"movups 0x30(%0),%%xmm3	;\n\t"	\
-		:				\
-		: "r" (xmm_save)		\
-		: "memory");			\
-	write_cr0(cr0);				\
-	preempt_enable();			\
-} while (0)
-
-#define ALIGN16 __attribute__((aligned(16)))
-
 #define OFFS(x)		"16*("#x")"
 #define PF_OFFS(x)	"256+16*("#x")"
 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
@@ -587,10 +555,8 @@ static void
 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
 	unsigned long lines = bytes >> 8;
-	char xmm_save[16*4] ALIGN16;
-	int cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 	:
 	: "memory");
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static void
@@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3)
 {
 	unsigned long lines = bytes >> 8;
-	char xmm_save[16*4] ALIGN16;
-	int cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	:
 	: "memory" );
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static void
@@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4)
 {
 	unsigned long lines = bytes >> 8;
-	char xmm_save[16*4] ALIGN16;
-	int cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	:
 	: "memory" );
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static void
@@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
 	unsigned long lines = bytes >> 8;
-	char xmm_save[16*4] ALIGN16;
-	int cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	/* Make sure GCC forgets anything it knows about p4 or p5,
 	   such that it won't pass to the asm volatile below a
@@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	   like assuming they have some legal value.  */
 	asm("" : "=r" (p4), "=r" (p5));
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static struct xor_block_template xor_block_pIII_sse = {
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index b9b2323..715f904 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -34,41 +34,7 @@
  * no advantages to be gotten from x86-64 here anyways.
  */
 
-typedef struct {
-	unsigned long a, b;
-} __attribute__((aligned(16))) xmm_store_t;
-
-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
-   tell it to do a clts before the register saving. */
-#define XMMS_SAVE				\
-do {						\
-	preempt_disable();			\
-	asm volatile(				\
-		"movq %%cr0,%0		;\n\t"	\
-		"clts			;\n\t"	\
-		"movups %%xmm0,(%1)	;\n\t"	\
-		"movups %%xmm1,0x10(%1)	;\n\t"	\
-		"movups %%xmm2,0x20(%1)	;\n\t"	\
-		"movups %%xmm3,0x30(%1)	;\n\t"	\
-		: "=&r" (cr0)			\
-		: "r" (xmm_save) 		\
-		: "memory");			\
-} while (0)
-
-#define XMMS_RESTORE				\
-do {						\
-	asm volatile(				\
-		"sfence			;\n\t"	\
-		"movups (%1),%%xmm0	;\n\t"	\
-		"movups 0x10(%1),%%xmm1	;\n\t"	\
-		"movups 0x20(%1),%%xmm2	;\n\t"	\
-		"movups 0x30(%1),%%xmm3	;\n\t"	\
-		"movq 	%0,%%cr0	;\n\t"	\
-		:				\
-		: "r" (cr0), "r" (xmm_save)	\
-		: "memory");			\
-	preempt_enable();			\
-} while (0)
+#include <asm/i387.h>
 
 #define OFFS(x)		"16*("#x")"
 #define PF_OFFS(x)	"256+16*("#x")"
@@ -91,10 +57,8 @@ static void
 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
 	unsigned int lines = bytes >> 8;
-	unsigned long cr0;
-	xmm_store_t xmm_save[4];
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 	: [inc] "r" (256UL)
 	: "memory");
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static void
@@ -143,10 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3)
 {
 	unsigned int lines = bytes >> 8;
-	xmm_store_t xmm_save[4];
-	unsigned long cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -194,7 +156,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 	: [inc] "r" (256UL)
 	: "memory");
-	XMMS_RESTORE;
+
+	kernel_fpu_end();
 }
 
 static void
@@ -202,10 +165,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4)
 {
 	unsigned int lines = bytes >> 8;
-	xmm_store_t xmm_save[4];
-	unsigned long cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -261,7 +222,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	: [inc] "r" (256UL)
 	: "memory" );
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static void
@@ -269,10 +230,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
 	unsigned int lines = bytes >> 8;
-	xmm_store_t xmm_save[4];
-	unsigned long cr0;
 
-	XMMS_SAVE;
+	kernel_fpu_begin();
 
 	asm volatile(
 #undef BLOCK
@@ -336,7 +295,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	: [inc] "r" (256UL)
 	: "memory");
 
-	XMMS_RESTORE;
+	kernel_fpu_end();
 }
 
 static struct xor_block_template xor_block_sse = {
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 2510d35..8d2795a 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -17,35 +17,8 @@
 
 #ifdef CONFIG_AS_AVX
 
-#include <linux/compiler.h>
 #include <asm/i387.h>
 
-#define ALIGN32 __aligned(32)
-
-#define YMM_SAVED_REGS 4
-
-#define YMMS_SAVE \
-do { \
-	preempt_disable(); \
-	cr0 = read_cr0(); \
-	clts(); \
-	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
-	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
-	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
-	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
-} while (0);
-
-#define YMMS_RESTORE \
-do { \
-	asm volatile("sfence" : : : "memory"); \
-	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
-	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
-	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
-	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
-	write_cr0(cr0); \
-	preempt_enable(); \
-} while (0);
-
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
 		BLOCK(32 * (i + 1), 1) \
@@ -60,10 +33,9 @@ do { \
 
 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
 {
-	unsigned long cr0, lines = bytes >> 9;
-	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+	unsigned long lines = bytes >> 9;
 
-	YMMS_SAVE
+	kernel_fpu_begin();
 
 	while (lines--) {
 #undef BLOCK
@@ -82,16 +54,15 @@ do { \
 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 	}
 
-	YMMS_RESTORE
+	kernel_fpu_end();
 }
 
 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 	unsigned long *p2)
 {
-	unsigned long cr0, lines = bytes >> 9;
-	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+	unsigned long lines = bytes >> 9;
 
-	YMMS_SAVE
+	kernel_fpu_begin();
 
 	while (lines--) {
 #undef BLOCK
@@ -113,16 +84,15 @@ do { \
 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
 	}
 
-	YMMS_RESTORE
+	kernel_fpu_end();
 }
 
 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 	unsigned long *p2, unsigned long *p3)
 {
-	unsigned long cr0, lines = bytes >> 9;
-	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+	unsigned long lines = bytes >> 9;
 
-	YMMS_SAVE
+	kernel_fpu_begin();
 
 	while (lines--) {
 #undef BLOCK
@@ -147,16 +117,15 @@ do { \
 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
 	}
 
-	YMMS_RESTORE
+	kernel_fpu_end();
 }
 
 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
 {
-	unsigned long cr0, lines = bytes >> 9;
-	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+	unsigned long lines = bytes >> 9;
 
-	YMMS_SAVE
+	kernel_fpu_begin();
 
 	while (lines--) {
 #undef BLOCK
@@ -184,7 +153,7 @@ do { \
 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
 	}
 
-	YMMS_RESTORE
+	kernel_fpu_end();
 }
 
 static struct xor_block_template xor_block_avx = {
-- 
1.7.8.6

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] arch/x86: use kernel_fpu_[begin|end] for RAID5 checksumming
  2012-04-30 23:51 [PATCH] arch/x86: use kernel_fpu_[begin|end] for RAID5 checksumming Jim Kukunas
@ 2012-05-03  4:57 ` NeilBrown
  2012-05-03  6:43   ` H. Peter Anvin
  0 siblings, 1 reply; 3+ messages in thread
From: NeilBrown @ 2012-05-03  4:57 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: linux-raid, hpa, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 11475 bytes --]

On Mon, 30 Apr 2012 16:51:15 -0700 Jim Kukunas
<james.t.kukunas@linux.intel.com> wrote:

> Currently, the SSE and AVX xor functions manually save and restore the
> [X|Y]MM registers. Instead, we should use kernel_fpu_[begin|end].
> 
> This patch sacrifices some throughput,~5-10% for AVX and ~2% for SSE, in
> exchange for safety against future FPU corruption bugs.
> 
> Patch applies to md/for-next.
> 
> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>

Hi Jim,
 I'm really not able to assess the validity of this patch.
I don't doubt it exactly, but I'd value a second opinion just to be confident.
Peter: can you review and comment?

Thanks,
NeilBrown


> ---
>  arch/x86/include/asm/xor_32.h  |   56 +++++-------------------------------
>  arch/x86/include/asm/xor_64.h  |   61 ++++++---------------------------------
>  arch/x86/include/asm/xor_avx.h |   55 ++++++++----------------------------
>  3 files changed, 30 insertions(+), 142 deletions(-)
> 
> diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
> index 4545708..aabd585 100644
> --- a/arch/x86/include/asm/xor_32.h
> +++ b/arch/x86/include/asm/xor_32.h
> @@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = {
>   * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
>   */
>  
> -#define XMMS_SAVE				\
> -do {						\
> -	preempt_disable();			\
> -	cr0 = read_cr0();			\
> -	clts();					\
> -	asm volatile(				\
> -		"movups %%xmm0,(%0)	;\n\t"	\
> -		"movups %%xmm1,0x10(%0)	;\n\t"	\
> -		"movups %%xmm2,0x20(%0)	;\n\t"	\
> -		"movups %%xmm3,0x30(%0)	;\n\t"	\
> -		:				\
> -		: "r" (xmm_save) 		\
> -		: "memory");			\
> -} while (0)
> -
> -#define XMMS_RESTORE				\
> -do {						\
> -	asm volatile(				\
> -		"sfence			;\n\t"	\
> -		"movups (%0),%%xmm0	;\n\t"	\
> -		"movups 0x10(%0),%%xmm1	;\n\t"	\
> -		"movups 0x20(%0),%%xmm2	;\n\t"	\
> -		"movups 0x30(%0),%%xmm3	;\n\t"	\
> -		:				\
> -		: "r" (xmm_save)		\
> -		: "memory");			\
> -	write_cr0(cr0);				\
> -	preempt_enable();			\
> -} while (0)
> -
> -#define ALIGN16 __attribute__((aligned(16)))
> -
>  #define OFFS(x)		"16*("#x")"
>  #define PF_OFFS(x)	"256+16*("#x")"
>  #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
> @@ -587,10 +555,8 @@ static void
>  xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>  {
>  	unsigned long lines = bytes >> 8;
> -	char xmm_save[16*4] ALIGN16;
> -	int cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>  	:
>  	: "memory");
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3)
>  {
>  	unsigned long lines = bytes >> 8;
> -	char xmm_save[16*4] ALIGN16;
> -	int cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	:
>  	: "memory" );
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3, unsigned long *p4)
>  {
>  	unsigned long lines = bytes >> 8;
> -	char xmm_save[16*4] ALIGN16;
> -	int cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	:
>  	: "memory" );
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
>  {
>  	unsigned long lines = bytes >> 8;
> -	char xmm_save[16*4] ALIGN16;
> -	int cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	/* Make sure GCC forgets anything it knows about p4 or p5,
>  	   such that it won't pass to the asm volatile below a
> @@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	   like assuming they have some legal value.  */
>  	asm("" : "=r" (p4), "=r" (p5));
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static struct xor_block_template xor_block_pIII_sse = {
> diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
> index b9b2323..715f904 100644
> --- a/arch/x86/include/asm/xor_64.h
> +++ b/arch/x86/include/asm/xor_64.h
> @@ -34,41 +34,7 @@
>   * no advantages to be gotten from x86-64 here anyways.
>   */
>  
> -typedef struct {
> -	unsigned long a, b;
> -} __attribute__((aligned(16))) xmm_store_t;
> -
> -/* Doesn't use gcc to save the XMM registers, because there is no easy way to
> -   tell it to do a clts before the register saving. */
> -#define XMMS_SAVE				\
> -do {						\
> -	preempt_disable();			\
> -	asm volatile(				\
> -		"movq %%cr0,%0		;\n\t"	\
> -		"clts			;\n\t"	\
> -		"movups %%xmm0,(%1)	;\n\t"	\
> -		"movups %%xmm1,0x10(%1)	;\n\t"	\
> -		"movups %%xmm2,0x20(%1)	;\n\t"	\
> -		"movups %%xmm3,0x30(%1)	;\n\t"	\
> -		: "=&r" (cr0)			\
> -		: "r" (xmm_save) 		\
> -		: "memory");			\
> -} while (0)
> -
> -#define XMMS_RESTORE				\
> -do {						\
> -	asm volatile(				\
> -		"sfence			;\n\t"	\
> -		"movups (%1),%%xmm0	;\n\t"	\
> -		"movups 0x10(%1),%%xmm1	;\n\t"	\
> -		"movups 0x20(%1),%%xmm2	;\n\t"	\
> -		"movups 0x30(%1),%%xmm3	;\n\t"	\
> -		"movq 	%0,%%cr0	;\n\t"	\
> -		:				\
> -		: "r" (cr0), "r" (xmm_save)	\
> -		: "memory");			\
> -	preempt_enable();			\
> -} while (0)
> +#include <asm/i387.h>
>  
>  #define OFFS(x)		"16*("#x")"
>  #define PF_OFFS(x)	"256+16*("#x")"
> @@ -91,10 +57,8 @@ static void
>  xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>  {
>  	unsigned int lines = bytes >> 8;
> -	unsigned long cr0;
> -	xmm_store_t xmm_save[4];
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>  	: [inc] "r" (256UL)
>  	: "memory");
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -143,10 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3)
>  {
>  	unsigned int lines = bytes >> 8;
> -	xmm_store_t xmm_save[4];
> -	unsigned long cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -194,7 +156,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
>  	: [inc] "r" (256UL)
>  	: "memory");
> -	XMMS_RESTORE;
> +
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -202,10 +165,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3, unsigned long *p4)
>  {
>  	unsigned int lines = bytes >> 8;
> -	xmm_store_t xmm_save[4];
> -	unsigned long cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -261,7 +222,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	: [inc] "r" (256UL)
>  	: "memory" );
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static void
> @@ -269,10 +230,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
>  {
>  	unsigned int lines = bytes >> 8;
> -	xmm_store_t xmm_save[4];
> -	unsigned long cr0;
>  
> -	XMMS_SAVE;
> +	kernel_fpu_begin();
>  
>  	asm volatile(
>  #undef BLOCK
> @@ -336,7 +295,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>  	: [inc] "r" (256UL)
>  	: "memory");
>  
> -	XMMS_RESTORE;
> +	kernel_fpu_end();
>  }
>  
>  static struct xor_block_template xor_block_sse = {
> diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
> index 2510d35..8d2795a 100644
> --- a/arch/x86/include/asm/xor_avx.h
> +++ b/arch/x86/include/asm/xor_avx.h
> @@ -17,35 +17,8 @@
>  
>  #ifdef CONFIG_AS_AVX
>  
> -#include <linux/compiler.h>
>  #include <asm/i387.h>
>  
> -#define ALIGN32 __aligned(32)
> -
> -#define YMM_SAVED_REGS 4
> -
> -#define YMMS_SAVE \
> -do { \
> -	preempt_disable(); \
> -	cr0 = read_cr0(); \
> -	clts(); \
> -	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
> -	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
> -	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
> -	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
> -} while (0);
> -
> -#define YMMS_RESTORE \
> -do { \
> -	asm volatile("sfence" : : : "memory"); \
> -	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
> -	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
> -	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
> -	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
> -	write_cr0(cr0); \
> -	preempt_enable(); \
> -} while (0);
> -
>  #define BLOCK4(i) \
>  		BLOCK(32 * i, 0) \
>  		BLOCK(32 * (i + 1), 1) \
> @@ -60,10 +33,9 @@ do { \
>  
>  static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
>  {
> -	unsigned long cr0, lines = bytes >> 9;
> -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
> +	unsigned long lines = bytes >> 9;
>  
> -	YMMS_SAVE
> +	kernel_fpu_begin();
>  
>  	while (lines--) {
>  #undef BLOCK
> @@ -82,16 +54,15 @@ do { \
>  		p1 = (unsigned long *)((uintptr_t)p1 + 512);
>  	}
>  
> -	YMMS_RESTORE
> +	kernel_fpu_end();
>  }
>  
>  static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
>  	unsigned long *p2)
>  {
> -	unsigned long cr0, lines = bytes >> 9;
> -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
> +	unsigned long lines = bytes >> 9;
>  
> -	YMMS_SAVE
> +	kernel_fpu_begin();
>  
>  	while (lines--) {
>  #undef BLOCK
> @@ -113,16 +84,15 @@ do { \
>  		p2 = (unsigned long *)((uintptr_t)p2 + 512);
>  	}
>  
> -	YMMS_RESTORE
> +	kernel_fpu_end();
>  }
>  
>  static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
>  	unsigned long *p2, unsigned long *p3)
>  {
> -	unsigned long cr0, lines = bytes >> 9;
> -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
> +	unsigned long lines = bytes >> 9;
>  
> -	YMMS_SAVE
> +	kernel_fpu_begin();
>  
>  	while (lines--) {
>  #undef BLOCK
> @@ -147,16 +117,15 @@ do { \
>  		p3 = (unsigned long *)((uintptr_t)p3 + 512);
>  	}
>  
> -	YMMS_RESTORE
> +	kernel_fpu_end();
>  }
>  
>  static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
>  	unsigned long *p2, unsigned long *p3, unsigned long *p4)
>  {
> -	unsigned long cr0, lines = bytes >> 9;
> -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
> +	unsigned long lines = bytes >> 9;
>  
> -	YMMS_SAVE
> +	kernel_fpu_begin();
>  
>  	while (lines--) {
>  #undef BLOCK
> @@ -184,7 +153,7 @@ do { \
>  		p4 = (unsigned long *)((uintptr_t)p4 + 512);
>  	}
>  
> -	YMMS_RESTORE
> +	kernel_fpu_end();
>  }
>  
>  static struct xor_block_template xor_block_avx = {


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] arch/x86: use kernel_fpu_[begin|end] for RAID5 checksumming
  2012-05-03  4:57 ` NeilBrown
@ 2012-05-03  6:43   ` H. Peter Anvin
  0 siblings, 0 replies; 3+ messages in thread
From: H. Peter Anvin @ 2012-05-03  6:43 UTC (permalink / raw)
  To: NeilBrown; +Cc: Jim Kukunas, linux-raid, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 05/02/2012 09:57 PM, NeilBrown wrote:
> On Mon, 30 Apr 2012 16:51:15 -0700 Jim Kukunas 
> <james.t.kukunas@linux.intel.com> wrote:
> 
>> Currently, the SSE and AVX xor functions manually save and 
>> restore the [X|Y]MM registers. Instead, we should use 
>> kernel_fpu_[begin|end].
>> 
>> This patch sacrifices some throughput,~5-10% for AVX and ~2% for 
>> SSE, in exchange for safety against future FPU corruption bugs.
>> 
>> Patch applies to md/for-next.
>> 
>> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
> 
> Hi Jim, I'm really not able to assess the validity of this patch.
> I don't doubt it exactly, but I'd value a second opinion just to
> be confident. Peter: can you review and comment?
> 
> Thanks, NeilBrown
> 

Are these functions ever called with unaligned arguments?

Other than that, the patch is correct, and is the preferred way to do
this, but I'm unhappy about the cost.  I have suggested to Jim a few
ways by which we might be able to mitigate the cost or even improve
performance, but the implementation depends very much on the above
question (unlike the RAID-6 code, which assumes aligned buffers at all
times.)

	-hpa
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.12 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQIcBAEBAgAGBQJPoikHAAoJEL2gYIVJO6zk9Q8P/AqcdBC2fal6Wy9RSWnky4vf
BLycwd7JAK6CU6Tl/dK747gpEdzXwFOPPrGV3pbyycft8rKPvUoliBhE9esM8GN6
5wZfn9dBBXa3Gmacxdu5mj87W54LpQOCc8rGj3CSx8kfoORjBCxC0zueX3Wmno/I
KdL6JtWiSEgJ93WLRr4mYfo1V8rulJ5kAxTthntyAmuIiJi4nc4ERyMLRXEH+3p2
iam8f4v4JDhm2qVxnkSX2toPjzEVLPXSa5DDhHJq8OC9o8jl6gL8nIByG7/0hYMG
Z3mI4pG4GB7jdX8YxSI8sGo0rB1bXtnLEI61FAZo4vOJRs1xJr98m3nwiE4X81NV
OU+xucUPpd+Y5x6oC7tZyDkoKqnkNH1nNHhhyKawe4pMWWom9m8V5skkYrXufr8G
K1unlM0zfUUAH2vFbEuRqbGIMfhH2qFCWp/oywRK0M+R/HjLISVjjIIYCB87Y30K
dZ+y8z6bEYRm0WyFSw/SkEulSyAu/VfHcOAmYorvu721MG2HfPkdWiviwgbf6CD2
33bZ3IUkaian619eQC+wBliGIjvR6PexxTzilIbfV4D20t9QhvBVSkkbT9cIaFeF
38gC8LxDBvrSYyutn4v6e6NUAUQlbtQnPZJQfb8uDbtOe74Er8qsZJGsCyt1bgv1
EjeLFnHOn4F9pE/g7GNh
=wGxV
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-05-03  6:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-30 23:51 [PATCH] arch/x86: use kernel_fpu_[begin|end] for RAID5 checksumming Jim Kukunas
2012-05-03  4:57 ` NeilBrown
2012-05-03  6:43   ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).