* AVX RAID5 xor checksumming
@ 2012-03-29 21:44 Jim Kukunas
2012-03-29 21:44 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas
2012-03-31 11:38 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) John Robinson
0 siblings, 2 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-03-29 21:44 UTC (permalink / raw)
To: linux-raid; +Cc: hpa, neilb
Hi Folks,
The following patch adds an AVX implementation of the RAID5 xor checksumming
functions.
Based on xor_speed, the AVX implementation appears to be ~32% faster than the
SSE implementation on my i7 2600:
generic_sse: 15088.000 MB/sec
avx: 19936.000 MB/sec
Thanks.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] raid5: add AVX optimized RAID5 checksumming
2012-03-29 21:44 AVX RAID5 xor checksumming Jim Kukunas
@ 2012-03-29 21:44 ` Jim Kukunas
2012-03-31 11:38 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) John Robinson
1 sibling, 0 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-03-29 21:44 UTC (permalink / raw)
To: linux-raid; +Cc: hpa, neilb
Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
---
arch/x86/include/asm/xor_32.h | 8 ++-
arch/x86/include/asm/xor_64.h | 10 ++-
arch/x86/include/asm/xor_avx.h | 184 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 200 insertions(+), 2 deletions(-)
create mode 100644 arch/x86/include/asm/xor_avx.h
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a..1799baa 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5,
};
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
/* Also try the generic routines. */
#include <asm-generic/xor.h>
@@ -871,6 +874,8 @@ do { \
xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \
+ if (cpu_has_avx) \
+ xor_speed(&xor_block_avx); \
if (cpu_has_xmm) \
xor_speed(&xor_block_pIII_sse); \
if (cpu_has_mmx) { \
@@ -883,6 +888,7 @@ do { \
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
- (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+ (cpu has_avx ? &xor_block_avx : \
+ cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e..d331b41 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,23 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5,
};
+
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
+ if (cpu_has_avx) \
+ xor_speed(&xor_block_avx); \
xor_speed(&xor_block_sse); \
} while (0)
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ (cpu_has_avx ? &xor_block_avx : \
+ &xor_block_sse)
#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 0000000..dda165b
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <asm/i387.h>
+
+#define ALIGN32 __attribute__((aligned(32)))
+
+#define YMM_SAVED_REGS 4
+
+#define YMMS_SAVE \
+do { \
+ preempt_disable(); \
+ cr0 = read_cr0(); \
+ clts(); \
+ asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+ asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+ asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+ asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+
+#define YMMS_RESTORE \
+do { \
+ asm volatile("sfence" : : : "memory"); \
+ asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+ asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+ asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+ asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+ write_cr0(cr0); \
+ preempt_enable(); \
+} while (0);
+
+#define BLOCK4(i) \
+ BLOCK(32 * i, 0) \
+ BLOCK(32 * (i + 1), 1) \
+ BLOCK(32 * (i + 2), 2) \
+ BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+ BLOCK4(0) \
+ BLOCK4(4) \
+ BLOCK4(8) \
+ BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)]));
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)]));
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)]));
+
+ BLOCK16();
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)]));
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ p4 = (unsigned long *)((uintptr_t)p4 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static struct xor_block_template xor_block_avx = {
+ .name = "avx",
+ .do_2 = xor_avx_2,
+ .do_3 = xor_avx_3,
+ .do_4 = xor_avx_4,
+ .do_5 = xor_avx_5,
+};
+
+#endif
+
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-03-29 21:44 AVX RAID5 xor checksumming Jim Kukunas
2012-03-29 21:44 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas
@ 2012-03-31 11:38 ` John Robinson
2012-04-02 0:01 ` H. Peter Anvin
2012-04-02 22:48 ` Jim Kukunas
1 sibling, 2 replies; 12+ messages in thread
From: John Robinson @ 2012-03-31 11:38 UTC (permalink / raw)
To: linux-raid
On 29/03/2012 22:44, Jim Kukunas wrote:
> Based on xor_speed, the AVX implementation appears to be ~32% faster than the
> SSE implementation on my i7 2600:
>
> generic_sse: 15088.000 MB/sec
> avx: 19936.000 MB/sec
I just noticed in my logs the other day (recent el5 kernel on a Core 2):
raid5: automatically using best checksumming function: generic_sse
generic_sse: 7805.000 MB/sec
raid5: using function: generic_sse (7805.000 MB/sec)
raid6: int64x1 2635 MB/s
raid6: int64x2 3208 MB/s
raid6: int64x4 3020 MB/s
raid6: int64x8 2519 MB/s
raid6: sse2x1 5099 MB/s
raid6: sse2x2 5742 MB/s
raid6: sse2x4 8237 MB/s
raid6: using algorithm sse2x4 (8237 MB/s)
I was just wondering how it's possible to do the RAID6 Q calculation
faster than the RAID5 XOR calculation - or am I reading this log excerpt
wrongly?
It's probably academic, since the machine this is running on only has a
maximum of about 4500 MB/s of memory throughput, and a lot of that would
be consumed sending data to disc in amongst the calculations being done.
Cheers,
John.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-03-31 11:38 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) John Robinson
@ 2012-04-02 0:01 ` H. Peter Anvin
2012-04-02 22:48 ` Jim Kukunas
1 sibling, 0 replies; 12+ messages in thread
From: H. Peter Anvin @ 2012-04-02 0:01 UTC (permalink / raw)
To: John Robinson; +Cc: linux-raid
On 03/31/2012 04:38 AM, John Robinson wrote:
> On 29/03/2012 22:44, Jim Kukunas wrote:
>> Based on xor_speed, the AVX implementation appears to be ~32% faster
>> than the
>> SSE implementation on my i7 2600:
>>
>> generic_sse: 15088.000 MB/sec
>> avx: 19936.000 MB/sec
>
> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
>
> raid5: automatically using best checksumming function: generic_sse
> generic_sse: 7805.000 MB/sec
> raid5: using function: generic_sse (7805.000 MB/sec)
> raid6: int64x1 2635 MB/s
> raid6: int64x2 3208 MB/s
> raid6: int64x4 3020 MB/s
> raid6: int64x8 2519 MB/s
> raid6: sse2x1 5099 MB/s
> raid6: sse2x2 5742 MB/s
> raid6: sse2x4 8237 MB/s
> raid6: using algorithm sse2x4 (8237 MB/s)
>
> I was just wondering how it's possible to do the RAID6 Q calculation
> faster than the RAID5 XOR calculation - or am I reading this log excerpt
> wrongly?
>
> It's probably academic, since the machine this is running on only has a
> maximum of about 4500 MB/s of memory throughput, and a lot of that would
> be consumed sending data to disc in amongst the calculations being done.
>
It *might* be a result of how these different algorithms are
benchmarked, but yes, that really looks a bit odd, especially since the
RAID6 code *also* computes the XOR checksum (it does P and Q in parallel
since it has to read the data anyway).
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-03-31 11:38 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) John Robinson
2012-04-02 0:01 ` H. Peter Anvin
@ 2012-04-02 22:48 ` Jim Kukunas
2012-04-03 10:23 ` John Robinson
1 sibling, 1 reply; 12+ messages in thread
From: Jim Kukunas @ 2012-04-02 22:48 UTC (permalink / raw)
To: linux-raid
On Sat, Mar 31, 2012 at 12:38:56PM +0100, John Robinson wrote:
> On 29/03/2012 22:44, Jim Kukunas wrote:
> > Based on xor_speed, the AVX implementation appears to be ~32% faster than the
> > SSE implementation on my i7 2600:
> >
> > generic_sse: 15088.000 MB/sec
> > avx: 19936.000 MB/sec
>
> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
>
> raid5: automatically using best checksumming function: generic_sse
> generic_sse: 7805.000 MB/sec
> raid5: using function: generic_sse (7805.000 MB/sec)
> raid6: int64x1 2635 MB/s
> raid6: int64x2 3208 MB/s
> raid6: int64x4 3020 MB/s
> raid6: int64x8 2519 MB/s
> raid6: sse2x1 5099 MB/s
> raid6: sse2x2 5742 MB/s
> raid6: sse2x4 8237 MB/s
> raid6: using algorithm sse2x4 (8237 MB/s)
>
> I was just wondering how it's possible to do the RAID6 Q calculation
> faster than the RAID5 XOR calculation - or am I reading this log excerpt
> wrongly?
Out of curiosity, are you running with CONFIG_PREEMPT=y?
Thanks.
>
> It's probably academic, since the machine this is running on only has a
> maximum of about 4500 MB/s of memory throughput, and a lot of that would
> be consumed sending data to disc in amongst the calculations being done.
>
> Cheers,
>
> John.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jim Kukunas
Intel Open Source Technology Center
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-04-02 22:48 ` Jim Kukunas
@ 2012-04-03 10:23 ` John Robinson
2012-04-03 23:56 ` Jim Kukunas
0 siblings, 1 reply; 12+ messages in thread
From: John Robinson @ 2012-04-03 10:23 UTC (permalink / raw)
To: linux-raid
On 02/04/2012 23:48, Jim Kukunas wrote:
> On Sat, Mar 31, 2012 at 12:38:56PM +0100, John Robinson wrote:
[...]
>> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
>>
>> raid5: automatically using best checksumming function: generic_sse
>> generic_sse: 7805.000 MB/sec
>> raid5: using function: generic_sse (7805.000 MB/sec)
[...]
>> raid6: using algorithm sse2x4 (8237 MB/s)
>>
>> I was just wondering how it's possible to do the RAID6 Q calculation
>> faster than the RAID5 XOR calculation - or am I reading this log excerpt
>> wrongly?
>
> Out of curiosity, are you running with CONFIG_PREEMPT=y?
No. Here's an excerpt from my .config:
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_BKL=y
CONFIG_PREEMPT_NOTIFIERS=y
But this is a Xen dom0 kernel, 2.6.18-308.1.1.el5.centos.plusxen. Now, a
non-Xen kernel (2.6.18-308.1.1.el5) says:
raid5: automatically using best checksumming function: generic_sse
generic_sse: 11892.000 MB/sec
raid5: using function: generic_sse (11892.000 MB/sec)
raid6: int64x1 2644 MB/s
raid6: int64x2 3238 MB/s
raid6: int64x4 3011 MB/s
raid6: int64x8 2503 MB/s
raid6: sse2x1 5375 MB/s
raid6: sse2x2 5851 MB/s
raid6: sse2x4 9136 MB/s
raid6: using algorithm sse2x4 (9136 MB/s)
Looks like it loses a chunk of performance running as a Xen dom0.
Even still, 11892 MB/s for XOR vs 9136 MB/s for XOR+Q - it still seems
remarkable that the XOR can't be done several times faster than the Q.
Cheers,
John.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-04-03 10:23 ` John Robinson
@ 2012-04-03 23:56 ` Jim Kukunas
2012-04-03 23:56 ` [PATCH 1/2] crypto: wait for a full jiffy in do_xor_speed Jim Kukunas
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-04-03 23:56 UTC (permalink / raw)
To: linux-raid; +Cc: linux-crypto
On Tue, Apr 03, 2012 at 11:23:16AM +0100, John Robinson wrote:
> On 02/04/2012 23:48, Jim Kukunas wrote:
> > On Sat, Mar 31, 2012 at 12:38:56PM +0100, John Robinson wrote:
> [...]
> >> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
> >>
> >> raid5: automatically using best checksumming function: generic_sse
> >> generic_sse: 7805.000 MB/sec
> >> raid5: using function: generic_sse (7805.000 MB/sec)
> [...]
> >> raid6: using algorithm sse2x4 (8237 MB/s)
> >>
> >> I was just wondering how it's possible to do the RAID6 Q calculation
> >> faster than the RAID5 XOR calculation - or am I reading this log excerpt
> >> wrongly?
> >
> > Out of curiosity, are you running with CONFIG_PREEMPT=y?
>
> No. Here's an excerpt from my .config:
>
> # CONFIG_PREEMPT_NONE is not set
> CONFIG_PREEMPT_VOLUNTARY=y
> # CONFIG_PREEMPT is not set
> CONFIG_PREEMPT_BKL=y
> CONFIG_PREEMPT_NOTIFIERS=y
>
> But this is a Xen dom0 kernel, 2.6.18-308.1.1.el5.centos.plusxen. Now, a
> non-Xen kernel (2.6.18-308.1.1.el5) says:
> raid5: automatically using best checksumming function: generic_sse
> generic_sse: 11892.000 MB/sec
> raid5: using function: generic_sse (11892.000 MB/sec)
> raid6: int64x1 2644 MB/s
> raid6: int64x2 3238 MB/s
> raid6: int64x4 3011 MB/s
> raid6: int64x8 2503 MB/s
> raid6: sse2x1 5375 MB/s
> raid6: sse2x2 5851 MB/s
> raid6: sse2x4 9136 MB/s
> raid6: using algorithm sse2x4 (9136 MB/s)
>
> Looks like it loses a chunk of performance running as a Xen dom0.
>
> Even still, 11892 MB/s for XOR vs 9136 MB/s for XOR+Q - it still seems
> remarkable that the XOR can't be done several times faster than the Q.
Taking a look at do_xor_speed, I see two issues which might be the cause
of the disparity you reported.
0) In the RAID5 xor benchmark, we get the current jiffy, then run do_2() until
the jiffy increments. This means we could potentially be testing for less
than a full jiffy. The RAID6 benchmark handles this by obtaining the current
jiffy, then calling cpu_relax() until the jiffy increments, and then running
the test. This is addressed by my first patch.
1) The only way I could reproduce your findings of a higher throughput for
RAID6 than for RAID5 xor checksumming was with CONFIG_PREEMPT=y. It seems
that you encountered this while running as XEN dom0. Currently, we disable
preemption during the RAID6 benchmark, but don't in the RAID5 benchmark.
This is addressed by my second patch.
I've added linux-crypto to the discussion as both of these patches affect
code in crypto/
Thanks.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 1/2] crypto: wait for a full jiffy in do_xor_speed
2012-04-03 23:56 ` Jim Kukunas
@ 2012-04-03 23:56 ` Jim Kukunas
2012-04-03 23:56 ` [PATCH 2/2] crypto: disable preemption while benchmarking RAID5 xor checksumming Jim Kukunas
2012-04-06 20:43 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) Dan Williams
2 siblings, 0 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-04-03 23:56 UTC (permalink / raw)
To: linux-raid; +Cc: linux-crypto
In the existing do_xor_speed(), there is no guarantee that we actually
run do_2() for a full jiffy. We get the current jiffy, then run do_2()
until the next jiffy.
Instead, let's get the current jiffy, then wait until the next jiffy
to start our test.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
crypto/xor.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/crypto/xor.c b/crypto/xor.c
index b75182d..8788443 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -63,7 +63,7 @@ static void
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
{
int speed;
- unsigned long now;
+ unsigned long now, j;
int i, count, max;
tmpl->next = template_list;
@@ -76,9 +76,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
*/
max = 0;
for (i = 0; i < 5; i++) {
- now = jiffies;
+ j = jiffies;
count = 0;
- while (jiffies == now) {
+ while ((now = jiffies) == j)
+ cpu_relax();
+ while (time_before(jiffies, now + 1)) {
mb(); /* prevent loop optimzation */
tmpl->do_2(BENCH_SIZE, b1, b2);
mb();
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/2] crypto: disable preemption while benchmarking RAID5 xor checksumming
2012-04-03 23:56 ` Jim Kukunas
2012-04-03 23:56 ` [PATCH 1/2] crypto: wait for a full jiffy in do_xor_speed Jim Kukunas
@ 2012-04-03 23:56 ` Jim Kukunas
2012-04-06 20:43 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) Dan Williams
2 siblings, 0 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-04-03 23:56 UTC (permalink / raw)
To: linux-raid; +Cc: linux-crypto
With CONFIG_PREEMPT=y, we need to disable preemption while benchmarking
RAID5 xor checksumming to ensure we're actually measuring what we think
we're measuring.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
crypto/xor.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/crypto/xor.c b/crypto/xor.c
index 8788443..84daa11 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -21,6 +21,7 @@
#include <linux/gfp.h>
#include <linux/raid/xor.h>
#include <linux/jiffies.h>
+#include <linux/preempt.h>
#include <asm/xor.h>
/* The xor routines to use. */
@@ -69,6 +70,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
tmpl->next = template_list;
template_list = tmpl;
+ preempt_disable();
+
/*
* Count the number of XORs done during a whole jiffy, and use
* this to calculate the speed of checksumming. We use a 2-page
@@ -91,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
max = count;
}
+ preempt_enable();
+
speed = max * (HZ * BENCH_SIZE / 1024);
tmpl->speed = speed;
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-04-03 23:56 ` Jim Kukunas
2012-04-03 23:56 ` [PATCH 1/2] crypto: wait for a full jiffy in do_xor_speed Jim Kukunas
2012-04-03 23:56 ` [PATCH 2/2] crypto: disable preemption while benchmarking RAID5 xor checksumming Jim Kukunas
@ 2012-04-06 20:43 ` Dan Williams
2012-04-17 15:32 ` Boaz Harrosh
2 siblings, 1 reply; 12+ messages in thread
From: Dan Williams @ 2012-04-06 20:43 UTC (permalink / raw)
To: Jim Kukunas; +Cc: linux-raid, linux-crypto, bharrosh
[adding Boaz since he also made an attempt at fixing this]
http://marc.info/?l=linux-crypto-vger&m=131829241111450&w=2
...I had meant to follow up on this, but was buried in 'isci' issues.
On Tue, Apr 3, 2012 at 4:56 PM, Jim Kukunas
<james.t.kukunas@linux.intel.com> wrote:
> On Tue, Apr 03, 2012 at 11:23:16AM +0100, John Robinson wrote:
>> On 02/04/2012 23:48, Jim Kukunas wrote:
>> > On Sat, Mar 31, 2012 at 12:38:56PM +0100, John Robinson wrote:
>> [...]
>> >> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
>> >>
>> >> raid5: automatically using best checksumming function: generic_sse
>> >> generic_sse: 7805.000 MB/sec
>> >> raid5: using function: generic_sse (7805.000 MB/sec)
>> [...]
>> >> raid6: using algorithm sse2x4 (8237 MB/s)
>> >>
>> >> I was just wondering how it's possible to do the RAID6 Q calculation
>> >> faster than the RAID5 XOR calculation - or am I reading this log excerpt
>> >> wrongly?
>> >
>> > Out of curiosity, are you running with CONFIG_PREEMPT=y?
>>
>> No. Here's an excerpt from my .config:
>>
>> # CONFIG_PREEMPT_NONE is not set
>> CONFIG_PREEMPT_VOLUNTARY=y
>> # CONFIG_PREEMPT is not set
>> CONFIG_PREEMPT_BKL=y
>> CONFIG_PREEMPT_NOTIFIERS=y
>>
>> But this is a Xen dom0 kernel, 2.6.18-308.1.1.el5.centos.plusxen. Now, a
>> non-Xen kernel (2.6.18-308.1.1.el5) says:
>> raid5: automatically using best checksumming function: generic_sse
>> generic_sse: 11892.000 MB/sec
>> raid5: using function: generic_sse (11892.000 MB/sec)
>> raid6: int64x1 2644 MB/s
>> raid6: int64x2 3238 MB/s
>> raid6: int64x4 3011 MB/s
>> raid6: int64x8 2503 MB/s
>> raid6: sse2x1 5375 MB/s
>> raid6: sse2x2 5851 MB/s
>> raid6: sse2x4 9136 MB/s
>> raid6: using algorithm sse2x4 (9136 MB/s)
>>
>> Looks like it loses a chunk of performance running as a Xen dom0.
>>
>> Even still, 11892 MB/s for XOR vs 9136 MB/s for XOR+Q - it still seems
>> remarkable that the XOR can't be done several times faster than the Q.
>
> Taking a look at do_xor_speed, I see two issues which might be the cause
> of the disparity you reported.
>
> 0) In the RAID5 xor benchmark, we get the current jiffy, then run do_2() until
> the jiffy increments. This means we could potentially be testing for less
> than a full jiffy. The RAID6 benchmark handles this by obtaining the current
> jiffy, then calling cpu_relax() until the jiffy increments, and then running
> the test. This is addressed by my first patch.
>
> 1) The only way I could reproduce your findings of a higher throughput for
> RAID6 than for RAID5 xor checksumming was with CONFIG_PREEMPT=y. It seems
> that you encountered this while running as XEN dom0. Currently, we disable
> preemption during the RAID6 benchmark, but don't in the RAID5 benchmark.
> This is addressed by my second patch.
>
> I've added linux-crypto to the discussion as both of these patches affect
> code in crypto/
>
> Thanks.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming)
2012-04-06 20:43 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) Dan Williams
@ 2012-04-17 15:32 ` Boaz Harrosh
0 siblings, 0 replies; 12+ messages in thread
From: Boaz Harrosh @ 2012-04-17 15:32 UTC (permalink / raw)
To: Dan Williams; +Cc: Jim Kukunas, linux-raid, linux-crypto
On 04/06/2012 11:43 PM, Dan Williams wrote:
> [adding Boaz since he also made an attempt at fixing this]
>
> http://marc.info/?l=linux-crypto-vger&m=131829241111450&w=2
>
> ...I had meant to follow up on this, but was buried in 'isci' issues.
>
>
Sorry was traveling.
Yes I have an old fix for this. Which I need to cleanup and retest.
My original problem was an hang in UML, but I noticed the timing problems
as well.
Please give me til the end of the week to settle in and come up to speed.
[Current patch: http://marc.info/?l=linux-crypto-vger&m=131829242311458&w=2]
Thanks
Boaz
> On Tue, Apr 3, 2012 at 4:56 PM, Jim Kukunas
> <james.t.kukunas@linux.intel.com> wrote:
>> On Tue, Apr 03, 2012 at 11:23:16AM +0100, John Robinson wrote:
>>> On 02/04/2012 23:48, Jim Kukunas wrote:
>>>> On Sat, Mar 31, 2012 at 12:38:56PM +0100, John Robinson wrote:
>>> [...]
>>>>> I just noticed in my logs the other day (recent el5 kernel on a Core 2):
>>>>>
>>>>> raid5: automatically using best checksumming function: generic_sse
>>>>> generic_sse: 7805.000 MB/sec
>>>>> raid5: using function: generic_sse (7805.000 MB/sec)
>>> [...]
>>>>> raid6: using algorithm sse2x4 (8237 MB/s)
>>>>>
>>>>> I was just wondering how it's possible to do the RAID6 Q calculation
>>>>> faster than the RAID5 XOR calculation - or am I reading this log excerpt
>>>>> wrongly?
>>>>
>>>> Out of curiosity, are you running with CONFIG_PREEMPT=y?
>>>
>>> No. Here's an excerpt from my .config:
>>>
>>> # CONFIG_PREEMPT_NONE is not set
>>> CONFIG_PREEMPT_VOLUNTARY=y
>>> # CONFIG_PREEMPT is not set
>>> CONFIG_PREEMPT_BKL=y
>>> CONFIG_PREEMPT_NOTIFIERS=y
>>>
>>> But this is a Xen dom0 kernel, 2.6.18-308.1.1.el5.centos.plusxen. Now, a
>>> non-Xen kernel (2.6.18-308.1.1.el5) says:
>>> raid5: automatically using best checksumming function: generic_sse
>>> generic_sse: 11892.000 MB/sec
>>> raid5: using function: generic_sse (11892.000 MB/sec)
>>> raid6: int64x1 2644 MB/s
>>> raid6: int64x2 3238 MB/s
>>> raid6: int64x4 3011 MB/s
>>> raid6: int64x8 2503 MB/s
>>> raid6: sse2x1 5375 MB/s
>>> raid6: sse2x2 5851 MB/s
>>> raid6: sse2x4 9136 MB/s
>>> raid6: using algorithm sse2x4 (9136 MB/s)
>>>
>>> Looks like it loses a chunk of performance running as a Xen dom0.
>>>
>>> Even still, 11892 MB/s for XOR vs 9136 MB/s for XOR+Q - it still seems
>>> remarkable that the XOR can't be done several times faster than the Q.
>>
>> Taking a look at do_xor_speed, I see two issues which might be the cause
>> of the disparity you reported.
>>
>> 0) In the RAID5 xor benchmark, we get the current jiffy, then run do_2() until
>> the jiffy increments. This means we could potentially be testing for less
>> than a full jiffy. The RAID6 benchmark handles this by obtaining the current
>> jiffy, then calling cpu_relax() until the jiffy increments, and then running
>> the test. This is addressed by my first patch.
>>
>> 1) The only way I could reproduce your findings of a higher throughput for
>> RAID6 than for RAID5 xor checksumming was with CONFIG_PREEMPT=y. It seems
>> that you encountered this while running as XEN dom0. Currently, we disable
>> preemption during the RAID6 benchmark, but don't in the RAID5 benchmark.
>> This is addressed by my second patch.
>>
>> I've added linux-crypto to the discussion as both of these patches affect
>> code in crypto/
>>
>> Thanks.
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] raid5: add AVX optimized RAID5 checksumming
2012-04-18 22:58 arch/86: AVX RAID5 xor checksumming v1 Jim Kukunas
@ 2012-04-18 22:58 ` Jim Kukunas
0 siblings, 0 replies; 12+ messages in thread
From: Jim Kukunas @ 2012-04-18 22:58 UTC (permalink / raw)
To: neilb; +Cc: hpa, linux-kernel, linux-raid
Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
arch/x86/Makefile | 5 +-
arch/x86/include/asm/xor_32.h | 6 +-
arch/x86/include/asm/xor_64.h | 8 ++-
arch/x86/include/asm/xor_avx.h | 215 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 230 insertions(+), 4 deletions(-)
create mode 100644 arch/x86/include/asm/xor_avx.h
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 209ba12..803c76d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -99,9 +99,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a..4545708 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5,
};
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
/* Also try the generic routines. */
#include <asm-generic/xor.h>
@@ -871,6 +874,7 @@ do { \
xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \
+ AVX_XOR_SPEED; \
if (cpu_has_xmm) \
xor_speed(&xor_block_pIII_sse); \
if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
- (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+ AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e..b9b2323 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5,
};
+
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
+ AVX_XOR_SPEED; \
xor_speed(&xor_block_sse); \
} while (0)
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ AVX_SELECT(&xor_block_sse)
#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 0000000..a2c9aa5
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,215 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifdef CONFIG_AS_AVX
+
+#include <linux/compiler.h>
+#include <asm/i387.h>
+
+#define ALIGN32 __aligned(32)
+
+#define YMM_SAVED_REGS 4
+
+#define YMMS_SAVE \
+do { \
+ preempt_disable(); \
+ cr0 = read_cr0(); \
+ clts(); \
+ asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+ asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+ asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+ asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+
+#define YMMS_RESTORE \
+do { \
+ asm volatile("sfence" : : : "memory"); \
+ asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+ asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+ asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+ asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+ write_cr0(cr0); \
+ preempt_enable(); \
+} while (0);
+
+#define BLOCK4(i) \
+ BLOCK(32 * i, 0) \
+ BLOCK(32 * (i + 1), 1) \
+ BLOCK(32 * (i + 2), 2) \
+ BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+ BLOCK4(0) \
+ BLOCK4(4) \
+ BLOCK4(8) \
+ BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16();
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ p4 = (unsigned long *)((uintptr_t)p4 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static struct xor_block_template xor_block_avx = {
+ .name = "avx",
+ .do_2 = xor_avx_2,
+ .do_3 = xor_avx_3,
+ .do_4 = xor_avx_4,
+ .do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+ if (cpu_has_avx) \
+ xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+ (cpu_has_avx ? &xor_block_avx : FASTEST)
+
+#else
+
+#define AVX_XOR_SPEED {}
+
+#define AVX_SELECT(FASTEST) (FASTEST)
+
+#endif
+#endif
+
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
end of thread, other threads:[~2012-04-18 22:58 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-29 21:44 AVX RAID5 xor checksumming Jim Kukunas
2012-03-29 21:44 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas
2012-03-31 11:38 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) John Robinson
2012-04-02 0:01 ` H. Peter Anvin
2012-04-02 22:48 ` Jim Kukunas
2012-04-03 10:23 ` John Robinson
2012-04-03 23:56 ` Jim Kukunas
2012-04-03 23:56 ` [PATCH 1/2] crypto: wait for a full jiffy in do_xor_speed Jim Kukunas
2012-04-03 23:56 ` [PATCH 2/2] crypto: disable preemption while benchmarking RAID5 xor checksumming Jim Kukunas
2012-04-06 20:43 ` RAID5 XOR speed vs RAID6 Q speed (was Re: AVX RAID5 xor checksumming) Dan Williams
2012-04-17 15:32 ` Boaz Harrosh
-- strict thread matches above, loose matches on Subject: below --
2012-04-18 22:58 arch/86: AVX RAID5 xor checksumming v1 Jim Kukunas
2012-04-18 22:58 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).