[PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types
@ 2018-11-27 10:08 Jackie Liu
  2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Jackie Liu @ 2018-11-27 10:08 UTC (permalink / raw)
  To: linux-arm-kernel

In a way similar to ARM commit 09096f6a0ee2 ("ARM: 7822/1: add workaround
for ambiguous C99 stdint.h types"), this patch redefines the macros that
are used in stdint.h so its definitions of uint64_t and int64_t are
compatible with those of the kernel.

This patch comes from: https://patchwork.kernel.org/patch/3540001/
Wrote by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

We mark this file as a private file and don't have to override asm/types.h

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
---
 arch/arm64/include/asm/neon-intrinsics.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 arch/arm64/include/asm/neon-intrinsics.h

diff --git a/arch/arm64/include/asm/neon-intrinsics.h b/arch/arm64/include/asm/neon-intrinsics.h
new file mode 100644
index 0000000..96a3fda
--- /dev/null
+++ b/arch/arm64/include/asm/neon-intrinsics.h
@@ -0,0 +1,28 @@
+#ifndef _NEON_INTRINSICS_H
+#define _NEON_INTRINSICS_H
+
+#include <asm-generic/int-ll64.h>
+
+/*
+ * For Aarch64, there is some ambiguity in the definition of the types below
+ * between the kernel and GCC itself. This is usually not a big deal, but it
+ * causes trouble when including GCC's version of 'stdint.h' (this is the file
+ * that gets included when you #include <stdint.h> on a -ffreestanding build).
+ * As this file also gets included implicitly when including 'arm_neon.h' (the
+ * NEON intrinsics support header), we need the following to work around the
+ * issue if we want to use NEON intrinsics in the kernel.
+ */
+
+#ifdef __INT64_TYPE__
+#undef __INT64_TYPE__
+#define __INT64_TYPE__		__signed__ long long
+#endif
+
+#ifdef __UINT64_TYPE__
+#undef __UINT64_TYPE__
+#define __UINT64_TYPE__		unsigned long long
+#endif
+
+#include <arm_neon.h>
+
+#endif /* ! _NEON_INTRINSICS_H */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 10:08 [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Jackie Liu
@ 2018-11-27 10:08 ` Jackie Liu
  2018-11-27 11:49   ` Ard Biesheuvel
  2018-11-29 17:00   ` Dave Martin
  2018-11-27 11:42 ` [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Ard Biesheuvel
  2018-11-29 16:55 ` Dave Martin
  2 siblings, 2 replies; 13+ messages in thread
From: Jackie Liu @ 2018-11-27 10:08 UTC (permalink / raw)
  To: linux-arm-kernel

This is a NEON acceleration method that can improve
performance by approximately 20%. I got the following
data from the centos 7.5 on Huawei's HISI1616 chip:

[ 93.837726] xor: measuring software checksum speed
[ 93.874039]   8regs  : 7123.200 MB/sec
[ 93.914038]   32regs : 7180.300 MB/sec
[ 93.954043]   arm64_neon: 9856.000 MB/sec
[ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)

I believe this code can bring some optimization for
all arm64 platform.

That is patch version 3. Thanks for Ard Biesheuvel's
suggestions.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
---
 arch/arm64/include/asm/Kbuild |   1 -
 arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
 arch/arm64/lib/Makefile       |   6 ++
 arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/xor.h
 create mode 100644 arch/arm64/lib/xor-neon.c

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 6cd5d77..1877f29 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -27,4 +27,3 @@ generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += user.h
 generic-y += vga.h
-generic-y += xor.h
diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
new file mode 100644
index 0000000..856386a
--- /dev/null
+++ b/arch/arm64/include/asm/xor.h
@@ -0,0 +1,73 @@
+/*
+ * arch/arm64/include/asm/xor.h
+ *
+ * Authors: Jackie Liu <liuyun01@kylinos.cn>
+ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/hardirq.h>
+#include <asm-generic/xor.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+extern struct xor_block_template const xor_block_inner_neon;
+
+static void
+xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	kernel_neon_begin();
+	xor_block_inner_neon.do_2(bytes, p1, p2);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3)
+{
+	kernel_neon_begin();
+	xor_block_inner_neon.do_3(bytes, p1, p2, p3);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3, unsigned long *p4)
+{
+	kernel_neon_begin();
+	xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	kernel_neon_begin();
+	xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
+	kernel_neon_end();
+}
+
+static struct xor_block_template xor_block_arm64 = {
+	.name   = "arm64_neon",
+	.do_2   = xor_neon_2,
+	.do_3   = xor_neon_3,
+	.do_4   = xor_neon_4,
+	.do_5	= xor_neon_5
+};
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES           \
+	do {        \
+		xor_speed(&xor_block_8regs);    \
+		xor_speed(&xor_block_32regs);    \
+		if (cpu_has_neon()) { \
+			xor_speed(&xor_block_arm64);\
+		} \
+	} while (0)
+
+#endif /* ! CONFIG_KERNEL_MODE_NEON */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 69ff988..5540a16 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,6 +5,12 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o tishift.o
 
+ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
+obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
+CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
+CFLAGS_xor-neon.o		+= -ffreestanding
+endif
+
 # Tell the compiler to treat all general purpose registers (with the
 # exception of the IP registers, which are already handled by the caller
 # in case of a PLT) as callee-saved, which allows for efficient runtime
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
new file mode 100644
index 0000000..131c60c2
--- /dev/null
+++ b/arch/arm64/lib/xor-neon.c
@@ -0,0 +1,184 @@
+/*
+ * arch/arm64/lib/xor-neon.c
+ *
+ * Authors: Jackie Liu <liuyun01@kylinos.cn>
+ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/raid/xor.h>
+#include <linux/module.h>
+#include <asm/neon-intrinsics.h>
+
+void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
+	unsigned long *p2)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 */
+		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+		/* store */
+		vst1q_u64(dp1 +  0, v0);
+		vst1q_u64(dp1 +  2, v1);
+		vst1q_u64(dp1 +  4, v2);
+		vst1q_u64(dp1 +  6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+	} while (--lines > 0);
+}
+
+void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 */
+		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+		/* p1 ^= p3 */
+		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+		/* store */
+		vst1q_u64(dp1 +  0, v0);
+		vst1q_u64(dp1 +  2, v1);
+		vst1q_u64(dp1 +  4, v2);
+		vst1q_u64(dp1 +  6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+	} while (--lines > 0);
+}
+
+void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 */
+		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+		/* p1 ^= p3 */
+		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+		/* store */
+		vst1q_u64(dp1 +  0, v0);
+		vst1q_u64(dp1 +  2, v1);
+		vst1q_u64(dp1 +  4, v2);
+		vst1q_u64(dp1 +  6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+	} while (--lines > 0);
+}
+
+void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3,
+	unsigned long *p4, unsigned long *p5)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+	uint64_t *dp5 = (uint64_t *)p5;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 */
+		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+		/* p1 ^= p3 */
+		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+		/* p1 ^= p5 */
+		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
+		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
+		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
+		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
+
+		/* store */
+		vst1q_u64(dp1 +  0, v0);
+		vst1q_u64(dp1 +  2, v1);
+		vst1q_u64(dp1 +  4, v2);
+		vst1q_u64(dp1 +  6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+		dp5 += 8;
+	} while (--lines > 0);
+}
+
+struct xor_block_template const xor_block_inner_neon = {
+	.name	= "__inner_neon__",
+	.do_2	= xor_arm64_neon_2,
+	.do_3	= xor_arm64_neon_3,
+	.do_4	= xor_arm64_neon_4,
+	.do_5	= xor_arm64_neon_5,
+};
+EXPORT_SYMBOL(xor_block_inner_neon);
+
+MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
+MODULE_DESCRIPTION("ARMv8 XOR Extensions");
+MODULE_LICENSE("GPL");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
@ 2018-11-27 11:49   ` Ard Biesheuvel
  2018-11-27 12:33     ` JackieLiu
  2018-11-27 12:46     ` Ard Biesheuvel
  2018-11-29 17:00   ` Dave Martin
  1 sibling, 2 replies; 13+ messages in thread
From: Ard Biesheuvel @ 2018-11-27 11:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
>
> This is a NEON acceleration method that can improve
> performance by approximately 20%. I got the following
> data from the centos 7.5 on Huawei's HISI1616 chip:
>
> [ 93.837726] xor: measuring software checksum speed
> [ 93.874039]   8regs  : 7123.200 MB/sec
> [ 93.914038]   32regs : 7180.300 MB/sec
> [ 93.954043]   arm64_neon: 9856.000 MB/sec

That looks more like 37% to me

Note that Cortex-A57 gives me

[    0.111543] xor: measuring software checksum speed
[    0.154874]    8regs     :  3782.000 MB/sec
[    0.195069]    32regs    :  6095.000 MB/sec
[    0.235145]    arm64_neon:  5924.000 MB/sec
[    0.236942] xor: using function: 32regs (6095.000 MB/sec)

so we fall back to the scalar code, which is fine.

> [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
>
> I believe this code can bring some optimization for
> all arm64 platform.
>
> That is patch version 3. Thanks for Ard Biesheuvel's
> suggestions.
>
> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---
>  arch/arm64/include/asm/Kbuild |   1 -
>  arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
>  arch/arm64/lib/Makefile       |   6 ++
>  arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 263 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/xor.h
>  create mode 100644 arch/arm64/lib/xor-neon.c
>
> diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
> index 6cd5d77..1877f29 100644
> --- a/arch/arm64/include/asm/Kbuild
> +++ b/arch/arm64/include/asm/Kbuild
> @@ -27,4 +27,3 @@ generic-y += trace_clock.h
>  generic-y += unaligned.h
>  generic-y += user.h
>  generic-y += vga.h
> -generic-y += xor.h
> diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
> new file mode 100644
> index 0000000..856386a
> --- /dev/null
> +++ b/arch/arm64/include/asm/xor.h
> @@ -0,0 +1,73 @@
> +/*
> + * arch/arm64/include/asm/xor.h
> + *
> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/hardirq.h>
> +#include <asm-generic/xor.h>
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +
> +extern struct xor_block_template const xor_block_inner_neon;
> +
> +static void
> +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
> +{
> +       kernel_neon_begin();
> +       xor_block_inner_neon.do_2(bytes, p1, p2);
> +       kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +               unsigned long *p3)
> +{
> +       kernel_neon_begin();
> +       xor_block_inner_neon.do_3(bytes, p1, p2, p3);
> +       kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +               unsigned long *p3, unsigned long *p4)
> +{
> +       kernel_neon_begin();
> +       xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
> +       kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +               unsigned long *p3, unsigned long *p4, unsigned long *p5)
> +{
> +       kernel_neon_begin();
> +       xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
> +       kernel_neon_end();
> +}
> +
> +static struct xor_block_template xor_block_arm64 = {
> +       .name   = "arm64_neon",
> +       .do_2   = xor_neon_2,
> +       .do_3   = xor_neon_3,
> +       .do_4   = xor_neon_4,
> +       .do_5   = xor_neon_5
> +};
> +#undef XOR_TRY_TEMPLATES
> +#define XOR_TRY_TEMPLATES           \
> +       do {        \
> +               xor_speed(&xor_block_8regs);    \
> +               xor_speed(&xor_block_32regs);    \
> +               if (cpu_has_neon()) { \
> +                       xor_speed(&xor_block_arm64);\
> +               } \
> +       } while (0)
> +
> +#endif /* ! CONFIG_KERNEL_MODE_NEON */
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 69ff988..5540a16 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -5,6 +5,12 @@ lib-y          := clear_user.o delay.o copy_from_user.o                \
>                    memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
>                    strchr.o strrchr.o tishift.o
>
> +ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> +obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
> +CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
> +CFLAGS_xor-neon.o              += -ffreestanding
> +endif
> +
>  # Tell the compiler to treat all general purpose registers (with the
>  # exception of the IP registers, which are already handled by the caller
>  # in case of a PLT) as callee-saved, which allows for efficient runtime
> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> new file mode 100644
> index 0000000..131c60c2
> --- /dev/null
> +++ b/arch/arm64/lib/xor-neon.c
> @@ -0,0 +1,184 @@
> +/*
> + * arch/arm64/lib/xor-neon.c
> + *
> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/raid/xor.h>
> +#include <linux/module.h>
> +#include <asm/neon-intrinsics.h>
> +
> +void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
> +       unsigned long *p2)
> +{
> +       uint64_t *dp1 = (uint64_t *)p1;
> +       uint64_t *dp2 = (uint64_t *)p2;
> +
> +       register uint64x2_t v0, v1, v2, v3;
> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> +
> +       do {
> +               /* p1 ^= p2 */
> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> +
> +               /* store */
> +               vst1q_u64(dp1 +  0, v0);
> +               vst1q_u64(dp1 +  2, v1);
> +               vst1q_u64(dp1 +  4, v2);
> +               vst1q_u64(dp1 +  6, v3);
> +
> +               dp1 += 8;
> +               dp2 += 8;
> +       } while (--lines > 0);
> +}
> +
> +void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
> +       unsigned long *p2, unsigned long *p3)
> +{
> +       uint64_t *dp1 = (uint64_t *)p1;
> +       uint64_t *dp2 = (uint64_t *)p2;
> +       uint64_t *dp3 = (uint64_t *)p3;
> +
> +       register uint64x2_t v0, v1, v2, v3;
> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> +
> +       do {
> +               /* p1 ^= p2 */
> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> +
> +               /* p1 ^= p3 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> +
> +               /* store */
> +               vst1q_u64(dp1 +  0, v0);
> +               vst1q_u64(dp1 +  2, v1);
> +               vst1q_u64(dp1 +  4, v2);
> +               vst1q_u64(dp1 +  6, v3);
> +
> +               dp1 += 8;
> +               dp2 += 8;
> +               dp3 += 8;
> +       } while (--lines > 0);
> +}
> +
> +void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
> +       unsigned long *p2, unsigned long *p3, unsigned long *p4)
> +{
> +       uint64_t *dp1 = (uint64_t *)p1;
> +       uint64_t *dp2 = (uint64_t *)p2;
> +       uint64_t *dp3 = (uint64_t *)p3;
> +       uint64_t *dp4 = (uint64_t *)p4;
> +
> +       register uint64x2_t v0, v1, v2, v3;
> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> +
> +       do {
> +               /* p1 ^= p2 */
> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> +
> +               /* p1 ^= p3 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> +
> +               /* p1 ^= p4 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
> +
> +               /* store */
> +               vst1q_u64(dp1 +  0, v0);
> +               vst1q_u64(dp1 +  2, v1);
> +               vst1q_u64(dp1 +  4, v2);
> +               vst1q_u64(dp1 +  6, v3);
> +
> +               dp1 += 8;
> +               dp2 += 8;
> +               dp3 += 8;
> +               dp4 += 8;
> +       } while (--lines > 0);
> +}
> +
> +void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> +       unsigned long *p2, unsigned long *p3,
> +       unsigned long *p4, unsigned long *p5)
> +{
> +       uint64_t *dp1 = (uint64_t *)p1;
> +       uint64_t *dp2 = (uint64_t *)p2;
> +       uint64_t *dp3 = (uint64_t *)p3;
> +       uint64_t *dp4 = (uint64_t *)p4;
> +       uint64_t *dp5 = (uint64_t *)p5;
> +
> +       register uint64x2_t v0, v1, v2, v3;
> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> +
> +       do {
> +               /* p1 ^= p2 */
> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> +
> +               /* p1 ^= p3 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> +
> +               /* p1 ^= p4 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
> +
> +               /* p1 ^= p5 */
> +               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
> +               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
> +               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
> +               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
> +
> +               /* store */
> +               vst1q_u64(dp1 +  0, v0);
> +               vst1q_u64(dp1 +  2, v1);
> +               vst1q_u64(dp1 +  4, v2);
> +               vst1q_u64(dp1 +  6, v3);
> +
> +               dp1 += 8;
> +               dp2 += 8;
> +               dp3 += 8;
> +               dp4 += 8;
> +               dp5 += 8;
> +       } while (--lines > 0);
> +}
> +
> +struct xor_block_template const xor_block_inner_neon = {
> +       .name   = "__inner_neon__",
> +       .do_2   = xor_arm64_neon_2,
> +       .do_3   = xor_arm64_neon_3,
> +       .do_4   = xor_arm64_neon_4,
> +       .do_5   = xor_arm64_neon_5,
> +};
> +EXPORT_SYMBOL(xor_block_inner_neon);
> +
> +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
> +MODULE_DESCRIPTION("ARMv8 XOR Extensions");
> +MODULE_LICENSE("GPL");
> --
> 2.7.4
>
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 11:49   ` Ard Biesheuvel
@ 2018-11-27 12:33     ` JackieLiu
  2018-11-27 12:46     ` Ard Biesheuvel
  1 sibling, 0 replies; 13+ messages in thread
From: JackieLiu @ 2018-11-27 12:33 UTC (permalink / raw)
  To: linux-arm-kernel



> ? 2018?11?27??19:49?Ard Biesheuvel <ard.biesheuvel@linaro.org> ???
> 
> On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
>> 
>> This is a NEON acceleration method that can improve
>> performance by approximately 20%. I got the following
>> data from the centos 7.5 on Huawei's HISI1616 chip:
>> 
>> [ 93.837726] xor: measuring software checksum speed
>> [ 93.874039]   8regs  : 7123.200 MB/sec
>> [ 93.914038]   32regs : 7180.300 MB/sec
>> [ 93.954043]   arm64_neon: 9856.000 MB/sec
> 
> That looks more like 37% to me
> 
> Note that Cortex-A57 gives me
> 
> [    0.111543] xor: measuring software checksum speed
> [    0.154874]    8regs     :  3782.000 MB/sec
> [    0.195069]    32regs    :  6095.000 MB/sec
> [    0.235145]    arm64_neon:  5924.000 MB/sec
> [    0.236942] xor: using function: 32regs (6095.000 MB/sec)
> 

Centos 7.5 running with 64k page_size, maybe different with you.

> so we fall back to the scalar code, which is fine.
> 
>> [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
>> 
>> I believe this code can bring some optimization for
>> all arm64 platform.
>> 
>> That is patch version 3. Thanks for Ard Biesheuvel's
>> suggestions.
>> 
>> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> 
> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> 
>> ---
>> arch/arm64/include/asm/Kbuild |   1 -
>> arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
>> arch/arm64/lib/Makefile       |   6 ++
>> arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 263 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/include/asm/xor.h
>> create mode 100644 arch/arm64/lib/xor-neon.c
>> 
>> diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
>> index 6cd5d77..1877f29 100644
>> --- a/arch/arm64/include/asm/Kbuild
>> +++ b/arch/arm64/include/asm/Kbuild
>> @@ -27,4 +27,3 @@ generic-y += trace_clock.h
>> generic-y += unaligned.h
>> generic-y += user.h
>> generic-y += vga.h
>> -generic-y += xor.h
>> diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
>> new file mode 100644
>> index 0000000..856386a
>> --- /dev/null
>> +++ b/arch/arm64/include/asm/xor.h
>> @@ -0,0 +1,73 @@
>> +/*
>> + * arch/arm64/include/asm/xor.h
>> + *
>> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
>> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/hardirq.h>
>> +#include <asm-generic/xor.h>
>> +#include <asm/hwcap.h>
>> +#include <asm/neon.h>
>> +
>> +#ifdef CONFIG_KERNEL_MODE_NEON
>> +
>> +extern struct xor_block_template const xor_block_inner_neon;
>> +
>> +static void
>> +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>> +{
>> +       kernel_neon_begin();
>> +       xor_block_inner_neon.do_2(bytes, p1, p2);
>> +       kernel_neon_end();
>> +}
>> +
>> +static void
>> +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>> +               unsigned long *p3)
>> +{
>> +       kernel_neon_begin();
>> +       xor_block_inner_neon.do_3(bytes, p1, p2, p3);
>> +       kernel_neon_end();
>> +}
>> +
>> +static void
>> +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>> +               unsigned long *p3, unsigned long *p4)
>> +{
>> +       kernel_neon_begin();
>> +       xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
>> +       kernel_neon_end();
>> +}
>> +
>> +static void
>> +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>> +               unsigned long *p3, unsigned long *p4, unsigned long *p5)
>> +{
>> +       kernel_neon_begin();
>> +       xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
>> +       kernel_neon_end();
>> +}
>> +
>> +static struct xor_block_template xor_block_arm64 = {
>> +       .name   = "arm64_neon",
>> +       .do_2   = xor_neon_2,
>> +       .do_3   = xor_neon_3,
>> +       .do_4   = xor_neon_4,
>> +       .do_5   = xor_neon_5
>> +};
>> +#undef XOR_TRY_TEMPLATES
>> +#define XOR_TRY_TEMPLATES           \
>> +       do {        \
>> +               xor_speed(&xor_block_8regs);    \
>> +               xor_speed(&xor_block_32regs);    \
>> +               if (cpu_has_neon()) { \
>> +                       xor_speed(&xor_block_arm64);\
>> +               } \
>> +       } while (0)
>> +
>> +#endif /* ! CONFIG_KERNEL_MODE_NEON */
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 69ff988..5540a16 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -5,6 +5,12 @@ lib-y          := clear_user.o delay.o copy_from_user.o                \
>>                   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
>>                   strchr.o strrchr.o tishift.o
>> 
>> +ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>> +obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
>> +CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
>> +CFLAGS_xor-neon.o              += -ffreestanding
>> +endif
>> +
>> # Tell the compiler to treat all general purpose registers (with the
>> # exception of the IP registers, which are already handled by the caller
>> # in case of a PLT) as callee-saved, which allows for efficient runtime
>> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
>> new file mode 100644
>> index 0000000..131c60c2
>> --- /dev/null
>> +++ b/arch/arm64/lib/xor-neon.c
>> @@ -0,0 +1,184 @@
>> +/*
>> + * arch/arm64/lib/xor-neon.c
>> + *
>> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
>> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/raid/xor.h>
>> +#include <linux/module.h>
>> +#include <asm/neon-intrinsics.h>
>> +
>> +void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
>> +       unsigned long *p2)
>> +{
>> +       uint64_t *dp1 = (uint64_t *)p1;
>> +       uint64_t *dp2 = (uint64_t *)p2;
>> +
>> +       register uint64x2_t v0, v1, v2, v3;
>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>> +
>> +       do {
>> +               /* p1 ^= p2 */
>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>> +
>> +               /* store */
>> +               vst1q_u64(dp1 +  0, v0);
>> +               vst1q_u64(dp1 +  2, v1);
>> +               vst1q_u64(dp1 +  4, v2);
>> +               vst1q_u64(dp1 +  6, v3);
>> +
>> +               dp1 += 8;
>> +               dp2 += 8;
>> +       } while (--lines > 0);
>> +}
>> +
>> +void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
>> +       unsigned long *p2, unsigned long *p3)
>> +{
>> +       uint64_t *dp1 = (uint64_t *)p1;
>> +       uint64_t *dp2 = (uint64_t *)p2;
>> +       uint64_t *dp3 = (uint64_t *)p3;
>> +
>> +       register uint64x2_t v0, v1, v2, v3;
>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>> +
>> +       do {
>> +               /* p1 ^= p2 */
>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>> +
>> +               /* p1 ^= p3 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>> +
>> +               /* store */
>> +               vst1q_u64(dp1 +  0, v0);
>> +               vst1q_u64(dp1 +  2, v1);
>> +               vst1q_u64(dp1 +  4, v2);
>> +               vst1q_u64(dp1 +  6, v3);
>> +
>> +               dp1 += 8;
>> +               dp2 += 8;
>> +               dp3 += 8;
>> +       } while (--lines > 0);
>> +}
>> +
>> +void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
>> +       unsigned long *p2, unsigned long *p3, unsigned long *p4)
>> +{
>> +       uint64_t *dp1 = (uint64_t *)p1;
>> +       uint64_t *dp2 = (uint64_t *)p2;
>> +       uint64_t *dp3 = (uint64_t *)p3;
>> +       uint64_t *dp4 = (uint64_t *)p4;
>> +
>> +       register uint64x2_t v0, v1, v2, v3;
>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>> +
>> +       do {
>> +               /* p1 ^= p2 */
>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>> +
>> +               /* p1 ^= p3 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>> +
>> +               /* p1 ^= p4 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
>> +
>> +               /* store */
>> +               vst1q_u64(dp1 +  0, v0);
>> +               vst1q_u64(dp1 +  2, v1);
>> +               vst1q_u64(dp1 +  4, v2);
>> +               vst1q_u64(dp1 +  6, v3);
>> +
>> +               dp1 += 8;
>> +               dp2 += 8;
>> +               dp3 += 8;
>> +               dp4 += 8;
>> +       } while (--lines > 0);
>> +}
>> +
>> +void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
>> +       unsigned long *p2, unsigned long *p3,
>> +       unsigned long *p4, unsigned long *p5)
>> +{
>> +       uint64_t *dp1 = (uint64_t *)p1;
>> +       uint64_t *dp2 = (uint64_t *)p2;
>> +       uint64_t *dp3 = (uint64_t *)p3;
>> +       uint64_t *dp4 = (uint64_t *)p4;
>> +       uint64_t *dp5 = (uint64_t *)p5;
>> +
>> +       register uint64x2_t v0, v1, v2, v3;
>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>> +
>> +       do {
>> +               /* p1 ^= p2 */
>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>> +
>> +               /* p1 ^= p3 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>> +
>> +               /* p1 ^= p4 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
>> +
>> +               /* p1 ^= p5 */
>> +               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
>> +               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
>> +               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
>> +               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
>> +
>> +               /* store */
>> +               vst1q_u64(dp1 +  0, v0);
>> +               vst1q_u64(dp1 +  2, v1);
>> +               vst1q_u64(dp1 +  4, v2);
>> +               vst1q_u64(dp1 +  6, v3);
>> +
>> +               dp1 += 8;
>> +               dp2 += 8;
>> +               dp3 += 8;
>> +               dp4 += 8;
>> +               dp5 += 8;
>> +       } while (--lines > 0);
>> +}
>> +
>> +struct xor_block_template const xor_block_inner_neon = {
>> +       .name   = "__inner_neon__",
>> +       .do_2   = xor_arm64_neon_2,
>> +       .do_3   = xor_arm64_neon_3,
>> +       .do_4   = xor_arm64_neon_4,
>> +       .do_5   = xor_arm64_neon_5,
>> +};
>> +EXPORT_SYMBOL(xor_block_inner_neon);
>> +
>> +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
>> +MODULE_DESCRIPTION("ARMv8 XOR Extensions");
>> +MODULE_LICENSE("GPL");
>> --
>> 2.7.4
>> 
>> 
>> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 11:49   ` Ard Biesheuvel
  2018-11-27 12:33     ` JackieLiu
@ 2018-11-27 12:46     ` Ard Biesheuvel
  2018-11-27 12:52       ` JackieLiu
  2018-11-27 18:03       ` Will Deacon
  1 sibling, 2 replies; 13+ messages in thread
From: Ard Biesheuvel @ 2018-11-27 12:46 UTC (permalink / raw)
  To: linux-arm-kernel

(add maintainers back to cc)

On Tue, 27 Nov 2018 at 12:49, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
> >
> > This is a NEON acceleration method that can improve
> > performance by approximately 20%. I got the following
> > data from the centos 7.5 on Huawei's HISI1616 chip:
> >
> > [ 93.837726] xor: measuring software checksum speed
> > [ 93.874039]   8regs  : 7123.200 MB/sec
> > [ 93.914038]   32regs : 7180.300 MB/sec
> > [ 93.954043]   arm64_neon: 9856.000 MB/sec
>
> That looks more like 37% to me
>
> Note that Cortex-A57 gives me
>
> [    0.111543] xor: measuring software checksum speed
> [    0.154874]    8regs     :  3782.000 MB/sec
> [    0.195069]    32regs    :  6095.000 MB/sec
> [    0.235145]    arm64_neon:  5924.000 MB/sec
> [    0.236942] xor: using function: 32regs (6095.000 MB/sec)
>
> so we fall back to the scalar code, which is fine.
>
> > [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
> >
> > I believe this code can bring some optimization for
> > all arm64 platform.
> >
> > That is patch version 3. Thanks for Ard Biesheuvel's
> > suggestions.
> >
> > Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
>
> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>

This goes with v4 of the NEON intrinsics patch.

Jackie: no need to resend these, but next time, please repost the
series entirely, not just a single patch, and keep the maintainers on
cc.

> > ---
> >  arch/arm64/include/asm/Kbuild |   1 -
> >  arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
> >  arch/arm64/lib/Makefile       |   6 ++
> >  arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 263 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/arm64/include/asm/xor.h
> >  create mode 100644 arch/arm64/lib/xor-neon.c
> >
> > diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
> > index 6cd5d77..1877f29 100644
> > --- a/arch/arm64/include/asm/Kbuild
> > +++ b/arch/arm64/include/asm/Kbuild
> > @@ -27,4 +27,3 @@ generic-y += trace_clock.h
> >  generic-y += unaligned.h
> >  generic-y += user.h
> >  generic-y += vga.h
> > -generic-y += xor.h
> > diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
> > new file mode 100644
> > index 0000000..856386a
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/xor.h
> > @@ -0,0 +1,73 @@
> > +/*
> > + * arch/arm64/include/asm/xor.h
> > + *
> > + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> > + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/hardirq.h>
> > +#include <asm-generic/xor.h>
> > +#include <asm/hwcap.h>
> > +#include <asm/neon.h>
> > +
> > +#ifdef CONFIG_KERNEL_MODE_NEON
> > +
> > +extern struct xor_block_template const xor_block_inner_neon;
> > +
> > +static void
> > +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
> > +{
> > +       kernel_neon_begin();
> > +       xor_block_inner_neon.do_2(bytes, p1, p2);
> > +       kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +               unsigned long *p3)
> > +{
> > +       kernel_neon_begin();
> > +       xor_block_inner_neon.do_3(bytes, p1, p2, p3);
> > +       kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +               unsigned long *p3, unsigned long *p4)
> > +{
> > +       kernel_neon_begin();
> > +       xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
> > +       kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +               unsigned long *p3, unsigned long *p4, unsigned long *p5)
> > +{
> > +       kernel_neon_begin();
> > +       xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
> > +       kernel_neon_end();
> > +}
> > +
> > +static struct xor_block_template xor_block_arm64 = {
> > +       .name   = "arm64_neon",
> > +       .do_2   = xor_neon_2,
> > +       .do_3   = xor_neon_3,
> > +       .do_4   = xor_neon_4,
> > +       .do_5   = xor_neon_5
> > +};
> > +#undef XOR_TRY_TEMPLATES
> > +#define XOR_TRY_TEMPLATES           \
> > +       do {        \
> > +               xor_speed(&xor_block_8regs);    \
> > +               xor_speed(&xor_block_32regs);    \
> > +               if (cpu_has_neon()) { \
> > +                       xor_speed(&xor_block_arm64);\
> > +               } \
> > +       } while (0)
> > +
> > +#endif /* ! CONFIG_KERNEL_MODE_NEON */
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> > index 69ff988..5540a16 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -5,6 +5,12 @@ lib-y          := clear_user.o delay.o copy_from_user.o                \
> >                    memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
> >                    strchr.o strrchr.o tishift.o
> >
> > +ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> > +obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
> > +CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
> > +CFLAGS_xor-neon.o              += -ffreestanding
> > +endif
> > +
> >  # Tell the compiler to treat all general purpose registers (with the
> >  # exception of the IP registers, which are already handled by the caller
> >  # in case of a PLT) as callee-saved, which allows for efficient runtime
> > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > new file mode 100644
> > index 0000000..131c60c2
> > --- /dev/null
> > +++ b/arch/arm64/lib/xor-neon.c
> > @@ -0,0 +1,184 @@
> > +/*
> > + * arch/arm64/lib/xor-neon.c
> > + *
> > + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> > + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/raid/xor.h>
> > +#include <linux/module.h>
> > +#include <asm/neon-intrinsics.h>
> > +
> > +void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
> > +       unsigned long *p2)
> > +{
> > +       uint64_t *dp1 = (uint64_t *)p1;
> > +       uint64_t *dp2 = (uint64_t *)p2;
> > +
> > +       register uint64x2_t v0, v1, v2, v3;
> > +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> > +
> > +       do {
> > +               /* p1 ^= p2 */
> > +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> > +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> > +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> > +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> > +
> > +               /* store */
> > +               vst1q_u64(dp1 +  0, v0);
> > +               vst1q_u64(dp1 +  2, v1);
> > +               vst1q_u64(dp1 +  4, v2);
> > +               vst1q_u64(dp1 +  6, v3);
> > +
> > +               dp1 += 8;
> > +               dp2 += 8;
> > +       } while (--lines > 0);
> > +}
> > +
> > +void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
> > +       unsigned long *p2, unsigned long *p3)
> > +{
> > +       uint64_t *dp1 = (uint64_t *)p1;
> > +       uint64_t *dp2 = (uint64_t *)p2;
> > +       uint64_t *dp3 = (uint64_t *)p3;
> > +
> > +       register uint64x2_t v0, v1, v2, v3;
> > +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> > +
> > +       do {
> > +               /* p1 ^= p2 */
> > +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> > +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> > +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> > +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> > +
> > +               /* p1 ^= p3 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> > +
> > +               /* store */
> > +               vst1q_u64(dp1 +  0, v0);
> > +               vst1q_u64(dp1 +  2, v1);
> > +               vst1q_u64(dp1 +  4, v2);
> > +               vst1q_u64(dp1 +  6, v3);
> > +
> > +               dp1 += 8;
> > +               dp2 += 8;
> > +               dp3 += 8;
> > +       } while (--lines > 0);
> > +}
> > +
> > +void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
> > +       unsigned long *p2, unsigned long *p3, unsigned long *p4)
> > +{
> > +       uint64_t *dp1 = (uint64_t *)p1;
> > +       uint64_t *dp2 = (uint64_t *)p2;
> > +       uint64_t *dp3 = (uint64_t *)p3;
> > +       uint64_t *dp4 = (uint64_t *)p4;
> > +
> > +       register uint64x2_t v0, v1, v2, v3;
> > +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> > +
> > +       do {
> > +               /* p1 ^= p2 */
> > +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> > +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> > +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> > +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> > +
> > +               /* p1 ^= p3 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> > +
> > +               /* p1 ^= p4 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
> > +
> > +               /* store */
> > +               vst1q_u64(dp1 +  0, v0);
> > +               vst1q_u64(dp1 +  2, v1);
> > +               vst1q_u64(dp1 +  4, v2);
> > +               vst1q_u64(dp1 +  6, v3);
> > +
> > +               dp1 += 8;
> > +               dp2 += 8;
> > +               dp3 += 8;
> > +               dp4 += 8;
> > +       } while (--lines > 0);
> > +}
> > +
> > +void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > +       unsigned long *p2, unsigned long *p3,
> > +       unsigned long *p4, unsigned long *p5)
> > +{
> > +       uint64_t *dp1 = (uint64_t *)p1;
> > +       uint64_t *dp2 = (uint64_t *)p2;
> > +       uint64_t *dp3 = (uint64_t *)p3;
> > +       uint64_t *dp4 = (uint64_t *)p4;
> > +       uint64_t *dp5 = (uint64_t *)p5;
> > +
> > +       register uint64x2_t v0, v1, v2, v3;
> > +       long lines = bytes / (sizeof(uint64x2_t) * 4);
> > +
> > +       do {
> > +               /* p1 ^= p2 */
> > +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
> > +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
> > +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
> > +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
> > +
> > +               /* p1 ^= p3 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
> > +
> > +               /* p1 ^= p4 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
> > +
> > +               /* p1 ^= p5 */
> > +               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
> > +               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
> > +               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
> > +               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
> > +
> > +               /* store */
> > +               vst1q_u64(dp1 +  0, v0);
> > +               vst1q_u64(dp1 +  2, v1);
> > +               vst1q_u64(dp1 +  4, v2);
> > +               vst1q_u64(dp1 +  6, v3);
> > +
> > +               dp1 += 8;
> > +               dp2 += 8;
> > +               dp3 += 8;
> > +               dp4 += 8;
> > +               dp5 += 8;
> > +       } while (--lines > 0);
> > +}
> > +
> > +struct xor_block_template const xor_block_inner_neon = {
> > +       .name   = "__inner_neon__",
> > +       .do_2   = xor_arm64_neon_2,
> > +       .do_3   = xor_arm64_neon_3,
> > +       .do_4   = xor_arm64_neon_4,
> > +       .do_5   = xor_arm64_neon_5,
> > +};
> > +EXPORT_SYMBOL(xor_block_inner_neon);
> > +
> > +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
> > +MODULE_DESCRIPTION("ARMv8 XOR Extensions");
> > +MODULE_LICENSE("GPL");
> > --
> > 2.7.4
> >
> >
> >

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 12:46     ` Ard Biesheuvel
@ 2018-11-27 12:52       ` JackieLiu
  2018-11-27 18:03       ` Will Deacon
  1 sibling, 0 replies; 13+ messages in thread
From: JackieLiu @ 2018-11-27 12:52 UTC (permalink / raw)
  To: linux-arm-kernel

Yes, I got it. Thanks for review these patch.

BR.
Jackie

> ? 2018?11?27??20:46?Ard Biesheuvel <ard.biesheuvel@linaro.org> ???
> 
> (add maintainers back to cc)
> 
> On Tue, 27 Nov 2018 at 12:49, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> 
>> On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
>>> 
>>> This is a NEON acceleration method that can improve
>>> performance by approximately 20%. I got the following
>>> data from the centos 7.5 on Huawei's HISI1616 chip:
>>> 
>>> [ 93.837726] xor: measuring software checksum speed
>>> [ 93.874039]   8regs  : 7123.200 MB/sec
>>> [ 93.914038]   32regs : 7180.300 MB/sec
>>> [ 93.954043]   arm64_neon: 9856.000 MB/sec
>> 
>> That looks more like 37% to me
>> 
>> Note that Cortex-A57 gives me
>> 
>> [    0.111543] xor: measuring software checksum speed
>> [    0.154874]    8regs     :  3782.000 MB/sec
>> [    0.195069]    32regs    :  6095.000 MB/sec
>> [    0.235145]    arm64_neon:  5924.000 MB/sec
>> [    0.236942] xor: using function: 32regs (6095.000 MB/sec)
>> 
>> so we fall back to the scalar code, which is fine.
>> 
>>> [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
>>> 
>>> I believe this code can bring some optimization for
>>> all arm64 platform.
>>> 
>>> That is patch version 3. Thanks for Ard Biesheuvel's
>>> suggestions.
>>> 
>>> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
>> 
>> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> 
> 
> This goes with v4 of the NEON intrinsics patch.
> 
> Jackie: no need to resend these, but next time, please repost the
> series entirely, not just a single patch, and keep the maintainers on
> cc.
> 
>>> ---
>>> arch/arm64/include/asm/Kbuild |   1 -
>>> arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
>>> arch/arm64/lib/Makefile       |   6 ++
>>> arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
>>> 4 files changed, 263 insertions(+), 1 deletion(-)
>>> create mode 100644 arch/arm64/include/asm/xor.h
>>> create mode 100644 arch/arm64/lib/xor-neon.c
>>> 
>>> diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
>>> index 6cd5d77..1877f29 100644
>>> --- a/arch/arm64/include/asm/Kbuild
>>> +++ b/arch/arm64/include/asm/Kbuild
>>> @@ -27,4 +27,3 @@ generic-y += trace_clock.h
>>> generic-y += unaligned.h
>>> generic-y += user.h
>>> generic-y += vga.h
>>> -generic-y += xor.h
>>> diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
>>> new file mode 100644
>>> index 0000000..856386a
>>> --- /dev/null
>>> +++ b/arch/arm64/include/asm/xor.h
>>> @@ -0,0 +1,73 @@
>>> +/*
>>> + * arch/arm64/include/asm/xor.h
>>> + *
>>> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
>>> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License version 2 as
>>> + * published by the Free Software Foundation.
>>> + */
>>> +
>>> +#include <linux/hardirq.h>
>>> +#include <asm-generic/xor.h>
>>> +#include <asm/hwcap.h>
>>> +#include <asm/neon.h>
>>> +
>>> +#ifdef CONFIG_KERNEL_MODE_NEON
>>> +
>>> +extern struct xor_block_template const xor_block_inner_neon;
>>> +
>>> +static void
>>> +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
>>> +{
>>> +       kernel_neon_begin();
>>> +       xor_block_inner_neon.do_2(bytes, p1, p2);
>>> +       kernel_neon_end();
>>> +}
>>> +
>>> +static void
>>> +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>>> +               unsigned long *p3)
>>> +{
>>> +       kernel_neon_begin();
>>> +       xor_block_inner_neon.do_3(bytes, p1, p2, p3);
>>> +       kernel_neon_end();
>>> +}
>>> +
>>> +static void
>>> +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>>> +               unsigned long *p3, unsigned long *p4)
>>> +{
>>> +       kernel_neon_begin();
>>> +       xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
>>> +       kernel_neon_end();
>>> +}
>>> +
>>> +static void
>>> +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
>>> +               unsigned long *p3, unsigned long *p4, unsigned long *p5)
>>> +{
>>> +       kernel_neon_begin();
>>> +       xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
>>> +       kernel_neon_end();
>>> +}
>>> +
>>> +static struct xor_block_template xor_block_arm64 = {
>>> +       .name   = "arm64_neon",
>>> +       .do_2   = xor_neon_2,
>>> +       .do_3   = xor_neon_3,
>>> +       .do_4   = xor_neon_4,
>>> +       .do_5   = xor_neon_5
>>> +};
>>> +#undef XOR_TRY_TEMPLATES
>>> +#define XOR_TRY_TEMPLATES           \
>>> +       do {        \
>>> +               xor_speed(&xor_block_8regs);    \
>>> +               xor_speed(&xor_block_32regs);    \
>>> +               if (cpu_has_neon()) { \
>>> +                       xor_speed(&xor_block_arm64);\
>>> +               } \
>>> +       } while (0)
>>> +
>>> +#endif /* ! CONFIG_KERNEL_MODE_NEON */
>>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>>> index 69ff988..5540a16 100644
>>> --- a/arch/arm64/lib/Makefile
>>> +++ b/arch/arm64/lib/Makefile
>>> @@ -5,6 +5,12 @@ lib-y          := clear_user.o delay.o copy_from_user.o                \
>>>                   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
>>>                   strchr.o strrchr.o tishift.o
>>> 
>>> +ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>>> +obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
>>> +CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
>>> +CFLAGS_xor-neon.o              += -ffreestanding
>>> +endif
>>> +
>>> # Tell the compiler to treat all general purpose registers (with the
>>> # exception of the IP registers, which are already handled by the caller
>>> # in case of a PLT) as callee-saved, which allows for efficient runtime
>>> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
>>> new file mode 100644
>>> index 0000000..131c60c2
>>> --- /dev/null
>>> +++ b/arch/arm64/lib/xor-neon.c
>>> @@ -0,0 +1,184 @@
>>> +/*
>>> + * arch/arm64/lib/xor-neon.c
>>> + *
>>> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
>>> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License version 2 as
>>> + * published by the Free Software Foundation.
>>> + */
>>> +
>>> +#include <linux/raid/xor.h>
>>> +#include <linux/module.h>
>>> +#include <asm/neon-intrinsics.h>
>>> +
>>> +void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
>>> +       unsigned long *p2)
>>> +{
>>> +       uint64_t *dp1 = (uint64_t *)p1;
>>> +       uint64_t *dp2 = (uint64_t *)p2;
>>> +
>>> +       register uint64x2_t v0, v1, v2, v3;
>>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>>> +
>>> +       do {
>>> +               /* p1 ^= p2 */
>>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>>> +
>>> +               /* store */
>>> +               vst1q_u64(dp1 +  0, v0);
>>> +               vst1q_u64(dp1 +  2, v1);
>>> +               vst1q_u64(dp1 +  4, v2);
>>> +               vst1q_u64(dp1 +  6, v3);
>>> +
>>> +               dp1 += 8;
>>> +               dp2 += 8;
>>> +       } while (--lines > 0);
>>> +}
>>> +
>>> +void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
>>> +       unsigned long *p2, unsigned long *p3)
>>> +{
>>> +       uint64_t *dp1 = (uint64_t *)p1;
>>> +       uint64_t *dp2 = (uint64_t *)p2;
>>> +       uint64_t *dp3 = (uint64_t *)p3;
>>> +
>>> +       register uint64x2_t v0, v1, v2, v3;
>>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>>> +
>>> +       do {
>>> +               /* p1 ^= p2 */
>>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>>> +
>>> +               /* p1 ^= p3 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>>> +
>>> +               /* store */
>>> +               vst1q_u64(dp1 +  0, v0);
>>> +               vst1q_u64(dp1 +  2, v1);
>>> +               vst1q_u64(dp1 +  4, v2);
>>> +               vst1q_u64(dp1 +  6, v3);
>>> +
>>> +               dp1 += 8;
>>> +               dp2 += 8;
>>> +               dp3 += 8;
>>> +       } while (--lines > 0);
>>> +}
>>> +
>>> +void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
>>> +       unsigned long *p2, unsigned long *p3, unsigned long *p4)
>>> +{
>>> +       uint64_t *dp1 = (uint64_t *)p1;
>>> +       uint64_t *dp2 = (uint64_t *)p2;
>>> +       uint64_t *dp3 = (uint64_t *)p3;
>>> +       uint64_t *dp4 = (uint64_t *)p4;
>>> +
>>> +       register uint64x2_t v0, v1, v2, v3;
>>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>>> +
>>> +       do {
>>> +               /* p1 ^= p2 */
>>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>>> +
>>> +               /* p1 ^= p3 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>>> +
>>> +               /* p1 ^= p4 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
>>> +
>>> +               /* store */
>>> +               vst1q_u64(dp1 +  0, v0);
>>> +               vst1q_u64(dp1 +  2, v1);
>>> +               vst1q_u64(dp1 +  4, v2);
>>> +               vst1q_u64(dp1 +  6, v3);
>>> +
>>> +               dp1 += 8;
>>> +               dp2 += 8;
>>> +               dp3 += 8;
>>> +               dp4 += 8;
>>> +       } while (--lines > 0);
>>> +}
>>> +
>>> +void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
>>> +       unsigned long *p2, unsigned long *p3,
>>> +       unsigned long *p4, unsigned long *p5)
>>> +{
>>> +       uint64_t *dp1 = (uint64_t *)p1;
>>> +       uint64_t *dp2 = (uint64_t *)p2;
>>> +       uint64_t *dp3 = (uint64_t *)p3;
>>> +       uint64_t *dp4 = (uint64_t *)p4;
>>> +       uint64_t *dp5 = (uint64_t *)p5;
>>> +
>>> +       register uint64x2_t v0, v1, v2, v3;
>>> +       long lines = bytes / (sizeof(uint64x2_t) * 4);
>>> +
>>> +       do {
>>> +               /* p1 ^= p2 */
>>> +               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
>>> +               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
>>> +               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
>>> +               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
>>> +
>>> +               /* p1 ^= p3 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
>>> +
>>> +               /* p1 ^= p4 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
>>> +
>>> +               /* p1 ^= p5 */
>>> +               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
>>> +               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
>>> +               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
>>> +               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
>>> +
>>> +               /* store */
>>> +               vst1q_u64(dp1 +  0, v0);
>>> +               vst1q_u64(dp1 +  2, v1);
>>> +               vst1q_u64(dp1 +  4, v2);
>>> +               vst1q_u64(dp1 +  6, v3);
>>> +
>>> +               dp1 += 8;
>>> +               dp2 += 8;
>>> +               dp3 += 8;
>>> +               dp4 += 8;
>>> +               dp5 += 8;
>>> +       } while (--lines > 0);
>>> +}
>>> +
>>> +struct xor_block_template const xor_block_inner_neon = {
>>> +       .name   = "__inner_neon__",
>>> +       .do_2   = xor_arm64_neon_2,
>>> +       .do_3   = xor_arm64_neon_3,
>>> +       .do_4   = xor_arm64_neon_4,
>>> +       .do_5   = xor_arm64_neon_5,
>>> +};
>>> +EXPORT_SYMBOL(xor_block_inner_neon);
>>> +
>>> +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
>>> +MODULE_DESCRIPTION("ARMv8 XOR Extensions");
>>> +MODULE_LICENSE("GPL");
>>> --
>>> 2.7.4

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 12:46     ` Ard Biesheuvel
  2018-11-27 12:52       ` JackieLiu
@ 2018-11-27 18:03       ` Will Deacon
  1 sibling, 0 replies; 13+ messages in thread
From: Will Deacon @ 2018-11-27 18:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 27, 2018 at 01:46:48PM +0100, Ard Biesheuvel wrote:
> (add maintainers back to cc)
> 
> On Tue, 27 Nov 2018 at 12:49, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >
> > On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
> > >
> > > This is a NEON acceleration method that can improve
> > > performance by approximately 20%. I got the following
> > > data from the centos 7.5 on Huawei's HISI1616 chip:
> > >
> > > [ 93.837726] xor: measuring software checksum speed
> > > [ 93.874039]   8regs  : 7123.200 MB/sec
> > > [ 93.914038]   32regs : 7180.300 MB/sec
> > > [ 93.954043]   arm64_neon: 9856.000 MB/sec
> >
> > That looks more like 37% to me
> >
> > Note that Cortex-A57 gives me
> >
> > [    0.111543] xor: measuring software checksum speed
> > [    0.154874]    8regs     :  3782.000 MB/sec
> > [    0.195069]    32regs    :  6095.000 MB/sec
> > [    0.235145]    arm64_neon:  5924.000 MB/sec
> > [    0.236942] xor: using function: 32regs (6095.000 MB/sec)
> >
> > so we fall back to the scalar code, which is fine.
> >
> > > [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
> > >
> > > I believe this code can bring some optimization for
> > > all arm64 platform.
> > >
> > > That is patch version 3. Thanks for Ard Biesheuvel's
> > > suggestions.
> > >
> > > Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> >
> > Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> >
> 
> This goes with v4 of the NEON intrinsics patch.
> 
> Jackie: no need to resend these, but next time, please repost the
> series entirely, not just a single patch, and keep the maintainers on
> cc.

Actually, it would be helpful if they were resent since I'm currently CC'd
on a v4 1/1 and a v3 2/2 and don't really know what I'm supposed to do with
them :)

Will

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
  2018-11-27 11:49   ` Ard Biesheuvel
@ 2018-11-29 17:00   ` Dave Martin
  2018-11-29 18:09     ` Ard Biesheuvel
  1 sibling, 1 reply; 13+ messages in thread
From: Dave Martin @ 2018-11-29 17:00 UTC (permalink / raw)
  To: Jackie Liu; +Cc: linux-block, linux-arm-kernel, ard.biesheuvel

On Tue, Nov 27, 2018 at 06:08:58PM +0800, Jackie Liu wrote:
> This is a NEON acceleration method that can improve
> performance by approximately 20%. I got the following
> data from the centos 7.5 on Huawei's HISI1616 chip:
> 
> [ 93.837726] xor: measuring software checksum speed
> [ 93.874039]   8regs  : 7123.200 MB/sec
> [ 93.914038]   32regs : 7180.300 MB/sec
> [ 93.954043]   arm64_neon: 9856.000 MB/sec
> [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
> 
> I believe this code can bring some optimization for
> all arm64 platform.
> 
> That is patch version 3. Thanks for Ard Biesheuvel's
> suggestions.
> 
> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> ---
>  arch/arm64/include/asm/Kbuild |   1 -
>  arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
>  arch/arm64/lib/Makefile       |   6 ++
>  arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 263 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/xor.h
>  create mode 100644 arch/arm64/lib/xor-neon.c
> 
> diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
> index 6cd5d77..1877f29 100644
> --- a/arch/arm64/include/asm/Kbuild
> +++ b/arch/arm64/include/asm/Kbuild
> @@ -27,4 +27,3 @@ generic-y += trace_clock.h
>  generic-y += unaligned.h
>  generic-y += user.h
>  generic-y += vga.h
> -generic-y += xor.h
> diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
> new file mode 100644
> index 0000000..856386a
> --- /dev/null
> +++ b/arch/arm64/include/asm/xor.h
> @@ -0,0 +1,73 @@
> +/*
> + * arch/arm64/include/asm/xor.h
> + *
> + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/hardirq.h>
> +#include <asm-generic/xor.h>
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +
> +extern struct xor_block_template const xor_block_inner_neon;
> +
> +static void
> +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
> +{
> +	kernel_neon_begin();
> +	xor_block_inner_neon.do_2(bytes, p1, p2);
> +	kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +		unsigned long *p3)
> +{
> +	kernel_neon_begin();
> +	xor_block_inner_neon.do_3(bytes, p1, p2, p3);
> +	kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +		unsigned long *p3, unsigned long *p4)
> +{
> +	kernel_neon_begin();
> +	xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
> +	kernel_neon_end();
> +}
> +
> +static void
> +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> +		unsigned long *p3, unsigned long *p4, unsigned long *p5)
> +{
> +	kernel_neon_begin();
> +	xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
> +	kernel_neon_end();
> +}
> +
> +static struct xor_block_template xor_block_arm64 = {
> +	.name   = "arm64_neon",
> +	.do_2   = xor_neon_2,
> +	.do_3   = xor_neon_3,
> +	.do_4   = xor_neon_4,
> +	.do_5	= xor_neon_5
> +};
> +#undef XOR_TRY_TEMPLATES
> +#define XOR_TRY_TEMPLATES           \
> +	do {        \
> +		xor_speed(&xor_block_8regs);    \
> +		xor_speed(&xor_block_32regs);    \
> +		if (cpu_has_neon()) { \
> +			xor_speed(&xor_block_arm64);\
> +		} \
> +	} while (0)

Should there be a may_use_simd() check somewhere?

If we invoke this in a softirq I don't see what prevents us from
corrupting the task's NEON state.

(The check might be in some surrounding glue code that I missed...)

[...]

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-29 17:00   ` Dave Martin
@ 2018-11-29 18:09     ` Ard Biesheuvel
  2018-11-29 18:20       ` Dave Martin
  0 siblings, 1 reply; 13+ messages in thread
From: Ard Biesheuvel @ 2018-11-29 18:09 UTC (permalink / raw)
  To: Dave Martin; +Cc: linux-block, liuyun01, linux-arm-kernel

On Thu, 29 Nov 2018 at 18:00, Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Tue, Nov 27, 2018 at 06:08:58PM +0800, Jackie Liu wrote:
> > This is a NEON acceleration method that can improve
> > performance by approximately 20%. I got the following
> > data from the centos 7.5 on Huawei's HISI1616 chip:
> >
> > [ 93.837726] xor: measuring software checksum speed
> > [ 93.874039]   8regs  : 7123.200 MB/sec
> > [ 93.914038]   32regs : 7180.300 MB/sec
> > [ 93.954043]   arm64_neon: 9856.000 MB/sec
> > [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec)
> >
> > I believe this code can bring some optimization for
> > all arm64 platform.
> >
> > That is patch version 3. Thanks for Ard Biesheuvel's
> > suggestions.
> >
> > Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> > ---
> >  arch/arm64/include/asm/Kbuild |   1 -
> >  arch/arm64/include/asm/xor.h  |  73 +++++++++++++++++
> >  arch/arm64/lib/Makefile       |   6 ++
> >  arch/arm64/lib/xor-neon.c     | 184 ++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 263 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/arm64/include/asm/xor.h
> >  create mode 100644 arch/arm64/lib/xor-neon.c
> >
> > diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
> > index 6cd5d77..1877f29 100644
> > --- a/arch/arm64/include/asm/Kbuild
> > +++ b/arch/arm64/include/asm/Kbuild
> > @@ -27,4 +27,3 @@ generic-y += trace_clock.h
> >  generic-y += unaligned.h
> >  generic-y += user.h
> >  generic-y += vga.h
> > -generic-y += xor.h
> > diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
> > new file mode 100644
> > index 0000000..856386a
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/xor.h
> > @@ -0,0 +1,73 @@
> > +/*
> > + * arch/arm64/include/asm/xor.h
> > + *
> > + * Authors: Jackie Liu <liuyun01@kylinos.cn>
> > + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/hardirq.h>
> > +#include <asm-generic/xor.h>
> > +#include <asm/hwcap.h>
> > +#include <asm/neon.h>
> > +
> > +#ifdef CONFIG_KERNEL_MODE_NEON
> > +
> > +extern struct xor_block_template const xor_block_inner_neon;
> > +
> > +static void
> > +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
> > +{
> > +     kernel_neon_begin();
> > +     xor_block_inner_neon.do_2(bytes, p1, p2);
> > +     kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +             unsigned long *p3)
> > +{
> > +     kernel_neon_begin();
> > +     xor_block_inner_neon.do_3(bytes, p1, p2, p3);
> > +     kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +             unsigned long *p3, unsigned long *p4)
> > +{
> > +     kernel_neon_begin();
> > +     xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
> > +     kernel_neon_end();
> > +}
> > +
> > +static void
> > +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
> > +             unsigned long *p3, unsigned long *p4, unsigned long *p5)
> > +{
> > +     kernel_neon_begin();
> > +     xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
> > +     kernel_neon_end();
> > +}
> > +
> > +static struct xor_block_template xor_block_arm64 = {
> > +     .name   = "arm64_neon",
> > +     .do_2   = xor_neon_2,
> > +     .do_3   = xor_neon_3,
> > +     .do_4   = xor_neon_4,
> > +     .do_5   = xor_neon_5
> > +};
> > +#undef XOR_TRY_TEMPLATES
> > +#define XOR_TRY_TEMPLATES           \
> > +     do {        \
> > +             xor_speed(&xor_block_8regs);    \
> > +             xor_speed(&xor_block_32regs);    \
> > +             if (cpu_has_neon()) { \
> > +                     xor_speed(&xor_block_arm64);\
> > +             } \
> > +     } while (0)
>
> Should there be a may_use_simd() check somewhere?
>
> If we invoke this in a softirq I don't see what prevents us from
> corrupting the task's NEON state.
>
> (The check might be in some surrounding glue code that I missed...)
>

There is no check. This code should simply not be called from
non-process context, same as the RAID56 code.

This is not terribly robust, obviously, but appears to be common
practice in this realm of the kernel.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-29 18:09     ` Ard Biesheuvel
@ 2018-11-29 18:20       ` Dave Martin
  2018-11-30  1:15         ` JackieLiu
  0 siblings, 1 reply; 13+ messages in thread
From: Dave Martin @ 2018-11-29 18:20 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-block, liuyun01, linux-arm-kernel

On Thu, Nov 29, 2018 at 07:09:10PM +0100, Ard Biesheuvel wrote:
> On Thu, 29 Nov 2018 at 18:00, Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Tue, Nov 27, 2018 at 06:08:58PM +0800, Jackie Liu wrote:

[...]

> > > +static struct xor_block_template xor_block_arm64 = {
> > > +     .name   = "arm64_neon",
> > > +     .do_2   = xor_neon_2,
> > > +     .do_3   = xor_neon_3,
> > > +     .do_4   = xor_neon_4,
> > > +     .do_5   = xor_neon_5
> > > +};
> > > +#undef XOR_TRY_TEMPLATES
> > > +#define XOR_TRY_TEMPLATES           \
> > > +     do {        \
> > > +             xor_speed(&xor_block_8regs);    \
> > > +             xor_speed(&xor_block_32regs);    \
> > > +             if (cpu_has_neon()) { \
> > > +                     xor_speed(&xor_block_arm64);\
> > > +             } \
> > > +     } while (0)
> >
> > Should there be a may_use_simd() check somewhere?
> >
> > If we invoke this in a softirq I don't see what prevents us from
> > corrupting the task's NEON state.
> >
> > (The check might be in some surrounding glue code that I missed...)
> >
> 
> There is no check. This code should simply not be called from
> non-process context, same as the RAID56 code.
> 
> This is not terribly robust, obviously, but appears to be common
> practice in this realm of the kernel.

Fair enough -- I was just curious.

If this goes wrong, we should get a clear splat in kernel_neon_begin()
anyway.  I'd be more concerned if we could just end up scribbling over
the NEON state silently.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation
  2018-11-29 18:20       ` Dave Martin
@ 2018-11-30  1:15         ` JackieLiu
  0 siblings, 0 replies; 13+ messages in thread
From: JackieLiu @ 2018-11-30  1:15 UTC (permalink / raw)
  To: Dave Martin; +Cc: linux-block, linux-arm-kernel, Ard Biesheuvel



> 在 2018年11月30日，02:20，Dave Martin <Dave.Martin@arm.com> 写道：
> 
> On Thu, Nov 29, 2018 at 07:09:10PM +0100, Ard Biesheuvel wrote:
>> On Thu, 29 Nov 2018 at 18:00, Dave Martin <Dave.Martin@arm.com> wrote:
>>> 
>>> On Tue, Nov 27, 2018 at 06:08:58PM +0800, Jackie Liu wrote:
> 
> [...]
> 
>>>> +static struct xor_block_template xor_block_arm64 = {
>>>> +     .name   = "arm64_neon",
>>>> +     .do_2   = xor_neon_2,
>>>> +     .do_3   = xor_neon_3,
>>>> +     .do_4   = xor_neon_4,
>>>> +     .do_5   = xor_neon_5
>>>> +};
>>>> +#undef XOR_TRY_TEMPLATES
>>>> +#define XOR_TRY_TEMPLATES           \
>>>> +     do {        \
>>>> +             xor_speed(&xor_block_8regs);    \
>>>> +             xor_speed(&xor_block_32regs);    \
>>>> +             if (cpu_has_neon()) { \
>>>> +                     xor_speed(&xor_block_arm64);\
>>>> +             } \
>>>> +     } while (0)
>>> 
>>> Should there be a may_use_simd() check somewhere?
>>> 
>>> If we invoke this in a softirq I don't see what prevents us from
>>> corrupting the task's NEON state.
>>> 
>>> (The check might be in some surrounding glue code that I missed...)
>>> 
>> 
>> There is no check. This code should simply not be called from
>> non-process context, same as the RAID56 code.
>> 
>> This is not terribly robust, obviously, but appears to be common
>> practice in this realm of the kernel.
> 
> Fair enough -- I was just curious.
> 
> If this goes wrong, we should get a clear splat in kernel_neon_begin()
> anyway.  I'd be more concerned if we could just end up scribbling over
> the NEON state silently.
> 
> Cheers
> —Dave

I checked the code of xor.h for other architectures. except the arm32 
platform has interrupt judgment, and the others are kept simple.

In fact, the xor algorithm is currently only available to RAID456 (maybe?),
I hope it can be simple enough.





_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types
  2018-11-27 10:08 [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Jackie Liu
  2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
@ 2018-11-27 11:42 ` Ard Biesheuvel
  2018-11-29 16:55 ` Dave Martin
  2 siblings, 0 replies; 13+ messages in thread
From: Ard Biesheuvel @ 2018-11-27 11:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 27 Nov 2018 at 11:10, Jackie Liu <liuyun01@kylinos.cn> wrote:
>
> In a way similar to ARM commit 09096f6a0ee2 ("ARM: 7822/1: add workaround
> for ambiguous C99 stdint.h types"), this patch redefines the macros that
> are used in stdint.h so its definitions of uint64_t and int64_t are
> compatible with those of the kernel.
>
> This patch comes from: https://patchwork.kernel.org/patch/3540001/
> Wrote by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> We mark this file as a private file and don't have to override asm/types.h
>
> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> ---
>  arch/arm64/include/asm/neon-intrinsics.h | 28 ++++++++++++++++++++++++++++
>  1 file changed, 28 insertions(+)
>  create mode 100644 arch/arm64/include/asm/neon-intrinsics.h
>
> diff --git a/arch/arm64/include/asm/neon-intrinsics.h b/arch/arm64/include/asm/neon-intrinsics.h
> new file mode 100644
> index 0000000..96a3fda
> --- /dev/null
> +++ b/arch/arm64/include/asm/neon-intrinsics.h
> @@ -0,0 +1,28 @@
> +#ifndef _NEON_INTRINSICS_H
> +#define _NEON_INTRINSICS_H
> +
> +#include <asm-generic/int-ll64.h>
> +
> +/*
> + * For Aarch64, there is some ambiguity in the definition of the types below
> + * between the kernel and GCC itself. This is usually not a big deal, but it
> + * causes trouble when including GCC's version of 'stdint.h' (this is the file
> + * that gets included when you #include <stdint.h> on a -ffreestanding build).
> + * As this file also gets included implicitly when including 'arm_neon.h' (the
> + * NEON intrinsics support header), we need the following to work around the
> + * issue if we want to use NEON intrinsics in the kernel.
> + */
> +
> +#ifdef __INT64_TYPE__
> +#undef __INT64_TYPE__
> +#define __INT64_TYPE__         __signed__ long long
> +#endif
> +
> +#ifdef __UINT64_TYPE__
> +#undef __UINT64_TYPE__
> +#define __UINT64_TYPE__                unsigned long long
> +#endif
> +
> +#include <arm_neon.h>
> +

We should make this

/*
 * genksyms chokes on the ARM NEON instrinsics system header, but we
don't export anything
 * it defines anyway, so just disregard when genksyms execute.
 */
#ifndef __GENKSYMS__
#include <arm_neon.h>
#endif

to work around the issue you reported with symbol versioning.

> +#endif /* ! _NEON_INTRINSICS_H */
> --
> 2.7.4
>
>
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types
  2018-11-27 10:08 [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Jackie Liu
  2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
  2018-11-27 11:42 ` [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Ard Biesheuvel
@ 2018-11-29 16:55 ` Dave Martin
  2 siblings, 0 replies; 13+ messages in thread
From: Dave Martin @ 2018-11-29 16:55 UTC (permalink / raw)
  To: Jackie Liu; +Cc: linux-block, linux-arm-kernel, ard.biesheuvel

On Tue, Nov 27, 2018 at 06:08:57PM +0800, Jackie Liu wrote:
> In a way similar to ARM commit 09096f6a0ee2 ("ARM: 7822/1: add workaround
> for ambiguous C99 stdint.h types"), this patch redefines the macros that
> are used in stdint.h so its definitions of uint64_t and int64_t are
> compatible with those of the kernel.
> 
> This patch comes from: https://patchwork.kernel.org/patch/3540001/
> Wrote by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> 
> We mark this file as a private file and don't have to override asm/types.h
> 
> Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
> ---
>  arch/arm64/include/asm/neon-intrinsics.h | 28 ++++++++++++++++++++++++++++
>  1 file changed, 28 insertions(+)
>  create mode 100644 arch/arm64/include/asm/neon-intrinsics.h
> 
> diff --git a/arch/arm64/include/asm/neon-intrinsics.h b/arch/arm64/include/asm/neon-intrinsics.h
> new file mode 100644
> index 0000000..96a3fda
> --- /dev/null
> +++ b/arch/arm64/include/asm/neon-intrinsics.h
> @@ -0,0 +1,28 @@
> +#ifndef _NEON_INTRINSICS_H
> +#define _NEON_INTRINSICS_H
> +
> +#include <asm-generic/int-ll64.h>
> +
> +/*
> + * For Aarch64, there is some ambiguity in the definition of the types below
> + * between the kernel and GCC itself. This is usually not a big deal, but it
> + * causes trouble when including GCC's version of 'stdint.h' (this is the file
> + * that gets included when you #include <stdint.h> on a -ffreestanding build).
> + * As this file also gets included implicitly when including 'arm_neon.h' (the
> + * NEON intrinsics support header), we need the following to work around the
> + * issue if we want to use NEON intrinsics in the kernel.
> + */
> +
> +#ifdef __INT64_TYPE__
> +#undef __INT64_TYPE__
> +#define __INT64_TYPE__		__signed__ long long

Minor query: Out of interest, why __signed__ here, and not signed?

Most similar headers do the same, but I haven't figured out why.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-11-30  1:16 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-11-27 10:08 [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Jackie Liu
2018-11-27 10:08 ` [PATCH v3 2/2] arm64: crypto: add NEON accelerated XOR implementation Jackie Liu
2018-11-27 11:49   ` Ard Biesheuvel
2018-11-27 12:33     ` JackieLiu
2018-11-27 12:46     ` Ard Biesheuvel
2018-11-27 12:52       ` JackieLiu
2018-11-27 18:03       ` Will Deacon
2018-11-29 17:00   ` Dave Martin
2018-11-29 18:09     ` Ard Biesheuvel
2018-11-29 18:20       ` Dave Martin
2018-11-30  1:15         ` JackieLiu
2018-11-27 11:42 ` [PATCH v3 1/2] arm64/neon: add workaround for ambiguous C99 stdint.h types Ard Biesheuvel
2018-11-29 16:55 ` Dave Martin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).