* [PATCH] arm64: lib: accelerate do_csum() with NEON instruction
@ 2018-11-21 9:21 huanglingyan
2018-11-21 14:41 ` Robin Murphy
2018-12-03 19:32 ` Will Deacon
0 siblings, 2 replies; 5+ messages in thread
From: huanglingyan @ 2018-11-21 9:21 UTC (permalink / raw)
To: linux-arm-kernel
From: Lingyan Huang <huanglingyan2@huawei.com>
Function do_csum() in lib/checksum.c is used to compute checksum,
which is turned out to be slowly and costs a lot of resources.
Let's use neon instructions to accelerate the checksum computation
for arm64.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
---
arch/arm64/include/asm/checksum.h | 8 ++
arch/arm64/lib/Makefile | 3 +
arch/arm64/lib/checksum.c | 30 +++++++
arch/arm64/lib/do_csum.S | 182 ++++++++++++++++++++++++++++++++++++++
lib/checksum.c | 6 +-
5 files changed, 226 insertions(+), 3 deletions(-)
create mode 100644 arch/arm64/lib/checksum.c
create mode 100644 arch/arm64/lib/do_csum.S
diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 0b6f5a7..9faf642 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -24,8 +24,16 @@ static inline __sum16 csum_fold(__wsum csum)
sum += (sum >> 16) | (sum << 16);
return ~(__force __sum16)(sum >> 16);
}
+
#define csum_fold csum_fold
+#ifdef CONFIG_KERNEL_MODE_NEON
+extern unsigned int do_csum_generic(const unsigned char *buff, int len);
+unsigned int do_csum_neon(const unsigned char *buff, unsigned int len);
+unsigned int do_csum(const unsigned char *buff, unsigned int len);
+#define do_csum do_csum
+#endif
+
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
__uint128_t tmp;
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 69ff988..9596fd8 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,6 +5,9 @@ lib-y := clear_user.o delay.o copy_from_user.o \
memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
strchr.o strrchr.o tishift.o
+# If NEON mode is supported, compile this file to speed up do_csum.
+lib-$(CONFIG_KERNEL_MODE_NEON) += do_csum.o checksum.o
+
# Tell the compiler to treat all general purpose registers (with the
# exception of the IP registers, which are already handled by the caller
# in case of a PLT) as callee-saved, which allows for efficient runtime
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 0000000..61dee8b
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,30 @@
+/*
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ * Written by Lingyan Huang (huanglingyan2 at huawei.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+#include <asm/byteorder.h>
+
+unsigned int do_csum(const unsigned char *buff, unsigned int len)
+{
+ if (may_use_simd()) {
+ unsigned int res;
+
+ kernel_neon_begin();
+ res = do_csum_neon(buff, len);
+ kernel_neon_end();
+ return res;
+ } else
+ return do_csum_generic(buff, len);
+}
diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
new file mode 100644
index 0000000..820302c
--- /dev/null
+++ b/arch/arm64/lib/do_csum.S
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2018 Huawei Inc.
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Parameters:
+ * x0 - address of buffer to checksum (const unsigned char *)
+ * x1 - length of the buffer (int)
+ * Returns:
+ * x0 - the return checksum of the buffer
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+ENTRY(do_csum_neon)
+ ldr x13, =0xffff
+ eor x4, x4, x4
+ eor x5, x5, x5
+ eor v0.16b, v0.16b, v0.16b // clear v0,x4,x5
+
+ /*
+ * len is zero or negative
+ */
+ and x6, x1, #0x80000000
+ cmp x6, #0
+ b.gt out
+ cbz w1, out
+
+ tst x0, #1
+ b.eq addr_not_odd
+
+ /*
+ * addr is odd
+ */
+ mov x4, #1
+ ldr x6, [x0], #1
+#ifdef __AARCH64EB__
+ and x6, x6, #0xff
+#else
+ lsl x6, x6, #8
+ and x6, x6, x13
+#endif
+ add x5, x5, x6
+ sub x1, x1, #1
+
+addr_not_odd:
+ cmp x1, #32
+ b.lt len_4
+ cmp x1, #128
+ b.ge len_gt_128
+ b do_loop_16
+
+len_gt_128:
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+
+do_loop_64:
+
+ ldp q5, q4, [x0], #32
+ ldp q7, q6, [x0], #32
+
+ uadalp v0.4s, v4.8h
+ uadalp v1.4s, v5.8h
+ uadalp v2.4s, v6.8h
+ uadalp v3.4s, v7.8h
+
+ sub x1, x1, #64
+ cmp x1, #64
+ b.ge do_loop_64
+
+ add v0.4s, v0.4s, v1.4s
+ add v2.4s, v2.4s, v3.4s
+ add v0.4s, v0.4s, v2.4s
+
+ cmp x1, #16
+ b.lt get_64
+
+
+do_loop_16:
+ ldr q6, [x0], #16
+
+ uaddl v24.4s, v0.4h, v6.4h
+ uaddl2 v25.4s, v0.8h, v6.8h
+ add v0.4s, v24.4s, v25.4s
+
+
+ sub x1, x1, #16
+ cmp x1, #16
+ b.ge do_loop_16
+
+get_64:
+ mov x6, v0.d[0]
+ add x5, x5, x6
+ mov x6, v0.d[1]
+
+ add x5, x5, x6
+ cmp x5, x6
+ b.ge len_4
+ add x5, x5, #1
+
+len_4:
+ cmp x1, #4
+ b.lt len_2
+
+ sub x1, x1, #4
+ ldr w6, [x0], #4
+ and x6, x6, #0xffffffff
+ add x5, x5, x6
+ b len_4
+
+len_2:
+ cmp x1, #2
+ b.lt len_1
+ sub x1, x1, #2
+ ldrh w6, [x0], #2
+ and x6, x6, x13
+ add x5, x5, x6
+
+len_1:
+ cmp x1, #1
+ b.lt fold_32
+ ldr x6, [x0], #1
+#ifdef __AARCH64EB__
+ lsl x6, x6, #8
+ and x6, x6, x13
+#else
+ and x6, x6, #0xff
+#endif
+ add x5, x5, x6
+
+fold_32:
+ and x9, x5, x13 /* [15:0] */
+ and x10, x13, x5, lsr #16 /* [31:16] */
+ and x11, x13, x5, lsr #32 /* [47:32] */
+ and x12, x13, x5, lsr #48 /* [47:32] */
+
+ add x9, x9, x10
+ add x11, x11, x12
+
+ add x9, x9, x11
+
+ and x10, x9, x13
+ and x11, x13, x9, lsr #16
+
+ add x5, x10, x11
+
+ and x9, x5, x13 /* add carry */
+ and x10, x13, x5, lsr #16
+ add x5, x9, x10
+
+ cbz x4, out /* addr isn't odd */
+
+ lsr x6, x5, #8
+ and x6, x6, #0xff
+ and x7, x5, #0xff
+ lsl x7, x7, #8
+
+ orr x5, x6, x7
+
+out:
+ mov x0, x5
+
+ /*
+ * pop neon register from stack
+ */
+/* ldp q24, q25, [sp], #0x20
+ ldp q22, q23, [sp], #0x20
+ ldp q20, q21, [sp], #0x20
+ ldp q18, q19, [sp], #0x20
+ ldp q16, q17, [sp], #0x20
+ ldp q14, q15, [sp], #0x20
+ ldp q12, q13, [sp], #0x20
+ ldp q10, q11, [sp], #0x20
+ ldp q8, q9, [sp], #0x20
+ ldp q6, q7, [sp], #0x20
+ ldp q4, q5, [sp], #0x20
+ ldp q2, q3, [sp], #0x20
+ ldp q0, q1, [sp], #0x20
+*/
+ ret
diff --git a/lib/checksum.c b/lib/checksum.c
index d3ec93f..422949c 100644
--- a/lib/checksum.c
+++ b/lib/checksum.c
@@ -34,10 +34,8 @@
#include <linux/export.h>
#include <net/checksum.h>
-
#include <asm/byteorder.h>
-#ifndef do_csum
static inline unsigned short from32to16(unsigned int x)
{
/* add up 16-bit and 16-bit for 16+c bit */
@@ -47,7 +45,7 @@ static inline unsigned short from32to16(unsigned int x)
return x;
}
-static unsigned int do_csum(const unsigned char *buff, int len)
+unsigned int do_csum_generic(const unsigned char *buff, int len)
{
int odd;
unsigned int result = 0;
@@ -100,6 +98,8 @@ static unsigned int do_csum(const unsigned char *buff, int len)
out:
return result;
}
+#ifndef do_csum
+#define do_csum do_csum_generic
#endif
#ifndef ip_fast_csum
--
2.7.4
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH] arm64: lib: accelerate do_csum() with NEON instruction
2018-11-21 9:21 [PATCH] arm64: lib: accelerate do_csum() with NEON instruction huanglingyan
@ 2018-11-21 14:41 ` Robin Murphy
2018-11-26 11:28 ` huanglingyan (A)
2018-11-28 1:47 ` huanglingyan (A)
2018-12-03 19:32 ` Will Deacon
1 sibling, 2 replies; 5+ messages in thread
From: Robin Murphy @ 2018-11-21 14:41 UTC (permalink / raw)
To: linux-arm-kernel
On 21/11/2018 09:21, huanglingyan wrote:
> From: Lingyan Huang <huanglingyan2@huawei.com>
>
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
Can you say how slow exactly it is? I had been meaning to come back and
take a look at do_csum() since I did a rough perf profile on a little
Cortex-A53 box with ethernet checksum offloading disabled, but I've not
found the time for a proper analysis yet.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
How much improvement have you measured with this change? Ideally for a
range of different-sized workloads on more than one microarchitecture -
some CPUs have weaker SIMD pipelines than others, so any possible
benefit is still going to have some variance overall.
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
> ---
> arch/arm64/include/asm/checksum.h | 8 ++
> arch/arm64/lib/Makefile | 3 +
> arch/arm64/lib/checksum.c | 30 +++++++
> arch/arm64/lib/do_csum.S | 182 ++++++++++++++++++++++++++++++++++++++
> lib/checksum.c | 6 +-
> 5 files changed, 226 insertions(+), 3 deletions(-)
> create mode 100644 arch/arm64/lib/checksum.c
> create mode 100644 arch/arm64/lib/do_csum.S
>
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 0b6f5a7..9faf642 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -24,8 +24,16 @@ static inline __sum16 csum_fold(__wsum csum)
> sum += (sum >> 16) | (sum << 16);
> return ~(__force __sum16)(sum >> 16);
> }
> +
Please clean up unnecessary noise like this from your patches before
posting.
> #define csum_fold csum_fold
>
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len);
> +unsigned int do_csum(const unsigned char *buff, unsigned int len);
> +#define do_csum do_csum
> +#endif
> +
> static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> {
> __uint128_t tmp;
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 69ff988..9596fd8 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -5,6 +5,9 @@ lib-y := clear_user.o delay.o copy_from_user.o \
> memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
> strchr.o strrchr.o tishift.o
>
> +# If NEON mode is supported, compile this file to speed up do_csum.
> +lib-$(CONFIG_KERNEL_MODE_NEON) += do_csum.o checksum.o
> +
> # Tell the compiler to treat all general purpose registers (with the
> # exception of the IP registers, which are already handled by the caller
> # in case of a PLT) as callee-saved, which allows for efficient runtime
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index 0000000..61dee8b
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,30 @@
> +/*
> + * Generic C or neon implementation of do_csum operations.
> + * Choose faster neon instructions when NEON is supported.
> + *
> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
> + * Written by Lingyan Huang (huanglingyan2 at huawei.com)
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.
> + */
> +
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +#include <asm/checksum.h>
> +#include <asm/byteorder.h>
> +
> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
> +{
> + if (may_use_simd()) {
There's a significant overhead involved with kernel_neon_{begin,end}
which means that for sufficiently small values of len, taking this path
will almost certainly be slower than even the dumb generic C
implementation. For starters, with len<32 your code doesn't even use
SIMD anyway, so it's just pure waste.
> + unsigned int res;
> +
> + kernel_neon_begin();
Also note that you've got preemption disabled the whole time in here - I
don't know off-hand how large a single buffer might possibly be
checksummed in a single call, but the potential latency there is a
problem until proven otherwise, especially for RT.
> + res = do_csum_neon(buff, len);
> + kernel_neon_end();
> + return res;
> + } else
> + return do_csum_generic(buff, len);
> +}
> diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
> new file mode 100644
> index 0000000..820302c
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.S
> @@ -0,0 +1,182 @@
> +/*
> + * Copyright (C) 2018 Huawei Inc.
> + *
> + * Optmized version of the standard do_csum() function
> + *
> + * Parameters:
> + * x0 - address of buffer to checksum (const unsigned char *)
> + * x1 - length of the buffer (int)
> + * Returns:
> + * x0 - the return checksum of the buffer
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +ENTRY(do_csum_neon)
> + ldr x13, =0xffff
> + eor x4, x4, x4
> + eor x5, x5, x5
> + eor v0.16b, v0.16b, v0.16b // clear v0,x4,x5
> +
> + /*
> + * len is zero or negative
> + */
> + and x6, x1, #0x80000000
> + cmp x6, #0
> + b.gt out
> + cbz w1, out
Um... how is that more optimal than
cmp x1, #0
b.le out
?
> +
> + tst x0, #1
> + b.eq addr_not_odd
> +
> + /*
> + * addr is odd
> + */
> + mov x4, #1
> + ldr x6, [x0], #1
> +#ifdef __AARCH64EB__
> + and x6, x6, #0xff
> +#else
> + lsl x6, x6, #8
> + and x6, x6, x13
> +#endif
Did you just manage to open-code an ldrb instruction? :/
AFAICS the aim here is to load a byte, and shift it left if
little-endian - there's no way that needs 4 instructions.
> + add x5, x5, x6
> + sub x1, x1, #1
> +
> +addr_not_odd:
> + cmp x1, #32
> + b.lt len_4
> + cmp x1, #128
> + b.ge len_gt_128
> + b do_loop_16
> +
Surely you want to align the source pointer to more than just even/odd
given that the subsequent loops load in chunks much larger than 2 bytes?
Also, are you actually tuning this for typical static branch prediction
on the assumption that len<128 is the likely case (which would really
warrant a comment), or is this just an unnecessarily long-winded way of
saying:
cmp x1, #128
b.lt do_loop_16
?
> +len_gt_128:
> + movi v0.4s, #0
We already zeroed v0 earlier (and frankly if we'd done it this way it
wouldn't have needed a comment there either).
> + movi v1.4s, #0
> + movi v2.4s, #0
> + movi v3.4s, #0
> +
> +do_loop_64:
> +
> + ldp q5, q4, [x0], #32
> + ldp q7, q6, [x0], #32
Using post-index writeback is liable to cause an unnecessary register
dependency stall between these two loads in at least some cases.
> +
> + uadalp v0.4s, v4.8h
> + uadalp v1.4s, v5.8h
> + uadalp v2.4s, v6.8h
> + uadalp v3.4s, v7.8h
What if we're checksumming a buffer larger than 4MB and lose the
carry-out when one or more of these accumulations overflow?
> +
> + sub x1, x1, #64
> + cmp x1, #64
> + b.ge do_loop_64
> +
> + add v0.4s, v0.4s, v1.4s
> + add v2.4s, v2.4s, v3.4s
> + add v0.4s, v0.4s, v2.4s
> +
> + cmp x1, #16
> + b.lt get_64
> +
> +
> +do_loop_16:
> + ldr q6, [x0], #16
> +
> + uaddl v24.4s, v0.4h, v6.4h
> + uaddl2 v25.4s, v0.8h, v6.8h
> + add v0.4s, v24.4s, v25.4s
> +
> +
> + sub x1, x1, #16
> + cmp x1, #16
> + b.ge do_loop_16
> +
> +get_64:
> + mov x6, v0.d[0]
> + add x5, x5, x6
> + mov x6, v0.d[1]
> +
> + add x5, x5, x6
Is that really more efficient than an addp (or addh) and extracting a
single element?
> + cmp x5, x6
> + b.ge len_4
> + add x5, x5, #1
Is this... manual carry logic without using adds/adc? :/
> +
> +len_4:
> + cmp x1, #4
> + b.lt len_2
> +
> + sub x1, x1, #4
> + ldr w6, [x0], #4
> + and x6, x6, #0xffffffff
What's that and for?
> + add x5, x5, x6
> + b len_4
> +
> +len_2:
> + cmp x1, #2
> + b.lt len_1
> + sub x1, x1, #2
> + ldrh w6, [x0], #2
> + and x6, x6, x13
> + add x5, x5, x6
> +
> +len_1:
> + cmp x1, #1
> + b.lt fold_32
> + ldr x6, [x0], #1
> +#ifdef __AARCH64EB__
> + lsl x6, x6, #8
> + and x6, x6, x13
> +#else
> + and x6, x6, #0xff
> +#endif
> + add x5, x5, x6
> +
> +fold_32:
> + and x9, x5, x13 /* [15:0] */
> + and x10, x13, x5, lsr #16 /* [31:16] */
> + and x11, x13, x5, lsr #32 /* [47:32] */
> + and x12, x13, x5, lsr #48 /* [47:32] */
> +
> + add x9, x9, x10
> + add x11, x11, x12
> +
> + add x9, x9, x11
> +
> + and x10, x9, x13
> + and x11, x13, x9, lsr #16
> +
> + add x5, x10, x11
> +
> + and x9, x5, x13 /* add carry */
> + and x10, x13, x5, lsr #16
> + add x5, x9, x10
> +
> + cbz x4, out /* addr isn't odd */
> +
> + lsr x6, x5, #8
> + and x6, x6, #0xff
> + and x7, x5, #0xff
> + lsl x7, x7, #8
> +
> + orr x5, x6, x7
I know folding a 32-bit partial sum to 16 bits needs at most 3
instructions (ror/add/lsr), and I can't imagine the additional odd-byte
correction can need more than about 4 on top of that. As it stands,
there's more code in this "optimised" fold alone than in the entire
ip_fast_csum() routine.
> +
> +out:
> + mov x0, x5
> +
> + /*
> + * pop neon register from stack
> + */
> +/* ldp q24, q25, [sp], #0x20
> + ldp q22, q23, [sp], #0x20
> + ldp q20, q21, [sp], #0x20
> + ldp q18, q19, [sp], #0x20
> + ldp q16, q17, [sp], #0x20
> + ldp q14, q15, [sp], #0x20
> + ldp q12, q13, [sp], #0x20
> + ldp q10, q11, [sp], #0x20
> + ldp q8, q9, [sp], #0x20
> + ldp q6, q7, [sp], #0x20
> + ldp q4, q5, [sp], #0x20
> + ldp q2, q3, [sp], #0x20
> + ldp q0, q1, [sp], #0x20
> +*/
Why's this here?
> + ret
> diff --git a/lib/checksum.c b/lib/checksum.c
> index d3ec93f..422949c 100644
> --- a/lib/checksum.c
> +++ b/lib/checksum.c
> @@ -34,10 +34,8 @@
>
> #include <linux/export.h>
> #include <net/checksum.h>
> -
> #include <asm/byteorder.h>
>
> -#ifndef do_csum
> static inline unsigned short from32to16(unsigned int x)
> {
> /* add up 16-bit and 16-bit for 16+c bit */
> @@ -47,7 +45,7 @@ static inline unsigned short from32to16(unsigned int x)
> return x;
> }
>
> -static unsigned int do_csum(const unsigned char *buff, int len)
> +unsigned int do_csum_generic(const unsigned char *buff, int len)
> {
> int odd;
> unsigned int result = 0;
> @@ -100,6 +98,8 @@ static unsigned int do_csum(const unsigned char *buff, int len)
> out:
> return result;
> }
> +#ifndef do_csum
> +#define do_csum do_csum_generic
AFAICS this now means that at least one architecture (hexagon) gets the
generic version built in despite it being entirely redundant.
Robin.
> #endif
>
> #ifndef ip_fast_csum
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] arm64: lib: accelerate do_csum() with NEON instruction
2018-11-21 14:41 ` Robin Murphy
@ 2018-11-26 11:28 ` huanglingyan (A)
2018-11-28 1:47 ` huanglingyan (A)
1 sibling, 0 replies; 5+ messages in thread
From: huanglingyan (A) @ 2018-11-26 11:28 UTC (permalink / raw)
To: linux-arm-kernel
? 2018/11/21 22:41, Robin Murphy ??:
> On 21/11/2018 09:21, huanglingyan wrote:
>> From: Lingyan Huang <huanglingyan2@huawei.com>
>>
>> Function do_csum() in lib/checksum.c is used to compute checksum,
>> which is turned out to be slowly and costs a lot of resources.
>
> Can you say how slow exactly it is? I had been meaning to come back and take a look at do_csum() since I did a rough perf profile on a little Cortex-A53 box with ethernet checksum offloading disabled, but I've not found the time for a proper analysis yet.
Here is the comparison results of function ip_compute_csum() between general do_csum() and neon instruction do_csum().
pkt_len, 1000 64 128 129 512 513 1024 1500
gene_ip_cpt(ns): 55980 80730 81440 228330 228900 424930 607990
neon_ip_cpt(ns): 117610 115110 116160 132440 131520 150910 169020
ip_compute_csum() is an export function which calls do_csum().
__sum16 ip_compute_csum(const void *buff, int len)
{
return (__force __sum16)~do_csum(buff, len);
}
It seems that a threshold should be set about packet length. We can use neon instructions only when packet length exceeds the threshold. The spending maybe introduced when reservering/restoring neon registers with kernel_neon_begin()/kernel_neon_end().
>> Let's use neon instructions to accelerate the checksum computation
>> for arm64.
>
> How much improvement have you measured with this change? Ideally for a range of different-sized workloads on more than one microarchitecture - some CPUs have weaker SIMD pipelines than others, so any possible benefit is still going to have some variance overall.
>
This sounds good. We can get others' help for testing since I only have one microarchitecture.
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
>> ---
>> arch/arm64/include/asm/checksum.h | 8 ++
>> arch/arm64/lib/Makefile | 3 +
>> arch/arm64/lib/checksum.c | 30 +++++++
>> arch/arm64/lib/do_csum.S | 182 ++++++++++++++++++++++++++++++++++++++
>> lib/checksum.c | 6 +-
>> 5 files changed, 226 insertions(+), 3 deletions(-)
>> create mode 100644 arch/arm64/lib/checksum.c
>> create mode 100644 arch/arm64/lib/do_csum.S
>>
>> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
>> index 0b6f5a7..9faf642 100644
>> --- a/arch/arm64/include/asm/checksum.h
>> +++ b/arch/arm64/include/asm/checksum.h
>> @@ -24,8 +24,16 @@ static inline __sum16 csum_fold(__wsum csum)
>> sum += (sum >> 16) | (sum << 16);
>> return ~(__force __sum16)(sum >> 16);
>> }
>> +
>
> Please clean up unnecessary noise like this from your patches before posting.
>
>> #define csum_fold csum_fold
>> +#ifdef CONFIG_KERNEL_MODE_NEON
>> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len);
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len);
>> +#define do_csum do_csum
>> +#endif
>> +
>> static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>> {
>> __uint128_t tmp;
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 69ff988..9596fd8 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -5,6 +5,9 @@ lib-y := clear_user.o delay.o copy_from_user.o \
>> memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
>> strchr.o strrchr.o tishift.o
>> +# If NEON mode is supported, compile this file to speed up do_csum.
>> +lib-$(CONFIG_KERNEL_MODE_NEON) += do_csum.o checksum.o
>> +
>> # Tell the compiler to treat all general purpose registers (with the
>> # exception of the IP registers, which are already handled by the caller
>> # in case of a PLT) as callee-saved, which allows for efficient runtime
>> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
>> new file mode 100644
>> index 0000000..61dee8b
>> --- /dev/null
>> +++ b/arch/arm64/lib/checksum.c
>> @@ -0,0 +1,30 @@
>> +/*
>> + * Generic C or neon implementation of do_csum operations.
>> + * Choose faster neon instructions when NEON is supported.
>> + *
>> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
>> + * Written by Lingyan Huang (huanglingyan2 at huawei.com)
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public Licence
>> + * as published by the Free Software Foundation; either version
>> + * 2 of the Licence, or (at your option) any later version.
>> + */
>> +
>> +#include <asm/neon.h>
>> +#include <asm/simd.h>
>> +#include <asm/checksum.h>
>> +#include <asm/byteorder.h>
>> +
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
>> +{
>> + if (may_use_simd()) {
>
> There's a significant overhead involved with kernel_neon_{begin,end} which means that for sufficiently small values of len, taking this path will almost certainly be slower than even the dumb generic C implementation. For starters, with len<32 your code doesn't even use SIMD anyway, so it's just pure waste.
>
>> + unsigned int res;
>> +
>> + kernel_neon_begin();
>
> Also note that you've got preemption disabled the whole time in here - I don't know off-hand how large a single buffer might possibly be checksummed in a single call, but the potential latency there is a problem until proven otherwise, especially for RT.
>
>> + res = do_csum_neon(buff, len);
>> + kernel_neon_end();
>> + return res;
>> + } else
>> + return do_csum_generic(buff, len);
>> +}
>> diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
>> new file mode 100644
>> index 0000000..820302c
>> --- /dev/null
>> +++ b/arch/arm64/lib/do_csum.S
>> @@ -0,0 +1,182 @@
>> +/*
>> + * Copyright (C) 2018 Huawei Inc.
>> + *
>> + * Optmized version of the standard do_csum() function
>> + *
>> + * Parameters:
>> + * x0 - address of buffer to checksum (const unsigned char *)
>> + * x1 - length of the buffer (int)
>> + * Returns:
>> + * x0 - the return checksum of the buffer
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +ENTRY(do_csum_neon)
>> + ldr x13, =0xffff
>> + eor x4, x4, x4
>> + eor x5, x5, x5
>> + eor v0.16b, v0.16b, v0.16b // clear v0,x4,x5
>> +
>> + /*
>> + * len is zero or negative
>> + */
>> + and x6, x1, #0x80000000
>> + cmp x6, #0
>> + b.gt out
>> + cbz w1, out
>
> Um... how is that more optimal than
>
> cmp x1, #0
> b.le out
> ?
>
>> +
>> + tst x0, #1
>> + b.eq addr_not_odd
>> +
>> + /*
>> + * addr is odd
>> + */
>> + mov x4, #1
>> + ldr x6, [x0], #1
>> +#ifdef __AARCH64EB__
>> + and x6, x6, #0xff
>> +#else
>> + lsl x6, x6, #8
>> + and x6, x6, x13
>> +#endif
>
> Did you just manage to open-code an ldrb instruction? :/
>
> AFAICS the aim here is to load a byte, and shift it left if little-endian - there's no way that needs 4 instructions.
>
>> + add x5, x5, x6
>> + sub x1, x1, #1
>> +
>> +addr_not_odd:
>> + cmp x1, #32
>> + b.lt len_4
>> + cmp x1, #128
>> + b.ge len_gt_128
>> + b do_loop_16
>> +
>
> Surely you want to align the source pointer to more than just even/odd given that the subsequent loops load in chunks much larger than 2 bytes?
>
> Also, are you actually tuning this for typical static branch prediction on the assumption that len<128 is the likely case (which would really warrant a comment), or is this just an unnecessarily long-winded way of saying:
>
> cmp x1, #128
> b.lt do_loop_16
>
> ?
>
>> +len_gt_128:
>> + movi v0.4s, #0
>
> We already zeroed v0 earlier (and frankly if we'd done it this way it wouldn't have needed a comment there either).
>
>> + movi v1.4s, #0
>> + movi v2.4s, #0
>> + movi v3.4s, #0
>> +
>> +do_loop_64:
>> +
>> + ldp q5, q4, [x0], #32
>> + ldp q7, q6, [x0], #32
>
> Using post-index writeback is liable to cause an unnecessary register dependency stall between these two loads in at least some cases.
>
>> +
>> + uadalp v0.4s, v4.8h
>> + uadalp v1.4s, v5.8h
>> + uadalp v2.4s, v6.8h
>> + uadalp v3.4s, v7.8h
>
> What if we're checksumming a buffer larger than 4MB and lose the carry-out when one or more of these accumulations overflow?
>
>> +
>> + sub x1, x1, #64
>> + cmp x1, #64
>> + b.ge do_loop_64
>> +
>> + add v0.4s, v0.4s, v1.4s
>> + add v2.4s, v2.4s, v3.4s
>> + add v0.4s, v0.4s, v2.4s
>> +
>> + cmp x1, #16
>> + b.lt get_64
>> +
>> +
>> +do_loop_16:
>> + ldr q6, [x0], #16
>> +
>> + uaddl v24.4s, v0.4h, v6.4h
>> + uaddl2 v25.4s, v0.8h, v6.8h
>> + add v0.4s, v24.4s, v25.4s
>> +
>> +
>> + sub x1, x1, #16
>> + cmp x1, #16
>> + b.ge do_loop_16
>> +
>> +get_64:
>> + mov x6, v0.d[0]
>> + add x5, x5, x6
>> + mov x6, v0.d[1]
>> +
>> + add x5, x5, x6
>
> Is that really more efficient than an addp (or addh) and extracting a single element?
>
>> + cmp x5, x6
>> + b.ge len_4
>> + add x5, x5, #1
>
> Is this... manual carry logic without using adds/adc? :/
>
>> +
>> +len_4:
>> + cmp x1, #4
>> + b.lt len_2
>> +
>> + sub x1, x1, #4
>> + ldr w6, [x0], #4
>> + and x6, x6, #0xffffffff
>
> What's that and for?
>
>> + add x5, x5, x6
>> + b len_4
>> +
>> +len_2:
>> + cmp x1, #2
>> + b.lt len_1
>> + sub x1, x1, #2
>> + ldrh w6, [x0], #2
>> + and x6, x6, x13
>> + add x5, x5, x6
>> +
>> +len_1:
>> + cmp x1, #1
>> + b.lt fold_32
>> + ldr x6, [x0], #1
>> +#ifdef __AARCH64EB__
>> + lsl x6, x6, #8
>> + and x6, x6, x13
>> +#else
>> + and x6, x6, #0xff
>> +#endif
>> + add x5, x5, x6
>> +
>> +fold_32:
>> + and x9, x5, x13 /* [15:0] */
>> + and x10, x13, x5, lsr #16 /* [31:16] */
>> + and x11, x13, x5, lsr #32 /* [47:32] */
>> + and x12, x13, x5, lsr #48 /* [47:32] */
>> +
>> + add x9, x9, x10
>> + add x11, x11, x12
>> +
>> + add x9, x9, x11
>> +
>> + and x10, x9, x13
>> + and x11, x13, x9, lsr #16
>> +
>> + add x5, x10, x11
>> +
>> + and x9, x5, x13 /* add carry */
>> + and x10, x13, x5, lsr #16
>> + add x5, x9, x10
>> +
>> + cbz x4, out /* addr isn't odd */
>> +
>> + lsr x6, x5, #8
>> + and x6, x6, #0xff
>> + and x7, x5, #0xff
>> + lsl x7, x7, #8
>> +
>> + orr x5, x6, x7
>
> I know folding a 32-bit partial sum to 16 bits needs at most 3 instructions (ror/add/lsr), and I can't imagine the additional odd-byte correction can need more than about 4 on top of that. As it stands, there's more code in this "optimised" fold alone than in the entire ip_fast_csum() routine.
>
>> +
>> +out:
>> + mov x0, x5
>> +
>> + /*
>> + * pop neon register from stack
>> + */
>> +/* ldp q24, q25, [sp], #0x20
>> + ldp q22, q23, [sp], #0x20
>> + ldp q20, q21, [sp], #0x20
>> + ldp q18, q19, [sp], #0x20
>> + ldp q16, q17, [sp], #0x20
>> + ldp q14, q15, [sp], #0x20
>> + ldp q12, q13, [sp], #0x20
>> + ldp q10, q11, [sp], #0x20
>> + ldp q8, q9, [sp], #0x20
>> + ldp q6, q7, [sp], #0x20
>> + ldp q4, q5, [sp], #0x20
>> + ldp q2, q3, [sp], #0x20
>> + ldp q0, q1, [sp], #0x20
>> +*/
>
> Why's this here?
>
>> + ret
>> diff --git a/lib/checksum.c b/lib/checksum.c
>> index d3ec93f..422949c 100644
>> --- a/lib/checksum.c
>> +++ b/lib/checksum.c
>> @@ -34,10 +34,8 @@
>> #include <linux/export.h>
>> #include <net/checksum.h>
>> -
>> #include <asm/byteorder.h>
>> -#ifndef do_csum
>> static inline unsigned short from32to16(unsigned int x)
>> {
>> /* add up 16-bit and 16-bit for 16+c bit */
>> @@ -47,7 +45,7 @@ static inline unsigned short from32to16(unsigned int x)
>> return x;
>> }
>> -static unsigned int do_csum(const unsigned char *buff, int len)
>> +unsigned int do_csum_generic(const unsigned char *buff, int len)
>> {
>> int odd;
>> unsigned int result = 0;
>> @@ -100,6 +98,8 @@ static unsigned int do_csum(const unsigned char *buff, int len)
>> out:
>> return result;
>> }
>> +#ifndef do_csum
>> +#define do_csum do_csum_generic
>
> AFAICS this now means that at least one architecture (hexagon) gets the generic version built in despite it being entirely redundant.
>
> Robin.
>
>> #endif
>> #ifndef ip_fast_csum
>>
>
> .
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] arm64: lib: accelerate do_csum() with NEON instruction
2018-11-21 14:41 ` Robin Murphy
2018-11-26 11:28 ` huanglingyan (A)
@ 2018-11-28 1:47 ` huanglingyan (A)
1 sibling, 0 replies; 5+ messages in thread
From: huanglingyan (A) @ 2018-11-28 1:47 UTC (permalink / raw)
To: linux-arm-kernel
? 2018/11/21 22:41, Robin Murphy ??:
> On 21/11/2018 09:21, huanglingyan wrote:
>> From: Lingyan Huang <huanglingyan2@huawei.com>
>>
>> Function do_csum() in lib/checksum.c is used to compute checksum,
>> which is turned out to be slowly and costs a lot of resources.
>
> Can you say how slow exactly it is? I had been meaning to come back and take a look at do_csum() since I did a rough perf profile on a little Cortex-A53 box with ethernet checksum offloading disabled, but I've not found the time for a proper analysis yet.
Here is the comparison results of function ip_compute_csum() between general do_csum() and neon instruction do_csum().
pkt_len, 1000 64 128 129 512 513 1024 1500
gene_ip_cpt(ns): 55980 80730 81440 228330 228900 424930 607990
neon_ip_cpt(ns): 117610 115110 116160 132440 131520 150910 169020
ip_compute_csum() is an export function which calls do_csum().
__sum16 ip_compute_csum(const void *buff, int len)
{
return (__force __sum16)~do_csum(buff, len);
}
It seems that a threshold should be set about packet length. We can use neon instructions only when packet length exceeds the threshold. The spending maybe introduced when reservering/restoring neon registers with kernel_neon_begin()/kernel_neon_end().
>> Let's use neon instructions to accelerate the checksum computation
>> for arm64.
>
> How much improvement have you measured with this change? Ideally for a range of different-sized workloads on more than one microarchitecture - some CPUs have weaker SIMD pipelines than others, so any possible benefit is still going to have some variance overall.
>
This sounds good. We can get others' help for testing since I only have one microarchitecture.
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
>> ---
>> arch/arm64/include/asm/checksum.h | 8 ++
>> arch/arm64/lib/Makefile | 3 +
>> arch/arm64/lib/checksum.c | 30 +++++++
>> arch/arm64/lib/do_csum.S | 182 ++++++++++++++++++++++++++++++++++++++
>> lib/checksum.c | 6 +-
>> 5 files changed, 226 insertions(+), 3 deletions(-)
>> create mode 100644 arch/arm64/lib/checksum.c
>> create mode 100644 arch/arm64/lib/do_csum.S
>>
>> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
>> index 0b6f5a7..9faf642 100644
>> --- a/arch/arm64/include/asm/checksum.h
>> +++ b/arch/arm64/include/asm/checksum.h
>> @@ -24,8 +24,16 @@ static inline __sum16 csum_fold(__wsum csum)
>> sum += (sum >> 16) | (sum << 16);
>> return ~(__force __sum16)(sum >> 16);
>> }
>> +
>
> Please clean up unnecessary noise like this from your patches before posting.
>
>> #define csum_fold csum_fold
>> +#ifdef CONFIG_KERNEL_MODE_NEON
>> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len);
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len);
>> +#define do_csum do_csum
>> +#endif
>> +
>> static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>> {
>> __uint128_t tmp;
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 69ff988..9596fd8 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -5,6 +5,9 @@ lib-y := clear_user.o delay.o copy_from_user.o \
>> memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
>> strchr.o strrchr.o tishift.o
>> +# If NEON mode is supported, compile this file to speed up do_csum.
>> +lib-$(CONFIG_KERNEL_MODE_NEON) += do_csum.o checksum.o
>> +
>> # Tell the compiler to treat all general purpose registers (with the
>> # exception of the IP registers, which are already handled by the caller
>> # in case of a PLT) as callee-saved, which allows for efficient runtime
>> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
>> new file mode 100644
>> index 0000000..61dee8b
>> --- /dev/null
>> +++ b/arch/arm64/lib/checksum.c
>> @@ -0,0 +1,30 @@
>> +/*
>> + * Generic C or neon implementation of do_csum operations.
>> + * Choose faster neon instructions when NEON is supported.
>> + *
>> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
>> + * Written by Lingyan Huang (huanglingyan2 at huawei.com)
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public Licence
>> + * as published by the Free Software Foundation; either version
>> + * 2 of the Licence, or (at your option) any later version.
>> + */
>> +
>> +#include <asm/neon.h>
>> +#include <asm/simd.h>
>> +#include <asm/checksum.h>
>> +#include <asm/byteorder.h>
>> +
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
>> +{
>> + if (may_use_simd()) {
>
> There's a significant overhead involved with kernel_neon_{begin,end} which means that for sufficiently small values of len, taking this path will almost certainly be slower than even the dumb generic C implementation. For starters, with len<32 your code doesn't even use SIMD anyway, so it's just pure waste.
>
>> + unsigned int res;
>> +
>> + kernel_neon_begin();
>
> Also note that you've got preemption disabled the whole time in here - I don't know off-hand how large a single buffer might possibly be checksummed in a single call, but the potential latency there is a problem until proven otherwise, especially for RT.
>
>> + res = do_csum_neon(buff, len);
>> + kernel_neon_end();
>> + return res;
>> + } else
>> + return do_csum_generic(buff, len);
>> +}
>> diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
>> new file mode 100644
>> index 0000000..820302c
>> --- /dev/null
>> +++ b/arch/arm64/lib/do_csum.S
>> @@ -0,0 +1,182 @@
>> +/*
>> + * Copyright (C) 2018 Huawei Inc.
>> + *
>> + * Optmized version of the standard do_csum() function
>> + *
>> + * Parameters:
>> + * x0 - address of buffer to checksum (const unsigned char *)
>> + * x1 - length of the buffer (int)
>> + * Returns:
>> + * x0 - the return checksum of the buffer
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +ENTRY(do_csum_neon)
>> + ldr x13, =0xffff
>> + eor x4, x4, x4
>> + eor x5, x5, x5
>> + eor v0.16b, v0.16b, v0.16b // clear v0,x4,x5
>> +
>> + /*
>> + * len is zero or negative
>> + */
>> + and x6, x1, #0x80000000
>> + cmp x6, #0
>> + b.gt out
>> + cbz w1, out
>
> Um... how is that more optimal than
>
> cmp x1, #0
> b.le out
> ?
>
>> +
>> + tst x0, #1
>> + b.eq addr_not_odd
>> +
>> + /*
>> + * addr is odd
>> + */
>> + mov x4, #1
>> + ldr x6, [x0], #1
>> +#ifdef __AARCH64EB__
>> + and x6, x6, #0xff
>> +#else
>> + lsl x6, x6, #8
>> + and x6, x6, x13
>> +#endif
>
> Did you just manage to open-code an ldrb instruction? :/
>
> AFAICS the aim here is to load a byte, and shift it left if little-endian - there's no way that needs 4 instructions.
>
>> + add x5, x5, x6
>> + sub x1, x1, #1
>> +
>> +addr_not_odd:
>> + cmp x1, #32
>> + b.lt len_4
>> + cmp x1, #128
>> + b.ge len_gt_128
>> + b do_loop_16
>> +
>
> Surely you want to align the source pointer to more than just even/odd given that the subsequent loops load in chunks much larger than 2 bytes?
>
> Also, are you actually tuning this for typical static branch prediction on the assumption that len<128 is the likely case (which would really warrant a comment), or is this just an unnecessarily long-winded way of saying:
>
> cmp x1, #128
> b.lt do_loop_16
>
> ?
>
>> +len_gt_128:
>> + movi v0.4s, #0
>
> We already zeroed v0 earlier (and frankly if we'd done it this way it wouldn't have needed a comment there either).
>
>> + movi v1.4s, #0
>> + movi v2.4s, #0
>> + movi v3.4s, #0
>> +
>> +do_loop_64:
>> +
>> + ldp q5, q4, [x0], #32
>> + ldp q7, q6, [x0], #32
>
> Using post-index writeback is liable to cause an unnecessary register dependency stall between these two loads in at least some cases.
>
>> +
>> + uadalp v0.4s, v4.8h
>> + uadalp v1.4s, v5.8h
>> + uadalp v2.4s, v6.8h
>> + uadalp v3.4s, v7.8h
>
> What if we're checksumming a buffer larger than 4MB and lose the carry-out when one or more of these accumulations overflow?
>
>> +
>> + sub x1, x1, #64
>> + cmp x1, #64
>> + b.ge do_loop_64
>> +
>> + add v0.4s, v0.4s, v1.4s
>> + add v2.4s, v2.4s, v3.4s
>> + add v0.4s, v0.4s, v2.4s
>> +
>> + cmp x1, #16
>> + b.lt get_64
>> +
>> +
>> +do_loop_16:
>> + ldr q6, [x0], #16
>> +
>> + uaddl v24.4s, v0.4h, v6.4h
>> + uaddl2 v25.4s, v0.8h, v6.8h
>> + add v0.4s, v24.4s, v25.4s
>> +
>> +
>> + sub x1, x1, #16
>> + cmp x1, #16
>> + b.ge do_loop_16
>> +
>> +get_64:
>> + mov x6, v0.d[0]
>> + add x5, x5, x6
>> + mov x6, v0.d[1]
>> +
>> + add x5, x5, x6
>
> Is that really more efficient than an addp (or addh) and extracting a single element?
>
>> + cmp x5, x6
>> + b.ge len_4
>> + add x5, x5, #1
>
> Is this... manual carry logic without using adds/adc? :/
>
>> +
>> +len_4:
>> + cmp x1, #4
>> + b.lt len_2
>> +
>> + sub x1, x1, #4
>> + ldr w6, [x0], #4
>> + and x6, x6, #0xffffffff
>
> What's that and for?
>
>> + add x5, x5, x6
>> + b len_4
>> +
>> +len_2:
>> + cmp x1, #2
>> + b.lt len_1
>> + sub x1, x1, #2
>> + ldrh w6, [x0], #2
>> + and x6, x6, x13
>> + add x5, x5, x6
>> +
>> +len_1:
>> + cmp x1, #1
>> + b.lt fold_32
>> + ldr x6, [x0], #1
>> +#ifdef __AARCH64EB__
>> + lsl x6, x6, #8
>> + and x6, x6, x13
>> +#else
>> + and x6, x6, #0xff
>> +#endif
>> + add x5, x5, x6
>> +
>> +fold_32:
>> + and x9, x5, x13 /* [15:0] */
>> + and x10, x13, x5, lsr #16 /* [31:16] */
>> + and x11, x13, x5, lsr #32 /* [47:32] */
>> + and x12, x13, x5, lsr #48 /* [47:32] */
>> +
>> + add x9, x9, x10
>> + add x11, x11, x12
>> +
>> + add x9, x9, x11
>> +
>> + and x10, x9, x13
>> + and x11, x13, x9, lsr #16
>> +
>> + add x5, x10, x11
>> +
>> + and x9, x5, x13 /* add carry */
>> + and x10, x13, x5, lsr #16
>> + add x5, x9, x10
>> +
>> + cbz x4, out /* addr isn't odd */
>> +
>> + lsr x6, x5, #8
>> + and x6, x6, #0xff
>> + and x7, x5, #0xff
>> + lsl x7, x7, #8
>> +
>> + orr x5, x6, x7
>
> I know folding a 32-bit partial sum to 16 bits needs at most 3 instructions (ror/add/lsr), and I can't imagine the additional odd-byte correction can need more than about 4 on top of that. As it stands, there's more code in this "optimised" fold alone than in the entire ip_fast_csum() routine.
>
>> +
>> +out:
>> + mov x0, x5
>> +
>> + /*
>> + * pop neon register from stack
>> + */
>> +/* ldp q24, q25, [sp], #0x20
>> + ldp q22, q23, [sp], #0x20
>> + ldp q20, q21, [sp], #0x20
>> + ldp q18, q19, [sp], #0x20
>> + ldp q16, q17, [sp], #0x20
>> + ldp q14, q15, [sp], #0x20
>> + ldp q12, q13, [sp], #0x20
>> + ldp q10, q11, [sp], #0x20
>> + ldp q8, q9, [sp], #0x20
>> + ldp q6, q7, [sp], #0x20
>> + ldp q4, q5, [sp], #0x20
>> + ldp q2, q3, [sp], #0x20
>> + ldp q0, q1, [sp], #0x20
>> +*/
>
> Why's this here?
>
>> + ret
>> diff --git a/lib/checksum.c b/lib/checksum.c
>> index d3ec93f..422949c 100644
>> --- a/lib/checksum.c
>> +++ b/lib/checksum.c
>> @@ -34,10 +34,8 @@
>> #include <linux/export.h>
>> #include <net/checksum.h>
>> -
>> #include <asm/byteorder.h>
>> -#ifndef do_csum
>> static inline unsigned short from32to16(unsigned int x)
>> {
>> /* add up 16-bit and 16-bit for 16+c bit */
>> @@ -47,7 +45,7 @@ static inline unsigned short from32to16(unsigned int x)
>> return x;
>> }
>> -static unsigned int do_csum(const unsigned char *buff, int len)
>> +unsigned int do_csum_generic(const unsigned char *buff, int len)
>> {
>> int odd;
>> unsigned int result = 0;
>> @@ -100,6 +98,8 @@ static unsigned int do_csum(const unsigned char *buff, int len)
>> out:
>> return result;
>> }
>> +#ifndef do_csum
>> +#define do_csum do_csum_generic
>
> AFAICS this now means that at least one architecture (hexagon) gets the generic version built in despite it being entirely redundant.
>
> Robin.
>
>> #endif
>> #ifndef ip_fast_csum
>>
>
> .
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] arm64: lib: accelerate do_csum() with NEON instruction
2018-11-21 9:21 [PATCH] arm64: lib: accelerate do_csum() with NEON instruction huanglingyan
2018-11-21 14:41 ` Robin Murphy
@ 2018-12-03 19:32 ` Will Deacon
1 sibling, 0 replies; 5+ messages in thread
From: Will Deacon @ 2018-12-03 19:32 UTC (permalink / raw)
To: huanglingyan; +Cc: Catalin Marinas, liuyun01, linux-arm-kernel, ard.biesheuvel
[+ Ard and Jackie]
On Wed, Nov 21, 2018 at 05:21:05PM +0800, huanglingyan wrote:
> From: Lingyan Huang <huanglingyan2@huawei.com>
>
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
> ---
> arch/arm64/include/asm/checksum.h | 8 ++
> arch/arm64/lib/Makefile | 3 +
> arch/arm64/lib/checksum.c | 30 +++++++
> arch/arm64/lib/do_csum.S | 182 ++++++++++++++++++++++++++++++++++++++
> lib/checksum.c | 6 +-
> 5 files changed, 226 insertions(+), 3 deletions(-)
> create mode 100644 arch/arm64/lib/checksum.c
> create mode 100644 arch/arm64/lib/do_csum.S
I think we can avoid dropping into assembly for this if we build on top of
arm_neon.h for the core of the loop:
> +do_loop_64:
> +
> + ldp q5, q4, [x0], #32
> + ldp q7, q6, [x0], #32
> +
> + uadalp v0.4s, v4.8h
> + uadalp v1.4s, v5.8h
> + uadalp v2.4s, v6.8h
> + uadalp v3.4s, v7.8h
So please look at Jackie's patch for XOR checksumming as inspiration:
http://lists.infradead.org/pipermail/linux-arm-kernel/2018-November/615625.html
Thanks,
Will
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2018-12-03 19:33 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-11-21 9:21 [PATCH] arm64: lib: accelerate do_csum() with NEON instruction huanglingyan
2018-11-21 14:41 ` Robin Murphy
2018-11-26 11:28 ` huanglingyan (A)
2018-11-28 1:47 ` huanglingyan (A)
2018-12-03 19:32 ` Will Deacon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).