* [PATCH v2] riscv: lib: optimize memcmp with ld insn
@ 2022-09-02 11:00 Yipeng Zou
2022-09-05 11:49 ` Andrew Jones
0 siblings, 1 reply; 4+ messages in thread
From: Yipeng Zou @ 2022-09-02 11:00 UTC (permalink / raw)
To: linux-riscv, paul.walmsley, palmer, aou, Conor.Dooley
Cc: zouyipeng, liaochang1, chris.zjh
Currently memcmp was implemented in c code(lib/string.c), which compare
memory per byte.
This patch use ld insn compare memory per word to improve. From the test
Results, this will take several times optimized.
Alloc 8,4,1KB buffer to compare, each loop 10k times:
Size(B) Min(ns) AVG(ns) //before
8k 40800 46316
4k 26500 32302
1k 15600 17965
Size(B) Min(ns) AVG(ns) //after
8k 16100 21281
4k 14200 16446
1k 12400 14316
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
---
V2: Patch test data into the commit message,and collect Reviewed-by Tags.
arch/riscv/include/asm/string.h | 3 ++
arch/riscv/lib/Makefile | 1 +
arch/riscv/lib/memcmp.S | 59 +++++++++++++++++++++++++++++++++
3 files changed, 63 insertions(+)
create mode 100644 arch/riscv/lib/memcmp.S
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..3337b43d3803 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
#define __HAVE_ARCH_MEMMOVE
extern asmlinkage void *memmove(void *, const void *, size_t);
extern asmlinkage void *__memmove(void *, const void *, size_t);
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
+
/* For those files which don't want to check by kasan. */
#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
#define memcpy(dst, src, len) __memcpy(dst, src, len)
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..70773bf0c471 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -3,6 +3,7 @@ lib-y += delay.o
lib-y += memcpy.o
lib-y += memset.o
lib-y += memmove.o
+lib-y += memcmp.o
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
new file mode 100644
index 000000000000..83af1c433e6f
--- /dev/null
+++ b/arch/riscv/lib/memcmp.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 zouyipeng@huawei.com
+ */
+#include <linux/linkage.h>
+#include <asm-generic/export.h>
+#include <asm/asm.h>
+
+/* argrments:
+* a0: addr0
+* a1: addr1
+* a2: size
+*/
+#define addr0 a0
+#define addr1 a1
+#define limit a2
+
+#define data0 a3
+#define data1 a4
+#define tmp t3
+#define aaddr t4
+#define return a0
+
+/* load and compare */
+.macro LD_CMP op d0 d1 a0 a1 offset
+ \op \d0, 0(\a0)
+ \op \d1, 0(\a1)
+ addi \a0, \a0, \offset
+ addi \a1, \a1, \offset
+ sub tmp, \d0, \d1
+.endm
+
+ENTRY(memcmp)
+ /* test limit aligend with SZREG */
+ andi tmp, limit, SZREG - 1
+ /* load tail */
+ add aaddr, addr0, limit
+ sub aaddr, aaddr, tmp
+ add limit, addr0, limit
+
+.LloopWord:
+ sltu tmp, addr0, aaddr
+ beqz tmp, .LloopByte
+
+ LD_CMP REG_L data0 data1 addr0 addr1 SZREG
+ beqz tmp, .LloopWord
+ j .Lreturn
+
+.LloopByte:
+ sltu tmp, addr0, limit
+ beqz tmp, .Lreturn
+
+ LD_CMP lbu data0 data1 addr0 addr1 1
+ beqz tmp, .LloopByte
+.Lreturn:
+ mv return, tmp
+ ret
+END(memcmp)
+EXPORT_SYMBOL(memcmp);
--
2.17.1
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v2] riscv: lib: optimize memcmp with ld insn
2022-09-02 11:00 [PATCH v2] riscv: lib: optimize memcmp with ld insn Yipeng Zou
@ 2022-09-05 11:49 ` Andrew Jones
2022-09-05 11:58 ` Conor.Dooley
2022-09-06 2:15 ` Yipeng Zou
0 siblings, 2 replies; 4+ messages in thread
From: Andrew Jones @ 2022-09-05 11:49 UTC (permalink / raw)
To: Yipeng Zou
Cc: linux-riscv, paul.walmsley, palmer, aou, Conor.Dooley, liaochang1,
chris.zjh
On Fri, Sep 02, 2022 at 07:00:39PM +0800, Yipeng Zou wrote:
> Currently memcmp was implemented in c code(lib/string.c), which compare
> memory per byte.
>
> This patch use ld insn compare memory per word to improve. From the test
> Results, this will take several times optimized.
>
> Alloc 8,4,1KB buffer to compare, each loop 10k times:
>
> Size(B) Min(ns) AVG(ns) //before
>
> 8k 40800 46316
> 4k 26500 32302
> 1k 15600 17965
>
> Size(B) Min(ns) AVG(ns) //after
>
> 8k 16100 21281
> 4k 14200 16446
> 1k 12400 14316
>
> Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
> Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
> ---
> V2: Patch test data into the commit message,and collect Reviewed-by Tags.
>
> arch/riscv/include/asm/string.h | 3 ++
> arch/riscv/lib/Makefile | 1 +
> arch/riscv/lib/memcmp.S | 59 +++++++++++++++++++++++++++++++++
> 3 files changed, 63 insertions(+)
> create mode 100644 arch/riscv/lib/memcmp.S
>
> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> index 909049366555..3337b43d3803 100644
> --- a/arch/riscv/include/asm/string.h
> +++ b/arch/riscv/include/asm/string.h
> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
> #define __HAVE_ARCH_MEMMOVE
> extern asmlinkage void *memmove(void *, const void *, size_t);
> extern asmlinkage void *__memmove(void *, const void *, size_t);
> +#define __HAVE_ARCH_MEMCMP
> +extern int memcmp(const void *, const void *, size_t);
> +
> /* For those files which don't want to check by kasan. */
> #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
> #define memcpy(dst, src, len) __memcpy(dst, src, len)
> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> index 25d5c9664e57..70773bf0c471 100644
> --- a/arch/riscv/lib/Makefile
> +++ b/arch/riscv/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y += delay.o
> lib-y += memcpy.o
> lib-y += memset.o
> lib-y += memmove.o
> +lib-y += memcmp.o
> lib-$(CONFIG_MMU) += uaccess.o
> lib-$(CONFIG_64BIT) += tishift.o
>
> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
> new file mode 100644
> index 000000000000..83af1c433e6f
> --- /dev/null
> +++ b/arch/riscv/lib/memcmp.S
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2022 zouyipeng@huawei.com
> + */
> +#include <linux/linkage.h>
> +#include <asm-generic/export.h>
> +#include <asm/asm.h>
> +
> +/* argrments:
arguments
and please use an opening wing (/*)
> +* a0: addr0
> +* a1: addr1
> +* a2: size
> +*/
> +#define addr0 a0
> +#define addr1 a1
> +#define limit a2
> +
> +#define data0 a3
> +#define data1 a4
> +#define tmp t3
> +#define aaddr t4
> +#define return a0
I don't see much value in these defines. Why is addr0 better than a0?
data0 and data1 are just temp registers. Defining 'return' seems really
unnecessary. And, while a define for "the word end address" does make
some sense, I can't figure out what the first 'a' "aaddr" is supposed to
mean such that it conveys "the word end address". In general, if we're
going to use defines let's not make them as cryptic or even more cryptic
than the assembler itself :-)
> +
> +/* load and compare */
> +.macro LD_CMP op d0 d1 a0 a1 offset
> + \op \d0, 0(\a0)
> + \op \d1, 0(\a1)
> + addi \a0, \a0, \offset
> + addi \a1, \a1, \offset
> + sub tmp, \d0, \d1
I'd prefer not to have a side-effect on tmp. Why not take this output
register as another input to the macro?
> +.endm
> +
> +ENTRY(memcmp)
> + /* test limit aligend with SZREG */
is aligned
> + andi tmp, limit, SZREG - 1
> + /* load tail */
> + add aaddr, addr0, limit
> + sub aaddr, aaddr, tmp
> + add limit, addr0, limit
> +
> +.LloopWord:
> + sltu tmp, addr0, aaddr
> + beqz tmp, .LloopByte
> +
> + LD_CMP REG_L data0 data1 addr0 addr1 SZREG
> + beqz tmp, .LloopWord
> + j .Lreturn
> +
> +.LloopByte:
> + sltu tmp, addr0, limit
> + beqz tmp, .Lreturn
> +
> + LD_CMP lbu data0 data1 addr0 addr1 1
> + beqz tmp, .LloopByte
> +.Lreturn:
> + mv return, tmp
> + ret
> +END(memcmp)
> +EXPORT_SYMBOL(memcmp);
> --
> 2.17.1
>
I'd prefer an assembly format were the operands are lined up
op r1, r2
op r1, r2
Unfortunately I don't see much consistency in how to do this among other
riscv .S files, though, so I'm not sure what to recommend if anything.
Thanks,
drew
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2] riscv: lib: optimize memcmp with ld insn
2022-09-05 11:49 ` Andrew Jones
@ 2022-09-05 11:58 ` Conor.Dooley
2022-09-06 2:15 ` Yipeng Zou
1 sibling, 0 replies; 4+ messages in thread
From: Conor.Dooley @ 2022-09-05 11:58 UTC (permalink / raw)
To: ajones, zouyipeng
Cc: linux-riscv, paul.walmsley, palmer, aou, liaochang1, chris.zjh
On 05/09/2022 12:49, Andrew Jones wrote:
>
> I'd prefer an assembly format were the operands are lined up
>
> op r1, r2
> op r1, r2
>
> Unfortunately I don't see much consistency in how to do this among other
> riscv .S files, though, so I'm not sure what to recommend if anything.
Gotta start somewhere for consistency & we can drift to something that
is actually consistent over time /shrug
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2] riscv: lib: optimize memcmp with ld insn
2022-09-05 11:49 ` Andrew Jones
2022-09-05 11:58 ` Conor.Dooley
@ 2022-09-06 2:15 ` Yipeng Zou
1 sibling, 0 replies; 4+ messages in thread
From: Yipeng Zou @ 2022-09-06 2:15 UTC (permalink / raw)
To: Andrew Jones
Cc: linux-riscv, paul.walmsley, palmer, aou, Conor.Dooley, liaochang1,
chris.zjh
在 2022/9/5 19:49, Andrew Jones 写道:
> On Fri, Sep 02, 2022 at 07:00:39PM +0800, Yipeng Zou wrote:
>> Currently memcmp was implemented in c code(lib/string.c), which compare
>> memory per byte.
>>
>> This patch use ld insn compare memory per word to improve. From the test
>> Results, this will take several times optimized.
>>
>> Alloc 8,4,1KB buffer to compare, each loop 10k times:
>>
>> Size(B) Min(ns) AVG(ns) //before
>>
>> 8k 40800 46316
>> 4k 26500 32302
>> 1k 15600 17965
>>
>> Size(B) Min(ns) AVG(ns) //after
>>
>> 8k 16100 21281
>> 4k 14200 16446
>> 1k 12400 14316
>>
>> Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
>> Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
>> ---
>> V2: Patch test data into the commit message,and collect Reviewed-by Tags.
>>
>> arch/riscv/include/asm/string.h | 3 ++
>> arch/riscv/lib/Makefile | 1 +
>> arch/riscv/lib/memcmp.S | 59 +++++++++++++++++++++++++++++++++
>> 3 files changed, 63 insertions(+)
>> create mode 100644 arch/riscv/lib/memcmp.S
>>
>> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
>> index 909049366555..3337b43d3803 100644
>> --- a/arch/riscv/include/asm/string.h
>> +++ b/arch/riscv/include/asm/string.h
>> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>> #define __HAVE_ARCH_MEMMOVE
>> extern asmlinkage void *memmove(void *, const void *, size_t);
>> extern asmlinkage void *__memmove(void *, const void *, size_t);
>> +#define __HAVE_ARCH_MEMCMP
>> +extern int memcmp(const void *, const void *, size_t);
>> +
>> /* For those files which don't want to check by kasan. */
>> #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>> #define memcpy(dst, src, len) __memcpy(dst, src, len)
>> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
>> index 25d5c9664e57..70773bf0c471 100644
>> --- a/arch/riscv/lib/Makefile
>> +++ b/arch/riscv/lib/Makefile
>> @@ -3,6 +3,7 @@ lib-y += delay.o
>> lib-y += memcpy.o
>> lib-y += memset.o
>> lib-y += memmove.o
>> +lib-y += memcmp.o
>> lib-$(CONFIG_MMU) += uaccess.o
>> lib-$(CONFIG_64BIT) += tishift.o
>>
>> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
>> new file mode 100644
>> index 000000000000..83af1c433e6f
>> --- /dev/null
>> +++ b/arch/riscv/lib/memcmp.S
>> @@ -0,0 +1,59 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2022 zouyipeng@huawei.com
>> + */
>> +#include <linux/linkage.h>
>> +#include <asm-generic/export.h>
>> +#include <asm/asm.h>
>> +
>> +/* argrments:
> arguments
>
> and please use an opening wing (/*)
ok,
>
>
>> +* a0: addr0
>> +* a1: addr1
>> +* a2: size
>> +*/
>> +#define addr0 a0
>> +#define addr1 a1
>> +#define limit a2
>> +
>> +#define data0 a3
>> +#define data1 a4
>> +#define tmp t3
>> +#define aaddr t4
>> +#define return a0
> I don't see much value in these defines. Why is addr0 better than a0?
> data0 and data1 are just temp registers. Defining 'return' seems really
> unnecessary. And, while a define for "the word end address" does make
> some sense, I can't figure out what the first 'a' "aaddr" is supposed to
> mean such that it conveys "the word end address". In general, if we're
> going to use defines let's not make them as cryptic or even more cryptic
> than the assembler itself :-)
The naming of the registers does need improve.
I will make it more reasonable on the next version.
>> +
>> +/* load and compare */
>> +.macro LD_CMP op d0 d1 a0 a1 offset
>> + \op \d0, 0(\a0)
>> + \op \d1, 0(\a1)
>> + addi \a0, \a0, \offset
>> + addi \a1, \a1, \offset
>> + sub tmp, \d0, \d1
> I'd prefer not to have a side-effect on tmp. Why not take this output
> register as another input to the macro?
oh, This is definitely needs to be modified here.
>> +.endm
>> +
>> +ENTRY(memcmp)
>> + /* test limit aligend with SZREG */
> is aligned
ok,
>> + andi tmp, limit, SZREG - 1
>> + /* load tail */
>> + add aaddr, addr0, limit
>> + sub aaddr, aaddr, tmp
>> + add limit, addr0, limit
>> +
>> +.LloopWord:
>> + sltu tmp, addr0, aaddr
>> + beqz tmp, .LloopByte
>> +
>> + LD_CMP REG_L data0 data1 addr0 addr1 SZREG
>> + beqz tmp, .LloopWord
>> + j .Lreturn
>> +
>> +.LloopByte:
>> + sltu tmp, addr0, limit
>> + beqz tmp, .Lreturn
>> +
>> + LD_CMP lbu data0 data1 addr0 addr1 1
>> + beqz tmp, .LloopByte
>> +.Lreturn:
>> + mv return, tmp
>> + ret
>> +END(memcmp)
>> +EXPORT_SYMBOL(memcmp);
>> --
>> 2.17.1
>>
> I'd prefer an assembly format were the operands are lined up
>
> op r1, r2
> op r1, r2
>
> Unfortunately I don't see much consistency in how to do this among other
> riscv .S files, though, so I'm not sure what to recommend if anything.
I agree with that, maybe we should get start on this patch.
For above all , thanks for review.
Will send next version as soon as possible.
>
> Thanks,
> drew
--
Regards,
Yipeng Zou
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-09-06 2:15 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-09-02 11:00 [PATCH v2] riscv: lib: optimize memcmp with ld insn Yipeng Zou
2022-09-05 11:49 ` Andrew Jones
2022-09-05 11:58 ` Conor.Dooley
2022-09-06 2:15 ` Yipeng Zou
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox