* [PATCH] riscv: lib: optimize strchr() with Zbb extension
@ 2026-05-07 2:06 Zongmin Zhou
2026-05-07 10:22 ` David Laight
0 siblings, 1 reply; 3+ messages in thread
From: Zongmin Zhou @ 2026-05-07 2:06 UTC (permalink / raw)
To: pjw, palmer, aou, alex; +Cc: linux-riscv, linux-kernel, Zongmin Zhou
From: Zongmin Zhou <zhouzongmin@kylinos.cn>
Add a Zbb-powered optimization to the existing strchr() implementation
using the 'orc.b' instruction, following the same pattern established
by strnlen().
The Zbb variant processes data in word-sized chunks using orc.b to
detect both NUL terminators and target characters in parallel. On
systems without Zbb support, the original byte-by-byte implementation
is used as a fallback via the alternatives mechanism.
Benchmark results (QEMU TCG, rv64):
Length | zbb=off (MB/s) | zbb=on (MB/s) | Improvement
-------|----------------|---------------|------------
1 B | 27 | 25 | -7.4%
7 B | 147 | 128 | -12.9%
16 B | 216 | 372 | +72.2%
64 B | 378 | 958 | +153.4%
512 B | 480 | 1990 | +314.6%
4096 B | 501 | 2269 | +352.9%
The regression on very short strings (1-7 bytes) is due to the fixed
overhead of the word-level path: broadcasting the target character to
all byte lanes via multiplication and checking pointer alignment before
entering the main loop. For strings shorter than one machine word, this
setup cost outweighs the benefit of parallel comparison. As string
length increases beyond 16 bytes, the word-at-a-time processing shows
significant gains.
Signed-off-by: Zongmin Zhou <zhouzongmin@kylinos.cn>
---
arch/riscv/lib/strchr.S | 115 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 115 insertions(+)
diff --git a/arch/riscv/lib/strchr.S b/arch/riscv/lib/strchr.S
index 48c3a9da53e3..600b19452bc2 100644
--- a/arch/riscv/lib/strchr.S
+++ b/arch/riscv/lib/strchr.S
@@ -6,9 +6,15 @@
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
/* char *strchr(const char *s, int c) */
SYM_FUNC_START(strchr)
+
+ __ALTERNATIVE_CFG("nop", "j strchr_zbb", 0, RISCV_ISA_EXT_ZBB,
+ IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZBB))
+
/*
* Parameters
* a0 - The string to be searched
@@ -29,6 +35,115 @@ SYM_FUNC_START(strchr)
li a0, 0
2:
ret
+
+/*
+ * Variant of strchr using the ZBB extension if available
+ *
+ * This implementation uses orc.b to detect both NUL terminators and target
+ * characters in parallel, processing word-sized chunks for efficiency.
+ */
+#if defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)
+strchr_zbb:
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ clz
+#else
+# define CZ ctz
+#endif
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - Address of first occurrence of 'c' or NULL
+ *
+ * Parameters
+ * a0 - String to search
+ * a1 - Character to find
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4
+ */
+
+ /*
+ * Prepare target character mask.
+ * Broadcast target character to all bytes using multiply.
+ */
+ andi a1, a1, 0xff
+ li t1, 0x01010101
+#if __riscv_xlen == 64
+ slli t2, t1, 32
+ or t1, t1, t2
+#endif
+ mul t2, a1, t1
+
+ /* All-ones mask for orc.b comparisons. */
+ li t4, -1
+
+ /* Check alignment. */
+ andi t0, a0, SZREG-1
+ beqz t0, 2f
+
+ /* Handle misaligned portion byte-by-byte. */
+1:
+ lbu t1, 0(a0)
+ beq t1, a1, 9f
+ beqz t1, 8f
+ addi a0, a0, 1
+ andi t0, a0, SZREG-1
+ bnez t0, 1b
+
+ /* Main loop: process word-sized chunks. */
+2:
+ REG_L t0, 0(a0)
+
+ /* Check for NUL terminator. */
+ orc.b t1, t0
+ bne t1, t4, 3f
+
+ /* Check for target character. */
+ xor t1, t0, t2
+ orc.b t1, t1
+ bne t1, t4, 4f
+
+ addi a0, a0, SZREG
+ j 2b
+
+3:
+ /* NUL found in current chunk. Check if target appears before NUL. */
+ not t1, t1
+
+ xor t3, t0, t2
+ orc.b t3, t3
+ not t3, t3
+
+ CZ t3, t3
+ CZ t1, t1
+
+ /* If NUL appears before target, character not found. */
+ bltu t1, t3, 8f
+
+ srli t3, t3, 3
+ add a0, a0, t3
+ ret
+
+4:
+ /* Target found in chunk without NUL. */
+ not t1, t1
+ CZ t1, t1
+ srli t1, t1, 3
+ add a0, a0, t1
+ ret
+
+8:
+ /* Character not found, return NULL. */
+ li a0, 0
+9:
+ ret
+
+.option pop
+#endif
SYM_FUNC_END(strchr)
SYM_FUNC_ALIAS_WEAK(__pi_strchr, strchr)
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH] riscv: lib: optimize strchr() with Zbb extension
2026-05-07 2:06 [PATCH] riscv: lib: optimize strchr() with Zbb extension Zongmin Zhou
@ 2026-05-07 10:22 ` David Laight
2026-05-12 7:30 ` [PATCH v2] " Zongmin Zhou
0 siblings, 1 reply; 3+ messages in thread
From: David Laight @ 2026-05-07 10:22 UTC (permalink / raw)
To: Zongmin Zhou
Cc: pjw, palmer, aou, alex, linux-riscv, linux-kernel, Zongmin Zhou
On Thu, 7 May 2026 10:06:20 +0800
Zongmin Zhou <min_halo@163.com> wrote:
> From: Zongmin Zhou <zhouzongmin@kylinos.cn>
>
> Add a Zbb-powered optimization to the existing strchr() implementation
> using the 'orc.b' instruction, following the same pattern established
> by strnlen().
>
> The Zbb variant processes data in word-sized chunks using orc.b to
> detect both NUL terminators and target characters in parallel. On
> systems without Zbb support, the original byte-by-byte implementation
> is used as a fallback via the alternatives mechanism.
>
> Benchmark results (QEMU TCG, rv64):
> Length | zbb=off (MB/s) | zbb=on (MB/s) | Improvement
> -------|----------------|---------------|------------
> 1 B | 27 | 25 | -7.4%
> 7 B | 147 | 128 | -12.9%
> 16 B | 216 | 372 | +72.2%
> 64 B | 378 | 958 | +153.4%
> 512 B | 480 | 1990 | +314.6%
> 4096 B | 501 | 2269 | +352.9%
>
> The regression on very short strings (1-7 bytes) is due to the fixed
> overhead of the word-level path: broadcasting the target character to
> all byte lanes via multiplication and checking pointer alignment before
> entering the main loop. For strings shorter than one machine word, this
> setup cost outweighs the benefit of parallel comparison. As string
> length increases beyond 16 bytes, the word-at-a-time processing shows
> significant gains.
>
> Signed-off-by: Zongmin Zhou <zhouzongmin@kylinos.cn>
> ---
> arch/riscv/lib/strchr.S | 115 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 115 insertions(+)
>
> diff --git a/arch/riscv/lib/strchr.S b/arch/riscv/lib/strchr.S
> index 48c3a9da53e3..600b19452bc2 100644
> --- a/arch/riscv/lib/strchr.S
> +++ b/arch/riscv/lib/strchr.S
> @@ -6,9 +6,15 @@
>
> #include <linux/linkage.h>
> #include <asm/asm.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
>
> /* char *strchr(const char *s, int c) */
> SYM_FUNC_START(strchr)
> +
> + __ALTERNATIVE_CFG("nop", "j strchr_zbb", 0, RISCV_ISA_EXT_ZBB,
> + IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZBB))
> +
> /*
> * Parameters
> * a0 - The string to be searched
> @@ -29,6 +35,115 @@ SYM_FUNC_START(strchr)
> li a0, 0
> 2:
> ret
> +
> +/*
> + * Variant of strchr using the ZBB extension if available
> + *
> + * This implementation uses orc.b to detect both NUL terminators and target
> + * characters in parallel, processing word-sized chunks for efficiency.
> + */
> +#if defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)
> +strchr_zbb:
> +
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> +# define CZ clz
> +#else
> +# define CZ ctz
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> + /*
> + * Returns
> + * a0 - Address of first occurrence of 'c' or NULL
> + *
> + * Parameters
> + * a0 - String to search
> + * a1 - Character to find
> + *
> + * Clobbers
> + * t0, t1, t2, t3, t4
> + */
> +
> + /*
> + * Prepare target character mask.
> + * Broadcast target character to all bytes using multiply.
> + */
> + andi a1, a1, 0xff
> + li t1, 0x01010101
> +#if __riscv_xlen == 64
> + slli t2, t1, 32
> + or t1, t1, t2
> +#endif
> + mul t2, a1, t1
> +
> + /* All-ones mask for orc.b comparisons. */
> + li t4, -1
> +
> + /* Check alignment. */
> + andi t0, a0, SZREG-1
> + beqz t0, 2f
It is almost certainly faster to jump 'out of line' for misaligned
strings and fallthrough for aligned ones.
> +
> + /* Handle misaligned portion byte-by-byte. */
> +1:
> + lbu t1, 0(a0)
> + beq t1, a1, 9f
> + beqz t1, 8f
> + addi a0, a0, 1
> + andi t0, a0, SZREG-1
> + bnez t0, 1b
> +
> + /* Main loop: process word-sized chunks. */
Tweak to remove a branch from the loop:
addi a0, a0, -SZREG
> +2:
> + REG_L t0, 0(a0)
Do read first for better instruction scheduling.
REG_L t0, SZREG(a0)
addi a0, a0, SZREG
> +
> + /* Check for NUL terminator. */
> + orc.b t1, t0
> + bne t1, t4, 3f
> +
> + /* Check for target character. */
> + xor t1, t0, t2
> + orc.b t1, t1
> + bne t1, t4, 4f
be t1, t4, 2b
and move the code at '4:' here.
-- David
> +
> + addi a0, a0, SZREG
> + j 2b
> +
> +3:
> + /* NUL found in current chunk. Check if target appears before NUL. */
> + not t1, t1
> +
> + xor t3, t0, t2
> + orc.b t3, t3
> + not t3, t3
> +
> + CZ t3, t3
> + CZ t1, t1
> +
> + /* If NUL appears before target, character not found. */
> + bltu t1, t3, 8f
> +
> + srli t3, t3, 3
> + add a0, a0, t3
> + ret
> +
> +4:
> + /* Target found in chunk without NUL. */
> + not t1, t1
> + CZ t1, t1
> + srli t1, t1, 3
> + add a0, a0, t1
> + ret
> +
> +8:
> + /* Character not found, return NULL. */
> + li a0, 0
> +9:
> + ret
> +
> +.option pop
> +#endif
> SYM_FUNC_END(strchr)
>
> SYM_FUNC_ALIAS_WEAK(__pi_strchr, strchr)
^ permalink raw reply [flat|nested] 3+ messages in thread* [PATCH v2] riscv: lib: optimize strchr() with Zbb extension
2026-05-07 10:22 ` David Laight
@ 2026-05-12 7:30 ` Zongmin Zhou
0 siblings, 0 replies; 3+ messages in thread
From: Zongmin Zhou @ 2026-05-12 7:30 UTC (permalink / raw)
To: david.laight.linux
Cc: alex, aou, linux-kernel, linux-riscv, palmer, pjw, Zongmin Zhou
From: Zongmin Zhou <zhouzongmin@kylinos.cn>
Add a Zbb-powered optimization to the existing strchr() implementation
using the 'orc.b' instruction, following the same pattern established
by strnlen().
The Zbb variant processes data in word-sized chunks using orc.b to
detect both NUL terminators and target characters in parallel. On
systems without Zbb support, the original byte-by-byte implementation
is used as a fallback via the alternatives mechanism.
Benchmark results (QEMU TCG, rv64):
Length | zbb=off (MB/s) | zbb=on (MB/s) | Improvement
-------|----------------|---------------|------------
1 B | 27 | 24 | -11.1%
7 B | 148 | 125 | -15.5%
16 B | 218 | 363 | +66.5%
64 B | 384 | 1044 | +171.9%
512 B | 424 | 2081 | +390.8%
4096 B | 498 | 2636 | +429.3%
The regression on very short strings (1-7 bytes) is due to the fixed
overhead of the word-level path: broadcasting the target character to
all byte lanes via multiplication and checking pointer alignment before
entering the main loop. For strings shorter than one machine word, this
setup cost outweighs the benefit of parallel comparison. As string
length increases beyond 16 bytes, the word-at-a-time processing shows
significant gains.
Suggested-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Zongmin Zhou <zhouzongmin@kylinos.cn>
---
Changes in v2:
- Move alignment handling out-of-line (optimize hot path).
- Hide load latency by reordering REG_L and addi.
- Simplify loop branching.
arch/riscv/lib/strchr.S | 119 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 119 insertions(+)
diff --git a/arch/riscv/lib/strchr.S b/arch/riscv/lib/strchr.S
index 48c3a9da53e3..b7abf8b609b6 100644
--- a/arch/riscv/lib/strchr.S
+++ b/arch/riscv/lib/strchr.S
@@ -6,9 +6,15 @@
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
/* char *strchr(const char *s, int c) */
SYM_FUNC_START(strchr)
+
+ __ALTERNATIVE_CFG("nop", "j strchr_zbb", 0, RISCV_ISA_EXT_ZBB,
+ IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZBB))
+
/*
* Parameters
* a0 - The string to be searched
@@ -29,6 +35,119 @@ SYM_FUNC_START(strchr)
li a0, 0
2:
ret
+
+/*
+ * Variant of strchr using the ZBB extension if available
+ *
+ * This implementation uses orc.b to detect both NUL terminators and target
+ * characters in parallel, processing word-sized chunks for efficiency.
+ */
+#if defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)
+strchr_zbb:
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ clz
+#else
+# define CZ ctz
+#endif
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - Address of first occurrence of 'c' or NULL
+ *
+ * Parameters
+ * a0 - String to search
+ * a1 - Character to find
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4
+ */
+
+ /*
+ * Prepare target character mask.
+ * Broadcast target character to all bytes using multiply.
+ */
+ andi a1, a1, 0xff
+ li t1, 0x01010101
+#if __riscv_xlen == 64
+ slli t2, t1, 32
+ or t1, t1, t2
+#endif
+ mul t2, a1, t1
+
+ /* All-ones mask for orc.b comparisons. */
+ li t4, -1
+
+ /* Check alignment: jump out-of-line if misaligned. */
+ andi t0, a0, SZREG-1
+ bnez t0, 10f
+
+ /* Entry Path A: Directly aligned. Pre-bias for the loop. */
+ addi a0, a0, -SZREG
+
+2:
+ /* Main loop: process word-sized chunks. */
+ REG_L t0, SZREG(a0)
+ addi a0, a0, SZREG
+
+ /* Check for NUL terminator. */
+ orc.b t1, t0
+ bne t1, t4, 3f
+
+ /* Check for target character. */
+ xor t1, t0, t2
+ orc.b t1, t1
+ beq t1, t4, 2b
+
+ /* Target found in chunk without NUL. */
+ not t1, t1
+ CZ t1, t1
+ srli t1, t1, 3
+ add a0, a0, t1
+ ret
+
+3:
+ /* NUL found in current chunk. Check if target appears before NUL. */
+ not t1, t1
+
+ xor t3, t0, t2
+ orc.b t3, t3
+ not t3, t3
+
+ CZ t3, t3
+ CZ t1, t1
+
+ /* If NUL appears before target, character not found. */
+ bltu t1, t3, 8f
+
+ srli t3, t3, 3
+ add a0, a0, t3
+ ret
+
+8:
+ /* Character not found, return NULL. */
+ li a0, 0
+9:
+ ret
+
+ /* --- Out-of-line: Handle misaligned portion byte-by-byte --- */
+10:
+ lbu t1, 0(a0)
+ beq t1, a1, 9b
+ beqz t1, 8b
+ addi a0, a0, 1
+ andi t0, a0, SZREG-1
+ bnez t0, 10b
+
+ /* Entry Path B: Misalignment fixed. Pre-bias and enter main loop. */
+ addi a0, a0, -SZREG
+ j 2b
+
+.option pop
+#endif
SYM_FUNC_END(strchr)
SYM_FUNC_ALIAS_WEAK(__pi_strchr, strchr)
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-05-12 7:32 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-07 2:06 [PATCH] riscv: lib: optimize strchr() with Zbb extension Zongmin Zhou
2026-05-07 10:22 ` David Laight
2026-05-12 7:30 ` [PATCH v2] " Zongmin Zhou
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox