[PATCH 0/2] Implementing bitops rotate using riscv Zbb extension

linux-riscv.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] Implementing bitops rotate using riscv Zbb extension
@ 2025-06-20 11:16 cp0613
  2025-06-20 11:16 ` [PATCH 1/2] bitops: generic rotate cp0613
  2025-06-20 11:16 ` [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension cp0613
  0 siblings, 2 replies; 21+ messages in thread
From: cp0613 @ 2025-06-20 11:16 UTC (permalink / raw)
  To: yury.norov, linux, arnd, paul.walmsley, palmer, aou, alex
  Cc: linux-riscv, linux-arch, linux-kernel, Chen Pei

From: Chen Pei <cp0613@linux.alibaba.com>

This patch series moves the ror*/rol* functions from include/linux/bitops.h
to include/asm-generic/bitops/rotate.h as a generic implementation.

At the same time, an optimized implementation is made based on the bitwise
rotation instructions provided by the RISC-V Zbb extension[1].

Based on the RISC-V processor XUANTIE C908, I tested the performance of
sha3_generic. Compared with the generic implementation, the RISC-V Zbb
instruction implementation performance increased by an average of 6.87%.

Test method:
1. CONFIG_CRYPTO_TEST=m
2. modprobe tcrypt mode=322 sec=3
Different parameters will be selected to test the performance of sha3-224.

[1] https://github.com/riscv/riscv-bitmanip/

Chen Pei (2):
  bitops: generic rotate
  bitops: rotate: Add riscv implementation using Zbb extension

 arch/riscv/include/asm/bitops.h     | 127 ++++++++++++++++++++++++++++
 include/asm-generic/bitops.h        |   2 +-
 include/asm-generic/bitops/rotate.h |  97 +++++++++++++++++++++
 include/linux/bitops.h              |  80 ------------------
 tools/include/asm-generic/bitops.h  |   2 +-
 5 files changed, 226 insertions(+), 82 deletions(-)
 create mode 100644 include/asm-generic/bitops/rotate.h

-- 
2.49.0

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/2] bitops: generic rotate
  2025-06-20 11:16 [PATCH 0/2] Implementing bitops rotate using riscv Zbb extension cp0613
@ 2025-06-20 11:16 ` cp0613
  2025-06-20 15:47   ` kernel test robot
  2025-06-23 11:59   ` kernel test robot
  2025-06-20 11:16 ` [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension cp0613
  1 sibling, 2 replies; 21+ messages in thread
From: cp0613 @ 2025-06-20 11:16 UTC (permalink / raw)
  To: yury.norov, linux, arnd, paul.walmsley, palmer, aou, alex
  Cc: linux-riscv, linux-arch, linux-kernel, Chen Pei

From: Chen Pei <cp0613@linux.alibaba.com>

This patch introduces a generic bitops rotate implementation that moves
the ror* and rol* functions from include/linux/bitops.h.

Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
---
 include/asm-generic/bitops.h        |  2 +-
 include/asm-generic/bitops/rotate.h | 98 +++++++++++++++++++++++++++++
 include/linux/bitops.h              | 80 -----------------------
 tools/include/asm-generic/bitops.h  |  2 +-
 4 files changed, 100 insertions(+), 82 deletions(-)
 create mode 100644 include/asm-generic/bitops/rotate.h

diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index a47b8a71d6fe..8f30aac8325c 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -29,7 +29,7 @@
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/lock.h>
-
+#include <asm-generic/bitops/rotate.h>
 #include <asm-generic/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/le.h>
diff --git a/include/asm-generic/bitops/rotate.h b/include/asm-generic/bitops/rotate.h
new file mode 100644
index 000000000000..65449fefb402
--- /dev/null
+++ b/include/asm-generic/bitops/rotate.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_BITOPS_ROTATE_H_
+#define _ASM_GENERIC_BITOPS_ROTATE_H_
+
+#include <asm/types.h>
+
+/**
+ * generic_rol64 - rotate a 64-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u64 generic_rol64(u64 word, unsigned int shift)
+{
+	return (word << (shift & 63)) | (word >> ((-shift) & 63));
+}
+
+/**
+ * generic_ror64 - rotate a 64-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u64 generic_ror64(u64 word, unsigned int shift)
+{
+	return (word >> (shift & 63)) | (word << ((-shift) & 63));
+}
+
+/**
+ * generic_rol32 - rotate a 32-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u32 generic_rol32(u32 word, unsigned int shift)
+{
+	return (word << (shift & 31)) | (word >> ((-shift) & 31));
+}
+
+/**
+ * generic_ror32 - rotate a 32-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u32 generic_ror32(u32 word, unsigned int shift)
+{
+	return (word >> (shift & 31)) | (word << ((-shift) & 31));
+}
+
+/**
+ * generic_rol16 - rotate a 16-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u16 generic_rol16(u16 word, unsigned int shift)
+{
+	return (word << (shift & 15)) | (word >> ((-shift) & 15));
+}
+
+/**
+ * generic_ror16 - rotate a 16-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u16 generic_ror16(u16 word, unsigned int shift)
+{
+	return (word >> (shift & 15)) | (word << ((-shift) & 15));
+}
+
+/**
+ * generic_rol8 - rotate an 8-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u8 generic_rol8(u8 word, unsigned int shift)
+{
+	return (word << (shift & 7)) | (word >> ((-shift) & 7));
+}
+
+/**
+ * generic_ror8 - rotate an 8-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static __always_inline u8 generic_ror8(u8 word, unsigned int shift)
+{
+	return (word >> (shift & 7)) | (word << ((-shift) & 7));
+}
+
+#ifndef __HAVE_ARCH_ROTATE
+#define rol64(word, shift) generic_rol64(word, shift)
+#define ror64(word, shift) generic_ror64(word, shift)
+#define rol32(word, shift) generic_rol32(word, shift)
+#define ror32(word, shift) generic_ror32(word, shift)
+#define rol16(word, shift) generic_rol16(word, shift)
+#define ror16(word, shift) generic_ror16(word, shift)
+#define rol8(word, shift)  generic_rol8(word, shift)
+#define ror8(word, shift)  generic_ror8(word, shift)
+#endif
+
+#endif /* _ASM_GENERIC_BITOPS_ROTATE_H_ */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index c1cb53cf2f0f..1f8ef472cfb3 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -97,86 +97,6 @@ static __always_inline unsigned long hweight_long(unsigned long w)
 	return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
 }
 
-/**
- * rol64 - rotate a 64-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u64 rol64(__u64 word, unsigned int shift)
-{
-	return (word << (shift & 63)) | (word >> ((-shift) & 63));
-}
-
-/**
- * ror64 - rotate a 64-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u64 ror64(__u64 word, unsigned int shift)
-{
-	return (word >> (shift & 63)) | (word << ((-shift) & 63));
-}
-
-/**
- * rol32 - rotate a 32-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u32 rol32(__u32 word, unsigned int shift)
-{
-	return (word << (shift & 31)) | (word >> ((-shift) & 31));
-}
-
-/**
- * ror32 - rotate a 32-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u32 ror32(__u32 word, unsigned int shift)
-{
-	return (word >> (shift & 31)) | (word << ((-shift) & 31));
-}
-
-/**
- * rol16 - rotate a 16-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u16 rol16(__u16 word, unsigned int shift)
-{
-	return (word << (shift & 15)) | (word >> ((-shift) & 15));
-}
-
-/**
- * ror16 - rotate a 16-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u16 ror16(__u16 word, unsigned int shift)
-{
-	return (word >> (shift & 15)) | (word << ((-shift) & 15));
-}
-
-/**
- * rol8 - rotate an 8-bit value left
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u8 rol8(__u8 word, unsigned int shift)
-{
-	return (word << (shift & 7)) | (word >> ((-shift) & 7));
-}
-
-/**
- * ror8 - rotate an 8-bit value right
- * @word: value to rotate
- * @shift: bits to roll
- */
-static inline __u8 ror8(__u8 word, unsigned int shift)
-{
-	return (word >> (shift & 7)) | (word << ((-shift) & 7));
-}
-
 /**
  * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
  * @value: value to sign extend
diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h
index 9ab313e93555..bfa06c6babe3 100644
--- a/tools/include/asm-generic/bitops.h
+++ b/tools/include/asm-generic/bitops.h
@@ -24,7 +24,7 @@
 #endif
 
 #include <asm-generic/bitops/hweight.h>
-
+#include <asm-generic/bitops/rotate.h>
 #include <asm-generic/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 
-- 
2.49.0


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-20 11:16 [PATCH 0/2] Implementing bitops rotate using riscv Zbb extension cp0613
  2025-06-20 11:16 ` [PATCH 1/2] bitops: generic rotate cp0613
@ 2025-06-20 11:16 ` cp0613
  2025-06-20 16:20   ` Yury Norov
  1 sibling, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-06-20 11:16 UTC (permalink / raw)
  To: yury.norov, linux, arnd, paul.walmsley, palmer, aou, alex
  Cc: linux-riscv, linux-arch, linux-kernel, Chen Pei

From: Chen Pei <cp0613@linux.alibaba.com>

The RISC-V Zbb extension[1] defines bitwise rotation instructions,
which can be used to implement rotate related functions.

[1] https://github.com/riscv/riscv-bitmanip/

Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
---
 arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index d59310f74c2b..be247ef9e686 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -20,17 +20,20 @@
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
 
 #else
 #define __HAVE_ARCH___FFS
 #define __HAVE_ARCH___FLS
 #define __HAVE_ARCH_FFS
 #define __HAVE_ARCH_FLS
+#define __HAVE_ARCH_ROTATE
 
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
 
 #include <asm/alternative-macros.h>
 #include <asm/hwcap.h>
@@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
 	 variable_fls(x_);					\
 })
 
+static __always_inline u64 variable_rol64(u64 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rol %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_rol64(word, shift);
+}
+
+static inline u64 variable_ror64(u64 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"ror %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_ror64(word, shift);
+}
+
+static inline u32 variable_rol32(u32 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_rol32(word, shift);
+}
+
+static inline u32 variable_ror32(u32 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_ror32(word, shift);
+}
+
+static inline u16 variable_rol16(u16 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 16) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u16)word32;
+
+legacy:
+	return generic_rol16(word, shift);
+}
+
+static inline u16 variable_ror16(u16 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 16) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u16)word32;
+
+legacy:
+	return generic_ror16(word, shift);
+}
+
+static inline u8 variable_rol8(u8 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u8)word32;
+
+legacy:
+	return generic_rol8(word, shift);
+}
+
+static inline u8 variable_ror8(u8 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u8)word32;
+
+legacy:
+	return generic_ror8(word, shift);
+}
+
+#define rol64(word, shift) variable_rol64(word, shift)
+#define ror64(word, shift) variable_ror64(word, shift)
+#define rol32(word, shift) variable_rol32(word, shift)
+#define ror32(word, shift) variable_ror32(word, shift)
+#define rol16(word, shift) variable_rol16(word, shift)
+#define ror16(word, shift) variable_ror16(word, shift)
+#define rol8(word, shift)  variable_rol8(word, shift)
+#define ror8(word, shift)  variable_ror8(word, shift)
+
 #endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */
 
 #include <asm-generic/bitops/ffz.h>
-- 
2.49.0


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] bitops: generic rotate
  2025-06-20 11:16 ` [PATCH 1/2] bitops: generic rotate cp0613
@ 2025-06-20 15:47   ` kernel test robot
  2025-06-23 11:59   ` kernel test robot
  1 sibling, 0 replies; 21+ messages in thread
From: kernel test robot @ 2025-06-20 15:47 UTC (permalink / raw)
  To: cp0613, yury.norov, linux, arnd, paul.walmsley, palmer, aou, alex
  Cc: oe-kbuild-all, linux-riscv, linux-arch, linux-kernel, Chen Pei

Hi,

kernel test robot noticed the following build errors:

[auto build test ERROR on arnd-asm-generic/master]
[also build test ERROR on linus/master v6.16-rc2 next-20250620]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/cp0613-linux-alibaba-com/bitops-generic-rotate/20250620-192016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
patch link:    https://lore.kernel.org/r/20250620111610.52750-2-cp0613%40linux.alibaba.com
patch subject: [PATCH 1/2] bitops: generic rotate
reproduce: (https://download.01.org/0day-ci/archive/20250620/202506202300.dZGgBtbQ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202506202300.dZGgBtbQ-lkp@intel.com/

All errors (new ones prefixed by >>):

   make[1]: *** [Makefile:1281: prepare0] Error 2
   In file included from weak.c:10:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from arch/x86/orc.c:5:
   In file included from tools/objtool/include/objtool/check.h:11:
   In file included from tools/objtool/include/objtool/arch.h:11:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from check.c:13:
   In file included from tools/objtool/include/objtool/arch.h:11:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from orc_gen.c:12:
   In file included from tools/objtool/include/objtool/check.h:11:
   In file included from tools/objtool/include/objtool/arch.h:11:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitIn file included from orc_dump.c:8:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
   o   27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   ps/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from arch/x86/special.c:4:
--
   In file included from tools/objtool/include/objtool/check.h:11:
   In file included from tools/objtool/include/objtool/arch.h:11:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
   :13   27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   :
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.
   make[4]: *** [tools/build/Makefile.build:86: tools/objtool/weak.o] Error 1
   In file included from elf.c:22:
   In file included from tools/objtool/include/objtool/elf.h:12:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from builtin-check.c:15:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from objtool.c:16:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.
   1 error generated.
   make[5]: *** [tools/build/Makefile.build:86: tools/objtool/arch/x86/orc.o] Error 1
--
   In file included from tools/objtool/include/objtool/check.h:11:
   In file included from tools/objtool/include/objtool/arch.h:11:
   In file included from tools/objtool/include/objtool/objtool.h:11:
   In file included from tools/include/linux/hashtable.h:13:
   In file included from tools/include/linux/bitops.h:52:
>> tools/include/asm-generic/bitops.h:27:10: fatal error: 'asm-generic/bitops/rotate.h' file not found
      27 | #include <asm-generic/bitops/rotate.h>
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.
   make[4]: *** [tools/build/Makefile.build:86: tools/objtool/check.o] Error 1
   1 error generated.


vim +27 tools/include/asm-generic/bitops.h

    25	
    26	#include <asm-generic/bitops/hweight.h>
  > 27	#include <asm-generic/bitops/rotate.h>
    28	#include <asm-generic/bitops/atomic.h>
    29	#include <asm-generic/bitops/non-atomic.h>
    30	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-20 11:16 ` [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension cp0613
@ 2025-06-20 16:20   ` Yury Norov
  2025-06-25 16:02     ` David Laight
  2025-06-28 11:13     ` cp0613
  0 siblings, 2 replies; 21+ messages in thread
From: Yury Norov @ 2025-06-20 16:20 UTC (permalink / raw)
  To: cp0613
  Cc: linux, arnd, paul.walmsley, palmer, aou, alex, linux-riscv,
	linux-arch, linux-kernel

On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote:
> From: Chen Pei <cp0613@linux.alibaba.com>
> 
> The RISC-V Zbb extension[1] defines bitwise rotation instructions,
> which can be used to implement rotate related functions.
> 
> [1] https://github.com/riscv/riscv-bitmanip/
> 
> Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
> ---
>  arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
>  1 file changed, 172 insertions(+)
> 
> diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
> index d59310f74c2b..be247ef9e686 100644
> --- a/arch/riscv/include/asm/bitops.h
> +++ b/arch/riscv/include/asm/bitops.h
> @@ -20,17 +20,20 @@
>  #include <asm-generic/bitops/__fls.h>
>  #include <asm-generic/bitops/ffs.h>
>  #include <asm-generic/bitops/fls.h>
> +#include <asm-generic/bitops/rotate.h>
>  
>  #else
>  #define __HAVE_ARCH___FFS
>  #define __HAVE_ARCH___FLS
>  #define __HAVE_ARCH_FFS
>  #define __HAVE_ARCH_FLS
> +#define __HAVE_ARCH_ROTATE
>  
>  #include <asm-generic/bitops/__ffs.h>
>  #include <asm-generic/bitops/__fls.h>
>  #include <asm-generic/bitops/ffs.h>
>  #include <asm-generic/bitops/fls.h>
> +#include <asm-generic/bitops/rotate.h>
>  
>  #include <asm/alternative-macros.h>
>  #include <asm/hwcap.h>
> @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
>  	 variable_fls(x_);					\
>  })

...

> +static inline u8 variable_ror8(u8 word, unsigned int shift)
> +{
> +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;

Can you add a comment about what is happening here? Are you sure it's
optimized out in case of the 'legacy' alternative?

> +
> +	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> +				      RISCV_ISA_EXT_ZBB, 1)
> +			  : : : : legacy);
> +
> +	asm volatile(
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"rorw %0, %1, %2\n"
> +		".option pop\n"
> +		: "=r" (word32) : "r" (word32), "r" (shift) :);
> +
> +	return (u8)word32;
> +
> +legacy:
> +	return generic_ror8(word, shift);
> +}
> +
> +#define rol64(word, shift) variable_rol64(word, shift)
> +#define ror64(word, shift) variable_ror64(word, shift)
> +#define rol32(word, shift) variable_rol32(word, shift)
> +#define ror32(word, shift) variable_ror32(word, shift)
> +#define rol16(word, shift) variable_rol16(word, shift)
> +#define ror16(word, shift) variable_ror16(word, shift)
> +#define rol8(word, shift)  variable_rol8(word, shift)
> +#define ror8(word, shift)  variable_ror8(word, shift)

Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
that breaks compile-time rotation if the parameter is known at compile
time.

I believe, generic implementation will allow compiler to handle this
case better. Can you do a similar thing to what fls() does in the same
file?

Thanks,
Yury

> +
>  #endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */
>  
>  #include <asm-generic/bitops/ffz.h>
> -- 
> 2.49.0

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] bitops: generic rotate
  2025-06-20 11:16 ` [PATCH 1/2] bitops: generic rotate cp0613
  2025-06-20 15:47   ` kernel test robot
@ 2025-06-23 11:59   ` kernel test robot
  1 sibling, 0 replies; 21+ messages in thread
From: kernel test robot @ 2025-06-23 11:59 UTC (permalink / raw)
  To: cp0613, yury.norov, linux, arnd, paul.walmsley, palmer, aou, alex
  Cc: llvm, oe-kbuild-all, linux-riscv, linux-arch, linux-kernel,
	Chen Pei

Hi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on arnd-asm-generic/master]
[also build test WARNING on linus/master v6.16-rc3 next-20250623]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/cp0613-linux-alibaba-com/bitops-generic-rotate/20250620-192016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
patch link:    https://lore.kernel.org/r/20250620111610.52750-2-cp0613%40linux.alibaba.com
patch subject: [PATCH 1/2] bitops: generic rotate
config: powerpc-allyesconfig (https://download.01.org/0day-ci/archive/20250623/202506231924.kPX5UiaD-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project 875b36a8742437b95f623bab1e0332562c7b4b3f)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250623/202506231924.kPX5UiaD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202506231924.kPX5UiaD-lkp@intel.com/

All warnings (new ones prefixed by >>):

         |         ^
   In file included from drivers/net/ethernet/pensando/ionic/ionic_txrx.c:5:
   In file included from include/linux/ipv6.h:102:
   In file included from include/linux/tcp.h:19:
   In file included from include/net/sock.h:46:
   In file included from include/linux/netdevice.h:44:
   In file included from include/uapi/linux/neighbour.h:6:
   In file included from include/linux/netlink.h:9:
   In file included from include/net/scm.h:13:
   In file included from include/net/compat.h:8:
   include/linux/compat.h:458:22: warning: array index 1 is past the end of the array (that has type 'const unsigned long[1]') [-Warray-bounds]
     458 |         case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
         |                             ^        ~
   arch/powerpc/include/uapi/asm/signal.h:18:2: note: array 'sig' declared here
      18 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/net/ethernet/pensando/ionic/ionic_txrx.c:5:
   In file included from include/linux/ipv6.h:102:
   In file included from include/linux/tcp.h:19:
   In file included from include/net/sock.h:46:
   In file included from include/linux/netdevice.h:44:
   In file included from include/uapi/linux/neighbour.h:6:
   In file included from include/linux/netlink.h:9:
   In file included from include/net/scm.h:13:
   In file included from include/net/compat.h:8:
   include/linux/compat.h:458:10: warning: array index 3 is past the end of the array (that has type 'compat_sigset_word[2]' (aka 'unsigned int[2]')) [-Warray-bounds]
     458 |         case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
         |                 ^     ~
   include/linux/compat.h:130:2: note: array 'sig' declared here
     130 |         compat_sigset_word      sig[_COMPAT_NSIG_WORDS];
         |         ^
   include/linux/compat.h:458:42: warning: array index 2 is past the end of the array (that has type 'compat_sigset_word[2]' (aka 'unsigned int[2]')) [-Warray-bounds]
     458 |         case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
         |                                                 ^     ~
   include/linux/compat.h:130:2: note: array 'sig' declared here
     130 |         compat_sigset_word      sig[_COMPAT_NSIG_WORDS];
         |         ^
   include/linux/compat.h:458:53: warning: array index 1 is past the end of the array (that has type 'const unsigned long[1]') [-Warray-bounds]
     458 |         case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
         |                                                            ^        ~
   arch/powerpc/include/uapi/asm/signal.h:18:2: note: array 'sig' declared here
      18 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/net/ethernet/pensando/ionic/ionic_txrx.c:5:
   In file included from include/linux/ipv6.h:102:
   In file included from include/linux/tcp.h:20:
   In file included from include/net/inet_connection_sock.h:21:
   In file included from include/net/inet_sock.h:18:
   include/linux/jhash.h:83:3: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
      83 |                 __jhash_mix(a, b, c);
         |                 ^
   include/linux/jhash.h:37:16: note: expanded from macro '__jhash_mix'
      37 |         a -= c;  a ^= rol32(c, 4);  c += b;     \
         |                       ^
   include/linux/jhash.h:101:4: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     101 |                  __jhash_final(a, b, c);
         |                  ^
   include/linux/jhash.h:48:15: note: expanded from macro '__jhash_final'
      48 |         c ^= b; c -= rol32(b, 14);              \
         |                      ^
   include/linux/jhash.h:129:3: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     129 |                 __jhash_mix(a, b, c);
         |                 ^
   include/linux/jhash.h:37:16: note: expanded from macro '__jhash_mix'
      37 |         a -= c;  a ^= rol32(c, 4);  c += b;     \
         |                       ^
   include/linux/jhash.h:139:3: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     139 |                 __jhash_final(a, b, c);
         |                 ^
   include/linux/jhash.h:48:15: note: expanded from macro '__jhash_final'
      48 |         c ^= b; c -= rol32(b, 14);              \
         |                      ^
   include/linux/jhash.h:156:2: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     156 |         __jhash_final(a, b, c);
         |         ^
   include/linux/jhash.h:48:15: note: expanded from macro '__jhash_final'
      48 |         c ^= b; c -= rol32(b, 14);              \
         |                      ^
   In file included from drivers/net/ethernet/pensando/ionic/ionic_txrx.c:7:
   In file included from include/net/ip6_checksum.h:27:
   In file included from include/net/ip.h:30:
   In file included from include/net/route.h:24:
   In file included from include/net/inetpeer.h:16:
   include/net/ipv6.h:975:9: error: call to undeclared function 'rol32'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     975 |         hash = rol32(hash, 16);
         |                ^
   In file included from drivers/net/ethernet/pensando/ionic/ionic_txrx.c:7:
   In file included from include/net/ip6_checksum.h:27:
   include/net/ip.h:478:14: warning: default initialization of an object of type 'typeof (rt->dst.expires)' (aka 'const unsigned long') leaves the object uninitialized [-Wdefault-const-init-var-unsafe]
     478 |                 if (mtu && time_before(jiffies, rt->dst.expires))
         |                            ^
   include/linux/jiffies.h:138:26: note: expanded from macro 'time_before'
     138 | #define time_before(a,b)        time_after(b,a)
         |                                 ^
   include/linux/jiffies.h:128:3: note: expanded from macro 'time_after'
     128 |         (typecheck(unsigned long, a) && \
         |          ^
   include/linux/typecheck.h:11:12: note: expanded from macro 'typecheck'
      11 |         typeof(x) __dummy2; \
         |                   ^
>> drivers/net/ethernet/pensando/ionic/ionic_txrx.c:203:30: warning: implicit conversion from 'unsigned long' to 'u16' (aka 'unsigned short') changes value from 65536 to 0 [-Wconstant-conversion]
     203 |                 frag_len = min_t(u16, len, IONIC_PAGE_SIZE);
         |                            ~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~
   drivers/net/ethernet/pensando/ionic/ionic_dev.h:184:32: note: expanded from macro 'IONIC_PAGE_SIZE'
     184 | #define IONIC_PAGE_SIZE                         MIN(PAGE_SIZE, IONIC_MAX_BUF_LEN)
         |                                                     ^
   include/vdso/page.h:15:30: note: expanded from macro 'PAGE_SIZE'
      15 | #define PAGE_SIZE       (_AC(1,UL) << CONFIG_PAGE_SHIFT)
         |                                    ^
   include/linux/minmax.h:314:30: note: expanded from macro 'MIN'
     314 | #define MIN(a, b) __cmp(min, a, b)
         |                              ^
   note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/minmax.h:161:52: note: expanded from macro 'min_t'
     161 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
         |                           ~~~~~~~~~~~~~~~~~~~~~~~~~^~
   include/linux/minmax.h:89:33: note: expanded from macro '__cmp_once'
      89 |         __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
         |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/minmax.h:86:31: note: expanded from macro '__cmp_once_unique'
      86 |         ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
         |                                ~~    ^
   drivers/net/ethernet/pensando/ionic/ionic_txrx.c:803:36: warning: implicit conversion from 'unsigned long' to 'u16' (aka 'unsigned short') changes value from 65536 to 0 [-Wconstant-conversion]
     803 |                 first_frag_len = min_t(u16, len, IONIC_PAGE_SIZE);
         |                                  ~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~
   drivers/net/ethernet/pensando/ionic/ionic_dev.h:184:32: note: expanded from macro 'IONIC_PAGE_SIZE'
     184 | #define IONIC_PAGE_SIZE                         MIN(PAGE_SIZE, IONIC_MAX_BUF_LEN)
         |                                                     ^
   include/vdso/page.h:15:30: note: expanded from macro 'PAGE_SIZE'
      15 | #define PAGE_SIZE       (_AC(1,UL) << CONFIG_PAGE_SHIFT)
         |                                    ^
   include/linux/minmax.h:314:30: note: expanded from macro 'MIN'
     314 | #define MIN(a, b) __cmp(min, a, b)
         |                              ^
   note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/minmax.h:161:52: note: expanded from macro 'min_t'
     161 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
         |                           ~~~~~~~~~~~~~~~~~~~~~~~~~^~
   include/linux/minmax.h:89:33: note: expanded from macro '__cmp_once'
      89 |         __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
         |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/minmax.h:86:31: note: expanded from macro '__cmp_once_unique'
      86 |         ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
         |                                ~~    ^
   drivers/net/ethernet/pensando/ionic/ionic_txrx.c:838:38: warning: implicit conversion from 'unsigned long' to 'u16' (aka 'unsigned short') changes value from 65536 to 0 [-Wconstant-conversion]
     838 |                         frag_len = min_t(u16, remain_len, IONIC_PAGE_SIZE);
         |                                    ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~
   drivers/net/ethernet/pensando/ionic/ionic_dev.h:184:32: note: expanded from macro 'IONIC_PAGE_SIZE'
     184 | #define IONIC_PAGE_SIZE                         MIN(PAGE_SIZE, IONIC_MAX_BUF_LEN)
         |                                                     ^
   include/vdso/page.h:15:30: note: expanded from macro 'PAGE_SIZE'
      15 | #define PAGE_SIZE       (_AC(1,UL) << CONFIG_PAGE_SHIFT)
         |                                    ^
   include/linux/minmax.h:314:30: note: expanded from macro 'MIN'
     314 | #define MIN(a, b) __cmp(min, a, b)
         |                              ^
   note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/minmax.h:161:52: note: expanded from macro 'min_t'
     161 | #define min_t(type, x, y) __cmp_once(min, type, x, y)
         |                           ~~~~~~~~~~~~~~~~~~~~~~~~~^~
   include/linux/minmax.h:89:33: note: expanded from macro '__cmp_once'
      89 |         __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
         |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/minmax.h:86:31: note: expanded from macro '__cmp_once_unique'
      86 |         ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
         |                                ~~    ^
   16 warnings and 9 errors generated.


vim +203 drivers/net/ethernet/pensando/ionic/ionic_txrx.c

36a47c906b23240 Shannon Nelson 2024-03-06  174  
36a47c906b23240 Shannon Nelson 2024-03-06  175  static struct sk_buff *ionic_rx_build_skb(struct ionic_queue *q,
4dcd4575bfb17d0 Shannon Nelson 2024-03-06  176  					  struct ionic_rx_desc_info *desc_info,
f81da39bf4c0a54 Shannon Nelson 2024-02-14  177  					  unsigned int headroom,
f81da39bf4c0a54 Shannon Nelson 2024-02-14  178  					  unsigned int len,
f81da39bf4c0a54 Shannon Nelson 2024-02-14  179  					  unsigned int num_sg_elems,
180e35cdf035d1c Shannon Nelson 2024-02-14  180  					  bool synced)
0f3154e6bcb3549 Shannon Nelson 2019-09-03  181  {
4b0a7539a3728f0 Shannon Nelson 2021-03-10  182  	struct ionic_buf_info *buf_info;
08f2e4b2b2008ce Shannon Nelson 2019-10-23  183  	struct sk_buff *skb;
08f2e4b2b2008ce Shannon Nelson 2019-10-23  184  	unsigned int i;
08f2e4b2b2008ce Shannon Nelson 2019-10-23  185  	u16 frag_len;
08f2e4b2b2008ce Shannon Nelson 2019-10-23  186  
4b0a7539a3728f0 Shannon Nelson 2021-03-10  187  	buf_info = &desc_info->bufs[0];
e75ccac1d0644c9 Shannon Nelson 2021-07-27  188  	prefetchw(buf_info->page);
08f2e4b2b2008ce Shannon Nelson 2019-10-23  189  
89e572e7369fd9a Shannon Nelson 2021-03-10  190  	skb = napi_get_frags(&q_to_qcq(q)->napi);
89e572e7369fd9a Shannon Nelson 2021-03-10  191  	if (unlikely(!skb)) {
89e572e7369fd9a Shannon Nelson 2021-03-10  192  		net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
36a47c906b23240 Shannon Nelson 2024-03-06  193  				     dev_name(q->dev), q->name);
2854242d23a7b3a Shannon Nelson 2024-03-06  194  		q_to_rx_stats(q)->alloc_err++;
08f2e4b2b2008ce Shannon Nelson 2019-10-23  195  		return NULL;
89e572e7369fd9a Shannon Nelson 2021-03-10  196  	}
ac8813c0ab7d281 Shannon Nelson 2024-09-06  197  	skb_mark_for_recycle(skb);
08f2e4b2b2008ce Shannon Nelson 2019-10-23  198  
f81da39bf4c0a54 Shannon Nelson 2024-02-14  199  	if (headroom)
36a47c906b23240 Shannon Nelson 2024-03-06  200  		frag_len = min_t(u16, len,
36a47c906b23240 Shannon Nelson 2024-03-06  201  				 IONIC_XDP_MAX_LINEAR_MTU + VLAN_ETH_HLEN);
f81da39bf4c0a54 Shannon Nelson 2024-02-14  202  	else
ac8813c0ab7d281 Shannon Nelson 2024-09-06 @203  		frag_len = min_t(u16, len, IONIC_PAGE_SIZE);
f81da39bf4c0a54 Shannon Nelson 2024-02-14  204  
36a47c906b23240 Shannon Nelson 2024-03-06  205  	if (unlikely(!buf_info->page))
36a47c906b23240 Shannon Nelson 2024-03-06  206  		goto err_bad_buf_page;
36a47c906b23240 Shannon Nelson 2024-03-06  207  	ionic_rx_add_skb_frag(q, skb, buf_info, headroom, frag_len, synced);
36a47c906b23240 Shannon Nelson 2024-03-06  208  	len -= frag_len;
4b0a7539a3728f0 Shannon Nelson 2021-03-10  209  	buf_info++;
4b0a7539a3728f0 Shannon Nelson 2021-03-10  210  
36a47c906b23240 Shannon Nelson 2024-03-06  211  	for (i = 0; i < num_sg_elems; i++, buf_info++) {
36a47c906b23240 Shannon Nelson 2024-03-06  212  		if (unlikely(!buf_info->page))
36a47c906b23240 Shannon Nelson 2024-03-06  213  			goto err_bad_buf_page;
ac8813c0ab7d281 Shannon Nelson 2024-09-06  214  		frag_len = min_t(u16, len, buf_info->len);
36a47c906b23240 Shannon Nelson 2024-03-06  215  		ionic_rx_add_skb_frag(q, skb, buf_info, 0, frag_len, synced);
36a47c906b23240 Shannon Nelson 2024-03-06  216  		len -= frag_len;
36a47c906b23240 Shannon Nelson 2024-03-06  217  	}
08f2e4b2b2008ce Shannon Nelson 2019-10-23  218  
08f2e4b2b2008ce Shannon Nelson 2019-10-23  219  	return skb;
36a47c906b23240 Shannon Nelson 2024-03-06  220  
36a47c906b23240 Shannon Nelson 2024-03-06  221  err_bad_buf_page:
36a47c906b23240 Shannon Nelson 2024-03-06  222  	dev_kfree_skb(skb);
36a47c906b23240 Shannon Nelson 2024-03-06  223  	return NULL;
0f3154e6bcb3549 Shannon Nelson 2019-09-03  224  }
0f3154e6bcb3549 Shannon Nelson 2019-09-03  225  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-20 16:20   ` Yury Norov
@ 2025-06-25 16:02     ` David Laight
  2025-06-28 12:08       ` cp0613
  2025-06-28 11:13     ` cp0613
  1 sibling, 1 reply; 21+ messages in thread
From: David Laight @ 2025-06-25 16:02 UTC (permalink / raw)
  To: Yury Norov
  Cc: cp0613, linux, arnd, paul.walmsley, palmer, aou, alex,
	linux-riscv, linux-arch, linux-kernel

On Fri, 20 Jun 2025 12:20:47 -0400
Yury Norov <yury.norov@gmail.com> wrote:

> On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote:
> > From: Chen Pei <cp0613@linux.alibaba.com>
> > 
> > The RISC-V Zbb extension[1] defines bitwise rotation instructions,
> > which can be used to implement rotate related functions.
> > 
> > [1] https://github.com/riscv/riscv-bitmanip/
> > 
> > Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
> > ---
> >  arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
> >  1 file changed, 172 insertions(+)
> > 
> > diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
> > index d59310f74c2b..be247ef9e686 100644
> > --- a/arch/riscv/include/asm/bitops.h
> > +++ b/arch/riscv/include/asm/bitops.h
> > @@ -20,17 +20,20 @@
> >  #include <asm-generic/bitops/__fls.h>
> >  #include <asm-generic/bitops/ffs.h>
> >  #include <asm-generic/bitops/fls.h>
> > +#include <asm-generic/bitops/rotate.h>
> >  
> >  #else
> >  #define __HAVE_ARCH___FFS
> >  #define __HAVE_ARCH___FLS
> >  #define __HAVE_ARCH_FFS
> >  #define __HAVE_ARCH_FLS
> > +#define __HAVE_ARCH_ROTATE
> >  
> >  #include <asm-generic/bitops/__ffs.h>
> >  #include <asm-generic/bitops/__fls.h>
> >  #include <asm-generic/bitops/ffs.h>
> >  #include <asm-generic/bitops/fls.h>
> > +#include <asm-generic/bitops/rotate.h>
> >  
> >  #include <asm/alternative-macros.h>
> >  #include <asm/hwcap.h>
> > @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
> >  	 variable_fls(x_);					\
> >  })  
> 
> ...
> 
> > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > +{
> > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;  
> 
> Can you add a comment about what is happening here? Are you sure it's
> optimized out in case of the 'legacy' alternative?

Is it even a gain in the zbb case?
The "rorw" is only ever going to help full word rotates.
Here you might as well do ((word << 8 | word) >> shift).

For "rol8" you'd need ((word << 24 | word) 'rol' shift).
I still bet the generic code is faster (but see below).

Same for 16bit rotates.

Actually the generic version is (probably) horrid for everything except x86.
See https://www.godbolt.org/z/xTxYj57To

unsigned char rol(unsigned char v, unsigned int shift)
{
    return (v << 8 | v) << shift >> 8;
}

unsigned char ror(unsigned char v, unsigned int shift)
{
    return (v << 8 | v) >> shift;
}

	David

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-20 16:20   ` Yury Norov
  2025-06-25 16:02     ` David Laight
@ 2025-06-28 11:13     ` cp0613
  2025-06-29  1:48       ` Yury Norov
  1 sibling, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-06-28 11:13 UTC (permalink / raw)
  To: yury.norov
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote:

> Can you add a comment about what is happening here? Are you sure it's
> optimized out in case of the 'legacy' alternative?

Thank you for your review. Yes, I referred to the existing variable__fls()
implementation, which should be fine.

> Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
> that breaks compile-time rotation if the parameter is known at compile
> time.
> 
> I believe, generic implementation will allow compiler to handle this
> case better. Can you do a similar thing to what fls() does in the same
> file?

I did consider it, but I did not find any toolchain that provides an
implementation similar to __builtin_ror or __builtin_rol. If there is one,
please help point it out.
In addition, I did not consider it carefully before. If the rotate function
is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
I missed this step.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-25 16:02     ` David Laight
@ 2025-06-28 12:08       ` cp0613
  2025-06-29 10:38         ` David Laight
  0 siblings, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-06-28 12:08 UTC (permalink / raw)
  To: david.laight.linux
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley, yury.norov

On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote:

> Is it even a gain in the zbb case?
> The "rorw" is only ever going to help full word rotates.
> Here you might as well do ((word << 8 | word) >> shift).
> 
> For "rol8" you'd need ((word << 24 | word) 'rol' shift).
> I still bet the generic code is faster (but see below).
> 
> Same for 16bit rotates.
> 
> Actually the generic version is (probably) horrid for everything except x86.
> See https://www.godbolt.org/z/xTxYj57To

Thanks for your suggestion, this website is very inspiring. According to the
results, the generic version is indeed the most friendly to x86. I think this
is also a reason why other architectures should be optimized. Take the riscv64
ror32 implementation as an example, compare the number of assembly instructions
of the following two functions:
```
u32 zbb_opt_ror32(u32 word, unsigned int shift)
{
	asm volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word) : "r" (word), "r" (shift) :);

	return word;
}

u16 generic_ror32(u16 word, unsigned int shift)
{
	return (word >> (shift & 31)) | (word << ((-shift) & 31));
}
```
Their disassembly is:
```
zbb_opt_ror32:
<+0>:     addi    sp,sp,-16
<+2>:     sd      s0,0(sp)
<+4>:     sd      ra,8(sp)
<+6>:     addi    s0,sp,16
<+8>:     .insn   4, 0x60b5553b
<+12>:    ld      ra,8(sp)
<+14>:    ld      s0,0(sp)
<+16>:    sext.w  a0,a0
<+18>:    addi    sp,sp,16
<+20>:    ret

generic_ror32:
<+0>:     addi    sp,sp,-16
<+2>:     andi    a1,a1,31
<+4>:     sd      s0,0(sp)
<+6>:     sd      ra,8(sp)
<+8>:     addi    s0,sp,16
<+10>:    negw    a5,a1
<+14>:    sllw    a5,a0,a5
<+18>:    ld      ra,8(sp)
<+20>:    ld      s0,0(sp)
<+22>:    srlw    a0,a0,a1
<+26>:    or      a0,a0,a5
<+28>:    slli    a0,a0,0x30
<+30>:    srli    a0,a0,0x30
<+32>:    addi    sp,sp,16
<+34>:    ret
```
It can be found that the zbb optimized implementation uses fewer instructions,
even for 16-bit and 8-bit data.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-28 11:13     ` cp0613
@ 2025-06-29  1:48       ` Yury Norov
  2025-06-30 12:04         ` cp0613
  0 siblings, 1 reply; 21+ messages in thread
From: Yury Norov @ 2025-06-29  1:48 UTC (permalink / raw)
  To: cp0613
  Cc: alex, aou, arnd, linux-arch, linux-kernel, linux-riscv, linux,
	palmer, paul.walmsley

On Sat, Jun 28, 2025 at 07:13:57PM +0800, cp0613@linux.alibaba.com wrote:
> On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote:
> 
> > Can you add a comment about what is happening here? Are you sure it's
> > optimized out in case of the 'legacy' alternative?
> 
> Thank you for your review. Yes, I referred to the existing variable__fls()
> implementation, which should be fine.

No, it's not fine. Because you trimmed your original email completely,
so there's no way to understand what I'm asking about; and because you
didn't answer my question. So I'll ask again: what exactly you are doing
in the line you've trimmed out? 
 
> > Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
> > that breaks compile-time rotation if the parameter is known at compile
> > time.
> > 
> > I believe, generic implementation will allow compiler to handle this
> > case better. Can you do a similar thing to what fls() does in the same
> > file?
> 
> I did consider it, but I did not find any toolchain that provides an
> implementation similar to __builtin_ror or __builtin_rol. If there is one,
> please help point it out.

This is the example of the toolchain you're looking for:

  /**
   * rol64 - rotate a 64-bit value left
   * @word: value to rotate
   * @shift: bits to roll
   */
  static inline __u64 rol64(__u64 word, unsigned int shift)
  {
          return (word << (shift & 63)) | (word >> ((-shift) & 63));
  }

What I'm asking is: please show me that compile-time rol/ror is still
calculated at compile time, i.e. ror64(1234, 12) is evaluated at
compile time. 

> In addition, I did not consider it carefully before. If the rotate function
> is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> I missed this step.

Sorry, I'm lost here about what you've considered and what not. I'm OK
about accelerating ror/rol, but I want to make sure that;

1. The most trivial compile-case is actually evaluated at compile time; and
2. Any arch-specific code is well explained; and
3. legacy case optimized just as well as non-legacy.

Thanks,
Yury

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-28 12:08       ` cp0613
@ 2025-06-29 10:38         ` David Laight
  2025-06-30 12:14           ` cp0613
  0 siblings, 1 reply; 21+ messages in thread
From: David Laight @ 2025-06-29 10:38 UTC (permalink / raw)
  To: cp0613
  Cc: alex, aou, arnd, linux-arch, linux-kernel, linux-riscv, linux,
	palmer, paul.walmsley, yury.norov

On Sat, 28 Jun 2025 20:08:16 +0800
cp0613@linux.alibaba.com wrote:

> On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote:
> 
> > Is it even a gain in the zbb case?
> > The "rorw" is only ever going to help full word rotates.
> > Here you might as well do ((word << 8 | word) >> shift).
> > 
> > For "rol8" you'd need ((word << 24 | word) 'rol' shift).
> > I still bet the generic code is faster (but see below).
> > 
> > Same for 16bit rotates.
> > 
> > Actually the generic version is (probably) horrid for everything except x86.
> > See https://www.godbolt.org/z/xTxYj57To  
> 
> Thanks for your suggestion, this website is very inspiring. According to the
> results, the generic version is indeed the most friendly to x86. I think this
> is also a reason why other architectures should be optimized. Take the riscv64
> ror32 implementation as an example, compare the number of assembly instructions
> of the following two functions:
> ```
> u32 zbb_opt_ror32(u32 word, unsigned int shift)
> {
> 	asm volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word) : "r" (word), "r" (shift) :);
> 
> 	return word;
> }
> 
> u16 generic_ror32(u16 word, unsigned int shift)
> {
> 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> }
> ```
> Their disassembly is:
> ```
> zbb_opt_ror32:
> <+0>:     addi    sp,sp,-16
> <+2>:     sd      s0,0(sp)
> <+4>:     sd      ra,8(sp)
> <+6>:     addi    s0,sp,16
> <+8>:     .insn   4, 0x60b5553b
> <+12>:    ld      ra,8(sp)
> <+14>:    ld      s0,0(sp)
> <+16>:    sext.w  a0,a0
> <+18>:    addi    sp,sp,16
> <+20>:    ret
> 
> generic_ror32:
> <+0>:     addi    sp,sp,-16
> <+2>:     andi    a1,a1,31
> <+4>:     sd      s0,0(sp)
> <+6>:     sd      ra,8(sp)
> <+8>:     addi    s0,sp,16
> <+10>:    negw    a5,a1
> <+14>:    sllw    a5,a0,a5
> <+18>:    ld      ra,8(sp)
> <+20>:    ld      s0,0(sp)
> <+22>:    srlw    a0,a0,a1
> <+26>:    or      a0,a0,a5
> <+28>:    slli    a0,a0,0x30
> <+30>:    srli    a0,a0,0x30
> <+32>:    addi    sp,sp,16
> <+34>:    ret
> ```
> It can be found that the zbb optimized implementation uses fewer instructions,
> even for 16-bit and 8-bit data.

Far too many register spills to stack.
I think you've forgotten to specify -O2

	David

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-29  1:48       ` Yury Norov
@ 2025-06-30 12:04         ` cp0613
  2025-06-30 16:53           ` Yury Norov
  0 siblings, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-06-30 12:04 UTC (permalink / raw)
  To: yury.norov
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote:

> > > > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > > > +{
> > > > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > > 
> > > Can you add a comment about what is happening here? Are you sure it's
> > > optimized out in case of the 'legacy' alternative?
> > 
> > Thank you for your review. Yes, I referred to the existing variable__fls()
> > implementation, which should be fine.
> 
> No, it's not fine. Because you trimmed your original email completely,
> so there's no way to understand what I'm asking about; and because you
> didn't answer my question. So I'll ask again: what exactly you are doing
> in the line you've trimmed out?

Sorry, I misunderstood your question. Now I have made up for the lost original
email. This is my answer. The RISC-V Zbb extension only provides 64-bit data
rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw.
Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction
optimization, the data is cyclically spliced here, and the corresponding number
of bits is truncated after processing to achieve the function.

This data preparation process does introduce additional operations. Compared with
genneric's implementation, I use the web tool provided by David to illustrate.

The two functions that need to be compared are briefly summarized as follows:
```
unsigned char generic_ror8(unsigned char word, unsigned int shift)
{
	return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift)
{
	unsigned int word32 = ((unsigned int)word << 24) | \
	    ((unsigned int)word << 16) | ((unsigned int)word << 8) | word;
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);
#endif
	return (unsigned char)word32;
}
```
The disassembly obtained is:
```
generic_ror8:
    andi    a1,a1,7
    negw    a5,a1
    andi    a5,a5,7
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a0,a5
    andi    a0,a0,0xff
    ret

zbb_opt_ror8:
    slli    a5,a0,8
    add     a0,a5,a0
    slliw   a5,a0,16
    addw    a5,a5,a0
    nop
    rorw a5, a5, a1
    andi    a0,a5,0xff
    ret
```
From the perspective of the total number of instructions, although zbb_opt_ror8 has
one more instruction, one of them is a nop, so the difference with generic_ror8 should
be very small, or using the solution provided by David would be better for non-x86.

> > I did consider it, but I did not find any toolchain that provides an
> > implementation similar to __builtin_ror or __builtin_rol. If there is one,
> > please help point it out.
> 
> This is the example of the toolchain you're looking for:
> 
>   /**
>    * rol64 - rotate a 64-bit value left
>    * @word: value to rotate
>    * @shift: bits to roll
>    */
>   static inline __u64 rol64(__u64 word, unsigned int shift)
>   {
>           return (word << (shift & 63)) | (word >> ((-shift) & 63));
>   }
> 
> What I'm asking is: please show me that compile-time rol/ror is still
> calculated at compile time, i.e. ror64(1234, 12) is evaluated at
> compile time.

I see what you mean, I didn't consider the case of constants being evaluated
at compile time, as you pointed out earlier:
"you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks
compile-time rotation if the parameter is known at compile time."

In the absence of compiler built-in function support, I think it can be handled
like this:
```
#define rol16(word, shift) \
	(__builtin_constant_p(word) && __builtin_constant_p(shift) ? \
	generic_ror16(word, shift) : variable_rol16(word, shift))
```
How do you see?

> > In addition, I did not consider it carefully before. If the rotate function
> > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> > I missed this step.
> 
> Sorry, I'm lost here about what you've considered and what not. I'm OK
> about accelerating ror/rol, but I want to make sure that;
> 
> 1. The most trivial compile-case is actually evaluated at compile time; and
> 2. Any arch-specific code is well explained; and
> 3. legacy case optimized just as well as non-legacy.

1. As in the above reply, use the generic implementation when compile-time evaluation
   is possible。
2. I will improve the comments later.
3. As mentioned before, only 8-bit rotation should have no optimization effect, and
   16-bit and above will have significant optimization.

Thanks,
Pei


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-29 10:38         ` David Laight
@ 2025-06-30 12:14           ` cp0613
  2025-06-30 17:35             ` David Laight
  0 siblings, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-06-30 12:14 UTC (permalink / raw)
  To: david.laight.linux
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley, yury.norov

On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:

> > It can be found that the zbb optimized implementation uses fewer instructions,
> > even for 16-bit and 8-bit data.
> 
> Far too many register spills to stack.
> I think you've forgotten to specify -O2

Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
I used the web tool you provided as follows:
```
unsigned int generic_ror32(unsigned int word, unsigned int shift)
{
	return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
{
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word) : "r" (word), "r" (shift) :);
#endif
	return word;
}

unsigned short generic_ror16(unsigned short word, unsigned int shift)
{
	return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
{
	unsigned int word32 = ((unsigned int)word << 16) | word;
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);
#endif
	return (unsigned short)word;
}
```
The disassembly obtained is:
```
generic_ror32:
    andi    a1,a1,31
    negw    a5,a1
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a5,a0
    ret

zbb_opt_ror32:
    nop
    rorw a0, a0, a1
    sext.w  a0,a0
    ret

generic_ror16:
    andi    a1,a1,15
    negw    a5,a1
    andi    a5,a5,15
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a0,a5
    slli    a0,a0,48
    srli    a0,a0,48
    ret

zbb_opt_ror16:
    slliw   a5,a0,16
    addw    a5,a5,a0
    nop
    rorw a5, a5, a1
    ret
```

Thanks,
Pei

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-30 12:04         ` cp0613
@ 2025-06-30 16:53           ` Yury Norov
  2025-07-01 12:47             ` cp0613
  0 siblings, 1 reply; 21+ messages in thread
From: Yury Norov @ 2025-06-30 16:53 UTC (permalink / raw)
  To: cp0613
  Cc: alex, aou, arnd, linux-arch, linux-kernel, linux-riscv, linux,
	palmer, paul.walmsley

On Mon, Jun 30, 2025 at 08:04:53PM +0800, cp0613@linux.alibaba.com wrote:
> On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote:
> 
> > > > > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > > > > +{
> > > > > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > > > 
> > > > Can you add a comment about what is happening here? Are you sure it's
> > > > optimized out in case of the 'legacy' alternative?
> > > 
> > > Thank you for your review. Yes, I referred to the existing variable__fls()
> > > implementation, which should be fine.
> > 
> > No, it's not fine. Because you trimmed your original email completely,
> > so there's no way to understand what I'm asking about; and because you
> > didn't answer my question. So I'll ask again: what exactly you are doing
> > in the line you've trimmed out?
> 
> Sorry, I misunderstood your question. Now I have made up for the lost original
> email. This is my answer. The RISC-V Zbb extension only provides 64-bit data
> rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw.
> Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction
> optimization, the data is cyclically spliced here, and the corresponding number
> of bits is truncated after processing to achieve the function.
> 
> This data preparation process does introduce additional operations. Compared with
> genneric's implementation, I use the web tool provided by David to illustrate.
> 
> The two functions that need to be compared are briefly summarized as follows:
> ```
> unsigned char generic_ror8(unsigned char word, unsigned int shift)
> {
> 	return (word >> (shift & 7)) | (word << ((-shift) & 7));
> }
> 
> unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift)
> {
> 	unsigned int word32 = ((unsigned int)word << 24) | \
> 	    ((unsigned int)word << 16) | ((unsigned int)word << 8) | word;
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> #endif
> 	return (unsigned char)word32;
> }
> ```
> The disassembly obtained is:
> ```
> generic_ror8:
>     andi    a1,a1,7
>     negw    a5,a1
>     andi    a5,a5,7
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a0,a5
>     andi    a0,a0,0xff
>     ret
> 
> zbb_opt_ror8:
>     slli    a5,a0,8
>     add     a0,a5,a0
>     slliw   a5,a0,16
>     addw    a5,a5,a0
>     nop
>     rorw a5, a5, a1
>     andi    a0,a5,0xff
>     ret
> ```
> From the perspective of the total number of instructions, although zbb_opt_ror8 has
> one more instruction, one of them is a nop, so the difference with generic_ror8 should
> be very small, or using the solution provided by David would be better for non-x86.

And what about performance?

> > > I did consider it, but I did not find any toolchain that provides an
> > > implementation similar to __builtin_ror or __builtin_rol. If there is one,
> > > please help point it out.
> > 
> > This is the example of the toolchain you're looking for:
> > 
> >   /**
> >    * rol64 - rotate a 64-bit value left
> >    * @word: value to rotate
> >    * @shift: bits to roll
> >    */
> >   static inline __u64 rol64(__u64 word, unsigned int shift)
> >   {
> >           return (word << (shift & 63)) | (word >> ((-shift) & 63));
> >   }
> > 
> > What I'm asking is: please show me that compile-time rol/ror is still
> > calculated at compile time, i.e. ror64(1234, 12) is evaluated at
> > compile time.
> 
> I see what you mean, I didn't consider the case of constants being evaluated
> at compile time, as you pointed out earlier:
> "you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks
> compile-time rotation if the parameter is known at compile time."
> 
> In the absence of compiler built-in function support, I think it can be handled
> like this:
> ```
> #define rol16(word, shift) \
> 	(__builtin_constant_p(word) && __builtin_constant_p(shift) ? \
> 	generic_ror16(word, shift) : variable_rol16(word, shift))
> ```
> How do you see?

That's what I meant.
 
> > > In addition, I did not consider it carefully before. If the rotate function
> > > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> > > I missed this step.
> > 
> > Sorry, I'm lost here about what you've considered and what not. I'm OK
> > about accelerating ror/rol, but I want to make sure that;
> > 
> > 1. The most trivial compile-case is actually evaluated at compile time; and
> > 2. Any arch-specific code is well explained; and
> > 3. legacy case optimized just as well as non-legacy.
> 
> 1. As in the above reply, use the generic implementation when compile-time evaluation
>    is possible。
> 2. I will improve the comments later.

I'm particularly interested in ror8/rol8 case:

        u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;

When you expand it to 32-bit word, and want to rotate, you obviously
need to copy lower quarterword to the higher one:

        0xab >> 0xab0000ab

That way generic (u8)ror32(0xab, shift) would work. But I don't understand
why you copy the lower 8 bits to inner quarterwords. Is that a hardware
requirement? Can you point to any arch documentation 

> 3. As mentioned before, only 8-bit rotation should have no optimization effect, and
>    16-bit and above will have significant optimization.

I asked you about the asm goto ("legacy") thing: you calculate that
complex word32 _before_ evaluating the goto. So this word32 may get
unused, and you waste cycles. I want to make sure this isn't the case.

Please find attached a test for compile-time ror/rol evaluation.
Please consider prepending your series with it.

Thanks,
Yury

From 5c5be22117a2bd0a656efae0efd6ed159d4168c5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 30 Jun 2025 12:07:47 -0400
Subject: [PATCH] bitops: add compile-time test for ror() and rol()

If parameters for the functions are passed at compile time, the compiler
must calculate the result at compile time, as well.

Now that architectures introduce accelerated implementations for bit
rotation, we must make sure that they don't break compile-time
evaluation.

This patch adds a test for it, similarly to test_bitmap_const_eval().

Tested on x86_64.

Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 lib/test_bitops.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/lib/test_bitops.c b/lib/test_bitops.c
index 55669624bb28..aa4c06df34f7 100644
--- a/lib/test_bitops.c
+++ b/lib/test_bitops.c
@@ -76,6 +76,56 @@ static int __init test_fns(void)
 	return 0;
 }
 
+static void __init test_bitops_const_eval(void)
+{
+	/*
+	 * ror/rol operations on parameters known at compile-time must be
+	 * optimized to compile-time constants on any supported optimization
+	 * level (-O2, -Os) and all architectures. Otherwise, trigger a build
+	 * bug.
+	 */
+
+	u64 r64 = ror64(0x1234567890abcdefull, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r64));
+	BUILD_BUG_ON(r64 != 0xabcdef1234567890ull);
+
+	u64 l64 = rol64(0x1234567890abcdefull, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l64));
+	BUILD_BUG_ON(l64 != 0x7890abcdef123456ull);
+
+	u32 r32 = ror32(0x12345678, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r32));
+	BUILD_BUG_ON(r32 != 0x34567812);
+
+	u32 l32 = rol32(0x12345678, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l32));
+	BUILD_BUG_ON(l32 != 0x78123456);
+
+	u16 r16 = ror16(0x1234, 12);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r16));
+	BUILD_BUG_ON(r16 != 0x2341);
+
+	u16 l16 = rol16(0x1234, 12);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l16));
+	BUILD_BUG_ON(l16 != 0x4123);
+
+	u8 r8 = ror8(0x12, 6);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r16));
+	BUILD_BUG_ON(r8 != 0x48);
+
+	u8 l8 = rol8(0x12, 6);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l16));
+	BUILD_BUG_ON(l8 != 0x84);
+}
+
 static int __init test_bitops_startup(void)
 {
 	int i, bit_set;
@@ -121,6 +171,7 @@ static int __init test_bitops_startup(void)
 		pr_err("ERROR: FOUND SET BIT %d\n", bit_set);
 
 	test_fns();
+	test_bitops_const_eval();
 
 	pr_info("Completed bitops test\n");
 
-- 
2.43.0


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-30 12:14           ` cp0613
@ 2025-06-30 17:35             ` David Laight
  2025-07-01 13:01               ` cp0613
  0 siblings, 1 reply; 21+ messages in thread
From: David Laight @ 2025-06-30 17:35 UTC (permalink / raw)
  To: cp0613
  Cc: alex, aou, arnd, linux-arch, linux-kernel, linux-riscv, linux,
	palmer, paul.walmsley, yury.norov

On Mon, 30 Jun 2025 20:14:30 +0800
cp0613@linux.alibaba.com wrote:

> On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:
> 
> > > It can be found that the zbb optimized implementation uses fewer instructions,
> > > even for 16-bit and 8-bit data.  
> > 
> > Far too many register spills to stack.
> > I think you've forgotten to specify -O2  
> 
> Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
> I used the web tool you provided as follows:
> ```
> unsigned int generic_ror32(unsigned int word, unsigned int shift)
> {
> 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> }
> 
> unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
> {
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word) : "r" (word), "r" (shift) :);
> #endif
> 	return word;
> }
> 
> unsigned short generic_ror16(unsigned short word, unsigned int shift)
> {
> 	return (word >> (shift & 15)) | (word << ((-shift) & 15));
> }
> 
> unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
> {
> 	unsigned int word32 = ((unsigned int)word << 16) | word;
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> #endif
> 	return (unsigned short)word;
> }
> ```
> The disassembly obtained is:
> ```
> generic_ror32:
>     andi    a1,a1,31

The compiler shouldn't be generating that mask.
After all it knows the negated value doesn't need the same mask.
(I'd guess the cpu just ignores the high bits of the shift - most do.)

>     negw    a5,a1
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a5,a0
>     ret
> 
> zbb_opt_ror32:
>     nop
>     rorw a0, a0, a1
>     sext.w  a0,a0

Is that a sign extend?
Why is it there?
If it is related to the (broken) 'feature' of riscv-64 that 32bit results
are sign extended, why isn't there one in the example above.

You also need to consider the code for non-zbb cpu.

>     ret
> 
> generic_ror16:
>     andi    a1,a1,15
>     negw    a5,a1
>     andi    a5,a5,15
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a0,a5
>     slli    a0,a0,48
>     srli    a0,a0,48

The last two instructions mask the result with 0xffff.
If that is necessary it is missing from the zbb version below.

>     ret
> 
> zbb_opt_ror16:
>     slliw   a5,a0,16
>     addw    a5,a5,a0

At this point you can just do a 'shift right' on all cpu.
For rol16 you can do a variable shift left and a 16 bit
shift right on all cpu.
If the zbb version ends up with a nop (as below) then it is
likely to be much the same speed.

	David

>     nop
>     rorw a5, a5, a1
>     ret
> ```
> 
> Thanks,
> Pei


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-30 16:53           ` Yury Norov
@ 2025-07-01 12:47             ` cp0613
  2025-07-01 18:32               ` Yury Norov
  0 siblings, 1 reply; 21+ messages in thread
From: cp0613 @ 2025-07-01 12:47 UTC (permalink / raw)
  To: yury.norov
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 5439 bytes --]

On Mon, 30 Jun 2025 12:53:03 -0400, yury.norov@gmail.com wrote:

> > This data preparation process does introduce additional operations. Compared with
> > genneric's implementation, I use the web tool provided by David to illustrate.
> > 
> > The two functions that need to be compared are briefly summarized as follows:
> > ```
> > unsigned char generic_ror8(unsigned char word, unsigned int shift)
> > {
> > 	return (word >> (shift & 7)) | (word << ((-shift) & 7));
> > }
> > 
> > unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift)
> > {
> > 	unsigned int word32 = ((unsigned int)word << 24) | \
> > 	    ((unsigned int)word << 16) | ((unsigned int)word << 8) | word;
> > #ifdef __riscv
> > 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> > 
> > 	__asm__ volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> > #endif
> > 	return (unsigned char)word32;
> > }
> > ```
> > The disassembly obtained is:
> > ```
> > generic_ror8:
> >     andi    a1,a1,7
> >     negw    a5,a1
> >     andi    a5,a5,7
> >     sllw    a5,a0,a5
> >     srlw    a0,a0,a1
> >     or      a0,a0,a5
> >     andi    a0,a0,0xff
> >     ret
> > 
> > zbb_opt_ror8:
> >     slli    a5,a0,8
> >     add     a0,a5,a0
> >     slliw   a5,a0,16
> >     addw    a5,a5,a0
> >     nop
> >     rorw a5, a5, a1
> >     andi    a0,a5,0xff
> >     ret
> > ```
> > From the perspective of the total number of instructions, although zbb_opt_ror8 has
> > one more instruction, one of them is a nop, so the difference with generic_ror8 should
> > be very small, or using the solution provided by David would be better for non-x86.
> 
> And what about performance?

Please allow me to include performance information later in this answer, as the
implementation has changed.

> > > Sorry, I'm lost here about what you've considered and what not. I'm OK
> > > about accelerating ror/rol, but I want to make sure that;
> > > 
> > > 1. The most trivial compile-case is actually evaluated at compile time; and
> > > 2. Any arch-specific code is well explained; and
> > > 3. legacy case optimized just as well as non-legacy.
> > 
> > 1. As in the above reply, use the generic implementation when compile-time evaluation
> >    is possible。
> > 2. I will improve the comments later.
> 
> I'm particularly interested in ror8/rol8 case:
> 
>         u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> 
> When you expand it to 32-bit word, and want to rotate, you obviously
> need to copy lower quarterword to the higher one:
> 
>         0xab >> 0xab0000ab
> 
> That way generic (u8)ror32(0xab, shift) would work. But I don't understand
> why you copy the lower 8 bits to inner quarterwords. Is that a hardware
> requirement? Can you point to any arch documentation 
> 
> > 3. As mentioned before, only 8-bit rotation should have no optimization effect, and
> >    16-bit and above will have significant optimization.
> 
> I asked you about the asm goto ("legacy") thing: you calculate that
> complex word32 _before_ evaluating the goto. So this word32 may get
> unused, and you waste cycles. I want to make sure this isn't the case.

Thank you for your suggestion and reminder. This is not a hardware requirement, but it
is somewhat related because the zbb instruction only supports word-sized rotate [1].
Considering the case where the shift is greater than 8, if the modulus of the shift is
not taken, it is required to continuously concatenate 8-bit data into 32 bits.

So, I considered the case of taking the remainder of shift and found that this
implementation would have one less instruction, so the final implementation is as follows:
```
static inline u8 variable_rol8(u8 word, unsigned int shift)
{
	u32 word32;

	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
				      RISCV_ISA_EXT_ZBB, 1)
			  : : : : legacy);

	word32 = ((u32)word << 24) | word;
	shift = shift % 8;

	asm volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rolw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);

	return (u8)word32;

legacy:
	return generic_rol8(word, shift);
}

static inline u8 variable_ror8(u8 word, unsigned int shift)
{
	u32 word32;

	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
				      RISCV_ISA_EXT_ZBB, 1)
			  : : : : legacy);

	word32 = ((u32)word << 8) | word;
	shift = shift % 8;

	asm volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);

	return (u8)word32;

legacy:
	return generic_ror8(word, shift);
}
```

I compared the performance of ror8 (zbb optimized) and generic_ror8 on the XUANTIE C908
by looping them. ror8 is better, and the advantage of ror8 becomes more obvious as the
number of iterations increases. The test code is as follows:
```
	u8 word = 0x5a;
	u32 shift = 9;
	u32 i, loop = 100;
	u8 ret1, ret2;

	u64 t1 = ktime_get_ns();
	for (i = 0; i < loop; i++) {
		ret2 = generic_ror8(word, shift);
	}
	u64 t2 = ktime_get_ns();
	for (i = 0; i < loop; i++) {
		ret1 = ror8(word, shift);
	}
	u64 t3 = ktime_get_ns();

	pr_info("t2-t1=%lld t3-t2=%lld\n", t2 - t1, t3 - t2);
```

> Please find attached a test for compile-time ror/rol evaluation.
> Please consider prepending your series with it.

Okay, I'll bring it to the next series.

[1] https://github.com/riscv/riscv-bitmanip


[-- Attachment #2: Type: text/plain, Size: 161 bytes --]

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-06-30 17:35             ` David Laight
@ 2025-07-01 13:01               ` cp0613
  0 siblings, 0 replies; 21+ messages in thread
From: cp0613 @ 2025-07-01 13:01 UTC (permalink / raw)
  To: david.laight.linux
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley, yury.norov

On Mon, 30 Jun 2025 18:35:34 +0100, david.laight.linux@gmail.com wrote:

> > On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:
> > 
> > > > It can be found that the zbb optimized implementation uses fewer instructions,
> > > > even for 16-bit and 8-bit data.  
> > > 
> > > Far too many register spills to stack.
> > > I think you've forgotten to specify -O2  
> > 
> > Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
> > I used the web tool you provided as follows:
> > ```
> > unsigned int generic_ror32(unsigned int word, unsigned int shift)
> > {
> > 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> > }
> > 
> > unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
> > {
> > #ifdef __riscv
> > 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> > 
> > 	__asm__ volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word) : "r" (word), "r" (shift) :);
> > #endif
> > 	return word;
> > }
> > 
> > unsigned short generic_ror16(unsigned short word, unsigned int shift)
> > {
> > 	return (word >> (shift & 15)) | (word << ((-shift) & 15));
> > }
> > 
> > unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
> > {
> > 	unsigned int word32 = ((unsigned int)word << 16) | word;
> > #ifdef __riscv
> > 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> > 
> > 	__asm__ volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> > #endif
> > 	return (unsigned short)word;
> > }
> > ```
> > The disassembly obtained is:
> > ```
> > generic_ror32:
> >     andi    a1,a1,31
> 
> The compiler shouldn't be generating that mask.
> After all it knows the negated value doesn't need the same mask.
> (I'd guess the cpu just ignores the high bits of the shift - most do.)
> 
> >     negw    a5,a1
> >     sllw    a5,a0,a5
> >     srlw    a0,a0,a1
> >     or      a0,a5,a0
> >     ret
> > 
> > zbb_opt_ror32:
> >     nop
> >     rorw a0, a0, a1
> >     sext.w  a0,a0
> 
> Is that a sign extend?
> Why is it there?
> If it is related to the (broken) 'feature' of riscv-64 that 32bit results
> are sign extended, why isn't there one in the example above.
> 
> You also need to consider the code for non-zbb cpu.
> 
> >     ret
> > 
> > generic_ror16:
> >     andi    a1,a1,15
> >     negw    a5,a1
> >     andi    a5,a5,15
> >     sllw    a5,a0,a5
> >     srlw    a0,a0,a1
> >     or      a0,a0,a5
> >     slli    a0,a0,48
> >     srli    a0,a0,48
> 
> The last two instructions mask the result with 0xffff.
> If that is necessary it is missing from the zbb version below.
> 
> >     ret
> > 
> > zbb_opt_ror16:
> >     slliw   a5,a0,16
> >     addw    a5,a5,a0
> 
> At this point you can just do a 'shift right' on all cpu.
> For rol16 you can do a variable shift left and a 16 bit
> shift right on all cpu.
> If the zbb version ends up with a nop (as below) then it is
> likely to be much the same speed.
> 
> 	David
> 
> >     nop
> >     rorw a5, a5, a1
> >     ret
> > ```

Sorry, please allow me to reply in a unified way. I did not check the rationality
of the above assembly, but only used the web tool you provided before to generate
it. In fact, I think it is more in line with the actual situation to disassemble
it from vmlinux. In addition, the code is simplified here, and the complete
implementation takes into account the processor that does not support or has not
enabled zbb.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-07-01 12:47             ` cp0613
@ 2025-07-01 18:32               ` Yury Norov
  2025-07-02 10:11                 ` David Laight
  2025-07-02 12:30                 ` cp0613
  0 siblings, 2 replies; 21+ messages in thread
From: Yury Norov @ 2025-07-01 18:32 UTC (permalink / raw)
  To: cp0613
  Cc: alex, aou, arnd, linux-arch, linux-kernel, linux-riscv, linux,
	palmer, paul.walmsley

On Tue, Jul 01, 2025 at 08:47:01PM +0800, cp0613@linux.alibaba.com wrote:
> On Mon, 30 Jun 2025 12:53:03 -0400, yury.norov@gmail.com wrote:
> 
> > > > 1. The most trivial compile-case is actually evaluated at compile time; and
> > > > 2. Any arch-specific code is well explained; and
> > > > 3. legacy case optimized just as well as non-legacy.
> > > 
> > > 1. As in the above reply, use the generic implementation when compile-time evaluation
> > >    is possible。
> > > 2. I will improve the comments later.
> > 
> > I'm particularly interested in ror8/rol8 case:
> > 
> >         u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > 
> > When you expand it to 32-bit word, and want to rotate, you obviously
> > need to copy lower quarterword to the higher one:
> > 
> >         0xab >> 0xab0000ab
> > 
> > That way generic (u8)ror32(0xab, shift) would work. But I don't understand
> > why you copy the lower 8 bits to inner quarterwords. Is that a hardware
> > requirement? Can you point to any arch documentation 
> > 
> > > 3. As mentioned before, only 8-bit rotation should have no optimization effect, and
> > >    16-bit and above will have significant optimization.
> > 
> > I asked you about the asm goto ("legacy") thing: you calculate that
> > complex word32 _before_ evaluating the goto. So this word32 may get
> > unused, and you waste cycles. I want to make sure this isn't the case.
> 
> Thank you for your suggestion and reminder. This is not a hardware requirement, but it

Sure. Please add

Suggested-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>

> is somewhat related because the zbb instruction only supports word-sized rotate [1].
> Considering the case where the shift is greater than 8, if the modulus of the shift is
> not taken, it is required to continuously concatenate 8-bit data into 32 bits.
> 
> So, I considered the case of taking the remainder of shift and found that this
> implementation would have one less instruction, so the final implementation is as follows:
> ```
> static inline u8 variable_rol8(u8 word, unsigned int shift)

Now that it has assembler inlines, would it help to add the __pure
qualifier?

> {
> 	u32 word32;
> 
> 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> 				      RISCV_ISA_EXT_ZBB, 1)
> 			  : : : : legacy);
> 
> 	word32 = ((u32)word << 24) | word;
> 	shift = shift % 8;

        shift %= 8;

> 
> 	asm volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rolw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> 
> 	return (u8)word32;
> 
> legacy:
> 	return generic_rol8(word, shift);
> }
> 
> static inline u8 variable_ror8(u8 word, unsigned int shift)
> {
> 	u32 word32;
> 
> 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> 				      RISCV_ISA_EXT_ZBB, 1)
> 			  : : : : legacy);
> 
> 	word32 = ((u32)word << 8) | word;
> 	shift = shift % 8;
> 
> 	asm volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> 
> 	return (u8)word32;
> 
> legacy:
> 	return generic_ror8(word, shift);
> }
> ```

OK, this looks better.

> I compared the performance of ror8 (zbb optimized) and generic_ror8 on the XUANTIE C908
> by looping them. ror8 is better, and the advantage of ror8 becomes more obvious as the
> number of iterations increases. The test code is as follows:
> ```
> 	u8 word = 0x5a;
> 	u32 shift = 9;
> 	u32 i, loop = 100;
> 	u8 ret1, ret2;
> 
> 	u64 t1 = ktime_get_ns();
> 	for (i = 0; i < loop; i++) {
> 		ret2 = generic_ror8(word, shift);
> 	}
> 	u64 t2 = ktime_get_ns();
> 	for (i = 0; i < loop; i++) {
> 		ret1 = ror8(word, shift);
> 	}
> 	u64 t3 = ktime_get_ns();
> 
> 	pr_info("t2-t1=%lld t3-t2=%lld\n", t2 - t1, t3 - t2);
> ```

Please do the following:

1. Drop the generic_ror8() and keep only ror/l8()
2. Add ror/l16, 34 and 64 tests.
3. Adjust the 'loop' so that each subtest will take 1-10 ms on your hw.
4. Name the function like test_rorl(), and put it next to the test_fns()
   in lib/test_bitops.c. Refer the 0a2c6664e56f0 for implementation.
5. Prepend the series with the benchmark patch, just as with const-eval
   one, and report before/after for your series. 

> > Please find attached a test for compile-time ror/rol evaluation.
> > Please consider prepending your series with it.
> 
> Okay, I'll bring it to the next series.

Still waiting for bloat-o-meter results.

Thanks,
Yury

> 
> [1] https://github.com/riscv/riscv-bitmanip

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-07-01 18:32               ` Yury Norov
@ 2025-07-02 10:11                 ` David Laight
  2025-07-03 16:58                   ` Yury Norov
  2025-07-02 12:30                 ` cp0613
  1 sibling, 1 reply; 21+ messages in thread
From: David Laight @ 2025-07-02 10:11 UTC (permalink / raw)
  To: Yury Norov
  Cc: cp0613, alex, aou, arnd, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

On Tue, 1 Jul 2025 14:32:14 -0400
Yury Norov <yury.norov@gmail.com> wrote:

> On Tue, Jul 01, 2025 at 08:47:01PM +0800, cp0613@linux.alibaba.com wrote:
> > On Mon, 30 Jun 2025 12:53:03 -0400, yury.norov@gmail.com wrote:
> >   
> > > > > 1. The most trivial compile-case is actually evaluated at compile time; and
> > > > > 2. Any arch-specific code is well explained; and
> > > > > 3. legacy case optimized just as well as non-legacy.  
> > > > 
> > > > 1. As in the above reply, use the generic implementation when compile-time evaluation
> > > >    is possible。
> > > > 2. I will improve the comments later.  
> > > 
> > > I'm particularly interested in ror8/rol8 case:
> > > 
> > >         u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > > 
> > > When you expand it to 32-bit word, and want to rotate, you obviously
> > > need to copy lower quarterword to the higher one:
> > > 
> > >         0xab >> 0xab0000ab
> > > 
> > > That way generic (u8)ror32(0xab, shift) would work. But I don't understand
> > > why you copy the lower 8 bits to inner quarterwords. Is that a hardware
> > > requirement? Can you point to any arch documentation 
> > >   
> > > > 3. As mentioned before, only 8-bit rotation should have no optimization effect, and
> > > >    16-bit and above will have significant optimization.  
> > > 
> > > I asked you about the asm goto ("legacy") thing: you calculate that
> > > complex word32 _before_ evaluating the goto. So this word32 may get
> > > unused, and you waste cycles. I want to make sure this isn't the case.  
> > 
> > Thank you for your suggestion and reminder. This is not a hardware requirement, but it  
> 
> Sure. Please add
> 
> Suggested-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> 
> > is somewhat related because the zbb instruction only supports word-sized rotate [1].
> > Considering the case where the shift is greater than 8, if the modulus of the shift is
> > not taken, it is required to continuously concatenate 8-bit data into 32 bits.

I'd not worry about rotates of 8 bits or more (for ror8).
They can be treated as 'undefined behaviour' under the assumption they don't happen.
The 'generic' version needs them to get gcc to generate a 'rorb' on x86.
The negated shift needs masking so that clang doesn't throw the code away when
the value is constant.

> > 
> > So, I considered the case of taking the remainder of shift and found that this
> > implementation would have one less instruction, so the final implementation is as follows:
> > ```
> > static inline u8 variable_rol8(u8 word, unsigned int shift)  
> 
> Now that it has assembler inlines, would it help to add the __pure
> qualifier?
> 
> > {
> > 	u32 word32;
> > 
> > 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> > 				      RISCV_ISA_EXT_ZBB, 1)
> > 			  : : : : legacy);
> > 
> > 	word32 = ((u32)word << 24) | word;
> > 	shift = shift % 8;  
> 
>         shift %= 8;
> 
> > 
> > 	asm volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rolw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> > 
> > 	return (u8)word32;
> > 
> > legacy:
> > 	return generic_rol8(word, shift);
> > }
> > 
> > static inline u8 variable_ror8(u8 word, unsigned int shift)
> > {
> > 	u32 word32;
> > 
> > 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> > 				      RISCV_ISA_EXT_ZBB, 1)
> > 			  : : : : legacy);
> > 
> > 	word32 = ((u32)word << 8) | word;
> > 	shift = shift % 8;
> > 
> > 	asm volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> > 
> > 	return (u8)word32;
> > 
> > legacy:
> > 	return generic_ror8(word, shift);
> > }
> > ```  
> 
> OK, this looks better.
> 
> > I compared the performance of ror8 (zbb optimized) and generic_ror8 on the XUANTIE C908
> > by looping them. ror8 is better, and the advantage of ror8 becomes more obvious as the
> > number of iterations increases. The test code is as follows:
> > ```
> > 	u8 word = 0x5a;
> > 	u32 shift = 9;
> > 	u32 i, loop = 100;
> > 	u8 ret1, ret2;
> > 
> > 	u64 t1 = ktime_get_ns();
> > 	for (i = 0; i < loop; i++) {
> > 		ret2 = generic_ror8(word, shift);
> > 	}
> > 	u64 t2 = ktime_get_ns();
> > 	for (i = 0; i < loop; i++) {
> > 		ret1 = ror8(word, shift);
> > 	}
> > 	u64 t3 = ktime_get_ns();
> > 
> > 	pr_info("t2-t1=%lld t3-t2=%lld\n", t2 - t1, t3 - t2);
> > ```  
> 
> Please do the following:
> 
> 1. Drop the generic_ror8() and keep only ror/l8()
> 2. Add ror/l16, 34 and 64 tests.
> 3. Adjust the 'loop' so that each subtest will take 1-10 ms on your hw.

That is far too many iterations.
You'll get interrupts dominating the tests.
The best thing is to do 'just enough' iterations to get a meaningful result,
and then repeat a few times and report the fastest (or average excluding
any large outliers).

You also need to ensure the compiler doesn't (or isn't allowed to) pull
the contents of the inlined function outside the loop - and then throw
the loop away,

The other question is whether any of it is worth the effort.
How many ror8() and ror16() calls are there?
I suspect not many.

Improving the generic ones might be worth while.
Perhaps moving the current versions to x86 only.
(I suspect the only other cpu with byte/short rotates is m68k)

	David



> 4. Name the function like test_rorl(), and put it next to the test_fns()
>    in lib/test_bitops.c. Refer the 0a2c6664e56f0 for implementation.
> 5. Prepend the series with the benchmark patch, just as with const-eval
>    one, and report before/after for your series. 
> 
> > > Please find attached a test for compile-time ror/rol evaluation.
> > > Please consider prepending your series with it.  
> > 
> > Okay, I'll bring it to the next series.  
> 
> Still waiting for bloat-o-meter results.
> 
> Thanks,
> Yury
> 
> > 
> > [1] https://github.com/riscv/riscv-bitmanip  
> 


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-07-01 18:32               ` Yury Norov
  2025-07-02 10:11                 ` David Laight
@ 2025-07-02 12:30                 ` cp0613
  1 sibling, 0 replies; 21+ messages in thread
From: cp0613 @ 2025-07-02 12:30 UTC (permalink / raw)
  To: yury.norov
  Cc: alex, aou, arnd, cp0613, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

On Tue, 1 Jul 2025 14:32:14 -0400, yury.norov@gmail.com wrote:

> > static inline u8 variable_rol8(u8 word, unsigned int shift)
> 
> Now that it has assembler inlines, would it help to add the __pure
> qualifier?

Even if the same parameters are passed in, after comparing the disassembly
with and without the __pure qualifier, it is found that it does not help.

> Please do the following:
> 
> 1. Drop the generic_ror8() and keep only ror/l8()
> 2. Add ror/l16, 34 and 64 tests.
> 3. Adjust the 'loop' so that each subtest will take 1-10 ms on your hw.
> 4. Name the function like test_rorl(), and put it next to the test_fns()
>    in lib/test_bitops.c. Refer the 0a2c6664e56f0 for implementation.
> 5. Prepend the series with the benchmark patch, just as with const-eval
>    one, and report before/after for your series. 
> 
> > > Please find attached a test for compile-time ror/rol evaluation.
> > > Please consider prepending your series with it.
> > 
> > Okay, I'll bring it to the next series.
> 
> Still waiting for bloat-o-meter results.

Ok, in the next series I will provide a patch for the benchmark, and then
prepend the const-eval and benchmark patches, and also provide the results
of the bloat-o-meter.

Thank you very much,
Pei

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension
  2025-07-02 10:11                 ` David Laight
@ 2025-07-03 16:58                   ` Yury Norov
  0 siblings, 0 replies; 21+ messages in thread
From: Yury Norov @ 2025-07-03 16:58 UTC (permalink / raw)
  To: David Laight
  Cc: cp0613, alex, aou, arnd, linux-arch, linux-kernel, linux-riscv,
	linux, palmer, paul.walmsley

On Wed, Jul 02, 2025 at 11:11:35AM +0100, David Laight wrote:
> On Tue, 1 Jul 2025 14:32:14 -0400
> Yury Norov <yury.norov@gmail.com> wrote:
> 
> I'd not worry about rotates of 8 bits or more (for ror8).
> They can be treated as 'undefined behaviour' under the assumption they don't happen.

Good for you. But generic implementation is safe against overflowing
the shift, so the arch must be safe as well.

> The 'generic' version needs them to get gcc to generate a 'rorb' on x86.
> The negated shift needs masking so that clang doesn't throw the code away when
> the value is constant.

...

> > > I compared the performance of ror8 (zbb optimized) and generic_ror8 on the XUANTIE C908
> > > by looping them. ror8 is better, and the advantage of ror8 becomes more obvious as the
> > > number of iterations increases. The test code is as follows:
> > > ```
> > > 	u8 word = 0x5a;
> > > 	u32 shift = 9;
> > > 	u32 i, loop = 100;
> > > 	u8 ret1, ret2;
> > > 
> > > 	u64 t1 = ktime_get_ns();
> > > 	for (i = 0; i < loop; i++) {
> > > 		ret2 = generic_ror8(word, shift);
> > > 	}
> > > 	u64 t2 = ktime_get_ns();
> > > 	for (i = 0; i < loop; i++) {
> > > 		ret1 = ror8(word, shift);
> > > 	}
> > > 	u64 t3 = ktime_get_ns();
> > > 
> > > 	pr_info("t2-t1=%lld t3-t2=%lld\n", t2 - t1, t3 - t2);
> > > ```  
> > 
> > Please do the following:
> > 
> > 1. Drop the generic_ror8() and keep only ror/l8()
> > 2. Add ror/l16, 34 and 64 tests.
> > 3. Adjust the 'loop' so that each subtest will take 1-10 ms on your hw.
> 
> That is far too many iterations.
> You'll get interrupts dominating the tests.

That's interesting observation. Can you show numbers for your
hardware?

> The best thing is to do 'just enough' iterations to get a meaningful result,
> and then repeat a few times and report the fastest (or average excluding
> any large outliers).
> 
> You also need to ensure the compiler doesn't (or isn't allowed to) pull
> the contents of the inlined function outside the loop - and then throw
> the loop away,

Not me - Chen Pei needs. I wrote __always_used for it. It should
help.
 
> The other question is whether any of it is worth the effort.
> How many ror8() and ror16() calls are there?
> I suspect not many.

I'm not a RISC-V engineer, and I can't judge how they want to use the
functions. This doesn't bring significant extra burden on generic
side, so I don't object against arch ror8() on RISCs.

> Improving the generic ones might be worth while.
> Perhaps moving the current versions to x86 only.
> (I suspect the only other cpu with byte/short rotates is m68k)
> 
> 	David

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-07-03 21:00 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-20 11:16 [PATCH 0/2] Implementing bitops rotate using riscv Zbb extension cp0613
2025-06-20 11:16 ` [PATCH 1/2] bitops: generic rotate cp0613
2025-06-20 15:47   ` kernel test robot
2025-06-23 11:59   ` kernel test robot
2025-06-20 11:16 ` [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension cp0613
2025-06-20 16:20   ` Yury Norov
2025-06-25 16:02     ` David Laight
2025-06-28 12:08       ` cp0613
2025-06-29 10:38         ` David Laight
2025-06-30 12:14           ` cp0613
2025-06-30 17:35             ` David Laight
2025-07-01 13:01               ` cp0613
2025-06-28 11:13     ` cp0613
2025-06-29  1:48       ` Yury Norov
2025-06-30 12:04         ` cp0613
2025-06-30 16:53           ` Yury Norov
2025-07-01 12:47             ` cp0613
2025-07-01 18:32               ` Yury Norov
2025-07-02 10:11                 ` David Laight
2025-07-03 16:58                   ` Yury Norov
2025-07-02 12:30                 ` cp0613

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).