dev.dpdk.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy
@ 2025-10-16  9:09 Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen
  0 siblings, 2 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 965 bytes --]

I've implemented optimizations to rte_memcpy targeting RISC-V architectures, 
achieving an average 10%~15% reduction in execution time for data sizes between
129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
 specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)


Qiguo Chen (2):
  riscv support rte_memcpy in vector
  benchmark report for rte_memcpy

 .mailmap                           |   1 +
 benchmark_report.txt               | 149 ++++++++++++++
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 4 files changed, 472 insertions(+), 2 deletions(-)
 create mode 100644 benchmark_report.txt

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 1861 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-16  9:09 ` Qiguo Chen
  2025-10-17  5:29   ` sunyuechi
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen
  1 sibling, 2 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9838 bytes --]

This patch implements RISC-V vector intrinsics
to accelerate memory copy operations for byte range (129~1600).

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy(dst, src, n);
+	/*else*/
+#endif
+		return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26648 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v1 2/2] benchmark report for rte_memcpy
  2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-16  9:09 ` Qiguo Chen
  1 sibling, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-16  9:09 UTC (permalink / raw)
  To: stanislaw.kardach, sunyuechi, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 12710 bytes --]

Benchmark results show 10~15% reduction in execution time for
  data sizes (129~1024)

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 benchmark_report.txt | 149 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 benchmark_report.txt

diff --git a/benchmark_report.txt b/benchmark_report.txt
new file mode 100644
index 0000000000..499d3fc5f0
--- /dev/null
+++ b/benchmark_report.txt
@@ -0,0 +1,149 @@
+================================= 16B aligned =================================
+      1  0 -  0( 24.27%)   1 -  1( 13.14%)   2 -  2( -3.48%)   3 -  3(  2.70%) 
+      2  0 -  0( 21.92%)   1 -  1(  4.75%)   2 -  2( -3.58%)   3 -  3(  3.05%) 
+      3  0 -  0( 23.54%)   1 -  1(  9.74%)   2 -  2( -4.24%)   3 -  3(  2.26%) 
+      4  0 -  0( 22.54%)   1 -  1(  7.10%)   2 -  2( -3.96%)   3 -  3(  0.99%) 
+      5  0 -  0( 22.73%)   1 -  1(  9.02%)   2 -  2( -3.18%)   3 -  3(  1.60%) 
+      6  0 -  0( 56.22%)   1 -  1(  8.21%)   2 -  2( -3.65%)   3 -  3(  1.10%) 
+      7  0 -  0( 23.07%)   1 -  1(  6.82%)   2 -  2( -3.53%)   3 -  3(  3.46%) 
+      8  0 -  0( 23.49%)   1 -  1(  7.70%)   2 -  2( -0.26%)   3 -  3(  2.22%) 
+      9  0 -  0( 56.70%)   1 -  1(  7.04%)   2 -  2( -3.75%)   3 -  3(  2.52%) 
+     12  0 -  0( 23.87%)   1 -  1(  5.80%)   2 -  2( -3.76%)   3 -  3(  1.49%) 
+     15  0 -  0( 22.95%)   1 -  1(  5.01%)   2 -  2( -3.52%)   3 -  3(  2.82%) 
+     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
+     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
+     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
+     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
+     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
+     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
+     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
+     65  0 -  0( 20.60%)   5 -  4( 31.05%)   6 -  5( 24.16%)   8 -  7(  5.69%) 
+    127  0 -  0( 18.22%)   6 -  6(  9.34%)   8 -  7(  9.72%)  11 - 11(  2.73%) 
+    128  0 -  0( 39.80%)   6 -  6( -0.93%)   8 -  7(  9.65%)  11 - 11(  4.63%) 
+    129  0 -  1(-50.92%)   6 -  7( -4.00%)   9 - 12(-28.67%)  11 - 16(-34.28%) 
+    191  1 -  1(-45.09%)   9 -  9(  5.04%)  12 - 13(-11.82%)  13 - 16(-15.66%) 
+    192  1 -  1(-43.44%)   7 -  9(-18.67%)  12 - 13( -5.92%)  13 - 15(-18.50%) 
+    193  1 -  1(-24.84%)   9 -  9( -5.60%)  12 - 13( -7.44%)  14 - 17(-14.15%) 
+    255  1 -  1(-23.65%)  11 - 11( -4.57%)  13 - 13( -3.46%)  16 - 18( -8.81%) 
+    256  1 -  1( 16.87%)   9 - 11(-13.78%)  14 - 13(  8.58%)  16 - 16(  5.20%) 
+    257  1 -  1(-15.41%)  12 - 13( -6.90%)  15 - 16( -6.71%)  18 - 19( -6.35%) 
+    319  1 -  1(-12.93%)  15 - 19(-18.96%)  17 - 17( -0.55%)  21 - 21( -1.25%) 
+    320  1 -  1(-16.38%)  10 - 17(-39.05%)  18 - 17(  4.65%)  20 - 20( -2.94%) 
+    321  1 -  1( -6.24%)  12 - 19(-36.30%)  18 - 17(  6.65%)  20 - 22( -8.86%) 
+    383  1 -  1( -4.06%)  16 - 20(-17.87%)  18 - 17(  9.18%)  23 - 23(  1.42%) 
+    384  1 -  1( 12.87%)  11 - 18(-36.31%)  18 - 18(  1.92%)  20 - 22( -8.22%) 
+    385  2 -  2( 26.46%)  11 - 20(-46.76%)  15 - 20(-22.07%)  19 - 24(-18.04%) 
+    447  2 -  1( 55.03%)  14 - 21(-34.10%)  15 - 20(-22.75%)  21 - 27(-23.99%) 
+    448  2 -  1( 18.00%)  12 - 20(-38.82%)  16 - 20(-20.82%)  21 - 25(-18.74%) 
+    449  4 -  2(141.90%)  13 - 22(-42.36%)  16 - 20(-22.84%)  21 - 26(-18.35%) 
+    511  3 -  2( 57.68%)  14 - 23(-37.60%)  16 - 20(-18.33%)  21 - 28(-22.10%) 
+    512  2 -  1( 27.98%)  12 - 21(-40.06%)  17 - 20(-15.21%)  21 - 26(-19.65%) 
+    513  2 -  2( 22.93%)  13 - 23(-43.25%)  18 - 22(-19.53%)  23 - 31(-26.70%) 
+    767  7 -  6( 29.60%)  21 - 29(-28.37%)  29 - 23( 29.04%)  32 - 35( -9.38%) 
+    768  6 -  3( 96.51%)  19 - 27(-29.32%)  23 - 21(  6.62%)  31 - 33( -6.22%) 
+    769  7 -  4( 94.30%)  21 - 28(-27.50%)  25 - 24(  3.23%)  32 - 37(-12.00%) 
+   1023  8 -  5( 72.12%)  25 - 34(-27.27%)  34 - 26( 33.59%)  37 - 42(-11.18%) 
+   1024  8 -  6( 41.80%)  23 - 32(-26.49%)  26 - 25(  4.23%)  37 - 40( -7.72%) 
+   1025  8 -  7(  9.36%)  25 - 34(-25.78%)  29 - 27(  7.68%)  38 - 42( -8.87%) 
+   1518  7 -  4( 71.47%)  34 - 45(-24.17%)  45 - 30( 47.69%)  51 - 53( -4.93%) 
+   1522 10 -  8( 19.45%)  35 - 45(-23.62%)  46 - 31( 47.81%)  51 - 52( -0.49%) 
+   1536 10 -  6( 62.55%)  32 - 42(-23.80%)  37 - 29( 29.19%)  50 - 51( -2.70%) 
+   1600 11 -  9( 20.69%)  34 - 43(-21.19%)  47 - 32( 45.63%)  49 - 53( -7.68%) 
+   2048 13 - 10( 26.67%)  53 - 53( -0.25%)  37 - 35(  7.16%)  61 - 62( -0.90%) 
+   2560 16 - 13( 25.07%)  62 - 59(  5.23%)  44 - 45( -0.71%)  71 - 70(  1.05%) 
+   3072 20 - 20(  1.91%)  72 - 71(  1.91%)  49 - 50( -3.36%)  82 - 82( -0.59%) 
+   3584 26 - 26( -0.81%)  81 - 81( -0.17%)  58 - 57(  1.17%)  92 - 91(  1.28%) 
+   4096 25 - 27( -9.39%)  90 - 90(  0.54%)  64 - 63(  0.67%) 102 -102(  0.70%) 
+   4608 31 - 27( 18.45%)  99 - 99( -0.00%)  70 - 70(  0.47%) 111 -111(  0.09%) 
+   5120 41 - 35( 16.65%) 108 -108( -0.28%)  78 - 77(  0.52%) 120 -120(  0.37%) 
+   5632 46 - 47( -2.05%) 117 -117(  0.12%)  85 - 85(  0.38%) 130 -130( -0.19%) 
+   6144 52 - 44( 18.06%) 126 -126(  0.01%)  94 - 93(  0.80%) 139 -138(  0.27%) 
+   6656 27 - 41(-33.88%) 135 -134(  0.33%) 102 -102(  0.52%) 149 -148(  1.11%) 
+   7168 56 - 27(104.91%) 143 -142(  0.33%) 110 -110(  0.15%) 157 -157(  0.07%) 
+   7680 66 - 70( -5.18%) 152 -152(  0.03%) 118 -117(  0.27%) 166 -166(  0.17%) 
+   8192 69 - 44( 57.50%) 161 -160(  0.45%) 125 -124(  0.35%) 176 -175(  0.41%) 
+------- ----------------- ----------------- ----------------- -----------------
+C     6  0 -  0( -1.10%)   1 -  1(  9.45%)   2 -  2( -0.19%)   3 -  3(  2.77%) 
+C    64  0 -  0(  0.60%)   3 -  3(  1.28%)   4 -  4( -0.18%)   6 -  6(  0.50%) 
+C   128  0 -  0( 35.46%)   6 -  6( -3.33%)   8 -  7(  7.02%)  11 - 11(  1.72%) 
+C   192  0 -  1(-48.74%)   7 -  8(-20.51%)  12 - 13(-12.42%)  12 - 15(-22.26%) 
+C   256  1 -  1( 11.88%)   9 - 11(-15.05%)  13 - 13(  0.17%)  15 - 16( -1.65%) 
+C   512  2 -  1( 27.80%)  13 - 22(-40.28%)  16 - 19(-12.48%)  22 - 25(-13.57%) 
+C   768  2 -  2( 11.66%)  18 - 26(-30.06%)  23 - 21(  5.93%)  31 - 33( -7.73%) 
+C  1024  6 -  4( 32.78%)  23 - 31(-25.36%)  26 - 24(  5.56%)  37 - 39( -6.05%) 
+C  1536  9 -  7( 33.48%)  32 - 43(-23.71%)  37 - 29( 26.46%)  50 - 50( -0.05%) 
+================================== Unaligned ==================================
+      1  0 -  0( 32.71%)   1 -  1(  7.91%)   2 -  2(  0.99%)   3 -  3(  3.36%) 
+      2  0 -  0( 33.59%)   1 -  1(  6.69%)   2 -  2(  1.04%)   3 -  3(  1.19%) 
+      3  0 -  0( 33.20%)   1 -  1(  8.36%)   2 -  2(  0.87%)   3 -  3(  3.03%) 
+      4  0 -  0( 33.41%)   1 -  1(  6.50%)   2 -  2(  1.03%)   3 -  3(  2.77%) 
+      5  0 -  0( 32.00%)   1 -  1(  6.83%)   2 -  2(  1.16%)   3 -  3(  2.28%) 
+      6  0 -  0( 33.29%)   1 -  1(  7.94%)   2 -  2(  0.93%)   3 -  3(  0.17%) 
+      7  0 -  0( 32.69%)   1 -  1(  6.01%)   2 -  2(  0.93%)   3 -  2(  4.20%) 
+      8  0 -  0( 33.99%)   1 -  1(  5.62%)   2 -  2(  0.92%)   3 -  3(  1.09%) 
+      9  0 -  0( 32.63%)   1 -  1(  6.33%)   2 -  2(  1.13%)   3 -  3(  2.01%) 
+     12  0 -  0( 33.10%)   1 -  1(  7.30%)   4 -  3( 47.16%)   5 -  3( 41.00%) 
+     15  0 -  0( 32.30%)   1 -  1(  6.96%)   4 -  3( 47.34%)   5 -  3( 43.19%) 
+     16  0 -  0( 18.41%)   3 -  2( 68.45%)   4 -  3( 62.20%)   5 -  3( 35.47%) 
+     17  0 -  0(  7.81%)   4 -  3( 37.51%)   5 -  3( 59.08%)   6 -  4( 40.54%) 
+     31  0 -  0( 33.54%)   4 -  3( 31.79%)   6 -  4( 47.27%)   6 -  4( 39.17%) 
+     32  0 -  0( 32.98%)   4 -  3( 29.22%)   6 -  4( 46.89%)   6 -  5( 35.76%) 
+     33  0 -  0( 27.50%)   4 -  4(  6.37%)   6 -  5( 34.85%)   7 -  6( 19.56%) 
+     63  0 -  0( 44.23%)   5 -  5( 19.68%)   7 -  7(  3.62%)   9 -  9(  7.96%) 
+     64  0 -  0( 29.92%)   5 -  5( 14.45%)   7 -  7(  3.11%)   9 -  9(  7.57%) 
+     65  0 -  0(  3.00%)   6 -  5(  6.09%)   8 -  7(  2.61%)  10 - 10(  4.75%) 
+    127  1 -  0( 16.12%)   9 -  8( 10.20%)  12 - 12( -0.66%)  14 - 14(  2.06%) 
+    128  1 -  1( 11.58%)   8 -  8(  2.75%)  12 - 12( -2.33%)  13 - 14( -7.63%) 
+    129  1 -  1(-48.77%)  10 - 12(-13.37%)  12 - 16(-22.85%)  14 - 22(-35.87%) 
+    191  1 -  1(-36.20%)  11 - 12( -4.61%)  13 - 18(-27.05%)  17 - 27(-39.94%) 
+    192  1 -  1(-31.62%)  11 - 12( -9.55%)  14 - 18(-18.64%)  18 - 28(-34.80%) 
+    193  1 -  2(-36.96%)  13 - 13(  0.19%)  15 - 17(-12.88%)  20 - 28(-29.62%) 
+    255  1 -  2(-35.46%)  16 - 18(-12.89%)  17 - 17(  0.23%)  22 - 28(-21.79%) 
+    256  1 -  1(  7.89%)  17 - 19(-10.33%)  17 - 18( -3.25%)  24 - 28(-16.62%) 
+    257  1 -  2(-28.10%)  16 - 19(-11.20%)  19 - 20( -6.80%)  23 - 32(-27.58%) 
+    319  1 -  2(-21.72%)  18 - 19( -6.08%)  21 - 21(  3.22%)  25 - 33(-22.94%) 
+    320  1 -  2(-23.13%)  16 - 21(-19.75%)  21 - 21(  2.12%)  26 - 33(-22.39%) 
+    321  1 -  2(-22.90%)  16 - 21(-22.21%)  21 - 20(  2.73%)  26 - 33(-22.90%) 
+    383  2 -  2(-22.35%)  19 - 20( -7.58%)  21 - 20(  0.49%)  29 - 33(-12.06%) 
+    384  2 -  2(  3.32%)  16 - 21(-22.26%)  20 - 20(  2.75%)  28 - 33(-13.58%) 
+    385  2 -  2(-36.41%)  14 - 21(-32.50%)  18 - 23(-22.20%)  27 - 35(-23.63%) 
+    447  2 -  2(  4.13%)  14 - 20(-28.61%)  16 - 23(-29.60%)  26 - 35(-23.79%) 
+    448  2 -  2(-21.37%)  14 - 22(-35.54%)  18 - 23(-21.17%)  27 - 36(-23.90%) 
+    449  2 -  2(-26.56%)  14 - 22(-36.19%)  18 - 22(-18.43%)  27 - 35(-22.43%) 
+    511  2 -  3(-31.11%)  14 - 22(-35.23%)  19 - 22(-16.50%)  29 - 35(-16.05%) 
+    512  2 -  2( -5.05%)  15 - 24(-37.63%)  19 - 22(-12.75%)  29 - 35(-15.81%) 
+    513  2 -  3(-27.14%)  15 - 24(-38.02%)  19 - 24(-20.39%)  30 - 36(-18.37%) 
+    767  3 -  4(-24.58%)  21 - 28(-26.97%)  23 - 25( -8.20%)  34 - 40(-13.70%) 
+    768  3 -  3( -0.56%)  21 - 29(-27.01%)  23 - 25( -5.67%)  34 - 39(-13.71%) 
+    769  3 -  3(-20.43%)  21 - 29(-26.40%)  23 - 27(-13.86%)  34 - 41(-15.93%) 
+   1023  5 -  5( -7.38%)  23 - 32(-27.22%)  27 - 28( -3.98%)  39 - 44(-11.72%) 
+   1024  5 -  6(-17.62%)  25 - 33(-25.40%)  27 - 28( -2.44%)  39 - 43(-11.14%) 
+   1025  5 -  4(  3.62%)  25 - 33(-25.57%)  27 - 29( -8.17%)  39 - 46(-16.26%) 
+   1518 10 - 10( -4.77%)  33 - 42(-20.47%)  36 - 34(  6.36%)  53 - 54( -2.01%) 
+   1522 10 - 11( -5.28%)  34 - 42(-18.86%)  36 - 33(  8.35%)  53 - 53( -1.57%) 
+   1536  7 -  8(-12.20%)  34 - 42(-19.11%)  39 - 33( 17.70%)  53 - 54( -0.54%) 
+   1600 11 -  9( 20.88%)  35 - 43(-18.54%)  31 - 35(-10.26%)  50 - 55( -9.91%) 
+   2048 15 -  8( 99.56%)  51 - 51(  0.24%)  40 - 39(  1.22%)  64 - 62(  3.14%) 
+   2560 17 - 16(  1.33%)  59 - 60( -0.76%)  47 - 47(  0.75%)  73 - 73( -0.56%) 
+   3072 22 - 20(  8.49%)  68 - 68(  0.32%)  53 - 54( -2.01%)  82 - 83( -0.37%) 
+   3584 30 - 32( -4.26%)  76 - 76(  0.19%)  61 - 60(  1.03%)  91 - 92( -0.92%) 
+   4096 34 - 28( 22.80%)  85 - 86( -0.61%)  67 - 67(  0.03%) 100 -100( -0.08%) 
+   4608 34 - 36( -4.01%)  93 - 93(  0.17%)  74 - 75( -0.47%) 109 -109(  0.44%) 
+   5120 35 - 29( 20.42%) 102 -102( -0.11%)  82 - 82( -0.08%) 119 -119(  0.53%) 
+   5632 44 - 41(  8.71%) 110 -110(  0.14%)  89 - 90( -0.16%) 128 -127(  0.16%) 
+   6144 40 - 48(-17.75%) 119 -119(  0.12%)  98 - 99( -0.31%) 138 -137(  0.56%) 
+   6656 53 - 54( -0.83%) 127 -127(  0.14%) 107 -107( -0.07%) 146 -145(  0.50%) 
+   7168 56 - 59( -5.16%) 136 -136(  0.18%) 115 -115( -0.13%) 155 -155( -0.34%) 
+   7680 71 - 68(  4.02%) 144 -144(  0.01%) 123 -123( -0.06%) 164 -163(  0.47%) 
+   8192 76 - 65( 17.61%) 152 -153( -0.36%) 130 -130( -0.04%) 174 -174(  0.13%) 
+------- ----------------- ----------------- ----------------- -----------------
+C     6  0 -  0(  1.10%)   1 -  1(  8.55%)   2 -  2(  0.06%)   3 -  3(  4.86%) 
+C    64  0 -  0( -3.20%)   5 -  5(  0.54%)   7 -  7(  0.27%)   9 -  9( -0.50%) 
+C   128  1 -  0( 25.53%)   9 -  8(  3.56%)  12 - 12( -3.53%)  13 - 14( -8.98%) 
+C   192  1 -  1(-37.27%)  11 - 12(-10.10%)  13 - 17(-23.33%)  17 - 28(-38.96%) 
+C   256  1 -  1(  3.35%)  17 - 19( -8.99%)  16 - 18( -7.62%)  23 - 29(-20.07%) 
+C   512  2 -  2( -6.31%)  14 - 24(-38.90%)  19 - 22(-13.61%)  29 - 35(-16.90%) 
+C   768  3 -  3( -0.45%)  21 - 29(-25.43%)  23 - 25( -6.64%)  34 - 40(-13.59%) 
+C  1024  6 -  6( -5.63%)  25 - 33(-24.23%)  27 - 28( -3.26%)  39 - 43(-10.94%) 
+C  1536  8 -  8(  3.04%)  34 - 43(-19.62%)  38 - 33( 15.48%)  53 - 53( -0.43%) 
+======= ================= ================= ================= =================
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 33307 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-17  5:29   ` sunyuechi
  2025-10-17 10:10     ` chen.qiguo
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  1 sibling, 1 reply; 21+ messages in thread
From: sunyuechi @ 2025-10-17  5:29 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson

[-- Attachment #1: Type: text/plain, Size: 12951 bytes --]

> riscv support rte_memcpy in vector
> This patch implements RISC-V vector intrinsics


Please adjust the title and msg to mention that zicbop has been introduced, and that intrinsic is not currently being used


config/riscv/meson.build


> # detect extensions
> # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
> if (riscv_extension_macros and
>     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
>   if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
>       or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
>       message('Compiling with the zicbop extension')
>       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
>   else
>     warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
>   endif
> endif


The implementation does not involve intrinsics


>     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
>     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
>     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
>     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
>     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
>     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
>     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 


It looks like there's a performance degradation in the 0-128 range, can you fix it?


eal/riscv/include/rte_memcpy.h


> #define ALIGNMENT_MASK_16    0xF


unused


>/*else*/


Please remove /*else*/


> static __rte_always_inline void *
> _rte_memcpy(void *dst, const void *src, size_t n)
> {
> return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
> }


No need for an extra function call; you can write the implementation directly in the function






-----原始邮件-----
发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
发送时间:2025-10-16 17:09:33 (星期四)
收件人: stanislaw.kardach@gmail.com, sunyuechi@iscas.ac.cn, stephen@networkplumber.org
抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
主题: [PATCH v1 1/2] riscv support rte_memcpy in vector

This patch implements RISC-V vector intrinsics
to accelerate memory copy operations for byte range (129~1600).

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
+  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
+      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
+      message(&apos;Compiling with the zicbop extension&apos;)
+      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
+  else
+    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 64(%0)\n"
+        "prefetch.w 64(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0)\n"
+        "prefetch.w 128(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0);"
+        "prefetch.w 128(%1);"
+        "prefetch.r 192(%0);"
+        "prefetch.w 192(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 32;
+    asm volatile (
+         "vsetvli t1, %2, e8, m2, ta, ma\n"
+         "vle8.v v2, (%1)\n"
+         "vse8.v v2, (%0)"
+         :: "r"(dst), "r"(src), "r"(n)
+         : "v2", "v3", "t1", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 64;
+    asm volatile (
+        "vsetvli t3, %2, e8, m4, ta, ma\n"
+        "vle8.v v8, (%1)\n"
+        "vse8.v v8, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        :  "v8", "v9", "v10", "v11", "t3", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 128;
+    asm volatile (
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "vse8.v v16, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+    memcpy_prefetch128_2(src, dst);
+    _rte_mov128(dst, src);
+    _rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+    asm volatile (
+        "prefetch.r 64(%1)\n"
+        "prefetch.w 64(%0)\n"
+        "prefetch.r 128(%1)\n"
+        "prefetch.w 128(%0)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.w 192(%0)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.w 256(%0)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.w 320(%0)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.w 384(%0)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.w 448(%0)\n"
+        "prefetch.r 512(%1)\n"
+        "li t6, 512\n"
+        "3:\n"
+        "li t5, 128;"
+        "vsetvli zero, t5, e8, m8, ta, ma\n"
+        "1:;"
+        "bgt %2, t6, 4f\n"
+        "j 2f\n"
+        "4:\n"
+        "prefetch.r 576(%1)\n"
+        "prefetch.r 640(%1)\n"
+        "2:\n"
+        "vle8.v   v16, (%1)\n"
+        "add      %1, %1, t5\n"
+        "vse8.v   v16, (%0)\n"
+        "add      %0, %0, t5\n"
+        "sub      %2, %2, t5\n"
+        "bnez     %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+    );
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "prefetch.r 128(%1)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "li t6, 640\n"
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "blt %2, t6, 3f\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "3:\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+     );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+    void *ret = dst;
+    size_t dstofss;
+    uint32_t bn;
+
+    if (n <= 384) {
+        if (n >= 256) {
+            memcpy_prefetch128_2(src, dst);
+            n -= 256;
+            _rte_mov128(dst, src);
+            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+            src = (const uint8_t *)src + 256;
+            dst = (uint8_t *)dst + 256;
+        }
+        if (n >= 128) {
+            memcpy_prefetch128_1(src, dst);
+            n -= 128;
+            _rte_mov128(dst, src);
+            src = (const uint8_t *)src + 128;
+            dst = (uint8_t *)dst + 128;
+        }
+
+        if (n >= 64) {
+            memcpy_prefetch64_1(src, dst);
+            n -= 64;
+            _rte_mov64(dst, src);
+            src = (const uint8_t *)src + 64;
+            dst = (uint8_t *)dst + 64;
+        }
+
+        if (n > 32) {
+            _rte_mov32(dst, src);
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+            return ret;
+        }
+
+        if (n > 0) {
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+        }
+        return ret;
+    }
+
+    /**
+     * Make store aligned when copy size exceeds 256 bytes.
+     */
+    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+    if (dstofss > 0) {
+        dstofss = 64 - dstofss;
+        n -= dstofss;
+        _rte_mov64(dst, src);
+        src = (const uint8_t *)src + dstofss;
+        dst = (uint8_t *)dst + dstofss;
+    }
+
+    /**
+     * Copy 128-byte blocks
+     */
+    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
+        bn = n - (n & ALIGNMENT_MASK_128);
+        _rte_mov128blocks(dst, src, bn);
+        n = n & ALIGNMENT_MASK_128;
+        src = (const uint8_t *)src + bn;
+        dst = (uint8_t *)dst + bn;
+        _rte_mov(dst, src, n);
+    } else
+        _rte_mov_aligned(dst, src, n);
+
+    return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+        return _rte_memcpy(dst, src, n);
+    /*else*/
+#endif
+        return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-    memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely(memcpy_alg == MEMCPY_RISCV))
+        _rte_mov256(dst, src);
+    else
+#endif
+        memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+    long vlenb;
+    asm ("csrr %0, 0xc22" : "=r"(vlenb));
+    return vlenb;
 }
 
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+    long vlenb = riscv_vlenb();
+    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+        memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #2: Type: text/html, Size: 30440 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v2 0/1]  Optimization Summary for RISC-V rte_memcpy
  2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-17  5:29   ` sunyuechi
@ 2025-10-17  9:36   ` Qiguo Chen
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
  1 sibling, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-17  9:36 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1008 bytes --]

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V architectures,
achieving an average 10%~15% reduction in execution time for data sizes between
129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
 specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)


Qiguo Chen (1):
  riscv support rte_memcpy in vector

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 1912 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 1/1] riscv support rte_memcpy in vector
  2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-17  9:36     ` Qiguo Chen
  2025-10-20  9:43       ` sunyuechi
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  0 siblings, 2 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-17  9:36 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9845 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy(dst, src, n);
+	/*else*/
+#endif
+		return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26665 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v1 1/2] riscv support rte_memcpy in vector
  2025-10-17  5:29   ` sunyuechi
@ 2025-10-17 10:10     ` chen.qiguo
  0 siblings, 0 replies; 21+ messages in thread
From: chen.qiguo @ 2025-10-17 10:10 UTC (permalink / raw)
  To: sunyuechi; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson


[-- Attachment #1.1.1: Type: text/plain, Size: 15131 bytes --]

>     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) >     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) >     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) >     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) >     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) >     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) >     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
It looks like there's a performance degradation in the 0-128 range, can you fix it?


For  small size copy,  we can use memcpy directly.   It seems that the judge condition causes this result. 



Original


From: sunyuechi@iscas.ac.cn <sunyuechi@iscas.ac.cn>
To: 陈其国10108961;
Cc: stanislaw.kardach@gmail.com <stanislaw.kardach@gmail.com>;stephen@networkplumber.org <stephen@networkplumber.org>;dev@dpdk.org <dev@dpdk.org>;bruce.richardson@intel.com <bruce.richardson@intel.com>;
Date: 2025年10月17日 13:29
Subject: Re: [PATCH v1 1/2] riscv support rte_memcpy in vector

> riscv support rte_memcpy in vector
 > This patch implements RISC-V vector intrinsics
 
 
 Please adjust the title and msg to mention that zicbop has been introduced, and that intrinsic is not currently being used
 
 
 config/riscv/meson.build
 
 
 > # detect extensions
 > # Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 > if (riscv_extension_macros and
 >     (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
 >   if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
 >       or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
 >       message('Compiling with the zicbop extension')
 >       machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
 >   else
 >     warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
 >   endif
 > endif
 
 
 The implementation does not involve intrinsics
 
 
 >     16  0 -  0( 57.49%)   1 -  1(  7.30%)   2 -  2(  0.19%)   3 -  3(  3.19%) 
 >     17  0 -  0( 53.78%)   3 -  2( 51.65%)   4 -  3( 37.35%)   4 -  3( 23.94%) 
 >     31  0 -  0( 27.02%)   3 -  2( 51.99%)   4 -  3( 37.34%)   4 -  3( 24.09%) 
 >     32  0 -  0( 56.82%)   3 -  2( 50.42%)   4 -  3( 39.73%)   4 -  3( 25.04%) 
 >     33  0 -  0( 30.60%)   3 -  3( 30.94%)   6 -  4( 46.89%)   6 -  5( 26.21%) 
 >     63  0 -  0( 16.84%)   4 -  3( 21.57%)   6 -  5( 31.74%)   7 -  6( 18.01%) 
 >     64  0 -  0( 21.98%)   4 -  3( 21.35%)   6 -  5( 36.13%)   7 -  6( 20.05%) 
 
 
 It looks like there's a performance degradation in the 0-128 range, can you fix it?
 
 
 eal/riscv/include/rte_memcpy.h
 
 
 > #define ALIGNMENT_MASK_16    0xF
 
 
 unused
 
 
 >/*else*/
 
 
 Please remove /*else*/
 
 
 > static __rte_always_inline void *
 > _rte_memcpy(void *dst, const void *src, size_t n)
 > {
 > 	return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
 > }
 
 
 No need for an extra function call; you can write the implementation directly in the function
 	
 
 
 
 	-----原始邮件-----
 发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
 发送时间:2025-10-16 17:09:33 (星期四)
 收件人: stanislaw.kardach@gmail.com, sunyuechi@iscas.ac.cn, stephen@networkplumber.org
 抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
 主题: [PATCH v1 1/2] riscv support rte_memcpy in vector
 
 This patch implements RISC-V vector intrinsics
 to accelerate memory copy operations for byte range (129~1600).
 
 Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn> 
 ---
  .mailmap                           |   1 +
  config/riscv/meson.build           |  14 ++
  lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
  3 files changed, 323 insertions(+), 2 deletions(-)
 
 diff --git a/.mailmap b/.mailmap
 index 08e5ec8560..178c5f44f4 100644
 --- a/.mailmap
 +++ b/.mailmap
 @@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com> 
  Qian Xu <qian.q.xu@intel.com> 
  Qiao Liu <qiao.liu@intel.com> 
  Qi Fu <qi.fu@intel.com> 
 +Qiguo Chen <chen.qiguo@zte.com.cn> 
  Qimai Xiao <qimaix.xiao@intel.com> 
  Qiming Chen <chenqiming_huawei@163.com> 
  Qiming Yang <qiming.yang@intel.com> 
 diff --git a/config/riscv/meson.build b/config/riscv/meson.build
 index f3daea0c0e..abba474b5e 100644
 --- a/config/riscv/meson.build
 +++ b/config/riscv/meson.build
 @@ -146,6 +146,20 @@ if (riscv_extension_macros and
      endif
  endif
   
 +# detect extensions
 +# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
 +if (riscv_extension_macros and
 +    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
 +  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
 +      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
 +      message(&apos;Compiling with the zicbop extension&apos;)
 +      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
 +  else
 +    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
 +  endif
 +endif
 +
 +
  # apply flags
  foreach flag: dpdk_flags
      if flag.length() > 0
 diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
 index d8a942c5d2..6f8cb0d4a4 100644
 --- a/lib/eal/riscv/include/rte_memcpy.h
 +++ b/lib/eal/riscv/include/rte_memcpy.h
 @@ -11,6 +11,7 @@
  #include <string.h> 
   
  #include "rte_common.h" 
 +#include <rte_branch_prediction.h> 
   
  #include "generic/rte_memcpy.h" 
   
 @@ -18,6 +19,290 @@
  extern "C" {
  #endif
   
 +
 +#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
 +#undef RTE_RISCV_FEATURE_V
 +#endif
 +
 +
 +#if defined(RTE_RISCV_FEATURE_V)
 +
 +#include "rte_cpuflags.h" 
 +
 +#define RISCV_VLENB   16
 +#define MEMCPY_GLIBC       (1U << 0)
 +#define MEMCPY_RISCV       (1U << 1)
 +#define ALIGNMENT_MASK_128   0x7F
 +#define ALIGNMENT_MASK_64    0x3F
 +#define ALIGNMENT_MASK_16    0xF
 +
 +static uint8_t memcpy_alg = MEMCPY_GLIBC;
 +
 +
 +static __rte_always_inline void
 +memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 64(%0)\n" 
 +        "prefetch.w 64(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +static __rte_always_inline void
 +memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 128(%0)\n" 
 +        "prefetch.w 128(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +static __rte_always_inline void
 +memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
 +{
 +    __asm__ (
 +        "prefetch.r 128(%0);" 
 +        "prefetch.w 128(%1);" 
 +        "prefetch.r 192(%0);" 
 +        "prefetch.w 192(%1)" 
 +        :: "r"(src), "r"(dst)
 +    );
 +}
 +
 +
 +static __rte_always_inline void
 +_rte_mov32(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 32;
 +    asm volatile (
 +         "vsetvli t1, %2, e8, m2, ta, ma\n" 
 +         "vle8.v v2, (%1)\n" 
 +         "vse8.v v2, (%0)" 
 +         :: "r"(dst), "r"(src), "r"(n)
 +         : "v2", "v3", "t1", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov64(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 64;
 +    asm volatile (
 +        "vsetvli t3, %2, e8, m4, ta, ma\n" 
 +        "vle8.v v8, (%1)\n" 
 +        "vse8.v v8, (%0)" 
 +        :: "r"(dst), "r"(src), "r"(n)
 +        :  "v8", "v9", "v10", "v11", "t3", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov128(uint8_t *dst, const uint8_t *src)
 +{
 +    uint32_t n = 128;
 +    asm volatile (
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "vse8.v v16, (%0)" 
 +        :: "r"(dst), "r"(src), "r"(n)
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov256(uint8_t *dst, const uint8_t *src)
 +{
 +    memcpy_prefetch128_2(src, dst);
 +    _rte_mov128(dst, src);
 +    _rte_mov128(dst + 128, src + 128);
 +}
 +
 +static __rte_always_inline void
 +_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 +{
 +    asm volatile (
 +        "prefetch.r 64(%1)\n" 
 +        "prefetch.w 64(%0)\n" 
 +        "prefetch.r 128(%1)\n" 
 +        "prefetch.w 128(%0)\n" 
 +        "prefetch.r 192(%1)\n" 
 +        "prefetch.w 192(%0)\n" 
 +        "prefetch.r 256(%1)\n" 
 +        "prefetch.w 256(%0)\n" 
 +        "prefetch.r 320(%1)\n" 
 +        "prefetch.w 320(%0)\n" 
 +        "prefetch.r 384(%1)\n" 
 +        "prefetch.w 384(%0)\n" 
 +        "prefetch.r 448(%1)\n" 
 +        "prefetch.w 448(%0)\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "li t6, 512\n" 
 +        "3:\n" 
 +        "li t5, 128;" 
 +        "vsetvli zero, t5, e8, m8, ta, ma\n" 
 +        "1:;" 
 +        "bgt %2, t6, 4f\n" 
 +        "j 2f\n" 
 +        "4:\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "prefetch.r 640(%1)\n" 
 +        "2:\n" 
 +        "vle8.v   v16, (%1)\n" 
 +        "add      %1, %1, t5\n" 
 +        "vse8.v   v16, (%0)\n" 
 +        "add      %0, %0, t5\n" 
 +        "sub      %2, %2, t5\n" 
 +        "bnez     %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory" 
 +    );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
 +{
 +    asm volatile (
 +        "1:\n" 
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "add %1, %1, t4\n" 
 +        "vse8.v v16, (%0)\n" 
 +        "add %0, %0, t4\n" 
 +        "sub %2, %2, t4\n" 
 +        "bnez %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void
 +_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
 +{
 +    asm volatile (
 +        "prefetch.r 128(%1)\n" 
 +        "prefetch.r 192(%1)\n" 
 +        "prefetch.r 256(%1)\n" 
 +        "prefetch.r 320(%1)\n" 
 +        "prefetch.r 384(%1)\n" 
 +        "prefetch.r 448(%1)\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "li t6, 640\n" 
 +        "1:\n" 
 +        "vsetvli t4, %2, e8, m8, ta, ma\n" 
 +        "vle8.v v16, (%1)\n" 
 +        "add %1, %1, t4\n" 
 +        "vse8.v v16, (%0)\n" 
 +        "add %0, %0, t4\n" 
 +        "sub %2, %2, t4\n" 
 +        "blt %2, t6, 3f\n" 
 +        "prefetch.r 512(%1)\n" 
 +        "prefetch.r 576(%1)\n" 
 +        "3:\n" 
 +        "bnez %2, 1b" 
 +        : "+r"(dst), "+r"(src), "+r"(n)
 +        :
 +        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory" 
 +     );
 +}
 +
 +static __rte_always_inline void *
 +_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
 +{
 +    void *ret = dst;
 +    size_t dstofss;
 +    uint32_t bn;
 +
 +    if (n <= 384) {
 +        if (n >= 256) {
 +            memcpy_prefetch128_2(src, dst);
 +            n -= 256;
 +            _rte_mov128(dst, src);
 +            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 +            src = (const uint8_t *)src + 256;
 +            dst = (uint8_t *)dst + 256;
 +        }
 +        if (n >= 128) {
 +            memcpy_prefetch128_1(src, dst);
 +            n -= 128;
 +            _rte_mov128(dst, src);
 +            src = (const uint8_t *)src + 128;
 +            dst = (uint8_t *)dst + 128;
 +        }
 +
 +        if (n >= 64) {
 +            memcpy_prefetch64_1(src, dst);
 +            n -= 64;
 +            _rte_mov64(dst, src);
 +            src = (const uint8_t *)src + 64;
 +            dst = (uint8_t *)dst + 64;
 +        }
 +
 +        if (n > 32) {
 +            _rte_mov32(dst, src);
 +            _rte_mov32((uint8_t *)dst - 32 + n,
 +                    (const uint8_t *)src - 32 + n);
 +            return ret;
 +        }
 +
 +        if (n > 0) {
 +            _rte_mov32((uint8_t *)dst - 32 + n,
 +                    (const uint8_t *)src - 32 + n);
 +        }
 +        return ret;
 +    }
 +
 +    /**
 +     * Make store aligned when copy size exceeds 256 bytes.
 +     */
 +    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
 +    if (dstofss > 0) {
 +        dstofss = 64 - dstofss;
 +        n -= dstofss;
 +        _rte_mov64(dst, src);
 +        src = (const uint8_t *)src + dstofss;
 +        dst = (uint8_t *)dst + dstofss;
 +    }
 +
 +    /**
 +     * Copy 128-byte blocks
 +     */
 +    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
 +        bn = n - (n & ALIGNMENT_MASK_128);
 +        _rte_mov128blocks(dst, src, bn);
 +        n = n & ALIGNMENT_MASK_128;
 +        src = (const uint8_t *)src + bn;
 +        dst = (uint8_t *)dst + bn;
 +        _rte_mov(dst, src, n);
 +    } else
 +        _rte_mov_aligned(dst, src, n);
 +
 +    return ret;
 +}
 +
 +static __rte_always_inline void *
 +_rte_memcpy(void *dst, const void *src, size_t n)
 +{
 +    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
 +}
 +#endif
 +
 +/*----------------------api---------------------------------------------------*/
 +static __rte_always_inline void *
 +rte_memcpy(void *dst, const void *src, size_t n)
 +{
 +#if defined(RTE_RISCV_FEATURE_V)
 +    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
 +        return _rte_memcpy(dst, src, n);
 +    /*else*/
 +#endif
 +        return memcpy(dst, src, n);
 +}
 +
  static inline void
  rte_mov16(uint8_t *dst, const uint8_t *src)
  {
 @@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
  static inline void
  rte_mov256(uint8_t *dst, const uint8_t *src)
  {
 -    memcpy(dst, src, 256);
 +#if defined(RTE_RISCV_FEATURE_V)
 +    if (likely(memcpy_alg == MEMCPY_RISCV))
 +        _rte_mov256(dst, src);
 +    else
 +#endif
 +        memcpy(dst, src, 256);
 +}
 +/*----------------------------------------------------------------------------*/
 +#if defined(RTE_RISCV_FEATURE_V)
 +static inline long
 +riscv_vlenb(void)
 +{
 +    long vlenb;
 +    asm ("csrr %0, 0xc22" : "=r"(vlenb));
 +    return vlenb;
  }
   
 -#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
 +RTE_INIT(rte_vect_memcpy_init)
 +{
 +    long vlenb = riscv_vlenb();
 +    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
 +        memcpy_alg = MEMCPY_RISCV;
 +}
 +#endif
 +
   
  #ifdef __cplusplus
  }
 --  
 2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 34249 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/1] riscv support rte_memcpy in vector
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
@ 2025-10-20  9:43       ` sunyuechi
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  1 sibling, 0 replies; 21+ messages in thread
From: sunyuechi @ 2025-10-20  9:43 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: stanislaw.kardach, stephen, dev, bruce.richardson

[-- Attachment #1: Type: text/plain, Size: 11146 bytes --]

It looks like the commit message has changed, but it seems the commit title and code files haven't changed. Was the wrong version committed?


-----原始邮件-----
发件人:"Qiguo Chen" <chen.qiguo@zte.com.cn>
发送时间:2025-10-17 17:36:17 (星期五)
收件人: sunyuechi@iscas.ac.cn, stanislaw.kardach@gmail.com, stephen@networkplumber.org
抄送: dev@dpdk.org, bruce.richardson@intel.com, "Qiguo Chen" <chen.qiguo@zte.com.cn>
主题: [PATCH v2 1/1] riscv support rte_memcpy in vector

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 310 ++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define(&apos;__riscv_zicbop&apos;, args: machine_args) != &apos;&apos;))
+  if ((cc.get_id() == &apos;gcc&apos; and cc.version().version_compare(&apos;>=14.1.0&apos;))
+      or (cc.get_id() == &apos;clang&apos; and cc.version().version_compare(&apos;>=18.1.0&apos;)))
+      message(&apos;Compiling with the zicbop extension&apos;)
+      machine_args += [&apos;-DRTE_RISCV_FEATURE_PREFETCH&apos;]
+  else
+    warning(&apos;Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)&apos;)
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..6f8cb0d4a4 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,290 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB   16
+#define MEMCPY_GLIBC       (1U << 0)
+#define MEMCPY_RISCV       (1U << 1)
+#define ALIGNMENT_MASK_128   0x7F
+#define ALIGNMENT_MASK_64    0x3F
+#define ALIGNMENT_MASK_16    0xF
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 64(%0)\n"
+        "prefetch.w 64(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0)\n"
+        "prefetch.w 128(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+    __asm__ (
+        "prefetch.r 128(%0);"
+        "prefetch.w 128(%1);"
+        "prefetch.r 192(%0);"
+        "prefetch.w 192(%1)"
+        :: "r"(src), "r"(dst)
+    );
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 32;
+    asm volatile (
+         "vsetvli t1, %2, e8, m2, ta, ma\n"
+         "vle8.v v2, (%1)\n"
+         "vse8.v v2, (%0)"
+         :: "r"(dst), "r"(src), "r"(n)
+         : "v2", "v3", "t1", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 64;
+    asm volatile (
+        "vsetvli t3, %2, e8, m4, ta, ma\n"
+        "vle8.v v8, (%1)\n"
+        "vse8.v v8, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        :  "v8", "v9", "v10", "v11", "t3", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+    uint32_t n = 128;
+    asm volatile (
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "vse8.v v16, (%0)"
+        :: "r"(dst), "r"(src), "r"(n)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+    memcpy_prefetch128_2(src, dst);
+    _rte_mov128(dst, src);
+    _rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+    asm volatile (
+        "prefetch.r 64(%1)\n"
+        "prefetch.w 64(%0)\n"
+        "prefetch.r 128(%1)\n"
+        "prefetch.w 128(%0)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.w 192(%0)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.w 256(%0)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.w 320(%0)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.w 384(%0)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.w 448(%0)\n"
+        "prefetch.r 512(%1)\n"
+        "li t6, 512\n"
+        "3:\n"
+        "li t5, 128;"
+        "vsetvli zero, t5, e8, m8, ta, ma\n"
+        "1:;"
+        "bgt %2, t6, 4f\n"
+        "j 2f\n"
+        "4:\n"
+        "prefetch.r 576(%1)\n"
+        "prefetch.r 640(%1)\n"
+        "2:\n"
+        "vle8.v   v16, (%1)\n"
+        "add      %1, %1, t5\n"
+        "vse8.v   v16, (%0)\n"
+        "add      %0, %0, t5\n"
+        "sub      %2, %2, t5\n"
+        "bnez     %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+    );
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+     );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+    asm volatile (
+        "prefetch.r 128(%1)\n"
+        "prefetch.r 192(%1)\n"
+        "prefetch.r 256(%1)\n"
+        "prefetch.r 320(%1)\n"
+        "prefetch.r 384(%1)\n"
+        "prefetch.r 448(%1)\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "li t6, 640\n"
+        "1:\n"
+        "vsetvli t4, %2, e8, m8, ta, ma\n"
+        "vle8.v v16, (%1)\n"
+        "add %1, %1, t4\n"
+        "vse8.v v16, (%0)\n"
+        "add %0, %0, t4\n"
+        "sub %2, %2, t4\n"
+        "blt %2, t6, 3f\n"
+        "prefetch.r 512(%1)\n"
+        "prefetch.r 576(%1)\n"
+        "3:\n"
+        "bnez %2, 1b"
+        : "+r"(dst), "+r"(src), "+r"(n)
+        :
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+     );
+}
+
+static __rte_always_inline void *
+_rte_memcpy_generic(uint8_t       *dst, const uint8_t *src, size_t n)
+{
+    void *ret = dst;
+    size_t dstofss;
+    uint32_t bn;
+
+    if (n <= 384) {
+        if (n >= 256) {
+            memcpy_prefetch128_2(src, dst);
+            n -= 256;
+            _rte_mov128(dst, src);
+            _rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+            src = (const uint8_t *)src + 256;
+            dst = (uint8_t *)dst + 256;
+        }
+        if (n >= 128) {
+            memcpy_prefetch128_1(src, dst);
+            n -= 128;
+            _rte_mov128(dst, src);
+            src = (const uint8_t *)src + 128;
+            dst = (uint8_t *)dst + 128;
+        }
+
+        if (n >= 64) {
+            memcpy_prefetch64_1(src, dst);
+            n -= 64;
+            _rte_mov64(dst, src);
+            src = (const uint8_t *)src + 64;
+            dst = (uint8_t *)dst + 64;
+        }
+
+        if (n > 32) {
+            _rte_mov32(dst, src);
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+            return ret;
+        }
+
+        if (n > 0) {
+            _rte_mov32((uint8_t *)dst - 32 + n,
+                    (const uint8_t *)src - 32 + n);
+        }
+        return ret;
+    }
+
+    /**
+     * Make store aligned when copy size exceeds 256 bytes.
+     */
+    dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+    if (dstofss > 0) {
+        dstofss = 64 - dstofss;
+        n -= dstofss;
+        _rte_mov64(dst, src);
+        src = (const uint8_t *)src + dstofss;
+        dst = (uint8_t *)dst + dstofss;
+    }
+
+    /**
+     * Copy 128-byte blocks
+     */
+    if ((uintptr_t)src & ALIGNMENT_MASK_64)    {
+        bn = n - (n & ALIGNMENT_MASK_128);
+        _rte_mov128blocks(dst, src, bn);
+        n = n & ALIGNMENT_MASK_128;
+        src = (const uint8_t *)src + bn;
+        dst = (uint8_t *)dst + bn;
+        _rte_mov(dst, src, n);
+    } else
+        _rte_mov_aligned(dst, src, n);
+
+    return ret;
+}
+
+static __rte_always_inline void *
+_rte_memcpy(void *dst, const void *src, size_t n)
+{
+    return _rte_memcpy_generic((uint8_t *)dst, (const uint8_t *)src, n);
+}
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+        return _rte_memcpy(dst, src, n);
+    /*else*/
+#endif
+        return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +336,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-    memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+    if (likely(memcpy_alg == MEMCPY_RISCV))
+        _rte_mov256(dst, src);
+    else
+#endif
+        memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+    long vlenb;
+    asm ("csrr %0, 0xc22" : "=r"(vlenb));
+    return vlenb;
 }
 
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+    long vlenb = riscv_vlenb();
+    if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+        memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #2: Type: text/html, Size: 25605 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
  2025-10-20  9:43       ` sunyuechi
@ 2025-10-20 12:08       ` Qiguo Chen
  2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
  1 sibling, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-20 12:08 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1201 bytes --]

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128)

Qiguo Chen (1):
  lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop
    extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2339 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions
  2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-20 12:08         ` Qiguo Chen
  2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  0 siblings, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-20 12:08 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 08e5ec8560..178c5f44f4 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1285,6 +1285,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
@ 2025-10-21  6:56           ` Qiguo Chen
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  0 siblings, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-21  6:56 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1233 bytes --]

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 


Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2416 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-21  6:56             ` Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
                                 ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-21  6:56 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3817bf7cdb..85f50bce87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1283,6 +1283,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
@ 2025-10-24  2:56               ` Qiguo Chen
  2025-10-24  3:04               ` retest Qiguo Chen
                                 ` (3 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  2:56 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1: Type: multipart/alternative, Size: 1 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
@ 2025-10-24  3:04               ` Qiguo Chen
  2025-10-24  3:12               ` retest Qiguo Chen
                                 ` (2 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  3:04 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 6 bytes --]

retest

[-- Attachment #1.1.2: Type: text/html , Size: 12 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  2:56               ` retest Qiguo Chen
  2025-10-24  3:04               ` retest Qiguo Chen
@ 2025-10-24  3:12               ` Qiguo Chen
  2025-10-24  5:04               ` retest Qiguo Chen
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  4 siblings, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  3:12 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 6 bytes --]

retest

[-- Attachment #1.1.2: Type: text/html , Size: 12 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* retest
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
                                 ` (2 preceding siblings ...)
  2025-10-24  3:12               ` retest Qiguo Chen
@ 2025-10-24  5:04               ` Qiguo Chen
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  4 siblings, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:04 UTC (permalink / raw)
  To: dev


[-- Attachment #1.1.1: Type: text/plain, Size: 9 bytes --]

ci retest

[-- Attachment #1.1.2: Type: text/html , Size: 20 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
                                 ` (3 preceding siblings ...)
  2025-10-24  5:04               ` retest Qiguo Chen
@ 2025-10-24  5:41               ` Qiguo Chen
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  4 siblings, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:41 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1276 bytes --]

Changes in v5:
     1)to trig ci only.

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 


Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2522 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24  5:41                 ` Qiguo Chen
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
  2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
  0 siblings, 2 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  5:41 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9649 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3817bf7cdb..85f50bce87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1283,6 +1283,7 @@ Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
 Qi Fu <qi.fu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..fb817e5f43 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,6 +11,7 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
@@ -18,6 +19,283 @@
 extern "C" {
 #endif
 
+
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+
+#if defined(RTE_RISCV_FEATURE_V)
+
+#include "rte_cpuflags.h"
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26287 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
@ 2025-10-24  7:27                   ` Qiguo Chen
  2025-10-24  7:27                     ` [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
  1 sibling, 1 reply; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  7:27 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 1325 bytes --]

Changes in v6:
     1)solve .mailmap conflict.

Changes in v5:
     1)to trig ci only.

Changes in v4:
     1)rebase code only.

Changes in v3:
     1)Change title for patch2.
     2)Apply correct patch version for patch2.
       Thanks to Sunyuechi for the reminder.

Changes in v2:
     1)Modify some codes and descriptions according to Sunyuechi's
     suggestions.
     2)Removed benchmark_report.txt to avoid warnings.

[PATCH v1]
I've implemented optimizations to rte_memcpy targeting RISC-V
architectures, achieving an average 10%~15% reduction in execution time
for data sizes between 129 to 1024 bytes( 1025~1600 gains little).
These enhancements draw inspiration from x86 implementations,
specifically focusing on:
1)Alignment Handling for Unaligned Scenarios
2)Vector Configuration Tuning
3)Strategic Prefetching with zicbop

- Patch 1: Cover letter
- Patch 2: Base implementation
- Patch 3: Benchmark report


Tested on Tested on SG2044 (VLEN=128) 

Qiguo Chen (1):
  eal/riscv: optimize rte_memcpy with vector and zicbop extensions

 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 2624 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24  7:27                     ` Qiguo Chen
  0 siblings, 0 replies; 21+ messages in thread
From: Qiguo Chen @ 2025-10-24  7:27 UTC (permalink / raw)
  To: sunyuechi, stanislaw.kardach, stephen; +Cc: dev, bruce.richardson, Qiguo Chen


[-- Attachment #1.1.1: Type: text/plain, Size: 9698 bytes --]

This patch uses RISC-V vector instructions and zicbop prefetching to
optimize memory copies for 129~1600 byte ranges.

Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
---
 .mailmap                           |   1 +
 config/riscv/meson.build           |  14 ++
 lib/eal/riscv/include/rte_memcpy.h | 303 ++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index e4d0590451..8fcdc518f9 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1291,6 +1291,7 @@ Qi Zhang <qi.z.zhang@intel.com>
 Qian Hao <qi_an_hao@126.com>
 Qian Xu <qian.q.xu@intel.com>
 Qiao Liu <qiao.liu@intel.com>
+Qiguo Chen <chen.qiguo@zte.com.cn>
 Qimai Xiao <qimaix.xiao@intel.com>
 Qiming Chen <chenqiming_huawei@163.com>
 Qiming Yang <qiming.yang@intel.com>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build
index f3daea0c0e..abba474b5e 100644
--- a/config/riscv/meson.build
+++ b/config/riscv/meson.build
@@ -146,6 +146,20 @@ if (riscv_extension_macros and
     endif
 endif
 
+# detect extensions
+# Requires intrinsics available in GCC 14.1.0+ and Clang 18.1.0+
+if (riscv_extension_macros and
+    (cc.get_define('__riscv_zicbop', args: machine_args) != ''))
+  if ((cc.get_id() == 'gcc' and cc.version().version_compare('>=14.1.0'))
+      or (cc.get_id() == 'clang' and cc.version().version_compare('>=18.1.0')))
+      message('Compiling with the zicbop extension')
+      machine_args += ['-DRTE_RISCV_FEATURE_PREFETCH']
+  else
+    warning('Detected zicbop extension but cannot use because intrinsics are not available (present in GCC 14.1.0+ and Clang 18.1.0+)')
+  endif
+endif
+
+
 # apply flags
 foreach flag: dpdk_flags
     if flag.length() > 0
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h
index d8a942c5d2..1be3ad748a 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -11,13 +11,291 @@
 #include <string.h>
 
 #include "rte_common.h"
+#include <rte_branch_prediction.h>
 
 #include "generic/rte_memcpy.h"
 
+#if defined(RTE_RISCV_FEATURE_V) && !(defined(RTE_RISCV_FEATURE_PREFETCH))
+#undef RTE_RISCV_FEATURE_V
+#endif
+
+#if defined(RTE_RISCV_FEATURE_V)
+#include "rte_cpuflags.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#if defined(RTE_RISCV_FEATURE_V)
+
+#define RISCV_VLENB         16
+#define MEMCPY_GLIBC        (1U << 0)
+#define MEMCPY_RISCV        (1U << 1)
+#define ALIGNMENT_MASK_128  0x7F
+#define ALIGNMENT_MASK_64   0x3F
+
+static uint8_t memcpy_alg = MEMCPY_GLIBC;
+
+
+static __rte_always_inline void
+memcpy_prefetch64_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 64(%0)\n"
+		"prefetch.w 64(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_1(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0)\n"
+		"prefetch.w 128(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+static __rte_always_inline void
+memcpy_prefetch128_2(const uint8_t *src, uint8_t *dst)
+{
+	__asm__ (
+		"prefetch.r 128(%0);"
+		"prefetch.w 128(%1);"
+		"prefetch.r 192(%0);"
+		"prefetch.w 192(%1)"
+		:: "r"(src), "r"(dst)
+	);
+}
+
+
+static __rte_always_inline void
+_rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 32;
+	asm volatile (
+	     "vsetvli t1, %2, e8, m2, ta, ma\n"
+	     "vle8.v v2, (%1)\n"
+	     "vse8.v v2, (%0)"
+	     :: "r"(dst), "r"(src), "r"(n)
+	     : "v2", "v3", "t1", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 64;
+	asm volatile (
+		"vsetvli t3, %2, e8, m4, ta, ma\n"
+		"vle8.v v8, (%1)\n"
+		"vse8.v v8, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		:  "v8", "v9", "v10", "v11", "t3", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	uint32_t n = 128;
+	asm volatile (
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"vse8.v v16, (%0)"
+		:: "r"(dst), "r"(src), "r"(n)
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy_prefetch128_2(src, dst);
+	_rte_mov128(dst, src);
+	_rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+_rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	asm volatile (
+		"prefetch.r 64(%1)\n"
+		"prefetch.w 64(%0)\n"
+		"prefetch.r 128(%1)\n"
+		"prefetch.w 128(%0)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.w 192(%0)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.w 256(%0)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.w 320(%0)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.w 384(%0)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.w 448(%0)\n"
+		"prefetch.r 512(%1)\n"
+		"li t6, 512\n"
+		"3:\n"
+		"li t5, 128;"
+		"vsetvli zero, t5, e8, m8, ta, ma\n"
+		"1:;"
+		"bgt %2, t6, 4f\n"
+		"j 2f\n"
+		"4:\n"
+		"prefetch.r 576(%1)\n"
+		"prefetch.r 640(%1)\n"
+		"2:\n"
+		"vle8.v   v16, (%1)\n"
+		"add      %1, %1, t5\n"
+		"vse8.v   v16, (%0)\n"
+		"add      %0, %0, t5\n"
+		"sub      %2, %2, t5\n"
+		"bnez     %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t5", "t6", "memory"
+	);
+}
+
+static __rte_always_inline void
+_rte_mov(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "memory"
+	 );
+}
+
+static __rte_always_inline void
+_rte_mov_aligned(uint8_t *dst, const uint8_t *src, uint32_t n)
+{
+	asm volatile (
+		"prefetch.r 128(%1)\n"
+		"prefetch.r 192(%1)\n"
+		"prefetch.r 256(%1)\n"
+		"prefetch.r 320(%1)\n"
+		"prefetch.r 384(%1)\n"
+		"prefetch.r 448(%1)\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"li t6, 640\n"
+		"1:\n"
+		"vsetvli t4, %2, e8, m8, ta, ma\n"
+		"vle8.v v16, (%1)\n"
+		"add %1, %1, t4\n"
+		"vse8.v v16, (%0)\n"
+		"add %0, %0, t4\n"
+		"sub %2, %2, t4\n"
+		"blt %2, t6, 3f\n"
+		"prefetch.r 512(%1)\n"
+		"prefetch.r 576(%1)\n"
+		"3:\n"
+		"bnez %2, 1b"
+		: "+r"(dst), "+r"(src), "+r"(n)
+		:
+		: "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "t4", "t6", "memory"
+	 );
+}
+
+static __rte_always_inline void *
+_rte_memcpy(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	uint32_t bn;
+
+	if (n <= 384) {
+		if (n >= 256) {
+			memcpy_prefetch128_2(src, dst);
+			n -= 256;
+			_rte_mov128(dst, src);
+			_rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			memcpy_prefetch128_1(src, dst);
+			n -= 128;
+			_rte_mov128(dst, src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+
+		if (n >= 64) {
+			memcpy_prefetch64_1(src, dst);
+			n -= 64;
+			_rte_mov64(dst, src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+
+		if (n > 32) {
+			_rte_mov32(dst, src);
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+
+		if (n > 0) {
+			_rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes.
+	 */
+	dstofss = (uintptr_t)dst & ALIGNMENT_MASK_64;
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		_rte_mov64(dst, src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	if ((uintptr_t)src & ALIGNMENT_MASK_64)	{
+		bn = n - (n & ALIGNMENT_MASK_128);
+		_rte_mov128blocks(dst, src, bn);
+		n = n & ALIGNMENT_MASK_128;
+		src = (const uint8_t *)src + bn;
+		dst = (uint8_t *)dst + bn;
+		_rte_mov(dst, src, n);
+	} else
+		_rte_mov_aligned(dst, src, n);
+
+	return ret;
+}
+
+#endif
+
+/*----------------------api---------------------------------------------------*/
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely((memcpy_alg == MEMCPY_RISCV) && (n >= 128) && (n < 2048)))
+		return _rte_memcpy((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+	return memcpy(dst, src, n);
+}
+
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -51,10 +329,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	memcpy(dst, src, 256);
+#if defined(RTE_RISCV_FEATURE_V)
+	if (likely(memcpy_alg == MEMCPY_RISCV))
+		_rte_mov256(dst, src);
+	else
+#endif
+		memcpy(dst, src, 256);
+}
+/*----------------------------------------------------------------------------*/
+#if defined(RTE_RISCV_FEATURE_V)
+static inline long
+riscv_vlenb(void)
+{
+	long vlenb;
+	asm ("csrr %0, 0xc22" : "=r"(vlenb));
+	return vlenb;
 }
 
-#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+RTE_INIT(rte_vect_memcpy_init)
+{
+	long vlenb = riscv_vlenb();
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V) && (vlenb >= RISCV_VLENB))
+		memcpy_alg = MEMCPY_RISCV;
+}
+#endif
+
 
 #ifdef __cplusplus
 }
-- 
2.21.0.windows.1

[-- Attachment #1.1.2: Type: text/html , Size: 26336 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions
  2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
  2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
@ 2025-10-24 16:27                   ` Stephen Hemminger
  1 sibling, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2025-10-24 16:27 UTC (permalink / raw)
  To: Qiguo Chen; +Cc: sunyuechi, stanislaw.kardach, dev, bruce.richardson

On Fri, 24 Oct 2025 13:41:28 +0800
Qiguo Chen <chen.qiguo@zte.com.cn> wrote:

> This patch uses RISC-V vector instructions and zicbop prefetching to
> optimize memory copies for 129~1600 byte ranges.
> 
> Signed-off-by: Qiguo Chen <chen.qiguo@zte.com.cn>
> ---

Is there any possibility of build environment being newer than
the run time? The Intel builds already have to deal with that problem.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-10-24 16:27 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-16  9:09 [PATCH v1 0/2] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-16  9:09 ` [PATCH v1 1/2] riscv support rte_memcpy in vector Qiguo Chen
2025-10-17  5:29   ` sunyuechi
2025-10-17 10:10     ` chen.qiguo
2025-10-17  9:36   ` [PATCH v2 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-17  9:36     ` [PATCH v2 1/1] riscv support rte_memcpy in vector Qiguo Chen
2025-10-20  9:43       ` sunyuechi
2025-10-20 12:08       ` [PATCH v3 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-20 12:08         ` [PATCH v3 1/1] lib/eal/riscv: optimize rte_memcpy with RISCV vector and zicbop extensions Qiguo Chen
2025-10-21  6:56           ` [PATCH v4 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-21  6:56             ` [PATCH v4 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-10-24  2:56               ` retest Qiguo Chen
2025-10-24  3:04               ` retest Qiguo Chen
2025-10-24  3:12               ` retest Qiguo Chen
2025-10-24  5:04               ` retest Qiguo Chen
2025-10-24  5:41               ` [PATCH v5 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-24  5:41                 ` [PATCH v5 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-10-24  7:27                   ` [PATCH v6 0/1] Optimization Summary for RISC-V rte_memcpy Qiguo Chen
2025-10-24  7:27                     ` [PATCH v6 1/1] eal/riscv: optimize rte_memcpy with vector and zicbop extensions Qiguo Chen
2025-10-24 16:27                   ` [PATCH v5 " Stephen Hemminger
2025-10-16  9:09 ` [PATCH v1 2/2] benchmark report for rte_memcpy Qiguo Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).