[PATCH] eal/x86: reduce memcpy code duplication

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] eal/x86: reduce memcpy code duplication
@ 2025-11-20 11:45 Morten Brørup
  2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
                   ` (10 more replies)
  0 siblings, 11 replies; 55+ messages in thread
From: Morten Brørup @ 2025-11-20 11:45 UTC (permalink / raw)
  To: Bruce Richardson, Konstantin Ananyev, Vipin Varghese, dev
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

This consolidation also has two minor benefits for the compiled output:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance cost of the alignment test and associated branching is
avoided when copying less than 65 bytes.

Furthermore, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/eal/x86/include/rte_memcpy.h | 214 +++++++++++++++----------------
 1 file changed, 100 insertions(+), 114 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..88820f50a9 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -132,6 +132,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +189,38 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 16 <= n <= 32, preferably 16 < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov16_to_32(void *dst, const void *src, size_t n)
+{
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 32 <= n <= 64, preferably 32 < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov32_to_64(void *dst, const void *src, size_t n)
+{
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +281,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +406,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -573,38 +578,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +685,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +712,35 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	if (__rte_constant(n) && n == 16) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+	if (__rte_constant(n) && n == 48) {
+		rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov16_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov32_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
@ 2025-11-21 10:35 ` Morten Brørup
  2025-11-21 16:57   ` Stephen Hemminger
  2025-11-21 10:40 ` Morten Brørup
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2025-11-21 10:35 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
 1 file changed, 181 insertions(+), 160 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
-	/**
-	 * Use the following structs to avoid violating C standard
-	 * alignment requirements and to avoid strict aliasing bugs
-	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint32_alias {
-		uint32_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
-	} __rte_packed_end __rte_may_alias;
-
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
-}
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" intructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 2)
+			return memcpy(dst, src, 2);
+		if (n == 4)
+			return memcpy(dst, src, 4);
+		if (n == 6) /* 4 + 2 */
+			return memcpy(dst, src, 6);
+		if (n == 8)
+			return memcpy(dst, src, 8);
+		if (n == 10) /* 8 + 2 */
+			return memcpy(dst, src, 10);
+		if (n == 12) /* 8 + 4 */
+			return memcpy(dst, src, 12);
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	if (n & 0x18) { /* n >= 8 */
+		/* copy 8 ~ 16 bytes */
+		memcpy(dst, src, 8);
+		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+	} else if (n & 0x4) {
+		/* copy 4 ~ 7 bytes */
+		memcpy(dst, src, 4);
+		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+	} else if (n & 0x2) {
+		/* copy 2 ~ 3 bytes */
+		memcpy(dst, src, 2);
+		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+	} else if (n & 0x1) {
+		/* copy 1 byte */
+		memcpy(dst, src, 1);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 18) /* 16 + 2 */
+			return memcpy(dst, src, 18);
+		if (n == 20) /* 16 + 4 */
+			return memcpy(dst, src, 20);
+		if (n == 24) /* 16 + 8 */
+			return memcpy(dst, src, 24);
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 17 (or 16) ~ 32 bytes */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 36) /* 32 + 4 */
+			return memcpy(dst, src, 36);
+		if (n == 40) /* 32 + 8 */
+			return memcpy(dst, src, 40);
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov17_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov33_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
  2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
@ 2025-11-21 10:40 ` Morten Brørup
  2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2025-11-21 10:40 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
 1 file changed, 181 insertions(+), 160 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
-	/**
-	 * Use the following structs to avoid violating C standard
-	 * alignment requirements and to avoid strict aliasing bugs
-	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint32_alias {
-		uint32_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
-	} __rte_packed_end __rte_may_alias;
-
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
-}
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 2)
+			return memcpy(dst, src, 2);
+		if (n == 4)
+			return memcpy(dst, src, 4);
+		if (n == 6) /* 4 + 2 */
+			return memcpy(dst, src, 6);
+		if (n == 8)
+			return memcpy(dst, src, 8);
+		if (n == 10) /* 8 + 2 */
+			return memcpy(dst, src, 10);
+		if (n == 12) /* 8 + 4 */
+			return memcpy(dst, src, 12);
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	if (n & 0x18) { /* n >= 8 */
+		/* copy 8 ~ 16 bytes */
+		memcpy(dst, src, 8);
+		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+	} else if (n & 0x4) {
+		/* copy 4 ~ 7 bytes */
+		memcpy(dst, src, 4);
+		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+	} else if (n & 0x2) {
+		/* copy 2 ~ 3 bytes */
+		memcpy(dst, src, 2);
+		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+	} else if (n & 0x1) {
+		/* copy 1 byte */
+		memcpy(dst, src, 1);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 18) /* 16 + 2 */
+			return memcpy(dst, src, 18);
+		if (n == 20) /* 16 + 4 */
+			return memcpy(dst, src, 20);
+		if (n == 24) /* 16 + 8 */
+			return memcpy(dst, src, 24);
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 17 (or 16) ~ 32 bytes */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 36) /* 32 + 4 */
+			return memcpy(dst, src, 36);
+		if (n == 40) /* 32 + 8 */
+			return memcpy(dst, src, 40);
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov17_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov33_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
  2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
  2025-11-21 10:40 ` Morten Brørup
@ 2025-11-21 10:40 ` Morten Brørup
  2025-11-24 13:36   ` Morten Brørup
  2025-11-28 14:02   ` Konstantin Ananyev
  2025-11-24 20:31 ` [PATCH v4] " Morten Brørup
                   ` (7 subsequent siblings)
  10 siblings, 2 replies; 55+ messages in thread
From: Morten Brørup @ 2025-11-21 10:40 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 341 ++++++++++++++++---------------
 1 file changed, 181 insertions(+), 160 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..665902ff62 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
-	/**
-	 * Use the following structs to avoid violating C standard
-	 * alignment requirements and to avoid strict aliasing bugs
-	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint32_alias {
-		uint32_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
-	} __rte_packed_end __rte_may_alias;
-
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
-}
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 2)
+			return memcpy(dst, src, 2);
+		if (n == 4)
+			return memcpy(dst, src, 4);
+		if (n == 6) /* 4 + 2 */
+			return memcpy(dst, src, 6);
+		if (n == 8)
+			return memcpy(dst, src, 8);
+		if (n == 10) /* 8 + 2 */
+			return memcpy(dst, src, 10);
+		if (n == 12) /* 8 + 4 */
+			return memcpy(dst, src, 12);
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	if (n & 0x18) { /* n >= 8 */
+		/* copy 8 ~ 16 bytes */
+		memcpy(dst, src, 8);
+		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+	} else if (n & 0x4) {
+		/* copy 4 ~ 7 bytes */
+		memcpy(dst, src, 4);
+		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+	} else if (n & 0x2) {
+		/* copy 2 ~ 3 bytes */
+		memcpy(dst, src, 2);
+		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+	} else if (n & 0x1) {
+		/* copy 1 byte */
+		memcpy(dst, src, 1);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 18) /* 16 + 2 */
+			return memcpy(dst, src, 18);
+		if (n == 20) /* 16 + 4 */
+			return memcpy(dst, src, 20);
+		if (n == 24) /* 16 + 8 */
+			return memcpy(dst, src, 24);
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 17 (or 16) ~ 32 bytes */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 36) /* 32 + 4 */
+			return memcpy(dst, src, 36);
+		if (n == 40) /* 32 + 8 */
+			return memcpy(dst, src, 40);
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov17_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov33_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
@ 2025-11-21 16:57   ` Stephen Hemminger
  2025-11-21 17:02     ` Bruce Richardson
  0 siblings, 1 reply; 55+ messages in thread
From: Stephen Hemminger @ 2025-11-21 16:57 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese

On Fri, 21 Nov 2025 10:35:35 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers, so the code
> handling this was moved from the various implementations to the common
> function.
> 
> Furthermore, the function for copying less than 16 bytes was replaced with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes. This small extension reduces the code path for
> copying two pointers.
> 
> These changes provide two benefits:
> 1. The memory footprint of the copy function is reduced.
> Previously there were two instances of the compiled code to copy up to 64
> bytes, one in the "aligned" code path, and one in the "generic" code path.
> Now there is only one instance, in the "common" code path.
> 2. The performance for copying up to 64 bytes is improved.
> The memcpy performance test shows cache-to-cache copying of up to 32 bytes
> now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> ca. 6.5 cycles before this patch.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

As I have said before would rather that DPDK move away from having its
own specialized memcpy.  How is this compared to stock inline gcc?
The main motivation is that the glibc/gcc team does more testing across
multiple architectures and has a community with more expertise on CPU
special cases.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-21 16:57   ` Stephen Hemminger
@ 2025-11-21 17:02     ` Bruce Richardson
  2025-11-21 17:11       ` Stephen Hemminger
  0 siblings, 1 reply; 55+ messages in thread
From: Bruce Richardson @ 2025-11-21 17:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Morten Brørup, dev, Konstantin Ananyev, Vipin Varghese

On Fri, Nov 21, 2025 at 08:57:30AM -0800, Stephen Hemminger wrote:
> On Fri, 21 Nov 2025 10:35:35 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > The implementation for copying up to 64 bytes does not depend on address
> > alignment with the size of the CPU's vector registers, so the code
> > handling this was moved from the various implementations to the common
> > function.
> > 
> > Furthermore, the function for copying less than 16 bytes was replaced with
> > a smarter implementation using fewer branches and potentially fewer
> > load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes. This small extension reduces the code path for
> > copying two pointers.
> > 
> > These changes provide two benefits:
> > 1. The memory footprint of the copy function is reduced.
> > Previously there were two instances of the compiled code to copy up to 64
> > bytes, one in the "aligned" code path, and one in the "generic" code path.
> > Now there is only one instance, in the "common" code path.
> > 2. The performance for copying up to 64 bytes is improved.
> > The memcpy performance test shows cache-to-cache copying of up to 32 bytes
> > now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> > ca. 6.5 cycles before this patch.
> > 
> > And finally, the missing implementation of rte_mov48() was added.
> > 
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> 
> As I have said before would rather that DPDK move away from having its
> own specialized memcpy.  How is this compared to stock inline gcc?
> The main motivation is that the glibc/gcc team does more testing across
> multiple architectures and has a community with more expertise on CPU
> special cases.

I would tend to agree. Even if we get rte_memcpy a few cycles faster, I
suspect many apps wouldn't notice the difference. However, I understand
that the virtio/vhost libraries gain from using rte_memcpy over standard
memcpy - or at least used to. Perhaps we can consider deprecating
rte_memcpy and just putting a vhost-specific memcpy in that library?

/Bruce

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-21 17:02     ` Bruce Richardson
@ 2025-11-21 17:11       ` Stephen Hemminger
  2025-11-21 21:36         ` Morten Brørup
  0 siblings, 1 reply; 55+ messages in thread
From: Stephen Hemminger @ 2025-11-21 17:11 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Morten Brørup, dev, Konstantin Ananyev, Vipin Varghese

On Fri, 21 Nov 2025 17:02:17 +0000
Bruce Richardson <bruce.richardson@intel.com> wrote:

> > As I have said before would rather that DPDK move away from having its
> > own specialized memcpy.  How is this compared to stock inline gcc?
> > The main motivation is that the glibc/gcc team does more testing across
> > multiple architectures and has a community with more expertise on CPU
> > special cases.  
> 
> I would tend to agree. Even if we get rte_memcpy a few cycles faster, I
> suspect many apps wouldn't notice the difference. However, I understand
> that the virtio/vhost libraries gain from using rte_memcpy over standard
> memcpy - or at least used to. Perhaps we can consider deprecating
> rte_memcpy and just putting a vhost-specific memcpy in that library?

It would be good to figure out why vhost is better with rte_memcpy,
maybe there is some alignment assumption that is in one and not the other?

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v2] eal/x86: optimize memcpy of small sizes
  2025-11-21 17:11       ` Stephen Hemminger
@ 2025-11-21 21:36         ` Morten Brørup
  0 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2025-11-21 21:36 UTC (permalink / raw)
  To: Stephen Hemminger, Bruce Richardson
  Cc: dev, Konstantin Ananyev, Vipin Varghese

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Friday, 21 November 2025 18.12
> 
> On Fri, 21 Nov 2025 17:02:17 +0000
> Bruce Richardson <bruce.richardson@intel.com> wrote:
> 
> > > As I have said before would rather that DPDK move away from having
> its
> > > own specialized memcpy.  How is this compared to stock inline gcc?

The "./build/app/test/dpdk-test memcpy_perf_autotest" compares to standard memcpy().

On my build system, copies up to 64 bytes (with size not known at build time) take 9 cycles using memcpy() vs. 2-4 cycles using rte_memcpy().

The general difference was probably worse with older compilers.
We should compare using the oldest compiler versions officially supported by DPDK. (GCC, Clang, MSVC, ...) And across the supported CPUs.

There are plenty of optimizations in DPDK, which were relevant at the time of addition, but have become obsolete over time.
I don't think rte_memcpy() is there yet. (Gut feeling, no data to back it up with!)
Until we get there, we should keep optimizing rte_memcpy().

For any per-packet operation, shaving off a few cycles is valuable.
And if the majority of an application's copy operations per packet are more than a few bytes, the application will not achieve high performance.
Thus, I think optimizing small copies is relevant: A normal DPDK application should perform many more small copies than large copies. (Measured by number of copy operations, not number of copied bytes.)

> > > The main motivation is that the glibc/gcc team does more testing
> across
> > > multiple architectures and has a community with more expertise on
> CPU
> > > special cases.
> >
> > I would tend to agree. Even if we get rte_memcpy a few cycles faster,
> I
> > suspect many apps wouldn't notice the difference. However, I
> understand
> > that the virtio/vhost libraries gain from using rte_memcpy over
> standard
> > memcpy - or at least used to. Perhaps we can consider deprecating
> > rte_memcpy and just putting a vhost-specific memcpy in that library?
> 
> It would be good to figure out why vhost is better with rte_memcpy,
> maybe there is some alignment assumption that is in one and not the
> other?

Looking at 1024 bytes copy on my build system,
cache-to-mem is 12 % faster with rte_memcpy(), and
mem-to-cache is 10 % slower.

Maybe the vhost library would benefit from having access to two rte_memcpy variants, respectively optimized for cache-to-mem and mem-to-cache.

There will always be some use cases where a generic "optimized" rte_memcpy() will be suboptimal.

Providing specific functions optimized for specific use cases makes really good sense.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
@ 2025-11-24 13:36   ` Morten Brørup
  2025-11-24 15:46     ` Patrick Robb
  2025-11-28 14:02   ` Konstantin Ananyev
  1 sibling, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2025-11-24 13:36 UTC (permalink / raw)
  To: dev

Recheck-request: Intel-compilation, iol-mellanox-Functional


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-24 13:36   ` Morten Brørup
@ 2025-11-24 15:46     ` Patrick Robb
  0 siblings, 0 replies; 55+ messages in thread
From: Patrick Robb @ 2025-11-24 15:46 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

[-- Attachment #1: Type: text/plain, Size: 355 bytes --]

Hi. There are some DTS failures on this testbed due to misconfiguration
applied by us at UNH last week. I'm fixing it and rerunning testing which
will update the CI checks for your patchseries.

On Mon, Nov 24, 2025 at 8:36 AM Morten Brørup <mb@smartsharesystems.com>
wrote:

> Recheck-request: Intel-compilation, iol-mellanox-Functional
>
>

[-- Attachment #2: Type: text/html, Size: 673 bytes --]

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v4] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (2 preceding siblings ...)
  2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
@ 2025-11-24 20:31 ` Morten Brørup
  2025-11-25  8:19   ` Morten Brørup
  2025-12-01 15:55 ` [PATCH v5] " Morten Brørup
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2025-11-24 20:31 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.

Also, all uses of SSE2 _mm_loadu_si128() intrinsics were upgraded to SSE3
_mm_lddqu_si128().
The Intel Intrinsics Guide notes that it may perform better when the data
crosses a cache line boundary.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 367 ++++++++++++++++---------------
 1 file changed, 194 insertions(+), 173 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..7c0029cc80 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
-	/**
-	 * Use the following structs to avoid violating C standard
-	 * alignment requirements and to avoid strict aliasing bugs
-	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint32_alias {
-		uint32_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
-	} __rte_packed_end __rte_may_alias;
-
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
-}
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -110,7 +64,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
+	xmm0 = _mm_lddqu_si128((const __m128i *)(const void *)src);
 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
 }
 
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +143,136 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ *
+ * Note: Using "n & X" generates 3-byte "test" instructions,
+ * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 2)
+			return memcpy(dst, src, 2);
+		if (n == 4)
+			return memcpy(dst, src, 4);
+		if (n == 6) /* 4 + 2 */
+			return memcpy(dst, src, 6);
+		if (n == 8)
+			return memcpy(dst, src, 8);
+		if (n == 10) /* 8 + 2 */
+			return memcpy(dst, src, 10);
+		if (n == 12) /* 8 + 4 */
+			return memcpy(dst, src, 12);
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	if (n & 0x18) { /* n >= 8 */
+		/* copy 8 ~ 16 bytes */
+		memcpy(dst, src, 8);
+		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+	} else if (n & 0x4) {
+		/* copy 4 ~ 7 bytes */
+		memcpy(dst, src, 4);
+		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+	} else if (n & 0x2) {
+		/* copy 2 ~ 3 bytes */
+		memcpy(dst, src, 2);
+		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+	} else if (n & 0x1) {
+		/* copy 1 byte */
+		memcpy(dst, src, 1);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 18) /* 16 + 2 */
+			return memcpy(dst, src, 18);
+		if (n == 20) /* 16 + 4 */
+			return memcpy(dst, src, 20);
+		if (n == 24) /* 16 + 8 */
+			return memcpy(dst, src, 24);
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 17 (or 16) ~ 32 bytes */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 36) /* 32 + 4 */
+			return memcpy(dst, src, 36);
+		if (n == 40) /* 32 + 8 */
+			return memcpy(dst, src, 40);
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* copy 33 (or 32) ~ 64 bytes */
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +333,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +458,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -494,16 +551,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 {                                                                                            \
     size_t tmp;                                                                                                \
     while (len >= 128 + 16 - offset) {                                                                      \
-        xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
+        xmm0 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
         len -= 128;                                                                                         \
-        xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
-        xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
-        xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
-        xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
-        xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
-        xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
-        xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
-        xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
+        xmm1 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
+        xmm2 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
+        xmm3 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
+        xmm4 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
+        xmm5 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
+        xmm6 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
+        xmm7 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
+        xmm8 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
         src = (const uint8_t *)src + 128;                                                                   \
         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
@@ -522,10 +579,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     dst = (uint8_t *)dst + tmp;                                                                             \
     if (len >= 32 + 16 - offset) {                                                                          \
         while (len >= 32 + 16 - offset) {                                                                   \
-            xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
+            xmm0 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
             len -= 32;                                                                                      \
-            xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
-            xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
+            xmm1 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
+            xmm2 = _mm_lddqu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
             src = (const uint8_t *)src + 32;                                                                \
             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
@@ -573,38 +630,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +737,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +764,18 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov17_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov33_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* RE: [PATCH v4] eal/x86: optimize memcpy of small sizes
  2025-11-24 20:31 ` [PATCH v4] " Morten Brørup
@ 2025-11-25  8:19   ` Morten Brørup
  0 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2025-11-25  8:19 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger

> Also, all uses of SSE2 _mm_loadu_si128() intrinsics were upgraded to
> SSE3 _mm_lddqu_si128().
> The Intel Intrinsics Guide notes that it may perform better when the
> data crosses a cache line boundary.

It turns out _mm_lddqu_si128() is much slower than _mm_loadu_si128().
Would have been nice if the Intel Intrinsics Guide mentioned that.

Marked v4 patch as Not Applicable, and changed v3 patch back to New.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
  2025-11-24 13:36   ` Morten Brørup
@ 2025-11-28 14:02   ` Konstantin Ananyev
  2025-11-28 15:55     ` Morten Brørup
  1 sibling, 1 reply; 55+ messages in thread
From: Konstantin Ananyev @ 2025-11-28 14:02 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger

> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n <= 16.
> + *
> + * Note: Copying uninitialized memory is perfectly acceptable.
> + * Using e.g. memcpy(dst, src, 8) instead of
> + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> + * avoids compiler warnings about source data may be uninitialized
> + * [-Wmaybe-uninitialized].
> + *
> + * Note: Using "n & X" generates 3-byte "test" instructions,
> + * instead of "n >= X", which would generate 4-byte "cmp" instructions.
> + */
> +static __rte_always_inline void *
> +rte_mov16_or_less(void *dst, const void *src, size_t n)
> +{
> +	/* Faster way when size is known at build time. */
> +	if (__rte_constant(n)) {
> +		if (n == 2)
> +			return memcpy(dst, src, 2);
> +		if (n == 4)
> +			return memcpy(dst, src, 4);
> +		if (n == 6) /* 4 + 2 */
> +			return memcpy(dst, src, 6);
> +		if (n == 8)
> +			return memcpy(dst, src, 8);
> +		if (n == 10) /* 8 + 2 */
> +			return memcpy(dst, src, 10);
> +		if (n == 12) /* 8 + 4 */
> +			return memcpy(dst, src, 12);
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	if (n & 0x18) { /* n >= 8 */

Probably 'n & 0x8'?
 
> +		/* copy 8 ~ 16 bytes */
> +		memcpy(dst, src, 8);
> +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
> +	} else if (n & 0x4) {
> +		/* copy 4 ~ 7 bytes */
> +		memcpy(dst, src, 4);
> +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
> +	} else if (n & 0x2) {
> +		/* copy 2 ~ 3 bytes */
> +		memcpy(dst, src, 2);
> +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
> +	} else if (n & 0x1) {
> +		/* copy 1 byte */
> +		memcpy(dst, src, 1);
> +	}
> +	return dst;
> +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-28 14:02   ` Konstantin Ananyev
@ 2025-11-28 15:55     ` Morten Brørup
  2025-11-28 18:10       ` Konstantin Ananyev
  0 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2025-11-28 15:55 UTC (permalink / raw)
  To: Konstantin Ananyev, dev, Bruce Richardson, Konstantin Ananyev,
	Vipin Varghese
  Cc: Stephen Hemminger

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Friday, 28 November 2025 15.03
> 
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n <= 16.
> > + *
> > + * Note: Copying uninitialized memory is perfectly acceptable.
> > + * Using e.g. memcpy(dst, src, 8) instead of
> > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > + * avoids compiler warnings about source data may be uninitialized
> > + * [-Wmaybe-uninitialized].
> > + *
> > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > + * instead of "n >= X", which would generate 4-byte "cmp"
> instructions.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > +{
> > +	/* Faster way when size is known at build time. */
> > +	if (__rte_constant(n)) {
> > +		if (n == 2)
> > +			return memcpy(dst, src, 2);
> > +		if (n == 4)
> > +			return memcpy(dst, src, 4);
> > +		if (n == 6) /* 4 + 2 */
> > +			return memcpy(dst, src, 6);
> > +		if (n == 8)
> > +			return memcpy(dst, src, 8);
> > +		if (n == 10) /* 8 + 2 */
> > +			return memcpy(dst, src, 10);
> > +		if (n == 12) /* 8 + 4 */
> > +			return memcpy(dst, src, 12);
> > +		if (n == 16) {
> > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +	}
> > +
> > +	if (n & 0x18) { /* n >= 8 */
> 
> Probably 'n & 0x8'?

It's intentional, to catch n == 0x10 too.
It seems the associated comment should be more verbose. How about:
if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */

> 
> > +		/* copy 8 ~ 16 bytes */
> > +		memcpy(dst, src, 8);
> > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> n, 8);
> > +	} else if (n & 0x4) {
> > +		/* copy 4 ~ 7 bytes */
> > +		memcpy(dst, src, 4);
> > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> n, 4);
> > +	} else if (n & 0x2) {
> > +		/* copy 2 ~ 3 bytes */
> > +		memcpy(dst, src, 2);
> > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> n, 2);
> > +	} else if (n & 0x1) {
> > +		/* copy 1 byte */
> > +		memcpy(dst, src, 1);
> > +	}
> > +	return dst;
> > +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-28 15:55     ` Morten Brørup
@ 2025-11-28 18:10       ` Konstantin Ananyev
  2025-11-29  2:17         ` Morten Brørup
  0 siblings, 1 reply; 55+ messages in thread
From: Konstantin Ananyev @ 2025-11-28 18:10 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger



> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Friday, 28 November 2025 15.03
> >
> > > +/**
> > > + * Copy bytes from one location to another,
> > > + * locations should not overlap.
> > > + * Use with n <= 16.
> > > + *
> > > + * Note: Copying uninitialized memory is perfectly acceptable.
> > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > + * avoids compiler warnings about source data may be uninitialized
> > > + * [-Wmaybe-uninitialized].
> > > + *
> > > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > > + * instead of "n >= X", which would generate 4-byte "cmp"
> > instructions.
> > > + */
> > > +static __rte_always_inline void *
> > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > +{
> > > +	/* Faster way when size is known at build time. */
> > > +	if (__rte_constant(n)) {
> > > +		if (n == 2)
> > > +			return memcpy(dst, src, 2);
> > > +		if (n == 4)
> > > +			return memcpy(dst, src, 4);
> > > +		if (n == 6) /* 4 + 2 */
> > > +			return memcpy(dst, src, 6);
> > > +		if (n == 8)
> > > +			return memcpy(dst, src, 8);
> > > +		if (n == 10) /* 8 + 2 */
> > > +			return memcpy(dst, src, 10);
> > > +		if (n == 12) /* 8 + 4 */
> > > +			return memcpy(dst, src, 12);
> > > +		if (n == 16) {
> > > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > > +			return dst;
> > > +		}
> > > +	}
> > > +
> > > +	if (n & 0x18) { /* n >= 8 */
> >
> > Probably 'n & 0x8'?
> 
> It's intentional, to catch n == 0x10 too.
> It seems the associated comment should be more verbose. How about:
> if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */

Ok, why just not simply : if (n >= 8) then?

> >
> > > +		/* copy 8 ~ 16 bytes */
> > > +		memcpy(dst, src, 8);
> > > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> > n, 8);
> > > +	} else if (n & 0x4) {
> > > +		/* copy 4 ~ 7 bytes */
> > > +		memcpy(dst, src, 4);
> > > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> > n, 4);
> > > +	} else if (n & 0x2) {
> > > +		/* copy 2 ~ 3 bytes */
> > > +		memcpy(dst, src, 2);
> > > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> > n, 2);
> > > +	} else if (n & 0x1) {
> > > +		/* copy 1 byte */
> > > +		memcpy(dst, src, 1);
> > > +	}
> > > +	return dst;
> > > +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-28 18:10       ` Konstantin Ananyev
@ 2025-11-29  2:17         ` Morten Brørup
  2025-12-01  9:35           ` Konstantin Ananyev
  0 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2025-11-29  2:17 UTC (permalink / raw)
  To: Konstantin Ananyev, dev, Bruce Richardson, Konstantin Ananyev,
	Vipin Varghese
  Cc: Stephen Hemminger

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Friday, 28 November 2025 19.11
> 
> > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > Sent: Friday, 28 November 2025 15.03
> > >
> > > > +/**
> > > > + * Copy bytes from one location to another,
> > > > + * locations should not overlap.
> > > > + * Use with n <= 16.
> > > > + *
> > > > + * Note: Copying uninitialized memory is perfectly acceptable.
> > > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > > + * avoids compiler warnings about source data may be
> uninitialized
> > > > + * [-Wmaybe-uninitialized].
> > > > + *
> > > > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > > > + * instead of "n >= X", which would generate 4-byte "cmp"
> > > instructions.
> > > > + */
> > > > +static __rte_always_inline void *
> > > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > > +{
> > > > +	/* Faster way when size is known at build time. */
> > > > +	if (__rte_constant(n)) {
> > > > +		if (n == 2)
> > > > +			return memcpy(dst, src, 2);
> > > > +		if (n == 4)
> > > > +			return memcpy(dst, src, 4);
> > > > +		if (n == 6) /* 4 + 2 */
> > > > +			return memcpy(dst, src, 6);
> > > > +		if (n == 8)
> > > > +			return memcpy(dst, src, 8);
> > > > +		if (n == 10) /* 8 + 2 */
> > > > +			return memcpy(dst, src, 10);
> > > > +		if (n == 12) /* 8 + 4 */
> > > > +			return memcpy(dst, src, 12);
> > > > +		if (n == 16) {
> > > > +			rte_mov16((uint8_t *)dst, (const uint8_t
> *)src);
> > > > +			return dst;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	if (n & 0x18) { /* n >= 8 */
> > >
> > > Probably 'n & 0x8'?
> >
> > It's intentional, to catch n == 0x10 too.
> > It seems the associated comment should be more verbose. How about:
> > if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */
> 
> Ok, why just not simply : if (n >= 8) then?

The reason mentioned in the function description:
 * Note: Using "n & X" generates 3-byte "test" instructions,
 * instead of "n >= X", which would generate 4-byte "cmp" instructions.

I'll move that comment down here, just before the comparisons.

> 
> > >
> > > > +		/* copy 8 ~ 16 bytes */
> > > > +		memcpy(dst, src, 8);
> > > > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src -
> 8 +
> > > n, 8);
> > > > +	} else if (n & 0x4) {
> > > > +		/* copy 4 ~ 7 bytes */
> > > > +		memcpy(dst, src, 4);
> > > > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src -
> 4 +
> > > n, 4);
> > > > +	} else if (n & 0x2) {
> > > > +		/* copy 2 ~ 3 bytes */
> > > > +		memcpy(dst, src, 2);
> > > > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src -
> 2 +
> > > n, 2);
> > > > +	} else if (n & 0x1) {
> > > > +		/* copy 1 byte */
> > > > +		memcpy(dst, src, 1);
> > > > +	}
> > > > +	return dst;
> > > > +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-11-29  2:17         ` Morten Brørup
@ 2025-12-01  9:35           ` Konstantin Ananyev
  2025-12-01 10:41             ` Morten Brørup
  0 siblings, 1 reply; 55+ messages in thread
From: Konstantin Ananyev @ 2025-12-01  9:35 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger



> > > > > +/**
> > > > > + * Copy bytes from one location to another,
> > > > > + * locations should not overlap.
> > > > > + * Use with n <= 16.
> > > > > + *
> > > > > + * Note: Copying uninitialized memory is perfectly acceptable.
> > > > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > > > + * avoids compiler warnings about source data may be
> > uninitialized
> > > > > + * [-Wmaybe-uninitialized].
> > > > > + *
> > > > > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > > > > + * instead of "n >= X", which would generate 4-byte "cmp"
> > > > instructions.
> > > > > + */
> > > > > +static __rte_always_inline void *
> > > > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > > > +{
> > > > > +	/* Faster way when size is known at build time. */
> > > > > +	if (__rte_constant(n)) {
> > > > > +		if (n == 2)
> > > > > +			return memcpy(dst, src, 2);
> > > > > +		if (n == 4)
> > > > > +			return memcpy(dst, src, 4);
> > > > > +		if (n == 6) /* 4 + 2 */
> > > > > +			return memcpy(dst, src, 6);
> > > > > +		if (n == 8)
> > > > > +			return memcpy(dst, src, 8);
> > > > > +		if (n == 10) /* 8 + 2 */
> > > > > +			return memcpy(dst, src, 10);
> > > > > +		if (n == 12) /* 8 + 4 */
> > > > > +			return memcpy(dst, src, 12);
> > > > > +		if (n == 16) {
> > > > > +			rte_mov16((uint8_t *)dst, (const uint8_t
> > *)src);
> > > > > +			return dst;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	if (n & 0x18) { /* n >= 8 */
> > > >
> > > > Probably 'n & 0x8'?
> > >
> > > It's intentional, to catch n == 0x10 too.
> > > It seems the associated comment should be more verbose. How about:
> > > if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */
> >
> > Ok, why just not simply : if (n >= 8) then?
> 
> The reason mentioned in the function description:
>  * Note: Using "n & X" generates 3-byte "test" instructions,
>  * instead of "n >= X", which would generate 4-byte "cmp" instructions.
Ah, I see the comment now, sorry missed it first time.
Indeed 'cmp' Will be one byte longer.
Though for non-constant 'n' gcc generates 90B of code for both versions:
https://godbolt.org/z/K6861qGcr

 
> 
> I'll move that comment down here, just before the comparisons.
> 
> >
> > > >
> > > > > +		/* copy 8 ~ 16 bytes */
> > > > > +		memcpy(dst, src, 8);
> > > > > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src -
> > 8 +
> > > > n, 8);
> > > > > +	} else if (n & 0x4) {
> > > > > +		/* copy 4 ~ 7 bytes */
> > > > > +		memcpy(dst, src, 4);
> > > > > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src -
> > 4 +
> > > > n, 4);
> > > > > +	} else if (n & 0x2) {
> > > > > +		/* copy 2 ~ 3 bytes */
> > > > > +		memcpy(dst, src, 2);
> > > > > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src -
> > 2 +
> > > > n, 2);
> > > > > +	} else if (n & 0x1) {
> > > > > +		/* copy 1 byte */
> > > > > +		memcpy(dst, src, 1);
> > > > > +	}
> > > > > +	return dst;
> > > > > +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v3] eal/x86: optimize memcpy of small sizes
  2025-12-01  9:35           ` Konstantin Ananyev
@ 2025-12-01 10:41             ` Morten Brørup
  0 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2025-12-01 10:41 UTC (permalink / raw)
  To: Konstantin Ananyev, dev, Bruce Richardson, Konstantin Ananyev,
	Vipin Varghese
  Cc: Stephen Hemminger

> > > > > > +/**
> > > > > > + * Copy bytes from one location to another,
> > > > > > + * locations should not overlap.
> > > > > > + * Use with n <= 16.
> > > > > > + *
> > > > > > + * Note: Copying uninitialized memory is perfectly
> acceptable.
> > > > > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > > > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > > > > + * avoids compiler warnings about source data may be
> > > uninitialized
> > > > > > + * [-Wmaybe-uninitialized].
> > > > > > + *
> > > > > > + * Note: Using "n & X" generates 3-byte "test" instructions,
> > > > > > + * instead of "n >= X", which would generate 4-byte "cmp"
> > > > > instructions.
> > > > > > + */
> > > > > > +static __rte_always_inline void *
> > > > > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > > > > +{
> > > > > > +	/* Faster way when size is known at build time. */
> > > > > > +	if (__rte_constant(n)) {
> > > > > > +		if (n == 2)
> > > > > > +			return memcpy(dst, src, 2);
> > > > > > +		if (n == 4)
> > > > > > +			return memcpy(dst, src, 4);
> > > > > > +		if (n == 6) /* 4 + 2 */
> > > > > > +			return memcpy(dst, src, 6);
> > > > > > +		if (n == 8)
> > > > > > +			return memcpy(dst, src, 8);
> > > > > > +		if (n == 10) /* 8 + 2 */
> > > > > > +			return memcpy(dst, src, 10);
> > > > > > +		if (n == 12) /* 8 + 4 */
> > > > > > +			return memcpy(dst, src, 12);
> > > > > > +		if (n == 16) {
> > > > > > +			rte_mov16((uint8_t *)dst, (const uint8_t
> > > *)src);
> > > > > > +			return dst;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	if (n & 0x18) { /* n >= 8 */
> > > > >
> > > > > Probably 'n & 0x8'?
> > > >
> > > > It's intentional, to catch n == 0x10 too.
> > > > It seems the associated comment should be more verbose. How
> about:
> > > > if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18 */
> > >
> > > Ok, why just not simply : if (n >= 8) then?
> >
> > The reason mentioned in the function description:
> >  * Note: Using "n & X" generates 3-byte "test" instructions,
> >  * instead of "n >= X", which would generate 4-byte "cmp"
> instructions.
> Ah, I see the comment now, sorry missed it first time.
> Indeed 'cmp' Will be one byte longer.
> Though for non-constant 'n' gcc generates 90B of code for both
> versions:
> https://godbolt.org/z/K6861qGcr

True. The saved byte becomes an extra byte of NOP padding.

There might also be opportunity to eliminate one more branch by reorganizing the outer comparison like so:

if (n <= 32)
	if (n <= 16)
		return rte_mov16_or_less (dst, src, n);
	else
		return rte_mov17_to_32(dst, src, n);
else if (n <= 64)
	return rte_mov33_to_64(dst, src, n);

Or:

if (n <= 16)
	return rte_mov16_or_less (dst, src, n);
else if (n <= 64)
	if (n <= 32)
		return rte_mov17_to_32(dst, src, n);
	else
		return rte_mov33_to_64(dst, src, n);

I'll play around with it some more and post a v4.

> 
> 
> >
> > I'll move that comment down here, just before the comparisons.
> >
> > >
> > > > >
> > > > > > +		/* copy 8 ~ 16 bytes */
> > > > > > +		memcpy(dst, src, 8);
> > > > > > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src -
> > > 8 +
> > > > > n, 8);
> > > > > > +	} else if (n & 0x4) {
> > > > > > +		/* copy 4 ~ 7 bytes */
> > > > > > +		memcpy(dst, src, 4);
> > > > > > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src -
> > > 4 +
> > > > > n, 4);
> > > > > > +	} else if (n & 0x2) {
> > > > > > +		/* copy 2 ~ 3 bytes */
> > > > > > +		memcpy(dst, src, 2);
> > > > > > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src -
> > > 2 +
> > > > > n, 2);
> > > > > > +	} else if (n & 0x1) {
> > > > > > +		/* copy 1 byte */
> > > > > > +		memcpy(dst, src, 1);
> > > > > > +	}
> > > > > > +	return dst;
> > > > > > +}

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v5] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (3 preceding siblings ...)
  2025-11-24 20:31 ` [PATCH v4] " Morten Brørup
@ 2025-12-01 15:55 ` Morten Brørup
  2025-12-03 13:29   ` Morten Brørup
  2026-01-03 17:53   ` Morten Brørup
  2026-01-12 12:03 ` [PATCH v6] " Morten Brørup
                   ` (5 subsequent siblings)
  10 siblings, 2 replies; 55+ messages in thread
From: Morten Brørup @ 2025-12-01 15:55 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

eal/x86: optimize memcpy of small sizes

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now typically only takes 2 cycles (4 cycles for 64 bytes) versus
ca. 6.5 cycles before this patch.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 346 +++++++++++++++++--------------
 1 file changed, 186 insertions(+), 160 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..04d1a474d1 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -55,52 +55,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-/**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
- */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
-{
-	/**
-	 * Use the following structs to avoid violating C standard
-	 * alignment requirements and to avoid strict aliasing bugs
-	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint32_alias {
-		uint32_t val;
-	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
-	} __rte_packed_end __rte_may_alias;
-
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
-}
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations should not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + 48);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
@@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n <= 16.
+ *
+ * Note: Copying uninitialized memory is perfectly acceptable.
+ * Using e.g. memcpy(dst, src, 8) instead of
+ * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
+ * avoids compiler warnings about source data may be uninitialized
+ * [-Wmaybe-uninitialized].
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 2)
+			return memcpy(dst, src, 2);
+		if (n == 4)
+			return memcpy(dst, src, 4);
+		if (n == 6) /* 4 + 2 */
+			return memcpy(dst, src, 6);
+		if (n == 8)
+			return memcpy(dst, src, 8);
+		if (n == 10) /* 8 + 2 */
+			return memcpy(dst, src, 10);
+		if (n == 12) /* 8 + 4 */
+			return memcpy(dst, src, 12);
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/*
+	 * Note: Using "n & X" generates 3-byte "test" instructions,
+	 * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+	 */
+	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18. */
+		/* Copy 8 ~ 16 bytes. */
+		memcpy(dst, src, 8);
+		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
+	} else if (n & 0x4) {
+		/* Copy 4 ~ 7 bytes. */
+		memcpy(dst, src, 4);
+		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
+	} else if (n & 0x2) {
+		/* Copy 2 ~ 3 bytes. */
+		memcpy(dst, src, 2);
+		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
+	} else if (n & 0x1) {
+		/* Copy 1 byte. */
+		memcpy(dst, src, 1);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 18) /* 16 + 2 */
+			return memcpy(dst, src, 18);
+		if (n == 20) /* 16 + 4 */
+			return memcpy(dst, src, 20);
+		if (n == 24) /* 16 + 8 */
+			return memcpy(dst, src, 24);
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 17 (or 16) ~ 32 bytes. */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *dst, const void *src, size_t n)
+{
+	/* Faster way when size is known at build time. */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 36) /* 32 + 4 */
+			return memcpy(dst, src, 36);
+		if (n == 40) /* 32 + 8 */
+			return memcpy(dst, src, 40);
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 33 (or 32) ~ 64 bytes. */
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations should not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+	/* Common implementation for size <= 64 bytes. */
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 64) {
+		/* Copy 17 ~ 64 bytes using vector instructions. */
+		if (n <= 32)
+			return rte_mov17_to_32(dst, src, n);
+		else
+			return rte_mov33_to_64(dst, src, n);
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2025-12-01 15:55 ` [PATCH v5] " Morten Brørup
@ 2025-12-03 13:29   ` Morten Brørup
  2026-01-03 17:53   ` Morten Brørup
  1 sibling, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2025-12-03 13:29 UTC (permalink / raw)
  To: dev

Recheck-request: Intel-compilation


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2025-12-01 15:55 ` [PATCH v5] " Morten Brørup
  2025-12-03 13:29   ` Morten Brørup
@ 2026-01-03 17:53   ` Morten Brørup
  2026-01-09 15:05     ` Varghese, Vipin
  2026-01-11 15:52     ` Konstantin Ananyev
  1 sibling, 2 replies; 55+ messages in thread
From: Morten Brørup @ 2026-01-03 17:53 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger

PING for review.

Regardless if you - for good and valid reasons - generally prefer memcpy() over rte_memcpy(), rte_memcpy() is not deprecated or being phased out, so improvements should be allowed.


> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Monday, 1 December 2025 16.55
> 
> eal/x86: optimize memcpy of small sizes
> 
> The implementation for copying up to 64 bytes does not depend on
> address
> alignment with the size of the CPU's vector registers, so the code
> handling this was moved from the various implementations to the common
> function.
> 
> Furthermore, the function for copying less than 16 bytes was replaced
> with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes. This small extension reduces the code path
> for
> copying two pointers.
> 
> These changes provide two benefits:
> 1. The memory footprint of the copy function is reduced.
> Previously there were two instances of the compiled code to copy up to
> 64
> bytes, one in the "aligned" code path, and one in the "generic" code
> path.
> Now there is only one instance, in the "common" code path.
> 2. The performance for copying up to 64 bytes is improved.
> The memcpy performance test shows cache-to-cache copying of up to 32
> bytes
> now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> ca. 6.5 cycles before this patch.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v5:
> * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> _mm_lddqu_si128().
>   It was slower.
> * Improved some comments. (Konstantin Ananyev)
> * Moved the size range 17..32 inside the size <= 64 branch, so when
>   building for SSE, the generated code can start copying the first
>   16 bytes before comparing if the size is greater than 32 or not.
> * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> rte_mov33_to_64().
> v4:
> * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> v3:
> * Fixed typo in comment.
> v2:
> * Updated patch title to reflect that the performance is improved.
> * Use the design pattern of two overlapping stores for small copies
> too.
> * Expanded first branch from size < 16 to size <= 16.
> * Handle more build time constant copy sizes.
> ---
>  lib/eal/x86/include/rte_memcpy.h | 346 +++++++++++++++++--------------
>  1 file changed, 186 insertions(+), 160 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..04d1a474d1 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -55,52 +55,6 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> -/**
> - * Copy bytes from one location to another,
> - * locations should not overlap.
> - * Use with n <= 15.
> - */
> -static __rte_always_inline void *
> -rte_mov15_or_less(void *dst, const void *src, size_t n)
> -{
> -	/**
> -	 * Use the following structs to avoid violating C standard
> -	 * alignment requirements and to avoid strict aliasing bugs
> -	 */
> -	struct __rte_packed_begin rte_uint64_alias {
> -		uint64_t val;
> -	} __rte_packed_end __rte_may_alias;
> -	struct __rte_packed_begin rte_uint32_alias {
> -		uint32_t val;
> -	} __rte_packed_end __rte_may_alias;
> -	struct __rte_packed_begin rte_uint16_alias {
> -		uint16_t val;
> -	} __rte_packed_end __rte_may_alias;
> -
> -	void *ret = dst;
> -	if (n & 8) {
> -		((struct rte_uint64_alias *)dst)->val =
> -			((const struct rte_uint64_alias *)src)->val;
> -		src = (const uint64_t *)src + 1;
> -		dst = (uint64_t *)dst + 1;
> -	}
> -	if (n & 4) {
> -		((struct rte_uint32_alias *)dst)->val =
> -			((const struct rte_uint32_alias *)src)->val;
> -		src = (const uint32_t *)src + 1;
> -		dst = (uint32_t *)dst + 1;
> -	}
> -	if (n & 2) {
> -		((struct rte_uint16_alias *)dst)->val =
> -			((const struct rte_uint16_alias *)src)->val;
> -		src = (const uint16_t *)src + 1;
> -		dst = (uint16_t *)dst + 1;
> -	}
> -	if (n & 1)
> -		*(uint8_t *)dst = *(const uint8_t *)src;
> -	return ret;
> -}
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  #endif
>  }
> 
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations should not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *dst, const uint8_t *src)
> +{
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 +
> 48);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> 16);
> +#endif
> +}
> +
>  /**
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
> @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
>  	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n <= 16.
> + *
> + * Note: Copying uninitialized memory is perfectly acceptable.
> + * Using e.g. memcpy(dst, src, 8) instead of
> + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> + * avoids compiler warnings about source data may be uninitialized
> + * [-Wmaybe-uninitialized].
> + */
> +static __rte_always_inline void *
> +rte_mov16_or_less(void *dst, const void *src, size_t n)
> +{
> +	/* Faster way when size is known at build time. */
> +	if (__rte_constant(n)) {
> +		if (n == 2)
> +			return memcpy(dst, src, 2);
> +		if (n == 4)
> +			return memcpy(dst, src, 4);
> +		if (n == 6) /* 4 + 2 */
> +			return memcpy(dst, src, 6);
> +		if (n == 8)
> +			return memcpy(dst, src, 8);
> +		if (n == 10) /* 8 + 2 */
> +			return memcpy(dst, src, 10);
> +		if (n == 12) /* 8 + 4 */
> +			return memcpy(dst, src, 12);
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/*
> +	 * Note: Using "n & X" generates 3-byte "test" instructions,
> +	 * instead of "n >= X", which would generate 4-byte "cmp"
> instructions.
> +	 */
> +	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> */
> +		/* Copy 8 ~ 16 bytes. */
> +		memcpy(dst, src, 8);
> +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> n, 8);
> +	} else if (n & 0x4) {
> +		/* Copy 4 ~ 7 bytes. */
> +		memcpy(dst, src, 4);
> +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> n, 4);
> +	} else if (n & 0x2) {
> +		/* Copy 2 ~ 3 bytes. */
> +		memcpy(dst, src, 2);
> +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> n, 2);
> +	} else if (n & 0x1) {
> +		/* Copy 1 byte. */
> +		memcpy(dst, src, 1);
> +	}
> +	return dst;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with 17 (or 16) < n <= 32.
> + */
> +static __rte_always_inline void *
> +rte_mov17_to_32(void *dst, const void *src, size_t n)
> +{
> +	/* Faster way when size is known at build time. */
> +	if (__rte_constant(n)) {
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 18) /* 16 + 2 */
> +			return memcpy(dst, src, 18);
> +		if (n == 20) /* 16 + 4 */
> +			return memcpy(dst, src, 20);
> +		if (n == 24) /* 16 + 8 */
> +			return memcpy(dst, src, 24);
> +		if (n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/* Copy 17 (or 16) ~ 32 bytes. */
> +	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> +	return dst;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with 33 (or 32) < n <= 64.
> + */
> +static __rte_always_inline void *
> +rte_mov33_to_64(void *dst, const void *src, size_t n)
> +{
> +	/* Faster way when size is known at build time. */
> +	if (__rte_constant(n)) {
> +		if (n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 36) /* 32 + 4 */
> +			return memcpy(dst, src, 36);
> +		if (n == 40) /* 32 + 8 */
> +			return memcpy(dst, src, 40);
> +		if (n == 48) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 64) {
> +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/* Copy 33 (or 32) ~ 64 bytes. */
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 +
> n);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> +	if (n > 48)
> +		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2
> * 16);
> +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> +#endif
> +	return dst;
> +}
> +
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> 
>  /**
> @@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t
> *src, size_t n)
>  	}
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
>  {
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				  (const uint8_t *)src - 16 + n);
> -		return ret;
> -	}
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				  (const uint8_t *)src - 32 + n);
> -		return ret;
> -	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> @@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> *src, size_t n)
>  	}
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
>  {
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -		return ret;
> -	}
>  	if (n <= 256) {
>  		if (n >= 128) {
>  			n -= 128;
> @@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
>      }
> \
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t srcofs;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> 16 + n);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		if (n > 48)
> -			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> 16 + n);
> -		return ret;
> -	}
>  	if (n <= 128) {
>  		goto COPY_BLOCK_128_BACK15;
>  	}
> @@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
> 
>  #endif /* __AVX512F__ */
> 
> +/**
> + * Copy bytes from one vector register size aligned location to
> another,
> + * locations should not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
>  {
>  	void *ret = dst;
> 
> -	/* Copy size < 16 bytes */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
> -	/* Copy 16 <= size <= 32 bytes */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -
> -		return ret;
> -	}
> -
> -	/* Copy 32 < size <= 64 bytes */
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -
> -		return ret;
> -	}
> -
>  	/* Copy 64 bytes blocks */
>  	for (; n > 64; n -= 64) {
>  		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> @@ -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src,
> size_t n)
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n)
>  {
> +	/* Common implementation for size <= 64 bytes. */
> +	if (n <= 16)
> +		return rte_mov16_or_less(dst, src, n);
> +	if (n <= 64) {
> +		/* Copy 17 ~ 64 bytes using vector instructions. */
> +		if (n <= 32)
> +			return rte_mov17_to_32(dst, src, n);
> +		else
> +			return rte_mov33_to_64(dst, src, n);
> +	}
> +
> +	/* Implementation for size > 64 bytes depends on alignment with
> vector register size. */
>  	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> -		return rte_memcpy_aligned(dst, src, n);
> +		return rte_memcpy_aligned_more_than_64(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return rte_memcpy_generic_more_than_64(dst, src, n);
>  }
> 
>  #undef ALIGNMENT_MASK
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-03 17:53   ` Morten Brørup
@ 2026-01-09 15:05     ` Varghese, Vipin
  2026-01-11 15:52     ` Konstantin Ananyev
  1 sibling, 0 replies; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-09 15:05 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev
  Cc: Stephen Hemminger

[AMD Official Use Only - AMD Internal Distribution Only]

Hi All,

I was occupied with a few works at Dec 2025. Let me look at this in weeked.

> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: Saturday, January 3, 2026 11:23 PM
> To: dev@dpdk.org; Bruce Richardson <bruce.richardson@intel.com>; Konstantin
> Ananyev <konstantin.v.ananyev@yandex.ru>; Varghese, Vipin
> <Vipin.Varghese@amd.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Subject: RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> PING for review.
>
> Regardless if you - for good and valid reasons - generally prefer memcpy() over
> rte_memcpy(), rte_memcpy() is not deprecated or being phased out, so
> improvements should be allowed.
>
>
> > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > Sent: Monday, 1 December 2025 16.55
> >
> > eal/x86: optimize memcpy of small sizes
> >
> > The implementation for copying up to 64 bytes does not depend on
> > address alignment with the size of the CPU's vector registers, so the
> > code handling this was moved from the various implementations to the
> > common function.
> >
> > Furthermore, the function for copying less than 16 bytes was replaced
> > with a smarter implementation using fewer branches and potentially
> > fewer load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes. This small extension reduces the code path
> > for copying two pointers.
> >
> > These changes provide two benefits:
> > 1. The memory footprint of the copy function is reduced.
> > Previously there were two instances of the compiled code to copy up to
> > 64
> > bytes, one in the "aligned" code path, and one in the "generic" code
> > path.
> > Now there is only one instance, in the "common" code path.
> > 2. The performance for copying up to 64 bytes is improved.
> > The memcpy performance test shows cache-to-cache copying of up to 32
> > bytes now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> > ca. 6.5 cycles before this patch.
> >
> > And finally, the missing implementation of rte_mov48() was added.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > v5:
> > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> > _mm_lddqu_si128().
> >   It was slower.
> > * Improved some comments. (Konstantin Ananyev)
> > * Moved the size range 17..32 inside the size <= 64 branch, so when
> >   building for SSE, the generated code can start copying the first
> >   16 bytes before comparing if the size is greater than 32 or not.
> > * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> > rte_mov33_to_64().
> > v4:
> > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> > v3:
> > * Fixed typo in comment.
> > v2:
> > * Updated patch title to reflect that the performance is improved.
> > * Use the design pattern of two overlapping stores for small copies
> > too.
> > * Expanded first branch from size < 16 to size <= 16.
> > * Handle more build time constant copy sizes.
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 346
> > +++++++++++++++++--------------
> >  1 file changed, 186 insertions(+), 160 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > index 46d34b8081..04d1a474d1 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -55,52 +55,6 @@ extern "C" {
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n);
> >
> > -/**
> > - * Copy bytes from one location to another,
> > - * locations should not overlap.
> > - * Use with n <= 15.
> > - */
> > -static __rte_always_inline void *
> > -rte_mov15_or_less(void *dst, const void *src, size_t n) -{
> > -     /**
> > -      * Use the following structs to avoid violating C standard
> > -      * alignment requirements and to avoid strict aliasing bugs
> > -      */
> > -     struct __rte_packed_begin rte_uint64_alias {
> > -             uint64_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -     struct __rte_packed_begin rte_uint32_alias {
> > -             uint32_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -     struct __rte_packed_begin rte_uint16_alias {
> > -             uint16_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -
> > -     void *ret = dst;
> > -     if (n & 8) {
> > -             ((struct rte_uint64_alias *)dst)->val =
> > -                     ((const struct rte_uint64_alias *)src)->val;
> > -             src = (const uint64_t *)src + 1;
> > -             dst = (uint64_t *)dst + 1;
> > -     }
> > -     if (n & 4) {
> > -             ((struct rte_uint32_alias *)dst)->val =
> > -                     ((const struct rte_uint32_alias *)src)->val;
> > -             src = (const uint32_t *)src + 1;
> > -             dst = (uint32_t *)dst + 1;
> > -     }
> > -     if (n & 2) {
> > -             ((struct rte_uint16_alias *)dst)->val =
> > -                     ((const struct rte_uint16_alias *)src)->val;
> > -             src = (const uint16_t *)src + 1;
> > -             dst = (uint16_t *)dst + 1;
> > -     }
> > -     if (n & 1)
> > -             *(uint8_t *)dst = *(const uint8_t *)src;
> > -     return ret;
> > -}
> > -
> >  /**
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > #endif  }
> >
> > +/**
> > + * Copy 48 bytes from one location to another,
> > + * locations should not overlap.
> > + */
> > +static __rte_always_inline void
> > +rte_mov48(uint8_t *dst, const uint8_t *src) { #if defined
> > +RTE_MEMCPY_AVX
> > +     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 +
> > 48);
> > +#else /* SSE implementation */
> > +     rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> > 16);
> > +#endif
> > +}
> > +
> >  /**
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
> >       rte_mov128(dst + 1 * 128, src + 1 * 128);  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n <= 16.
> > + *
> > + * Note: Copying uninitialized memory is perfectly acceptable.
> > + * Using e.g. memcpy(dst, src, 8) instead of
> > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > + * avoids compiler warnings about source data may be uninitialized
> > + * [-Wmaybe-uninitialized].
> > + */
> > +static __rte_always_inline void *
> > +rte_mov16_or_less(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 2)
> > +                     return memcpy(dst, src, 2);
> > +             if (n == 4)
> > +                     return memcpy(dst, src, 4);
> > +             if (n == 6) /* 4 + 2 */
> > +                     return memcpy(dst, src, 6);
> > +             if (n == 8)
> > +                     return memcpy(dst, src, 8);
> > +             if (n == 10) /* 8 + 2 */
> > +                     return memcpy(dst, src, 10);
> > +             if (n == 12) /* 8 + 4 */
> > +                     return memcpy(dst, src, 12);
> > +             if (n == 16) {
> > +                     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /*
> > +      * Note: Using "n & X" generates 3-byte "test" instructions,
> > +      * instead of "n >= X", which would generate 4-byte "cmp"
> > instructions.
> > +      */
> > +     if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> > */
> > +             /* Copy 8 ~ 16 bytes. */
> > +             memcpy(dst, src, 8);
> > +             memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8
> > + +
> > n, 8);
> > +     } else if (n & 0x4) {
> > +             /* Copy 4 ~ 7 bytes. */
> > +             memcpy(dst, src, 4);
> > +             memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4
> > + +
> > n, 4);
> > +     } else if (n & 0x2) {
> > +             /* Copy 2 ~ 3 bytes. */
> > +             memcpy(dst, src, 2);
> > +             memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2
> > + +
> > n, 2);
> > +     } else if (n & 0x1) {
> > +             /* Copy 1 byte. */
> > +             memcpy(dst, src, 1);
> > +     }
> > +     return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 17 (or 16) < n <= 32.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov17_to_32(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 16) {
> > +                     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 18) /* 16 + 2 */
> > +                     return memcpy(dst, src, 18);
> > +             if (n == 20) /* 16 + 4 */
> > +                     return memcpy(dst, src, 20);
> > +             if (n == 24) /* 16 + 8 */
> > +                     return memcpy(dst, src, 24);
> > +             if (n == 32) {
> > +                     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /* Copy 17 (or 16) ~ 32 bytes. */
> > +     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +     return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 33 (or 32) < n <= 64.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov33_to_64(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 32) {
> > +                     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 36) /* 32 + 4 */
> > +                     return memcpy(dst, src, 36);
> > +             if (n == 40) /* 32 + 8 */
> > +                     return memcpy(dst, src, 40);
> > +             if (n == 48) {
> > +                     rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 64) {
> > +                     rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /* Copy 33 (or 32) ~ 64 bytes. */ #if defined RTE_MEMCPY_AVX
> > +     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 +
> > n);
> > +#else /* SSE implementation */
> > +     rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +     if (n > 48)
> > +             rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src
> > + + 2
> > * 16);
> > +     rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +#endif
> > +     return dst;
> > +}
> > +
> >  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> >
> >  /**
> > @@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >       }
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t bits;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 512 bytes
> >        */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                               (const uint8_t *)src - 16 + n);
> > -             return ret;
> > -     }
> > -     if (__rte_constant(n) && n == 64) {
> > -             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                               (const uint8_t *)src - 32 + n);
> > -             return ret;
> > -     }
> >       if (n <= 512) {
> >               if (n >= 256) {
> >                       n -= 256;
> > @@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >       }
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t bits;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 256 bytes
> >        */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                             (const uint8_t *)src - 16 + n);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                             (const uint8_t *)src - 32 + n);
> > -             return ret;
> > -     }
> >       if (n <= 256) {
> >               if (n >= 128) {
> >                       n -= 128;
> > @@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >      }
> > \
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t srcofs;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 512 bytes
> >        */
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             if (n > 48)
> > -                     rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src +
> > 32);
> > -             rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -             return ret;
> > -     }
> >       if (n <= 128) {
> >               goto COPY_BLOCK_128_BACK15;
> >       }
> > @@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >
> >  #endif /* __AVX512F__ */
> >
> > +/**
> > + * Copy bytes from one vector register size aligned location to
> > another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> > +rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >
> > -     /* Copy size < 16 bytes */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> > -     /* Copy 16 <= size <= 32 bytes */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                             (const uint8_t *)src - 16 + n);
> > -
> > -             return ret;
> > -     }
> > -
> > -     /* Copy 32 < size <= 64 bytes */
> > -     if (__rte_constant(n) && n == 64) {
> > -             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                             (const uint8_t *)src - 32 + n);
> > -
> > -             return ret;
> > -     }
> > -
> >       /* Copy 64 bytes blocks */
> >       for (; n > 64; n -= 64) {
> >               rte_mov64((uint8_t *)dst, (const uint8_t *)src); @@
> > -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src,
> > size_t n)  static __rte_always_inline void *  rte_memcpy(void *dst,
> > const void *src, size_t n)  {
> > +     /* Common implementation for size <= 64 bytes. */
> > +     if (n <= 16)
> > +             return rte_mov16_or_less(dst, src, n);
> > +     if (n <= 64) {
> > +             /* Copy 17 ~ 64 bytes using vector instructions. */
> > +             if (n <= 32)
> > +                     return rte_mov17_to_32(dst, src, n);
> > +             else
> > +                     return rte_mov33_to_64(dst, src, n);
> > +     }
> > +
> > +     /* Implementation for size > 64 bytes depends on alignment with
> > vector register size. */
> >       if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > -             return rte_memcpy_aligned(dst, src, n);
> > +             return rte_memcpy_aligned_more_than_64(dst, src, n);
> >       else
> > -             return rte_memcpy_generic(dst, src, n);
> > +             return rte_memcpy_generic_more_than_64(dst, src, n);
> >  }
> >
> >  #undef ALIGNMENT_MASK
> > --
> > 2.43.0


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-03 17:53   ` Morten Brørup
  2026-01-09 15:05     ` Varghese, Vipin
@ 2026-01-11 15:52     ` Konstantin Ananyev
  2026-01-11 16:01       ` Stephen Hemminger
  2026-01-12  8:02       ` Morten Brørup
  1 sibling, 2 replies; 55+ messages in thread
From: Konstantin Ananyev @ 2026-01-11 15:52 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger



> 
> PING for review.
> 
> Regardless if you - for good and valid reasons - generally prefer memcpy() over
> rte_memcpy(), rte_memcpy() is not deprecated or being phased out, so
> improvements should be allowed.
> 
> 
> > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > Sent: Monday, 1 December 2025 16.55
> >
> > eal/x86: optimize memcpy of small sizes
> >
> > The implementation for copying up to 64 bytes does not depend on
> > address
> > alignment with the size of the CPU's vector registers, so the code
> > handling this was moved from the various implementations to the common
> > function.
> >
> > Furthermore, the function for copying less than 16 bytes was replaced
> > with
> > a smarter implementation using fewer branches and potentially fewer
> > load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes. This small extension reduces the code path
> > for
> > copying two pointers.
> >
> > These changes provide two benefits:
> > 1. The memory footprint of the copy function is reduced.
> > Previously there were two instances of the compiled code to copy up to
> > 64
> > bytes, one in the "aligned" code path, and one in the "generic" code
> > path.
> > Now there is only one instance, in the "common" code path.
> > 2. The performance for copying up to 64 bytes is improved.
> > The memcpy performance test shows cache-to-cache copying of up to 32
> > bytes
> > now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> > ca. 6.5 cycles before this patch.
> >
> > And finally, the missing implementation of rte_mov48() was added.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > v5:
> > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> > _mm_lddqu_si128().
> >   It was slower.
> > * Improved some comments. (Konstantin Ananyev)
> > * Moved the size range 17..32 inside the size <= 64 branch, so when
> >   building for SSE, the generated code can start copying the first
> >   16 bytes before comparing if the size is greater than 32 or not.
> > * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> > rte_mov33_to_64().
> > v4:
> > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> > v3:
> > * Fixed typo in comment.
> > v2:
> > * Updated patch title to reflect that the performance is improved.
> > * Use the design pattern of two overlapping stores for small copies
> > too.
> > * Expanded first branch from size < 16 to size <= 16.
> > * Handle more build time constant copy sizes.
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 346 +++++++++++++++++--------------
> >  1 file changed, 186 insertions(+), 160 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > index 46d34b8081..04d1a474d1 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -55,52 +55,6 @@ extern "C" {
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n);
> >
> > -/**
> > - * Copy bytes from one location to another,
> > - * locations should not overlap.
> > - * Use with n <= 15.
> > - */
> > -static __rte_always_inline void *
> > -rte_mov15_or_less(void *dst, const void *src, size_t n)
> > -{
> > -	/**
> > -	 * Use the following structs to avoid violating C standard
> > -	 * alignment requirements and to avoid strict aliasing bugs
> > -	 */
> > -	struct __rte_packed_begin rte_uint64_alias {
> > -		uint64_t val;
> > -	} __rte_packed_end __rte_may_alias;
> > -	struct __rte_packed_begin rte_uint32_alias {
> > -		uint32_t val;
> > -	} __rte_packed_end __rte_may_alias;
> > -	struct __rte_packed_begin rte_uint16_alias {
> > -		uint16_t val;
> > -	} __rte_packed_end __rte_may_alias;
> > -
> > -	void *ret = dst;
> > -	if (n & 8) {
> > -		((struct rte_uint64_alias *)dst)->val =
> > -			((const struct rte_uint64_alias *)src)->val;
> > -		src = (const uint64_t *)src + 1;
> > -		dst = (uint64_t *)dst + 1;
> > -	}
> > -	if (n & 4) {
> > -		((struct rte_uint32_alias *)dst)->val =
> > -			((const struct rte_uint32_alias *)src)->val;
> > -		src = (const uint32_t *)src + 1;
> > -		dst = (uint32_t *)dst + 1;
> > -	}
> > -	if (n & 2) {
> > -		((struct rte_uint16_alias *)dst)->val =
> > -			((const struct rte_uint16_alias *)src)->val;
> > -		src = (const uint16_t *)src + 1;
> > -		dst = (uint16_t *)dst + 1;
> > -	}
> > -	if (n & 1)
> > -		*(uint8_t *)dst = *(const uint8_t *)src;
> > -	return ret;
> > -}
> > -
> >  /**
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >  #endif
> >  }
> >
> > +/**
> > + * Copy 48 bytes from one location to another,
> > + * locations should not overlap.
> > + */
> > +static __rte_always_inline void
> > +rte_mov48(uint8_t *dst, const uint8_t *src)
> > +{
> > +#if defined RTE_MEMCPY_AVX
> > +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 +
> > 48);

Just a s thought: would compiler and CPU be smart enough to realize
that there is no dependency between these 2 ops, and they can be executed in any
order?
Might be do mov32(); mov16() instead? 
Again' didn't test anything, just a thought. 

> > +#else /* SSE implementation */
> > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> > 16);
> > +#endif
> > +}
> > +
> >  /**
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
> >  	rte_mov128(dst + 1 * 128, src + 1 * 128);
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n <= 16.
> > + *
> > + * Note: Copying uninitialized memory is perfectly acceptable.
> > + * Using e.g. memcpy(dst, src, 8) instead of
> > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > + * avoids compiler warnings about source data may be uninitialized
> > + * [-Wmaybe-uninitialized].
> > + */
> > +static __rte_always_inline void *
> > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > +{
> > +	/* Faster way when size is known at build time. */
> > +	if (__rte_constant(n)) {
> > +		if (n == 2)
> > +			return memcpy(dst, src, 2);
> > +		if (n == 4)
> > +			return memcpy(dst, src, 4);
> > +		if (n == 6) /* 4 + 2 */
> > +			return memcpy(dst, src, 6);
> > +		if (n == 8)
> > +			return memcpy(dst, src, 8);
> > +		if (n == 10) /* 8 + 2 */
> > +			return memcpy(dst, src, 10);
> > +		if (n == 12) /* 8 + 4 */
> > +			return memcpy(dst, src, 12);
> > +		if (n == 16) {
> > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}

If n is constant; wouldn't compiler unroll such memcpy itself?
Specially for such small (<=16) values?
I mean. can't we just:
If (n < 16) memcpy(dst, src, n); else rte_mov16(dst, src); 

> > +	}
> > +
> > +	/*
> > +	 * Note: Using "n & X" generates 3-byte "test" instructions,
> > +	 * instead of "n >= X", which would generate 4-byte "cmp"
> > instructions.
> > +	 */
> > +	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> > */
> > +		/* Copy 8 ~ 16 bytes. */
> > +		memcpy(dst, src, 8);
> > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> > n, 8);
> > +	} else if (n & 0x4) {
> > +		/* Copy 4 ~ 7 bytes. */
> > +		memcpy(dst, src, 4);
> > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> > n, 4);
> > +	} else if (n & 0x2) {
> > +		/* Copy 2 ~ 3 bytes. */
> > +		memcpy(dst, src, 2);
> > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> > n, 2);
> > +	} else if (n & 0x1) {
> > +		/* Copy 1 byte. */
> > +		memcpy(dst, src, 1);
> > +	}
> > +	return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 17 (or 16) < n <= 32.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov17_to_32(void *dst, const void *src, size_t n)
> > +{
> > +	/* Faster way when size is known at build time. */
> > +	if (__rte_constant(n)) {
> > +		if (n == 16) {
> > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +		if (n == 18) /* 16 + 2 */
> > +			return memcpy(dst, src, 18);
> > +		if (n == 20) /* 16 + 4 */
> > +			return memcpy(dst, src, 20);
> > +		if (n == 24) /* 16 + 8 */
> > +			return memcpy(dst, src, 24);
> > +		if (n == 32) {
> > +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +	}
> > +
> > +	/* Copy 17 (or 16) ~ 32 bytes. */
> > +	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +	return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 33 (or 32) < n <= 64.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov33_to_64(void *dst, const void *src, size_t n)
> > +{
> > +	/* Faster way when size is known at build time. */
> > +	if (__rte_constant(n)) {
> > +		if (n == 32) {
> > +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +		if (n == 36) /* 32 + 4 */
> > +			return memcpy(dst, src, 36);
> > +		if (n == 40) /* 32 + 8 */
> > +			return memcpy(dst, src, 40);
> > +		if (n == 48) {
> > +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +		if (n == 64) {
> > +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +			return dst;
> > +		}
> > +	}
> > +
> > +	/* Copy 33 (or 32) ~ 64 bytes. */
> > +#if defined RTE_MEMCPY_AVX
> > +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 +
> > n);
> > +#else /* SSE implementation */
> > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +	if (n > 48)
> > +		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2
> > * 16);
> > +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +#endif
> > +	return dst;
> > +}
> > +
> >  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> >
> >  /**
> > @@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >  	}
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >  	void *ret = dst;
> >  	size_t dstofss;
> >  	size_t bits;
> >
> > -	/**
> > -	 * Copy less than 16 bytes
> > -	 */
> > -	if (n < 16) {
> > -		return rte_mov15_or_less(dst, src, n);
> > -	}
> > -
> >  	/**
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> > -	if (__rte_constant(n) && n == 32) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		return ret;
> > -	}
> > -	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		if (__rte_constant(n) && n == 16)
> > -			return ret; /* avoid (harmless) duplicate copy */
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > -				  (const uint8_t *)src - 16 + n);
> > -		return ret;
> > -	}
> > -	if (__rte_constant(n) && n == 64) {
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -		return ret;
> > -	}
> > -	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > -				  (const uint8_t *)src - 32 + n);
> > -		return ret;
> > -	}
> >  	if (n <= 512) {
> >  		if (n >= 256) {
> >  			n -= 256;
> > @@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >  	}
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >  	void *ret = dst;
> >  	size_t dstofss;
> >  	size_t bits;
> >
> > -	/**
> > -	 * Copy less than 16 bytes
> > -	 */
> > -	if (n < 16) {
> > -		return rte_mov15_or_less(dst, src, n);
> > -	}
> > -
> >  	/**
> >  	 * Fast way when copy size doesn't exceed 256 bytes
> >  	 */
> > -	if (__rte_constant(n) && n == 32) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		return ret;
> > -	}
> > -	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		if (__rte_constant(n) && n == 16)
> > -			return ret; /* avoid (harmless) duplicate copy */
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > -				(const uint8_t *)src - 16 + n);
> > -		return ret;
> > -	}
> > -	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > -				(const uint8_t *)src - 32 + n);
> > -		return ret;
> > -	}
> >  	if (n <= 256) {
> >  		if (n >= 128) {
> >  			n -= 128;
> > @@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >      }
> > \
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> >  	void *ret = dst;
> >  	size_t dstofss;
> >  	size_t srcofs;
> >
> > -	/**
> > -	 * Copy less than 16 bytes
> > -	 */
> > -	if (n < 16) {
> > -		return rte_mov15_or_less(dst, src, n);
> > -	}
> > -
> >  	/**
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> > -	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		if (__rte_constant(n) && n == 16)
> > -			return ret; /* avoid (harmless) duplicate copy */
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -		return ret;
> > -	}
> > -	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		if (n > 48)
> > -			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src +
> > 32);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -		return ret;
> > -	}
> >  	if (n <= 128) {
> >  		goto COPY_BLOCK_128_BACK15;
> >  	}
> > @@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >
> >  #endif /* __AVX512F__ */
> >
> > +/**
> > + * Copy bytes from one vector register size aligned location to
> > another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> > +rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >  	void *ret = dst;
> >
> > -	/* Copy size < 16 bytes */
> > -	if (n < 16) {
> > -		return rte_mov15_or_less(dst, src, n);
> > -	}
> > -
> > -	/* Copy 16 <= size <= 32 bytes */
> > -	if (__rte_constant(n) && n == 32) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		return ret;
> > -	}
> > -	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		if (__rte_constant(n) && n == 16)
> > -			return ret; /* avoid (harmless) duplicate copy */
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > -				(const uint8_t *)src - 16 + n);
> > -
> > -		return ret;
> > -	}
> > -
> > -	/* Copy 32 < size <= 64 bytes */
> > -	if (__rte_constant(n) && n == 64) {
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -		return ret;
> > -	}
> > -	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > -				(const uint8_t *)src - 32 + n);
> > -
> > -		return ret;
> > -	}
> > -
> >  	/* Copy 64 bytes blocks */
> >  	for (; n > 64; n -= 64) {
> >  		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > @@ -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src,
> > size_t n)
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n)
> >  {
> > +	/* Common implementation for size <= 64 bytes. */
> > +	if (n <= 16)
> > +		return rte_mov16_or_less(dst, src, n);
> > +	if (n <= 64) {
> > +		/* Copy 17 ~ 64 bytes using vector instructions. */
> > +		if (n <= 32)
> > +			return rte_mov17_to_32(dst, src, n);
> > +		else
> > +			return rte_mov33_to_64(dst, src, n);
> > +	}
> > +
> > +	/* Implementation for size > 64 bytes depends on alignment with
> > vector register size. */
> >  	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > -		return rte_memcpy_aligned(dst, src, n);
> > +		return rte_memcpy_aligned_more_than_64(dst, src, n);
> >  	else
> > -		return rte_memcpy_generic(dst, src, n);
> > +		return rte_memcpy_generic_more_than_64(dst, src, n);
> >  }
> >
> >  #undef ALIGNMENT_MASK
> > --
> > 2.43.0


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-11 15:52     ` Konstantin Ananyev
@ 2026-01-11 16:01       ` Stephen Hemminger
  2026-01-12  8:02       ` Morten Brørup
  1 sibling, 0 replies; 55+ messages in thread
From: Stephen Hemminger @ 2026-01-11 16:01 UTC (permalink / raw)
  To: Konstantin Ananyev
  Cc: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese

On Sun, 11 Jan 2026 15:52:28 +0000
Konstantin Ananyev <konstantin.ananyev@huawei.com> wrote:

> > > +	/* Faster way when size is known at build time. */
> > > +	if (__rte_constant(n)) {
> > > +		if (n == 2)
> > > +			return memcpy(dst, src, 2);
> > > +		if (n == 4)
> > > +			return memcpy(dst, src, 4);
> > > +		if (n == 6) /* 4 + 2 */
> > > +			return memcpy(dst, src, 6);
> > > +		if (n == 8)
> > > +			return memcpy(dst, src, 8);
> > > +		if (n == 10) /* 8 + 2 */
> > > +			return memcpy(dst, src, 10);
> > > +		if (n == 12) /* 8 + 4 */
> > > +			return memcpy(dst, src, 12);
> > > +		if (n == 16) {
> > > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > > +			return dst;
> > > +		}  
> 
> If n is constant; wouldn't compiler unroll such memcpy itself?
> Specially for such small (<=16) values?
> I mean. can't we just:
> If (n < 16) memcpy(dst, src, n); else rte_mov16(dst, src); 

Yes, better to just let compiler do its own inline with all constant
sizes.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-11 15:52     ` Konstantin Ananyev
  2026-01-11 16:01       ` Stephen Hemminger
@ 2026-01-12  8:02       ` Morten Brørup
  2026-01-12 16:00         ` Scott Mitchell
  1 sibling, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2026-01-12  8:02 UTC (permalink / raw)
  To: Konstantin Ananyev, dev, Bruce Richardson, Konstantin Ananyev,
	Vipin Varghese
  Cc: Stephen Hemminger

> > > -	/**
> > > -	 * Use the following structs to avoid violating C standard
> > > -	 * alignment requirements and to avoid strict aliasing bugs
> > > -	 */
> > > -	struct __rte_packed_begin rte_uint64_alias {
> > > -		uint64_t val;
> > > -	} __rte_packed_end __rte_may_alias;
> > > -	struct __rte_packed_begin rte_uint32_alias {
> > > -		uint32_t val;
> > > -	} __rte_packed_end __rte_may_alias;
> > > -	struct __rte_packed_begin rte_uint16_alias {
> > > -		uint16_t val;
> > > -	} __rte_packed_end __rte_may_alias;

The discussion about the optimized checksum function [1] has shown us that memcpy() sometimes prevents Clang from optimizing (loop unrolling and vectorizing) and potentially causes strict aliasing bugs with GCC, so I will work on a new patch version that keeps using the above types, instead of introducing memcpy() inside rte_memcpy().

[1]: https://inbox.dpdk.org/dev/CAFn2buBzBLFLVN-K=u3MgBEbQ-hqbgJLVpDx3vSXVKJpa0yPNg@mail.gmail.com/

> > > +static __rte_always_inline void
> > > +rte_mov48(uint8_t *dst, const uint8_t *src)
> > > +{
> > > +#if defined RTE_MEMCPY_AVX
> > > +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > > +	rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 +
> > > 48);
> 
> Just a s thought: would compiler and CPU be smart enough to realize
> that there is no dependency between these 2 ops, and they can be
> executed in any
> order?
> Might be do mov32(); mov16() instead?
> Again' didn't test anything, just a thought.

Good idea.
I simply copied what the existing AVX code did for copying 48 bytes, but I agree with your suggestion.

> 
> > > +#else /* SSE implementation */
> > > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > > 16);
> > > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > > 16);
> > > +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> > > 16);
> > > +#endif
> > > +}
> > > +
> > >  /**
> > >   * Copy 64 bytes from one location to another,
> > >   * locations should not overlap.
> > > @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
> > >  	rte_mov128(dst + 1 * 128, src + 1 * 128);
> > >  }
> > >
> > > +/**
> > > + * Copy bytes from one location to another,
> > > + * locations should not overlap.
> > > + * Use with n <= 16.
> > > + *
> > > + * Note: Copying uninitialized memory is perfectly acceptable.
> > > + * Using e.g. memcpy(dst, src, 8) instead of
> > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > > + * avoids compiler warnings about source data may be uninitialized
> > > + * [-Wmaybe-uninitialized].
> > > + */
> > > +static __rte_always_inline void *
> > > +rte_mov16_or_less(void *dst, const void *src, size_t n)
> > > +{
> > > +	/* Faster way when size is known at build time. */
> > > +	if (__rte_constant(n)) {
> > > +		if (n == 2)
> > > +			return memcpy(dst, src, 2);
> > > +		if (n == 4)
> > > +			return memcpy(dst, src, 4);
> > > +		if (n == 6) /* 4 + 2 */
> > > +			return memcpy(dst, src, 6);
> > > +		if (n == 8)
> > > +			return memcpy(dst, src, 8);
> > > +		if (n == 10) /* 8 + 2 */
> > > +			return memcpy(dst, src, 10);
> > > +		if (n == 12) /* 8 + 4 */
> > > +			return memcpy(dst, src, 12);
> > > +		if (n == 16) {
> > > +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > > +			return dst;
> > > +		}
> 
> If n is constant; wouldn't compiler unroll such memcpy itself?
> Specially for such small (<=16) values?
> I mean. can't we just:
> If (n < 16) memcpy(dst, src, n); else rte_mov16(dst, src);

Unfortunately not. For e.g. n == 13, we want to use the trick with the overlapping copies, requiring only two 8-byte copy operations instead of three copy operations (8-byte + 4-byte + 1-byte).

> 
> > > +	}
> > > +
> > > +	/*
> > > +	 * Note: Using "n & X" generates 3-byte "test" instructions,
> > > +	 * instead of "n >= X", which would generate 4-byte "cmp"
> > > instructions.
> > > +	 */
> > > +	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> > > */
> > > +		/* Copy 8 ~ 16 bytes. */
> > > +		memcpy(dst, src, 8);
> > > +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> > > n, 8);
> > > +	} else if (n & 0x4) {
> > > +		/* Copy 4 ~ 7 bytes. */
> > > +		memcpy(dst, src, 4);
> > > +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> > > n, 4);
> > > +	} else if (n & 0x2) {
> > > +		/* Copy 2 ~ 3 bytes. */
> > > +		memcpy(dst, src, 2);
> > > +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> > > n, 2);
> > > +	} else if (n & 0x1) {
> > > +		/* Copy 1 byte. */
> > > +		memcpy(dst, src, 1);
> > > +	}
> > > +	return dst;
> > > +}


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v6] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (4 preceding siblings ...)
  2025-12-01 15:55 ` [PATCH v5] " Morten Brørup
@ 2026-01-12 12:03 ` Morten Brørup
  2026-01-13 23:19   ` Stephen Hemminger
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2026-01-12 12:03 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Stephen Hemminger, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers, so the code
handling this was moved from the various implementations to the common
function.

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes. This small extension reduces the code path for
copying two pointers.

These changes provide two benefits:
1. The memory footprint of the copy function is reduced.
Previously there were two instances of the compiled code to copy up to 64
bytes, one in the "aligned" code path, and one in the "generic" code path.
Now there is only one instance, in the "common" code path.
2. The performance for copying up to 64 bytes is improved.
The memcpy performance test shows cache-to-cache copying of up to 32 bytes
now only takes 2 cycles (5 cycles for 64 bytes) versus ca. 6.5 cycles
before this patch.

And finally, the missing implementation of rte_mov48() was added.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 527 ++++++++++++++++++++-----------
 1 file changed, 349 insertions(+), 178 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..e429865d21 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,60 +45,78 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
+ * Copy 1 byte from one location to another,
+ * locations must not overlap.
  */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+static __rte_always_inline void
+rte_mov1(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	*dst = *src;
+}
+
+/**
+ * Copy 2 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov2(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	/**
-	 * Use the following structs to avoid violating C standard
+	 * Use the following struct to avoid violating C standard
 	 * alignment requirements and to avoid strict aliasing bugs
 	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
+	struct __rte_packed_begin rte_uint16_alias {
+		uint16_t val;
 	} __rte_packed_end __rte_may_alias;
+
+	((struct rte_uint16_alias *)dst)->val = ((const struct rte_uint16_alias *)src)->val;
+}
+
+/**
+ * Copy 4 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov4(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	/**
+	 * Use the following struct to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
 	struct __rte_packed_begin rte_uint32_alias {
 		uint32_t val;
 	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
+
+	((struct rte_uint32_alias *)dst)->val = ((const struct rte_uint32_alias *)src)->val;
+}
+
+/**
+ * Copy 8 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov8(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	/**
+	 * Use the following struct to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
+	struct __rte_packed_begin rte_uint64_alias {
+		uint64_t val;
 	} __rte_packed_end __rte_may_alias;
 
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
+	((struct rte_uint64_alias *)dst)->val = ((const struct rte_uint64_alias *)src)->val;
 }
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +126,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +142,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +179,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,15 +190,235 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n <= 16.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring three copy operations are not handled here,
+	 * but proceed to the method using two overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 2) {
+			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 3) {
+			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 2, (const uint8_t *)src + 2);
+			return dst;
+		}
+		if (n == 4) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 5) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 4, (const uint8_t *)src + 4);
+			return dst;
+		}
+		if (n == 6) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 4, (const uint8_t *)src + 4);
+			return dst;
+		}
+		if (n == 8) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 9) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 10) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 12) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/*
+	 * Note: Using "n & X" generates 3-byte "test" instructions,
+	 * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+	 */
+	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18. */
+		/* Copy 8 ~ 16 bytes. */
+		rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov8((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n);
+	} else if (n & 0x4) {
+		/* Copy 4 ~ 7 bytes. */
+		rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov4((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n);
+	} else if (n & 0x2) {
+		/* Copy 2 ~ 3 bytes. */
+		rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov2((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n);
+	} else if (n & 0x1) {
+		/* Copy 1 byte. */
+		rte_mov1((uint8_t *)dst, (const uint8_t *)src);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring three copy operations are not handled here,
+	 * but proceed to the method using two overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 17) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 18) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 20) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 24) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 17 (or 16) ~ 32 bytes. */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring more copy operations are not handled here,
+	 * but proceed to the method using overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 33) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 34) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 36) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 40) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if !defined RTE_MEMCPY_AVX /* SSE specific implementation */
+		if (n == 49) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 50) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 52) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 56) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+#endif
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 33 (or 32) ~ 64 bytes. */
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -182,10 +429,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +449,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +479,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +575,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +605,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +687,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +747,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +778,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +886,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +912,28 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Common implementation for size <= 64 bytes. */
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 64) {
+		/* Copy 17 ~ 64 bytes using vector instructions. */
+		if (n <= 32)
+			return rte_mov17_to_32(dst, src, n);
+		else
+			return rte_mov33_to_64(dst, src, n);
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Re: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-12  8:02       ` Morten Brørup
@ 2026-01-12 16:00         ` Scott Mitchell
  2026-01-13  0:39           ` Stephen Hemminger
  0 siblings, 1 reply; 55+ messages in thread
From: Scott Mitchell @ 2026-01-12 16:00 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Konstantin Ananyev, dev, Bruce Richardson, Konstantin Ananyev,
	Vipin Varghese, Stephen Hemminger

>
> The discussion about the optimized checksum function [1] has shown us that memcpy() sometimes prevents Clang from optimizing (loop unrolling and vectorizing) and potentially causes strict aliasing bugs with GCC, so I will work on a new patch version that keeps using the above types, instead of introducing memcpy() inside rte_memcpy().
>
> [1]: https://inbox.dpdk.org/dev/CAFn2buBzBLFLVN-K=u3MgBEbQ-hqbgJLVpDx3vSXVKJpa0yPNg@mail.gmail.com/
>

Great timing for this thread :)

My observation:
- clang is unable to apply optimizations with RTE_PTR_[ADD,SUB]
like loop unrolling and vectorization (e.g. cksum)
- Even when clang/gcc do apply optimizations the assembly can be non-optimal
- direct usage of unaligned_NN_t types can cause incorrect results
(due to gcc bugs)

I don't think "rte_NN_alias" structs are safe on architectures that don't allow
unaligned access bcz the inner "val" needs to indicate it maybe for
unaligned access.

My suggestion:
1. Fix unaligned_NN_t types to ensure compiler doesn't aggressively
apply strict-alias
optimizations resulting in incorrect results
(https://patches.dpdk.org/project/dpdk/patch/20260112120411.27314-2-scott.k.mitch1@gmail.com/).
Intermediate structs rte_NN_alias are then unnecessary and we can directly use
unaligned_NN_t instead (e.g.
https://patches.dpdk.org/project/dpdk/patch/20260112120411.27314-3-scott.k.mitch1@gmail.com/)

2. Improve RTE_PTR_[ADD,SUB] to be more compiler friendly
(https://patches.dpdk.org/project/dpdk/patch/20260112154059.36879-1-scott.k.mitch1@gmail.com/)

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v5] eal/x86: optimize memcpy of small sizes
  2026-01-12 16:00         ` Scott Mitchell
@ 2026-01-13  0:39           ` Stephen Hemminger
  0 siblings, 0 replies; 55+ messages in thread
From: Stephen Hemminger @ 2026-01-13  0:39 UTC (permalink / raw)
  To: Scott Mitchell
  Cc: Morten Brørup, Konstantin Ananyev, dev, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese

On Mon, 12 Jan 2026 11:00:36 -0500
Scott Mitchell <scott.k.mitch1@gmail.com> wrote:

> >
> > The discussion about the optimized checksum function [1] has shown us that memcpy() sometimes prevents Clang from optimizing (loop unrolling and vectorizing) and potentially causes strict aliasing bugs with GCC, so I will work on a new patch version that keeps using the above types, instead of introducing memcpy() inside rte_memcpy().
> >
> > [1]: https://inbox.dpdk.org/dev/CAFn2buBzBLFLVN-K=u3MgBEbQ-hqbgJLVpDx3vSXVKJpa0yPNg@mail.gmail.com/
> >  
> 
> Great timing for this thread :)
> 
> My observation:
> - clang is unable to apply optimizations with RTE_PTR_[ADD,SUB]
> like loop unrolling and vectorization (e.g. cksum)
> - Even when clang/gcc do apply optimizations the assembly can be non-optimal
> - direct usage of unaligned_NN_t types can cause incorrect results
> (due to gcc bugs)
> 
> I don't think "rte_NN_alias" structs are safe on architectures that don't allow
> unaligned access bcz the inner "val" needs to indicate it maybe for
> unaligned access.
> 
> My suggestion:
> 1. Fix unaligned_NN_t types to ensure compiler doesn't aggressively
> apply strict-alias
> optimizations resulting in incorrect results
> (https://patches.dpdk.org/project/dpdk/patch/20260112120411.27314-2-scott.k.mitch1@gmail.com/).
> Intermediate structs rte_NN_alias are then unnecessary and we can directly use
> unaligned_NN_t instead (e.g.
> https://patches.dpdk.org/project/dpdk/patch/20260112120411.27314-3-scott.k.mitch1@gmail.com/)
> 
> 2. Improve RTE_PTR_[ADD,SUB] to be more compiler friendly
> (https://patches.dpdk.org/project/dpdk/patch/20260112154059.36879-1-scott.k.mitch1@gmail.com/)

FYI the Linux kernel avoids the memcpy silliness.
Mostly by identifying architectures where unaligned access is non-issue.
On x86, unaligned access works fine. As I remember it works on ARM as well.
The only place where unaligned can break badly is when this is an atomic operation.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-12 12:03 ` [PATCH v6] " Morten Brørup
@ 2026-01-13 23:19   ` Stephen Hemminger
  2026-01-20 11:00     ` Varghese, Vipin
  0 siblings, 1 reply; 55+ messages in thread
From: Stephen Hemminger @ 2026-01-13 23:19 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese

On Mon, 12 Jan 2026 12:03:37 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers, so the code
> handling this was moved from the various implementations to the common
> function.
> 
> Furthermore, the function for copying less than 16 bytes was replaced with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes. This small extension reduces the code path for
> copying two pointers.
> 
> These changes provide two benefits:
> 1. The memory footprint of the copy function is reduced.
> Previously there were two instances of the compiled code to copy up to 64
> bytes, one in the "aligned" code path, and one in the "generic" code path.
> Now there is only one instance, in the "common" code path.
> 2. The performance for copying up to 64 bytes is improved.
> The memcpy performance test shows cache-to-cache copying of up to 32 bytes
> now only takes 2 cycles (5 cycles for 64 bytes) versus ca. 6.5 cycles
> before this patch.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---

Not sure all this is really needed.
Would rather get the patch series in that used cocci script to
replace all the fixed size rte_memcpy() with memcpy().
The compiler inlining is safer and as fast or faster.

Be careful, there have been and still are bugs in DPDK rte_memcpy()
where it reads past the size of the source bytes.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-13 23:19   ` Stephen Hemminger
@ 2026-01-20 11:00     ` Varghese, Vipin
  2026-01-20 11:19       ` Varghese, Vipin
  0 siblings, 1 reply; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-20 11:00 UTC (permalink / raw)
  To: Stephen Hemminger, Morten Brørup
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev

[Public]

Hi @Morten Brørup, we tried the changes on zen4 `AMD EPYC 8534P 64-Core Processor` using `dpdk-test` with option `memcpy_perf_autotest`. Following are the observations

1. there are 1 or 2 cycles reduction especially to lower byte size for both aligned and unaligned cases.
2. overall test run for aligned and unaligned cases did not change.
3. improvement are seen more on aligned than unaligned.

Some caveats:
1. Zen4 compared to zen5 both support avx512, but the load-store is 32B at the backend of the uarch. This might explain the no change > 64B odd sizes.
2. need to test with virto or memif in copy mode to see actual results. (will test and share results separately)
3. in function rte_mov48, since zen4 use 32B load|store need to recheck if write-combing is causing stalling (which we can speed up by forcing higher then lower address).

Note: need some more time to cross check above 2 and 3.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-20 11:00     ` Varghese, Vipin
@ 2026-01-20 11:19       ` Varghese, Vipin
  2026-01-20 11:22         ` Morten Brørup
  0 siblings, 1 reply; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-20 11:19 UTC (permalink / raw)
  To: Varghese, Vipin, Stephen Hemminger, Morten Brørup
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev

[Public]

Hi @Morten Brørup, quick query in function `rte_mov17_to_32`

```
+       if (__rte_constant(n)) {
+               if (n == 16) {
+                       rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+                       rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+                       return dst;
+               }
+               if (n == 17) {
+                       rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+                       rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+                       return dst;
+               }
```

For `size 16` do we need to call ` rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);`? Can you please help me understand the corner case for the same.

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-20 11:19       ` Varghese, Vipin
@ 2026-01-20 11:22         ` Morten Brørup
  2026-01-21 11:48           ` Varghese, Vipin
  0 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2026-01-20 11:22 UTC (permalink / raw)
  To: Varghese, Vipin, Stephen Hemminger
  Cc: dev, Bruce Richardson, Konstantin Ananyev

> From: Varghese, Vipin [mailto:Vipin.Varghese@amd.com]
> Sent: Tuesday, 20 January 2026 12.19
> 
> [Public]
> 
> Hi @Morten Brørup, quick query in function `rte_mov17_to_32`
> 
> ```
> +       if (__rte_constant(n)) {
> +               if (n == 16) {
> +                       rte_mov16((uint8_t *)dst, (const uint8_t
> *)src);
> +                       rte_mov1((uint8_t *)dst + 16, (const uint8_t
> *)src + 16);
> +                       return dst;
> +               }
> +               if (n == 17) {
> +                       rte_mov16((uint8_t *)dst, (const uint8_t
> *)src);
> +                       rte_mov1((uint8_t *)dst + 16, (const uint8_t
> *)src + 16);
> +                       return dst;
> +               }
> ```
> 
> For `size 16` do we need to call ` rte_mov1((uint8_t *)dst + 16, (const
> uint8_t *)src + 16);`? Can you please help me understand the corner
> case for the same.

Good catch!
Copy-paste bug.
Will fix in next version, awaiting other feedback.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-20 11:22         ` Morten Brørup
@ 2026-01-21 11:48           ` Varghese, Vipin
  2026-01-22  6:59             ` Varghese, Vipin
  0 siblings, 1 reply; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-21 11:48 UTC (permalink / raw)
  To: Morten Brørup, Stephen Hemminger
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev

[Public]

Hi @Morten Brørup, please find our observation running testpmd with memif in zero-copy mode disabled (rte_memcpy enabled).

1. DPDK baseline version: 25.11 we tested with testpmd in io & flowgen mode
2. using no cargs for memcpy (rtemov32) and with patch 64B & 65B we get `15.5Mpps`
3. using cargs ` -DRTE_MEMCPY_AVX512` for memcpy (rtemov64) and with patch 64B & 65B we get `14.8Mpps`

We will run with dma-perf application for payload sizes of 1,2,3,4,5,...etc

Regards
Vipin Varghese

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-21 11:48           ` Varghese, Vipin
@ 2026-01-22  6:59             ` Varghese, Vipin
  2026-01-22  7:28               ` Liangxing Wang
  2026-01-23  6:58               ` Varghese, Vipin
  0 siblings, 2 replies; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-22  6:59 UTC (permalink / raw)
  To: Varghese, Vipin, Morten Brørup, Stephen Hemminger,
	P, Thiyagarajan, Murali Krishna, Bala
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev

[Public]

Hi @Morten Brørup,

We (@P, Thiyagarajan @Murali Krishna, Bala and myself) have used dma-perf to validate the performance from 1B to 17B payload.
Following are our observations

With c_args `-DRTE_MEMCPY_AVX512` enabled on zen4, we observe around 25% performance regression for payload size 1B to 15B and 17B.
While in case of 16B we see improvement in Mops by 40%.

Without c_args `-DRTE_MEMCPY_AVX512` enabled on zen4, we observe +-4% variation from 1B to 17B.

`We are investigating the variation is more prominent with avx512 memcpy.`

Note:
1. in zen4 ld|str is broken to 32B. While in zen5 ld|str is 64B.
2. we tested memif copy on zen5 with patch (without -DRTE_MEMCPY_AVX512) on 64B and 65B payload. It is same as zen4 observation (shared in previous email).



> -----Original Message-----
> From: Varghese, Vipin <Vipin.Varghese@amd.com>
> Sent: Wednesday, January 21, 2026 5:19 PM
> To: Morten Brørup <mb@smartsharesystems.com>; Stephen Hemminger
> <stephen@networkplumber.org>
> Cc: dev@dpdk.org; Bruce Richardson <bruce.richardson@intel.com>; Konstantin
> Ananyev <konstantin.v.ananyev@yandex.ru>
> Subject: RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> [Public]
>
> Hi @Morten Brørup, please find our observation running testpmd with memif in
> zero-copy mode disabled (rte_memcpy enabled).
>
> 1. DPDK baseline version: 25.11 we tested with testpmd in io & flowgen mode 2.
> using no cargs for memcpy (rtemov32) and with patch 64B & 65B we get
> `15.5Mpps` 3. using cargs ` -DRTE_MEMCPY_AVX512` for memcpy (rtemov64)
> and with patch 64B & 65B we get `14.8Mpps`
>
> We will run with dma-perf application for payload sizes of 1,2,3,4,5,...etc
>
> Regards
> Vipin Varghese

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-22  6:59             ` Varghese, Vipin
@ 2026-01-22  7:28               ` Liangxing Wang
  2026-01-23  6:58               ` Varghese, Vipin
  1 sibling, 0 replies; 55+ messages in thread
From: Liangxing Wang @ 2026-01-22  7:28 UTC (permalink / raw)
  To: Varghese, Vipin, Morten Brørup, Stephen Hemminger,
	P, Thiyagarajan, Murali Krishna, Bala
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev

Hi @ Varghese, Vipin,

I am interested in this rte_memcpy patch performance on our x86 platform too. May you please share the configuration file and steps for your dma-perf test then I can run the same on our platform? Thanks.

Regards,
Liangxing
> -----Original Message-----
> From: Varghese, Vipin <Vipin.Varghese@amd.com>
> Sent: 2026年1月22日 15:00
> To: Varghese, Vipin <Vipin.Varghese@amd.com>; Morten Brørup
> <mb@smartsharesystems.com>; Stephen Hemminger
> <stephen@networkplumber.org>; P, Thiyagarajan <Thiyagarajan.P@amd.com>;
> Murali Krishna, Bala <Bala.MuraliKrishna@amd.com>
> Cc: dev@dpdk.org; Bruce Richardson <bruce.richardson@intel.com>;
> Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Subject: RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
> 
> [Public]
> 
> Hi @Morten Brørup,
> 
> We (@P, Thiyagarajan @Murali Krishna, Bala and myself) have used dma-perf
> to validate the performance from 1B to 17B payload.
> Following are our observations
> 
> With c_args `-DRTE_MEMCPY_AVX512` enabled on zen4, we observe around
> 25% performance regression for payload size 1B to 15B and 17B.
> While in case of 16B we see improvement in Mops by 40%.
> 
> Without c_args `-DRTE_MEMCPY_AVX512` enabled on zen4, we observe +-4%
> variation from 1B to 17B.
> 
> `We are investigating the variation is more prominent with avx512 memcpy.`
> 
> Note:
> 1. in zen4 ld|str is broken to 32B. While in zen5 ld|str is 64B.
> 2. we tested memif copy on zen5 with patch (without -DRTE_MEMCPY_AVX512)
> on 64B and 65B payload. It is same as zen4 observation (shared in previous
> email).
> 
> 
> 
> > -----Original Message-----
> > From: Varghese, Vipin <Vipin.Varghese@amd.com>
> > Sent: Wednesday, January 21, 2026 5:19 PM
> > To: Morten Brørup <mb@smartsharesystems.com>; Stephen Hemminger
> > <stephen@networkplumber.org>
> > Cc: dev@dpdk.org; Bruce Richardson <bruce.richardson@intel.com>;
> > Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> > Subject: RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
> >
> > Caution: This message originated from an External Source. Use proper
> > caution when opening attachments, clicking links, or responding.
> >
> >
> > [Public]
> >
> > Hi @Morten Brørup, please find our observation running testpmd with
> > memif in zero-copy mode disabled (rte_memcpy enabled).
> >
> > 1. DPDK baseline version: 25.11 we tested with testpmd in io & flowgen
> mode 2.
> > using no cargs for memcpy (rtemov32) and with patch 64B & 65B we get
> > `15.5Mpps` 3. using cargs ` -DRTE_MEMCPY_AVX512` for memcpy
> (rtemov64)
> > and with patch 64B & 65B we get `14.8Mpps`
> >
> > We will run with dma-perf application for payload sizes of
> > 1,2,3,4,5,...etc
> >
> > Regards
> > Vipin Varghese

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v6] eal/x86: optimize memcpy of small sizes
  2026-01-22  6:59             ` Varghese, Vipin
  2026-01-22  7:28               ` Liangxing Wang
@ 2026-01-23  6:58               ` Varghese, Vipin
  1 sibling, 0 replies; 55+ messages in thread
From: Varghese, Vipin @ 2026-01-23  6:58 UTC (permalink / raw)
  To: wangliangxing@hygon.cn
  Cc: dev@dpdk.org, Bruce Richardson, Konstantin Ananyev,
	Morten Brørup, Stephen Hemminger, P, Thiyagarajan,
	Murali Krishna, Bala

[Public]

Hi Wang,

We use dpdk dma-perf application for testing memcpy via cpu.
Please find the ini configuration shared below

```
[GLOBAL]
eal_args=--in-memory --file-prefix=test -l 34,35
cache_flush=0
test_seconds=10

[case1]
type=CPU_MEM_COPY
mem_size=100
buf_size=1,17,1,ADD
src_numa_node=0
dst_numa_node=0
lcore=35
```

Please adapt eal_args and lcore for machine on which you are running.
Command used is `dpdk-vanilla/build-no512/app/dpdk-test-dma-perf --config ../config.ini --result /tmp/memcpy.log`

After running we rebuild dpdk to use avx512 MEMCPY using `-DRTE_MEMCPY_AVX512` and then rerun the dma-perf again.

Regards
Vipin Varghese

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v7] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (5 preceding siblings ...)
  2026-01-12 12:03 ` [PATCH v6] " Morten Brørup
@ 2026-02-20 11:08 ` Morten Brørup
  2026-03-11  7:28   ` Morten Brørup
                     ` (3 more replies)
  2026-04-29  9:36 ` [PATCH v8] " Morten Brørup
                   ` (3 subsequent siblings)
  10 siblings, 4 replies; 55+ messages in thread
From: Morten Brørup @ 2026-02-20 11:08 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna, Morten Brørup

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, the function for copying less than 16 bytes was replaced with
a smarter implementation using fewer branches and potentially fewer
load/store operations.
This function was also extended to handle copying of up to 16 bytes,
instead of up to 15 bytes.
This small extension reduces the code path, and thus improves the
performance, for copying two pointers on 64-bit architectures and four
pointers on 32-bit architectures.

Also, __rte_restrict was added to source and destination addresses.

And finally, the missing implementation of rte_mov48() was added.

Regarding performance, the memcpy performance test showed cache-to-cache
copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles before
this patch.
Copying 64 bytes now takes 4 cycles, versus 7 cycles before.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of build time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 526 ++++++++++++++++++++-----------
 1 file changed, 348 insertions(+), 178 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..ed8e5f8dc4 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,60 +45,78 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
- * Copy bytes from one location to another,
- * locations should not overlap.
- * Use with n <= 15.
+ * Copy 1 byte from one location to another,
+ * locations must not overlap.
  */
-static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+static __rte_always_inline void
+rte_mov1(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	*dst = *src;
+}
+
+/**
+ * Copy 2 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov2(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	/**
-	 * Use the following structs to avoid violating C standard
+	 * Use the following struct to avoid violating C standard
 	 * alignment requirements and to avoid strict aliasing bugs
 	 */
-	struct __rte_packed_begin rte_uint64_alias {
-		uint64_t val;
+	struct __rte_packed_begin rte_uint16_alias {
+		uint16_t val;
 	} __rte_packed_end __rte_may_alias;
+
+	((struct rte_uint16_alias *)dst)->val = ((const struct rte_uint16_alias *)src)->val;
+}
+
+/**
+ * Copy 4 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov4(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	/**
+	 * Use the following struct to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
 	struct __rte_packed_begin rte_uint32_alias {
 		uint32_t val;
 	} __rte_packed_end __rte_may_alias;
-	struct __rte_packed_begin rte_uint16_alias {
-		uint16_t val;
+
+	((struct rte_uint32_alias *)dst)->val = ((const struct rte_uint32_alias *)src)->val;
+}
+
+/**
+ * Copy 8 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov8(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+	/**
+	 * Use the following struct to avoid violating C standard
+	 * alignment requirements and to avoid strict aliasing bugs
+	 */
+	struct __rte_packed_begin rte_uint64_alias {
+		uint64_t val;
 	} __rte_packed_end __rte_may_alias;
 
-	void *ret = dst;
-	if (n & 8) {
-		((struct rte_uint64_alias *)dst)->val =
-			((const struct rte_uint64_alias *)src)->val;
-		src = (const uint64_t *)src + 1;
-		dst = (uint64_t *)dst + 1;
-	}
-	if (n & 4) {
-		((struct rte_uint32_alias *)dst)->val =
-			((const struct rte_uint32_alias *)src)->val;
-		src = (const uint32_t *)src + 1;
-		dst = (uint32_t *)dst + 1;
-	}
-	if (n & 2) {
-		((struct rte_uint16_alias *)dst)->val =
-			((const struct rte_uint16_alias *)src)->val;
-		src = (const uint16_t *)src + 1;
-		dst = (uint16_t *)dst + 1;
-	}
-	if (n & 1)
-		*(uint8_t *)dst = *(const uint8_t *)src;
-	return ret;
+	((struct rte_uint64_alias *)dst)->val = ((const struct rte_uint64_alias *)src)->val;
 }
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +126,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +142,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +179,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,15 +190,234 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n <= 16.
+ */
+static __rte_always_inline void *
+rte_mov16_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring three copy operations are not handled here,
+	 * but proceed to the method using two overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 2) {
+			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 3) {
+			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 2, (const uint8_t *)src + 2);
+			return dst;
+		}
+		if (n == 4) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 5) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 4, (const uint8_t *)src + 4);
+			return dst;
+		}
+		if (n == 6) {
+			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 4, (const uint8_t *)src + 4);
+			return dst;
+		}
+		if (n == 8) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 9) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 10) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 12) {
+			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 8, (const uint8_t *)src + 8);
+			return dst;
+		}
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/*
+	 * Note: Using "n & X" generates 3-byte "test" instructions,
+	 * instead of "n >= X", which would generate 4-byte "cmp" instructions.
+	 */
+	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18. */
+		/* Copy 8 ~ 16 bytes. */
+		rte_mov8((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov8((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n);
+	} else if (n & 0x4) {
+		/* Copy 4 ~ 7 bytes. */
+		rte_mov4((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov4((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n);
+	} else if (n & 0x2) {
+		/* Copy 2 ~ 3 bytes. */
+		rte_mov2((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov2((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n);
+	} else if (n & 0x1) {
+		/* Copy 1 byte. */
+		rte_mov1((uint8_t *)dst, (const uint8_t *)src);
+	}
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 17 (or 16) < n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov17_to_32(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring three copy operations are not handled here,
+	 * but proceed to the method using two overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 17) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 18) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 20) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 24) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+			return dst;
+		}
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 17 (or 16) ~ 32 bytes. */
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 33 (or 32) < n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov33_to_64(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	/*
+	 * Faster way when size is known at build time.
+	 * Sizes requiring more copy operations are not handled here,
+	 * but proceed to the method using overlapping copy operations.
+	 */
+	if (__rte_constant(n)) {
+		if (n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		if (n == 33) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 34) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 36) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 40) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+			return dst;
+		}
+		if (n == 48) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if !defined RTE_MEMCPY_AVX /* SSE specific implementation */
+		if (n == 49) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov1((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 50) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov2((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 52) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov4((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+		if (n == 56) {
+			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov8((uint8_t *)dst + 48, (const uint8_t *)src + 48);
+			return dst;
+		}
+#endif
+		if (n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+	}
+
+	/* Copy 33 (or 32) ~ 64 bytes. */
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -182,10 +428,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +448,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +478,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +574,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +604,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +686,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +746,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +777,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +885,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +911,28 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Common implementation for size <= 64 bytes. */
+	if (n <= 16)
+		return rte_mov16_or_less(dst, src, n);
+	if (n <= 64) {
+		/* Copy 17 ~ 64 bytes using vector instructions. */
+		if (n <= 32)
+			return rte_mov17_to_32(dst, src, n);
+		else
+			return rte_mov33_to_64(dst, src, n);
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* RE: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
@ 2026-03-11  7:28   ` Morten Brørup
  2026-03-11 16:58   ` Bruce Richardson
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-03-11  7:28 UTC (permalink / raw)
  To: Bruce Richardson, Konstantin Ananyev, Vipin Varghese
  Cc: Thiyagarajan P, Bala Murali Krishna, Stephen Hemminger,
	Liangxing Wang, dev

Bruce, Konstantin, Vipin (as x86 maintainers),

PING for final review/ack.
This patch speeds up small copies, e.g. putting 1~8 mbufs into a mempool cache, or copying a 64-byte packet, so let's get it in.

Venlig hilsen / Kind regards,
-Morten Brørup

> -----Original Message-----
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Friday, 20 February 2026 12.08
> To: dev@dpdk.org; Bruce Richardson; Konstantin Ananyev; Vipin Varghese;
> Stephen Hemminger; Liangxing Wang
> Cc: Thiyagarajan P; Bala Murali Krishna; Morten Brørup
> Subject: [PATCH v7] eal/x86: optimize memcpy of small sizes
> 
> The implementation for copying up to 64 bytes does not depend on
> address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the
> aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes,
> instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, the function for copying less than 16 bytes was replaced
> with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes.
> This small extension reduces the code path, and thus improves the
> performance, for copying two pointers on 64-bit architectures and four
> pointers on 32-bit architectures.
> 
> Also, __rte_restrict was added to source and destination addresses.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Regarding performance, the memcpy performance test showed cache-to-
> cache
> copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles
> before
> this patch.
> Copying 64 bytes now takes 4 cycles, versus 7 cycles before.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v7:
> * Updated patch description. Mainly to clarify that the changes related
> to
>   copying up to 64 bytes simply replaces multiple instances of copy-
> pasted
>   code with one common instance.
> * Fixed copy of build time known 16 bytes in rte_mov17_to_32(). (Vipin)
> * Rebased.
> v6:
> * Went back to using rte_uintN_alias structures for copying instead of
>   using memcpy(). They were there for a reason.
>   (Inspired by the discussion about optimizing the checksum function.)
> * Removed note about copying uninitialized data.
> * Added __rte_restrict to source and destination addresses.
>   Updated function descriptions from "should" to "must" not overlap.
> * Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
>   copying 32 + 32 overlapping bytes. (Konstantin)
> * Ignoring "-Wstringop-overflow" is not needed, so it was removed.
> v5:
> * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> _mm_lddqu_si128().
>   It was slower.
> * Improved some comments. (Konstantin Ananyev)
> * Moved the size range 17..32 inside the size <= 64 branch, so when
>   building for SSE, the generated code can start copying the first
>   16 bytes before comparing if the size is greater than 32 or not.
> * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> rte_mov33_to_64().
> v4:
> * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> v3:
> * Fixed typo in comment.
> v2:
> * Updated patch title to reflect that the performance is improved.
> * Use the design pattern of two overlapping stores for small copies
> too.
> * Expanded first branch from size < 16 to size <= 16.
> * Handle more build time constant copy sizes.
> ---
>  lib/eal/x86/include/rte_memcpy.h | 526 ++++++++++++++++++++-----------
>  1 file changed, 348 insertions(+), 178 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..ed8e5f8dc4 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -22,11 +22,6 @@
>  extern "C" {
>  #endif
> 
> -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> -#pragma GCC diagnostic push
> -#pragma GCC diagnostic ignored "-Wstringop-overflow"
> -#endif
> -
>  /*
>   * GCC older than version 11 doesn't compile AVX properly, so use SSE
> instead.
>   * There are no problems with AVX2.
> @@ -40,9 +35,6 @@ extern "C" {
>  /**
>   * Copy bytes from one location to another. The locations must not
> overlap.
>   *
> - * @note This is implemented as a macro, so it's address should not be
> taken
> - * and care is needed as parameter expressions may be evaluated
> multiple times.
> - *
>   * @param dst
>   *   Pointer to the destination of the data.
>   * @param src
> @@ -53,60 +45,78 @@ extern "C" {
>   *   Pointer to the destination data.
>   */
>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n);
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src,
> size_t n);
> 
>  /**
> - * Copy bytes from one location to another,
> - * locations should not overlap.
> - * Use with n <= 15.
> + * Copy 1 byte from one location to another,
> + * locations must not overlap.
>   */
> -static __rte_always_inline void *
> -rte_mov15_or_less(void *dst, const void *src, size_t n)
> +static __rte_always_inline void
> +rte_mov1(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
> +{
> +	*dst = *src;
> +}
> +
> +/**
> + * Copy 2 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov2(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  	/**
> -	 * Use the following structs to avoid violating C standard
> +	 * Use the following struct to avoid violating C standard
>  	 * alignment requirements and to avoid strict aliasing bugs
>  	 */
> -	struct __rte_packed_begin rte_uint64_alias {
> -		uint64_t val;
> +	struct __rte_packed_begin rte_uint16_alias {
> +		uint16_t val;
>  	} __rte_packed_end __rte_may_alias;
> +
> +	((struct rte_uint16_alias *)dst)->val = ((const struct
> rte_uint16_alias *)src)->val;
> +}
> +
> +/**
> + * Copy 4 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov4(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
> +{
> +	/**
> +	 * Use the following struct to avoid violating C standard
> +	 * alignment requirements and to avoid strict aliasing bugs
> +	 */
>  	struct __rte_packed_begin rte_uint32_alias {
>  		uint32_t val;
>  	} __rte_packed_end __rte_may_alias;
> -	struct __rte_packed_begin rte_uint16_alias {
> -		uint16_t val;
> +
> +	((struct rte_uint32_alias *)dst)->val = ((const struct
> rte_uint32_alias *)src)->val;
> +}
> +
> +/**
> + * Copy 8 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov8(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
> +{
> +	/**
> +	 * Use the following struct to avoid violating C standard
> +	 * alignment requirements and to avoid strict aliasing bugs
> +	 */
> +	struct __rte_packed_begin rte_uint64_alias {
> +		uint64_t val;
>  	} __rte_packed_end __rte_may_alias;
> 
> -	void *ret = dst;
> -	if (n & 8) {
> -		((struct rte_uint64_alias *)dst)->val =
> -			((const struct rte_uint64_alias *)src)->val;
> -		src = (const uint64_t *)src + 1;
> -		dst = (uint64_t *)dst + 1;
> -	}
> -	if (n & 4) {
> -		((struct rte_uint32_alias *)dst)->val =
> -			((const struct rte_uint32_alias *)src)->val;
> -		src = (const uint32_t *)src + 1;
> -		dst = (uint32_t *)dst + 1;
> -	}
> -	if (n & 2) {
> -		((struct rte_uint16_alias *)dst)->val =
> -			((const struct rte_uint16_alias *)src)->val;
> -		src = (const uint16_t *)src + 1;
> -		dst = (uint16_t *)dst + 1;
> -	}
> -	if (n & 1)
> -		*(uint8_t *)dst = *(const uint8_t *)src;
> -	return ret;
> +	((struct rte_uint64_alias *)dst)->val = ((const struct
> rte_uint64_alias *)src)->val;
>  }
> 
>  /**
>   * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  	__m128i xmm0;
> 
> @@ -116,10 +126,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> 
>  /**
>   * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  #if defined RTE_MEMCPY_AVX
>  	__m256i ymm0;
> @@ -132,12 +142,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  #endif
>  }
> 
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
> +{
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> 16);
> +#endif
> +}
> +
>  /**
>   * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
> @@ -152,10 +179,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
> 
>  /**
>   * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  	rte_mov64(dst + 0 * 64, src + 0 * 64);
>  	rte_mov64(dst + 1 * 64, src + 1 * 64);
> @@ -163,15 +190,234 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> 
>  /**
>   * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict
> src)
>  {
>  	rte_mov128(dst + 0 * 128, src + 0 * 128);
>  	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n <= 16.
> + */
> +static __rte_always_inline void *
> +rte_mov16_or_less(void *__rte_restrict dst, const void *__rte_restrict
> src, size_t n)
> +{
> +	/*
> +	 * Faster way when size is known at build time.
> +	 * Sizes requiring three copy operations are not handled here,
> +	 * but proceed to the method using two overlapping copy
> operations.
> +	 */
> +	if (__rte_constant(n)) {
> +		if (n == 2) {
> +			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 3) {
> +			rte_mov2((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 2, (const uint8_t *)src +
> 2);
> +			return dst;
> +		}
> +		if (n == 4) {
> +			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 5) {
> +			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 4, (const uint8_t *)src +
> 4);
> +			return dst;
> +		}
> +		if (n == 6) {
> +			rte_mov4((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov2((uint8_t *)dst + 4, (const uint8_t *)src +
> 4);
> +			return dst;
> +		}
> +		if (n == 8) {
> +			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 9) {
> +			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 8, (const uint8_t *)src +
> 8);
> +			return dst;
> +		}
> +		if (n == 10) {
> +			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov2((uint8_t *)dst + 8, (const uint8_t *)src +
> 8);
> +			return dst;
> +		}
> +		if (n == 12) {
> +			rte_mov8((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov4((uint8_t *)dst + 8, (const uint8_t *)src +
> 8);
> +			return dst;
> +		}
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/*
> +	 * Note: Using "n & X" generates 3-byte "test" instructions,
> +	 * instead of "n >= X", which would generate 4-byte "cmp"
> instructions.
> +	 */
> +	if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> */
> +		/* Copy 8 ~ 16 bytes. */
> +		rte_mov8((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov8((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 +
> n);
> +	} else if (n & 0x4) {
> +		/* Copy 4 ~ 7 bytes. */
> +		rte_mov4((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov4((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 +
> n);
> +	} else if (n & 0x2) {
> +		/* Copy 2 ~ 3 bytes. */
> +		rte_mov2((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov2((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 +
> n);
> +	} else if (n & 0x1) {
> +		/* Copy 1 byte. */
> +		rte_mov1((uint8_t *)dst, (const uint8_t *)src);
> +	}
> +	return dst;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with 17 (or 16) < n <= 32.
> + */
> +static __rte_always_inline void *
> +rte_mov17_to_32(void *__rte_restrict dst, const void *__rte_restrict
> src, size_t n)
> +{
> +	/*
> +	 * Faster way when size is known at build time.
> +	 * Sizes requiring three copy operations are not handled here,
> +	 * but proceed to the method using two overlapping copy
> operations.
> +	 */
> +	if (__rte_constant(n)) {
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 17) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 16, (const uint8_t *)src +
> 16);
> +			return dst;
> +		}
> +		if (n == 18) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov2((uint8_t *)dst + 16, (const uint8_t *)src +
> 16);
> +			return dst;
> +		}
> +		if (n == 20) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov4((uint8_t *)dst + 16, (const uint8_t *)src +
> 16);
> +			return dst;
> +		}
> +		if (n == 24) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov8((uint8_t *)dst + 16, (const uint8_t *)src +
> 16);
> +			return dst;
> +		}
> +		if (n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/* Copy 17 (or 16) ~ 32 bytes. */
> +	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> +	return dst;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with 33 (or 32) < n <= 64.
> + */
> +static __rte_always_inline void *
> +rte_mov33_to_64(void *__rte_restrict dst, const void *__rte_restrict
> src, size_t n)
> +{
> +	/*
> +	 * Faster way when size is known at build time.
> +	 * Sizes requiring more copy operations are not handled here,
> +	 * but proceed to the method using overlapping copy operations.
> +	 */
> +	if (__rte_constant(n)) {
> +		if (n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		if (n == 33) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> +			return dst;
> +		}
> +		if (n == 34) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov2((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> +			return dst;
> +		}
> +		if (n == 36) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov4((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> +			return dst;
> +		}
> +		if (n == 40) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov8((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> +			return dst;
> +		}
> +		if (n == 48) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +#if !defined RTE_MEMCPY_AVX /* SSE specific implementation */
> +		if (n == 49) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov1((uint8_t *)dst + 48, (const uint8_t *)src +
> 48);
> +			return dst;
> +		}
> +		if (n == 50) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov2((uint8_t *)dst + 48, (const uint8_t *)src +
> 48);
> +			return dst;
> +		}
> +		if (n == 52) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov4((uint8_t *)dst + 48, (const uint8_t *)src +
> 48);
> +			return dst;
> +		}
> +		if (n == 56) {
> +			rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> +			rte_mov8((uint8_t *)dst + 48, (const uint8_t *)src +
> 48);
> +			return dst;
> +		}
> +#endif
> +		if (n == 64) {
> +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	/* Copy 33 (or 32) ~ 64 bytes. */
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 +
> n);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> +	if (n > 48)
> +		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2
> * 16);
> +	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> +#endif
> +	return dst;
> +}
> +
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> 
>  /**
> @@ -182,10 +428,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t
> *__rte_restrict src, size_t n)
>  {
>  	__m512i zmm0, zmm1;
> 
> @@ -202,10 +448,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> *src, size_t n)
> 
>  /**
>   * Copy 512-byte blocks from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static inline void
> -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t
> *__rte_restrict src, size_t n)
>  {
>  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> 
> @@ -232,45 +478,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t
> *src, size_t n)
>  	}
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void
> *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				  (const uint8_t *)src - 16 + n);
> -		return ret;
> -	}
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				  (const uint8_t *)src - 32 + n);
> -		return ret;
> -	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> @@ -351,10 +574,10 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t
> *__rte_restrict src, size_t n)
>  {
>  	__m256i ymm0, ymm1, ymm2, ymm3;
> 
> @@ -381,41 +604,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> *src, size_t n)
>  	}
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void
> *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -		return ret;
> -	}
>  	if (n <= 256) {
>  		if (n >= 128) {
>  			n -= 128;
> @@ -482,7 +686,7 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
>  /**
>   * Macro for copying unaligned block from one location to another with
> constant load offset,
>   * 47 bytes leftover maximum,
> - * locations should not overlap.
> + * locations must not overlap.
>   * Requirements:
>   * - Store is aligned
>   * - Load offset is <offset>, which must be immediate value within [1,
> 15]
> @@ -542,7 +746,7 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
>  /**
>   * Macro for copying unaligned block from one location to another,
>   * 47 bytes leftover maximum,
> - * locations should not overlap.
> + * locations must not overlap.
>   * Use switch here because the aligning instruction requires immediate
> value for shift count.
>   * Requirements:
>   * - Store is aligned
> @@ -573,38 +777,23 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
>      }
> \
>  }
> 
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void
> *__rte_restrict src,
> +		size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t srcofs;
> 
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> 16 + n);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		if (n > 48)
> -			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src +
> 32);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> 16 + n);
> -		return ret;
> -	}
>  	if (n <= 128) {
>  		goto COPY_BLOCK_128_BACK15;
>  	}
> @@ -696,44 +885,17 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t n)
> 
>  #endif /* __AVX512F__ */
> 
> +/**
> + * Copy bytes from one vector register size aligned location to
> another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void
> *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
> 
> -	/* Copy size < 16 bytes */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
> -	/* Copy 16 <= size <= 32 bytes */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -
> -		return ret;
> -	}
> -
> -	/* Copy 32 < size <= 64 bytes */
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -
> -		return ret;
> -	}
> -
>  	/* Copy 64 bytes blocks */
>  	for (; n > 64; n -= 64) {
>  		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> @@ -749,20 +911,28 @@ rte_memcpy_aligned(void *dst, const void *src,
> size_t n)
>  }
> 
>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src,
> size_t n)
>  {
> +	/* Common implementation for size <= 64 bytes. */
> +	if (n <= 16)
> +		return rte_mov16_or_less(dst, src, n);
> +	if (n <= 64) {
> +		/* Copy 17 ~ 64 bytes using vector instructions. */
> +		if (n <= 32)
> +			return rte_mov17_to_32(dst, src, n);
> +		else
> +			return rte_mov33_to_64(dst, src, n);
> +	}
> +
> +	/* Implementation for size > 64 bytes depends on alignment with
> vector register size. */
>  	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> -		return rte_memcpy_aligned(dst, src, n);
> +		return rte_memcpy_aligned_more_than_64(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return rte_memcpy_generic_more_than_64(dst, src, n);
>  }
> 
>  #undef ALIGNMENT_MASK
> 
> -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> -#pragma GCC diagnostic pop
> -#endif
> -
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
  2026-03-11  7:28   ` Morten Brørup
@ 2026-03-11 16:58   ` Bruce Richardson
  2026-03-11 18:29     ` Morten Brørup
  2026-03-12  8:33   ` Konstantin Ananyev
  2026-03-19 15:55   ` Morten Brørup
  3 siblings, 1 reply; 55+ messages in thread
From: Bruce Richardson @ 2026-03-11 16:58 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Konstantin Ananyev, Vipin Varghese, Stephen Hemminger,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna

On Fri, Feb 20, 2026 at 11:08:24AM +0000, Morten Brørup wrote:
> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes, instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, the function for copying less than 16 bytes was replaced with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes.
> This small extension reduces the code path, and thus improves the
> performance, for copying two pointers on 64-bit architectures and four
> pointers on 32-bit architectures.
> 
> Also, __rte_restrict was added to source and destination addresses.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Regarding performance, the memcpy performance test showed cache-to-cache
> copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles before
> this patch.
> Copying 64 bytes now takes 4 cycles, versus 7 cycles before.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v7:
> * Updated patch description. Mainly to clarify that the changes related to
>   copying up to 64 bytes simply replaces multiple instances of copy-pasted
>   code with one common instance.
> * Fixed copy of build time known 16 bytes in rte_mov17_to_32(). (Vipin)
> * Rebased.
> v6:
> * Went back to using rte_uintN_alias structures for copying instead of
>   using memcpy(). They were there for a reason.
>   (Inspired by the discussion about optimizing the checksum function.)
> * Removed note about copying uninitialized data.
> * Added __rte_restrict to source and destination addresses.
>   Updated function descriptions from "should" to "must" not overlap.
> * Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
>   copying 32 + 32 overlapping bytes. (Konstantin)
> * Ignoring "-Wstringop-overflow" is not needed, so it was removed.
> v5:
> * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
>   It was slower.
> * Improved some comments. (Konstantin Ananyev)
> * Moved the size range 17..32 inside the size <= 64 branch, so when
>   building for SSE, the generated code can start copying the first
>   16 bytes before comparing if the size is greater than 32 or not.
> * Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
> v4:
> * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> v3:
> * Fixed typo in comment.
> v2:
> * Updated patch title to reflect that the performance is improved.
> * Use the design pattern of two overlapping stores for small copies too.
> * Expanded first branch from size < 16 to size <= 16.
> * Handle more build time constant copy sizes.
> ---
>  lib/eal/x86/include/rte_memcpy.h | 526 ++++++++++++++++++++-----------
>  1 file changed, 348 insertions(+), 178 deletions(-)
> 

I'm a little unhappy to see the amount of memcpy code growing rather than
shrinking, but since it improves performance I'm ok with it. We should keep
it under constant review though.

> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..ed8e5f8dc4 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -22,11 +22,6 @@
>  extern "C" {
>  #endif
>  
> -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> -#pragma GCC diagnostic push
> -#pragma GCC diagnostic ignored "-Wstringop-overflow"
> -#endif
> -
>  /*
>   * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
>   * There are no problems with AVX2.
> @@ -40,9 +35,6 @@ extern "C" {
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> - * @note This is implemented as a macro, so it's address should not be taken
> - * and care is needed as parameter expressions may be evaluated multiple times.
> - *

I'd be wary about completely removing this comment, as we may well want to
go back to a macro in the future, e.g. if we decide to remove the custom
rte_memcpy altogether. Therefore, rather than removing the comment, can we
tweak it to say "This may be implemented as a macro..."


Acked-by: Bruce Richardson <bruce.richardson@intel.com>

PS: If we want a little further cleanup, I'd consider removing the
RTE_MEMCPY_AVX macro and replacing it with a straight check for __AVX2__.
CPUs with AVX2 was introduced in 2013, and checking Claude and Wikipedia
says that AMD parts started having it in 2015, meaning that there were only
a few generations of CPUs >10 years ago which had AVX but not AVX2. [There
were later CPUs e.g. lower-end parts, which didn't have AVX2, but they
didn't have AVX1 either, so SSE is the only choice there]
Not a big cleanup if we did remove it, but sometimes every little helps!

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-03-11 16:58   ` Bruce Richardson
@ 2026-03-11 18:29     ` Morten Brørup
  2026-03-11 19:09       ` Bruce Richardson
  0 siblings, 1 reply; 55+ messages in thread
From: Morten Brørup @ 2026-03-11 18:29 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: dev, Konstantin Ananyev, Vipin Varghese, Stephen Hemminger,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Wednesday, 11 March 2026 17.59
> 
> On Fri, Feb 20, 2026 at 11:08:24AM +0000, Morten Brørup wrote:
> > The implementation for copying up to 64 bytes does not depend on
> address
> > alignment with the size of the CPU's vector registers. Nonetheless,
> the
> > exact same code for copying up to 64 bytes was present in both the
> aligned
> > copy function and all the CPU vector register size specific variants
> of
> > the unaligned copy functions.
> > With this patch, the implementation for copying up to 64 bytes was
> > consolidated into one instance, located in the common copy function,
> > before checking alignment requirements.
> > This provides three benefits:
> > 1. No copy-paste in the source code.
> > 2. A performance gain for copying up to 64 bytes, because the
> > address alignment check is avoided in this case.
> > 3. Reduced instruction memory footprint, because the compiler only
> > generates one instance of the function for copying up to 64 bytes,
> instead
> > of two instances (one in the unaligned copy function, and one in the
> > aligned copy function).
> >
> > Furthermore, the function for copying less than 16 bytes was replaced
> with
> > a smarter implementation using fewer branches and potentially fewer
> > load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes.
> > This small extension reduces the code path, and thus improves the
> > performance, for copying two pointers on 64-bit architectures and
> four
> > pointers on 32-bit architectures.
> >
> > Also, __rte_restrict was added to source and destination addresses.
> >
> > And finally, the missing implementation of rte_mov48() was added.
> >
> > Regarding performance, the memcpy performance test showed cache-to-
> cache
> > copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles
> before
> > this patch.
> > Copying 64 bytes now takes 4 cycles, versus 7 cycles before.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > v7:
> > * Updated patch description. Mainly to clarify that the changes
> related to
> >   copying up to 64 bytes simply replaces multiple instances of copy-
> pasted
> >   code with one common instance.
> > * Fixed copy of build time known 16 bytes in rte_mov17_to_32().
> (Vipin)
> > * Rebased.
> > v6:
> > * Went back to using rte_uintN_alias structures for copying instead
> of
> >   using memcpy(). They were there for a reason.
> >   (Inspired by the discussion about optimizing the checksum
> function.)
> > * Removed note about copying uninitialized data.
> > * Added __rte_restrict to source and destination addresses.
> >   Updated function descriptions from "should" to "must" not overlap.
> > * Changed rte_mov48() AVX implementation to copy 32+16 bytes instead
> of
> >   copying 32 + 32 overlapping bytes. (Konstantin)
> > * Ignoring "-Wstringop-overflow" is not needed, so it was removed.
> > v5:
> > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> _mm_lddqu_si128().
> >   It was slower.
> > * Improved some comments. (Konstantin Ananyev)
> > * Moved the size range 17..32 inside the size <= 64 branch, so when
> >   building for SSE, the generated code can start copying the first
> >   16 bytes before comparing if the size is greater than 32 or not.
> > * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> rte_mov33_to_64().
> > v4:
> > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> > v3:
> > * Fixed typo in comment.
> > v2:
> > * Updated patch title to reflect that the performance is improved.
> > * Use the design pattern of two overlapping stores for small copies
> too.
> > * Expanded first branch from size < 16 to size <= 16.
> > * Handle more build time constant copy sizes.
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 526 ++++++++++++++++++++---------
> --
> >  1 file changed, 348 insertions(+), 178 deletions(-)
> >
> 
> I'm a little unhappy to see the amount of memcpy code growing rather
> than
> shrinking, but since it improves performance I'm ok with it. We should
> keep
> it under constant review though.

Agree!

I just counted; 149 of the added lines are for handling __rte_constant(n). So it's not as bad as it looks.
But still growing, which was not the intention.
When I started working on this patch, the intention was to consolidate the copy-pasted instances for handling up to 64 bytes into one instance. This should have reduced the amount of code.
But then it somehow grew anyway.

> 
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> > index 46d34b8081..ed8e5f8dc4 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -22,11 +22,6 @@
> >  extern "C" {
> >  #endif
> >
> > -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> > -#pragma GCC diagnostic push
> > -#pragma GCC diagnostic ignored "-Wstringop-overflow"
> > -#endif
> > -
> >  /*
> >   * GCC older than version 11 doesn't compile AVX properly, so use
> SSE instead.
> >   * There are no problems with AVX2.
> > @@ -40,9 +35,6 @@ extern "C" {
> >  /**
> >   * Copy bytes from one location to another. The locations must not
> overlap.
> >   *
> > - * @note This is implemented as a macro, so it's address should not
> be taken
> > - * and care is needed as parameter expressions may be evaluated
> multiple times.
> > - *
> 
> I'd be wary about completely removing this comment, as we may well want
> to
> go back to a macro in the future, e.g. if we decide to remove the
> custom
> rte_memcpy altogether. Therefore, rather than removing the comment, can
> we
> tweak it to say "This may be implemented as a macro..."

The comment is still present in the "generic" header file used for the Doxygen documentation:
https://elixir.bootlin.com/dpdk/v26.03-rc1/source/lib/eal/include/generic/rte_memcpy.h#L99

All other architectures rely on the "generic" header file, and have no Doxygen comments at all.
We could also remove them from the x86 implementation.
That would shrink the file even more. ;-)
But I'd rather keep the comments - at least for now.

> 
> 
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Thank you for quick response, Bruce.

> 
> PS: If we want a little further cleanup, I'd consider removing the
> RTE_MEMCPY_AVX macro and replacing it with a straight check for
> __AVX2__.
> CPUs with AVX2 was introduced in 2013, and checking Claude and
> Wikipedia
> says that AMD parts started having it in 2015, meaning that there were
> only
> a few generations of CPUs >10 years ago which had AVX but not AVX2.
> [There
> were later CPUs e.g. lower-end parts, which didn't have AVX2, but they
> didn't have AVX1 either, so SSE is the only choice there]
> Not a big cleanup if we did remove it, but sometimes every little
> helps!

Good idea. But let's not do it now.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-03-11 18:29     ` Morten Brørup
@ 2026-03-11 19:09       ` Bruce Richardson
  0 siblings, 0 replies; 55+ messages in thread
From: Bruce Richardson @ 2026-03-11 19:09 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Konstantin Ananyev, Vipin Varghese, Stephen Hemminger,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna

On Wed, Mar 11, 2026 at 07:29:38PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Wednesday, 11 March 2026 17.59
> > 
> > On Fri, Feb 20, 2026 at 11:08:24AM +0000, Morten Brørup wrote:
> > > The implementation for copying up to 64 bytes does not depend on
> > address
> > > alignment with the size of the CPU's vector registers. Nonetheless,
> > the
> > > exact same code for copying up to 64 bytes was present in both the
> > aligned
> > > copy function and all the CPU vector register size specific variants
> > of
> > > the unaligned copy functions.
> > > With this patch, the implementation for copying up to 64 bytes was
> > > consolidated into one instance, located in the common copy function,
> > > before checking alignment requirements.
> > > This provides three benefits:
> > > 1. No copy-paste in the source code.
> > > 2. A performance gain for copying up to 64 bytes, because the
> > > address alignment check is avoided in this case.
> > > 3. Reduced instruction memory footprint, because the compiler only
> > > generates one instance of the function for copying up to 64 bytes,
> > instead
> > > of two instances (one in the unaligned copy function, and one in the
> > > aligned copy function).
> > >
> > > Furthermore, the function for copying less than 16 bytes was replaced
> > with
> > > a smarter implementation using fewer branches and potentially fewer
> > > load/store operations.
> > > This function was also extended to handle copying of up to 16 bytes,
> > > instead of up to 15 bytes.
> > > This small extension reduces the code path, and thus improves the
> > > performance, for copying two pointers on 64-bit architectures and
> > four
> > > pointers on 32-bit architectures.
> > >
> > > Also, __rte_restrict was added to source and destination addresses.
> > >
> > > And finally, the missing implementation of rte_mov48() was added.
> > >
> > > Regarding performance, the memcpy performance test showed cache-to-
> > cache
> > > copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles
> > before
> > > this patch.
> > > Copying 64 bytes now takes 4 cycles, versus 7 cycles before.
> > >
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > > v7:
> > > * Updated patch description. Mainly to clarify that the changes
> > related to
> > >   copying up to 64 bytes simply replaces multiple instances of copy-
> > pasted
> > >   code with one common instance.
> > > * Fixed copy of build time known 16 bytes in rte_mov17_to_32().
> > (Vipin)
> > > * Rebased.
> > > v6:
> > > * Went back to using rte_uintN_alias structures for copying instead
> > of
> > >   using memcpy(). They were there for a reason.
> > >   (Inspired by the discussion about optimizing the checksum
> > function.)
> > > * Removed note about copying uninitialized data.
> > > * Added __rte_restrict to source and destination addresses.
> > >   Updated function descriptions from "should" to "must" not overlap.
> > > * Changed rte_mov48() AVX implementation to copy 32+16 bytes instead
> > of
> > >   copying 32 + 32 overlapping bytes. (Konstantin)
> > > * Ignoring "-Wstringop-overflow" is not needed, so it was removed.
> > > v5:
> > > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> > _mm_lddqu_si128().
> > >   It was slower.
> > > * Improved some comments. (Konstantin Ananyev)
> > > * Moved the size range 17..32 inside the size <= 64 branch, so when
> > >   building for SSE, the generated code can start copying the first
> > >   16 bytes before comparing if the size is greater than 32 or not.
> > > * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> > rte_mov33_to_64().
> > > v4:
> > > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> > > v3:
> > > * Fixed typo in comment.
> > > v2:
> > > * Updated patch title to reflect that the performance is improved.
> > > * Use the design pattern of two overlapping stores for small copies
> > too.
> > > * Expanded first branch from size < 16 to size <= 16.
> > > * Handle more build time constant copy sizes.
> > > ---
> > >  lib/eal/x86/include/rte_memcpy.h | 526 ++++++++++++++++++++---------
> > --
> > >  1 file changed, 348 insertions(+), 178 deletions(-)
> > >
> > 
> > I'm a little unhappy to see the amount of memcpy code growing rather
> > than
> > shrinking, but since it improves performance I'm ok with it. We should
> > keep
> > it under constant review though.
> 
> Agree!
> 
> I just counted; 149 of the added lines are for handling __rte_constant(n). So it's not as bad as it looks.
> But still growing, which was not the intention.
> When I started working on this patch, the intention was to consolidate the copy-pasted instances for handling up to 64 bytes into one instance. This should have reduced the amount of code.
> But then it somehow grew anyway.
> 
> > 
> > > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > > index 46d34b8081..ed8e5f8dc4 100644
> > > --- a/lib/eal/x86/include/rte_memcpy.h
> > > +++ b/lib/eal/x86/include/rte_memcpy.h
> > > @@ -22,11 +22,6 @@
> > >  extern "C" {
> > >  #endif
> > >
> > > -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
> > > -#pragma GCC diagnostic push
> > > -#pragma GCC diagnostic ignored "-Wstringop-overflow"
> > > -#endif
> > > -
> > >  /*
> > >   * GCC older than version 11 doesn't compile AVX properly, so use
> > SSE instead.
> > >   * There are no problems with AVX2.
> > > @@ -40,9 +35,6 @@ extern "C" {
> > >  /**
> > >   * Copy bytes from one location to another. The locations must not
> > overlap.
> > >   *
> > > - * @note This is implemented as a macro, so it's address should not
> > be taken
> > > - * and care is needed as parameter expressions may be evaluated
> > multiple times.
> > > - *
> > 
> > I'd be wary about completely removing this comment, as we may well want
> > to
> > go back to a macro in the future, e.g. if we decide to remove the
> > custom
> > rte_memcpy altogether. Therefore, rather than removing the comment, can
> > we
> > tweak it to say "This may be implemented as a macro..."
> 
> The comment is still present in the "generic" header file used for the Doxygen documentation:
> https://elixir.bootlin.com/dpdk/v26.03-rc1/source/lib/eal/include/generic/rte_memcpy.h#L99
> 
> All other architectures rely on the "generic" header file, and have no Doxygen comments at all.
> We could also remove them from the x86 implementation.
> That would shrink the file even more. ;-)
> But I'd rather keep the comments - at least for now.
> 
> > 
> > 
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> 
> Thank you for quick response, Bruce.
> 
> > 
> > PS: If we want a little further cleanup, I'd consider removing the
> > RTE_MEMCPY_AVX macro and replacing it with a straight check for
> > __AVX2__.
> > CPUs with AVX2 was introduced in 2013, and checking Claude and
> > Wikipedia
> > says that AMD parts started having it in 2015, meaning that there were
> > only
> > a few generations of CPUs >10 years ago which had AVX but not AVX2.
> > [There
> > were later CPUs e.g. lower-end parts, which didn't have AVX2, but they
> > didn't have AVX1 either, so SSE is the only choice there]
> > Not a big cleanup if we did remove it, but sometimes every little
> > helps!
> 
> Good idea. But let's not do it now.
>
Agree on all counts. 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
  2026-03-11  7:28   ` Morten Brørup
  2026-03-11 16:58   ` Bruce Richardson
@ 2026-03-12  8:33   ` Konstantin Ananyev
  2026-03-19 15:55   ` Morten Brørup
  3 siblings, 0 replies; 55+ messages in thread
From: Konstantin Ananyev @ 2026-03-12  8:33 UTC (permalink / raw)
  To: Morten Brørup, dev@dpdk.org, Bruce Richardson,
	Konstantin Ananyev, Vipin Varghese, Stephen Hemminger,
	Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna



> 
> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes, instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, the function for copying less than 16 bytes was replaced with
> a smarter implementation using fewer branches and potentially fewer
> load/store operations.
> This function was also extended to handle copying of up to 16 bytes,
> instead of up to 15 bytes.
> This small extension reduces the code path, and thus improves the
> performance, for copying two pointers on 64-bit architectures and four
> pointers on 32-bit architectures.
> 
> Also, __rte_restrict was added to source and destination addresses.
> 
> And finally, the missing implementation of rte_mov48() was added.
> 
> Regarding performance, the memcpy performance test showed cache-to-cache
> copying of up to 32 bytes now takes 2 cycles, versus ca. 6.5 cycles before
> this patch.
> Copying 64 bytes now takes 4 cycles, versus 7 cycles before.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

> --
> 2.43.0


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v7] eal/x86: optimize memcpy of small sizes
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
                     ` (2 preceding siblings ...)
  2026-03-12  8:33   ` Konstantin Ananyev
@ 2026-03-19 15:55   ` Morten Brørup
  3 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-03-19 15:55 UTC (permalink / raw)
  To: dev

Recheck-request: github-robot


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v8] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (6 preceding siblings ...)
  2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
@ 2026-04-29  9:36 ` Morten Brørup
  2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-04-29  9:36 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna, Morten Brørup,
	Konstantin Ananyev

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, __rte_restrict was added to source and destination addresses.

And finally, the missing implementation of rte_mov48() was added.

Regarding performance...
The memcpy performance test (cache-to-cache copy) shows:
Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
Copying 8 bytes takes 4 cycles, versus 7 cycles before.
Copying 16 bytes takes 2 cycles, versus 4 cycles before.
Copying 64 bytes takes 4 cycles, versus 7 cycles before.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
v8:
* Reverted the first branch from size <= 16 back to size < 16, restored
  the original rte_mov15_or_less() function, and removed the new
  rte_mov16_or_less() function.
  When rte_memcpy() is used for copying an array of pointers, and the
  number of pointers to copy is low (size <= 64 bytes), it is more likely
  that the number of pointers to copy is 1 than 2.
  The rte_mov15_or_less() implementation handles copying 8 bytes more
  efficiently than the rte_mov16_or_less() implementation, which copied
  the 8-byte pointer twice.
  Also note that with rte_mov15_or_less(), the compiler can optimize away
  the branches handling n & 1, n & 2 and n & 4 when it is known at build
  time that (8-byte) pointers are being copied. (For 32-bit architecture,
  the n & 4 will not be optimized away when copying pointers.)
  This reversion also makes the patch less revolutionary and more
  incremental.
* Removed a lot of code for handling compile time known sizes. (Bruce)
  The rte_memcpy() function should not be used for small copies with
  compile time known sizes, so handling it is considered superfluous.
  Removing it improves source code readability. And reduces the size of
  the patch.
* Kept acks from Bruce and Konstantin (both given to v7).
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of build time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more build time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 274 ++++++++++++++-----------------
 1 file changed, 126 insertions(+), 148 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..e0645db2f1 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,15 +45,15 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
  * Copy bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  * Use with n <= 15.
  */
 static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
 	/**
 	 * Use the following structs to avoid violating C standard
@@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +161,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,15 +172,63 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 16 <= n <= 32.
+ */
+static __rte_always_inline void *
+rte_mov16_to_32(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+	rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+	if (__rte_constant(n) && n == 16)
+		return dst; /* avoid (harmless) duplicate copy */
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	return dst;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with 32 <= n <= 64.
+ */
+static __rte_always_inline void *
+rte_mov32_to_64(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
+{
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return dst;
+	}
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	if (__rte_constant(n) && n == 32)
+		return dst; /* avoid (harmless) duplicate copy */
+	rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	if (__rte_constant(n) && n == 32)
+		return dst; /* avoid (harmless) duplicate copy */
+	if (n > 48)
+		rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+	return dst;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 /**
@@ -182,10 +239,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +259,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +289,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +385,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +415,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +497,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +557,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +588,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +696,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +722,25 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Common implementation for size <= 64 bytes. */
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32)
+		return rte_mov16_to_32(dst, src, n);
+	if (n <= 64)
+		return rte_mov32_to_64(dst, src, n);
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH v9] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (7 preceding siblings ...)
  2026-04-29  9:36 ` [PATCH v8] " Morten Brørup
@ 2026-04-29 10:35 ` Morten Brørup
  2026-04-29 11:24   ` Morten Brørup
                     ` (2 more replies)
  2026-05-08  9:58 ` [PATCH v10] " Morten Brørup
  2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
  10 siblings, 3 replies; 55+ messages in thread
From: Morten Brørup @ 2026-04-29 10:35 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna, Morten Brørup,
	Konstantin Ananyev

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, __rte_restrict was added to source and destination addresses.

And finally, the missing implementation of rte_mov48() was added.

Regarding performance...
The memcpy performance test (cache-to-cache copy) shows:
Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
Copying 8 bytes takes 4 cycles, versus 7 cycles before.
Copying 16 bytes takes 2 cycles, versus 4 cycles before.
Copying 64 bytes takes 4 cycles, versus 7 cycles before.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
v9:
* Removed new functions rte_mov16_to_32() and rte_mov32_to_64(), and moved
  their implementations into rte_memcpy() instead.
  There is no need for such public functions, and having them separate did
  not improve source code readability.
* Kept acks from Bruce and Konstantin (both given to v7).
v8:
* Reverted the first branch from size <= 16 back to size < 16, restored
  the original rte_mov15_or_less() function, and removed the new
  rte_mov16_or_less() function.
  When rte_memcpy() is used for copying an array of pointers, and the
  number of pointers to copy is low (size <= 64 bytes), it is more likely
  that the number of pointers to copy is 1 than 2.
  The rte_mov15_or_less() implementation handles copying 8 bytes more
  efficiently than the rte_mov16_or_less() implementation, which copied
  the 8-byte pointer twice.
  Also note that with rte_mov15_or_less(), the compiler can optimize away
  the branches handling n & 1, n & 2 and n & 4 when it is known at compile
  time that (8-byte) pointers are being copied. (For 32-bit architecture,
  the n & 4 will not be optimized away when copying pointers.)
  This reversion also makes the patch less revolutionary and more
  incremental.
* Removed a lot of code for handling compile time known sizes. (Bruce)
  The rte_memcpy() function should not be used for small copies with
  compile time known sizes, so handling it is considered superfluous.
  Removing it improves source code readability. And reduces the size of
  the patch.
* Kept acks from Bruce and Konstantin (both given to v7).
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of compile time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more compile time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 250 +++++++++++++------------------
 1 file changed, 102 insertions(+), 148 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..8ed8c55010 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,15 +45,15 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
  * Copy bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  * Use with n <= 15.
  */
 static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
 	/**
 	 * Use the following structs to avoid violating C standard
@@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +161,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,10 +172,10 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
@@ -182,10 +191,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +211,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +241,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +337,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +367,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +449,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +509,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +540,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +648,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +674,49 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Fast way when copy size doesn't exceed 64 bytes. */
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32) {
+		if (__rte_constant(n) && n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return dst; /* avoid (harmless) duplicate copy */
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return dst;
+	}
+	if (n <= 64) {
+		if (__rte_constant(n) && n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if defined RTE_MEMCPY_AVX
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+		return dst;
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* RE: [PATCH v9] eal/x86: optimize memcpy of small sizes
  2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
@ 2026-04-29 11:24   ` Morten Brørup
  2026-05-08  6:32   ` Morten Brørup
  2026-05-21 10:54   ` [TEST PATCH " Morten Brørup
  2 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-04-29 11:24 UTC (permalink / raw)
  To: dev

Recheck-request: iol-intel-Performance


^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v9] eal/x86: optimize memcpy of small sizes
  2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
  2026-04-29 11:24   ` Morten Brørup
@ 2026-05-08  6:32   ` Morten Brørup
  2026-05-21 10:54   ` [TEST PATCH " Morten Brørup
  2 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-05-08  6:32 UTC (permalink / raw)
  To: dev

Recheck-request: github-robot, iol-unit-arm64-testing, iol-sample-apps-testing


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [PATCH v10] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (8 preceding siblings ...)
  2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
@ 2026-05-08  9:58 ` Morten Brørup
  2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
  10 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-05-08  9:58 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna, Anatoly Burakov,
	Vladimir Medvedkin, Morten Brørup, Konstantin Ananyev

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, __rte_restrict was added to source and destination addresses.

And finally, the missing implementation of rte_mov48() was added.

Regarding performance...
The memcpy performance test (cache-to-cache copy) shows:
Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
Copying 8 bytes takes 4 cycles, versus 7 cycles before.
Copying 16 bytes takes 2 cycles, versus 4 cycles before.
Copying 64 bytes takes 4 cycles, versus 7 cycles before.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
v10:
* Reverted removal of ignoring stringop-overflow warnings.
  Instead, added a comment about the reason for ignoring them.
  Some drivers still use elems[1] instead of elems[] for structures with
  flexible arrays.
  IMO, the drivers should be fixed, or the warnings should be igmored
  there; but I'm picking the easy solution, and not changing this.
  If they were using standard memcpy(), warnings would also be emitted.
v9:
* Removed new functions rte_mov16_to_32() and rte_mov32_to_64(), and moved
  their implementations into rte_memcpy() instead.
  There is no need for such public functions, and having them separate did
  not improve source code readability.
* Kept acks from Bruce and Konstantin (both given to v7).
v8:
* Reverted the first branch from size <= 16 back to size < 16, restored
  the original rte_mov15_or_less() function, and removed the new
  rte_mov16_or_less() function.
  When rte_memcpy() is used for copying an array of pointers, and the
  number of pointers to copy is low (size <= 64 bytes), it is more likely
  that the number of pointers to copy is 1 than 2.
  The rte_mov15_or_less() implementation handles copying 8 bytes more
  efficiently than the rte_mov16_or_less() implementation, which copied
  the 8-byte pointer twice.
  Also note that with rte_mov15_or_less(), the compiler can optimize away
  the branches handling n & 1, n & 2 and n & 4 when it is known at compile
  time that (8-byte) pointers are being copied. (For 32-bit architecture,
  the n & 4 will not be optimized away when copying pointers.)
  This reversion also makes the patch less revolutionary and more
  incremental.
* Removed a lot of code for handling compile time known sizes. (Bruce)
  The rte_memcpy() function should not be used for small copies with
  compile time known sizes, so handling it is considered superfluous.
  Removing it improves source code readability. And reduces the size of
  the patch.
* Kept acks from Bruce and Konstantin (both given to v7).
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of compile time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more compile time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 242 +++++++++++++------------------
 1 file changed, 103 insertions(+), 139 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..d6dfac19aa 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,6 +22,7 @@
 extern "C" {
 #endif
 
+/* Workaround for drivers using elems[1] instead of elems[] for flexible arrays. */
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
@@ -40,9 +41,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,15 +51,15 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
  * Copy bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  * Use with n <= 15.
  */
 static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
 	/**
 	 * Use the following structs to avoid violating C standard
@@ -103,10 +101,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +114,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +130,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +167,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,10 +178,10 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
@@ -182,10 +197,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +217,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +247,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +343,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +373,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +455,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +515,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +546,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +654,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,12 +680,45 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Fast way when copy size doesn't exceed 64 bytes. */
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32) {
+		if (__rte_constant(n) && n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return dst; /* avoid (harmless) duplicate copy */
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return dst;
+	}
+	if (n <= 64) {
+		if (__rte_constant(n) && n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if defined RTE_MEMCPY_AVX
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+		return dst;
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [TEST PATCH v9] eal/x86: optimize memcpy of small sizes
  2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
  2026-04-29 11:24   ` Morten Brørup
  2026-05-08  6:32   ` Morten Brørup
@ 2026-05-21 10:54   ` Morten Brørup
  2 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-05-21 10:54 UTC (permalink / raw)
  To: dev; +Cc: Bruce Richardson

TEST: Were the Intel drivers the only ones triggering the warnings with mingw in Github?

Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")

Venlig hilsen / Kind regards,
-Morten Brørup


The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, __rte_restrict was added to source and destination addresses.

And finally, the missing implementation of rte_mov48() was added.

Regarding performance...
The memcpy performance test (cache-to-cache copy) shows:
Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
Copying 8 bytes takes 4 cycles, versus 7 cycles before.
Copying 16 bytes takes 2 cycles, versus 4 cycles before.
Copying 64 bytes takes 4 cycles, versus 7 cycles before.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
v9:
* Removed new functions rte_mov16_to_32() and rte_mov32_to_64(), and moved
  their implementations into rte_memcpy() instead.
  There is no need for such public functions, and having them separate did
  not improve source code readability.
* Kept acks from Bruce and Konstantin (both given to v7).
v8:
* Reverted the first branch from size <= 16 back to size < 16, restored
  the original rte_mov15_or_less() function, and removed the new
  rte_mov16_or_less() function.
  When rte_memcpy() is used for copying an array of pointers, and the
  number of pointers to copy is low (size <= 64 bytes), it is more likely
  that the number of pointers to copy is 1 than 2.
  The rte_mov15_or_less() implementation handles copying 8 bytes more
  efficiently than the rte_mov16_or_less() implementation, which copied
  the 8-byte pointer twice.
  Also note that with rte_mov15_or_less(), the compiler can optimize away
  the branches handling n & 1, n & 2 and n & 4 when it is known at compile
  time that (8-byte) pointers are being copied. (For 32-bit architecture,
  the n & 4 will not be optimized away when copying pointers.)
  This reversion also makes the patch less revolutionary and more
  incremental.
* Removed a lot of code for handling compile time known sizes. (Bruce)
  The rte_memcpy() function should not be used for small copies with
  compile time known sizes, so handling it is considered superfluous.
  Removing it improves source code readability. And reduces the size of
  the patch.
* Kept acks from Bruce and Konstantin (both given to v7).
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of compile time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more compile time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 250 +++++++++++++------------------
 1 file changed, 102 insertions(+), 148 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..8ed8c55010 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,15 +45,15 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
  * Copy bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  * Use with n <= 15.
  */
 static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
 	/**
 	 * Use the following structs to avoid violating C standard
@@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +161,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,10 +172,10 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
@@ -182,10 +191,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +211,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +241,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +337,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +367,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +449,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +509,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +540,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +648,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +674,49 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Fast way when copy size doesn't exceed 64 bytes. */
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32) {
+		if (__rte_constant(n) && n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return dst; /* avoid (harmless) duplicate copy */
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return dst;
+	}
+	if (n <= 64) {
+		if (__rte_constant(n) && n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if defined RTE_MEMCPY_AVX
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+		return dst;
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* [PATCH v11] eal/x86: optimize memcpy of small sizes
  2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
                   ` (9 preceding siblings ...)
  2026-05-08  9:58 ` [PATCH v10] " Morten Brørup
@ 2026-05-21 18:56 ` Morten Brørup
  2026-05-21 19:48   ` Stephen Hemminger
                     ` (2 more replies)
  10 siblings, 3 replies; 55+ messages in thread
From: Morten Brørup @ 2026-05-21 18:56 UTC (permalink / raw)
  To: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang
  Cc: Thiyagarajan P, Bala Murali Krishna, Anatoly Burakov,
	Vladimir Medvedkin, Morten Brørup, Konstantin Ananyev

The implementation for copying up to 64 bytes does not depend on address
alignment with the size of the CPU's vector registers. Nonetheless, the
exact same code for copying up to 64 bytes was present in both the aligned
copy function and all the CPU vector register size specific variants of
the unaligned copy functions.
With this patch, the implementation for copying up to 64 bytes was
consolidated into one instance, located in the common copy function,
before checking alignment requirements.
This provides three benefits:
1. No copy-paste in the source code.
2. A performance gain for copying up to 64 bytes, because the
address alignment check is avoided in this case.
3. Reduced instruction memory footprint, because the compiler only
generates one instance of the function for copying up to 64 bytes, instead
of two instances (one in the unaligned copy function, and one in the
aligned copy function).

Furthermore, __rte_restrict was added to source and destination addresses.

Also, the missing implementation of rte_mov48() was added.

Until recently, some drivers required disabling stringop-overflow warnings
when using rte_memcpy().
For some strange reason, these warnings were disabled in the rte_memcpy
header file, instead of in the problematic drivers.
With series-38174 ("remove use of rte_memcpy from net/intel"), the
problematic drivers were updated to use memcpy() instead of rte_memcpy(),
so disabling these warnings is no longer required, and was removed.

Regarding performance...
The memcpy performance test (cache-to-cache copy) shows:
Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
Copying 8 bytes takes 4 cycles, versus 7 cycles before.
Copying 16 bytes takes 2 cycles, versus 4 cycles before.
Copying 64 bytes takes 4 cycles, versus 7 cycles before.

Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
v11:
* Removed ignoring stringop-overflow warnings.
  The drivers requiring this have been updated to use memcpy() instead.
  Added note about it in the cover letter, and a depends-on tag.
v10:
* Reverted removal of ignoring stringop-overflow warnings.
  Instead, added a comment about the reason for ignoring them.
  Some drivers still use elems[1] instead of elems[] for structures with
  flexible arrays.
  IMO, the drivers should be fixed, or the warnings should be igmored
  there; but I'm picking the easy solution, and not changing this.
  If they were using standard memcpy(), warnings would also be emitted.
v9:
* Removed new functions rte_mov16_to_32() and rte_mov32_to_64(), and moved
  their implementations into rte_memcpy() instead.
  There is no need for such public functions, and having them separate did
  not improve source code readability.
* Kept acks from Bruce and Konstantin (both given to v7).
v8:
* Reverted the first branch from size <= 16 back to size < 16, restored
  the original rte_mov15_or_less() function, and removed the new
  rte_mov16_or_less() function.
  When rte_memcpy() is used for copying an array of pointers, and the
  number of pointers to copy is low (size <= 64 bytes), it is more likely
  that the number of pointers to copy is 1 than 2.
  The rte_mov15_or_less() implementation handles copying 8 bytes more
  efficiently than the rte_mov16_or_less() implementation, which copied
  the 8-byte pointer twice.
  Also note that with rte_mov15_or_less(), the compiler can optimize away
  the branches handling n & 1, n & 2 and n & 4 when it is known at compile
  time that (8-byte) pointers are being copied. (For 32-bit architecture,
  the n & 4 will not be optimized away when copying pointers.)
  This reversion also makes the patch less revolutionary and more
  incremental.
* Removed a lot of code for handling compile time known sizes. (Bruce)
  The rte_memcpy() function should not be used for small copies with
  compile time known sizes, so handling it is considered superfluous.
  Removing it improves source code readability. And reduces the size of
  the patch.
* Kept acks from Bruce and Konstantin (both given to v7).
v7:
* Updated patch description. Mainly to clarify that the changes related to
  copying up to 64 bytes simply replaces multiple instances of copy-pasted
  code with one common instance.
* Fixed copy of compile time known 16 bytes in rte_mov17_to_32(). (Vipin)
* Rebased.
v6:
* Went back to using rte_uintN_alias structures for copying instead of
  using memcpy(). They were there for a reason.
  (Inspired by the discussion about optimizing the checksum function.)
* Removed note about copying uninitialized data.
* Added __rte_restrict to source and destination addresses.
  Updated function descriptions from "should" to "must" not overlap.
* Changed rte_mov48() AVX implementation to copy 32+16 bytes instead of
  copying 32 + 32 overlapping bytes. (Konstantin)
* Ignoring "-Wstringop-overflow" is not needed, so it was removed.
v5:
* Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
  It was slower.
* Improved some comments. (Konstantin Ananyev)
* Moved the size range 17..32 inside the size <= 64 branch, so when
  building for SSE, the generated code can start copying the first
  16 bytes before comparing if the size is greater than 32 or not.
* Just require RTE_MEMCPY_AVX for using rte_mov32() in rte_mov33_to_64().
v4:
* Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
v3:
* Fixed typo in comment.
v2:
* Updated patch title to reflect that the performance is improved.
* Use the design pattern of two overlapping stores for small copies too.
* Expanded first branch from size < 16 to size <= 16.
* Handle more compile time constant copy sizes.
---
 lib/eal/x86/include/rte_memcpy.h | 250 +++++++++++++------------------
 1 file changed, 102 insertions(+), 148 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 46d34b8081..8ed8c55010 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -22,11 +22,6 @@
 extern "C" {
 #endif
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
 /*
  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
  * There are no problems with AVX2.
@@ -40,9 +35,6 @@ extern "C" {
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
  * @param dst
  *   Pointer to the destination of the data.
  * @param src
@@ -53,15 +45,15 @@ extern "C" {
  *   Pointer to the destination data.
  */
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
 
 /**
  * Copy bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  * Use with n <= 15.
  */
 static __rte_always_inline void *
-rte_mov15_or_less(void *dst, const void *src, size_t n)
+rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
 	/**
 	 * Use the following structs to avoid violating C standard
@@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 
 /**
  * Copy 16 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	__m128i xmm0;
 
@@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 32 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
@@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 #endif
 }
 
+/**
+ * Copy 48 bytes from one location to another,
+ * locations must not overlap.
+ */
+static __rte_always_inline void
+rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
+{
+#if defined RTE_MEMCPY_AVX
+	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+#endif
+}
+
 /**
  * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
@@ -152,10 +161,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
@@ -163,10 +172,10 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 256 bytes from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
 {
 	rte_mov128(dst + 0 * 128, src + 0 * 128);
 	rte_mov128(dst + 1 * 128, src + 1 * 128);
@@ -182,10 +191,10 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -202,10 +211,10 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 
 /**
  * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -232,45 +241,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
@@ -351,10 +337,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 /**
  * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
+ * locations must not overlap.
  */
 static __rte_always_inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -381,41 +367,22 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
@@ -482,7 +449,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Requirements:
  * - Store is aligned
  * - Load offset is <offset>, which must be immediate value within [1, 15]
@@ -542,7 +509,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 /**
  * Macro for copying unaligned block from one location to another,
  * 47 bytes leftover maximum,
- * locations should not overlap.
+ * locations must not overlap.
  * Use switch here because the aligning instruction requires immediate value for shift count.
  * Requirements:
  * - Store is aligned
@@ -573,38 +540,23 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
     }                                                                 \
 }
 
+/**
+ * Copy bytes from one location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		if (n > 48)
-			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
@@ -696,44 +648,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #endif /* __AVX512F__ */
 
+/**
+ * Copy bytes from one vector register size aligned location to another,
+ * locations must not overlap.
+ * Use with n > 64.
+ */
 static __rte_always_inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
+rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
+		size_t n)
 {
 	void *ret = dst;
 
-	/* Copy size < 16 bytes */
-	if (n < 16) {
-		return rte_mov15_or_less(dst, src, n);
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (__rte_constant(n) && n == 32) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		if (__rte_constant(n) && n == 16)
-			return ret; /* avoid (harmless) duplicate copy */
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (__rte_constant(n) && n == 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
 	/* Copy 64 bytes blocks */
 	for (; n > 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
@@ -749,20 +674,49 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 }
 
 static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
 {
+	/* Fast way when copy size doesn't exceed 64 bytes. */
+	if (n < 16)
+		return rte_mov15_or_less(dst, src, n);
+	if (n <= 32) {
+		if (__rte_constant(n) && n == 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return dst; /* avoid (harmless) duplicate copy */
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return dst;
+	}
+	if (n <= 64) {
+		if (__rte_constant(n) && n == 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			return dst;
+		}
+#if defined RTE_MEMCPY_AVX
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
+		return dst;
+	}
+
+	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+		return rte_memcpy_aligned_more_than_64(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return rte_memcpy_generic_more_than_64(dst, src, n);
 }
 
 #undef ALIGNMENT_MASK
 
-#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-#pragma GCC diagnostic pop
-#endif
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 55+ messages in thread

* Re: [PATCH v11] eal/x86: optimize memcpy of small sizes
  2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
@ 2026-05-21 19:48   ` Stephen Hemminger
  2026-05-21 22:42   ` Stephen Hemminger
  2026-06-01 19:48   ` Thomas Monjalon
  2 siblings, 0 replies; 55+ messages in thread
From: Stephen Hemminger @ 2026-05-21 19:48 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna,
	Anatoly Burakov, Vladimir Medvedkin, Konstantin Ananyev

On Thu, 21 May 2026 18:56:31 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes, instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, __rte_restrict was added to source and destination addresses.
> 
> Also, the missing implementation of rte_mov48() was added.
> 
> Until recently, some drivers required disabling stringop-overflow warnings
> when using rte_memcpy().
> For some strange reason, these warnings were disabled in the rte_memcpy
> header file, instead of in the problematic drivers.
> With series-38174 ("remove use of rte_memcpy from net/intel"), the
> problematic drivers were updated to use memcpy() instead of rte_memcpy(),
> so disabling these warnings is no longer required, and was removed.
> 
> Regarding performance...
> The memcpy performance test (cache-to-cache copy) shows:
> Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
> Copying 8 bytes takes 4 cycles, versus 7 cycles before.
> Copying 16 bytes takes 2 cycles, versus 4 cycles before.
> Copying 64 bytes takes 4 cycles, versus 7 cycles before.
> 
> Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---

Used this as test case for comparing AI reviews.

Anthropic saw:

## Warnings

### 1. Missing rte_mov48() test coverage

**Issue:** The patch adds a new `rte_mov48()` function but does not mention adding tests for it.

**Why it matters:** New utility functions should have test coverage to verify correctness, especially for low-level copy operations that could silently corrupt data if implemented incorrectly.

**Suggested fix:** Add a test case in `app/test/test_memcpy.c` (or equivalent) that verifies `rte_mov48()` copies 48 bytes correctly and preserves data beyond the copy range.

### 2. Documentation comment inconsistency

**Issue:** The main `rte_memcpy()` function description comment was removed (lines 37-39 of original), but the opening documentation text about overlapping locations remains inconsistent across helper functions.

The original stated:
```c
/**
 * Copy bytes from one location to another. The locations must not overlap.
 *
 * @note This is implemented as a macro, so it's address should not be taken
 * and care is needed as parameter expressions may be evaluated multiple times.
```

The new version removes the `@note` about macro implementation (which is correct since it's now a function), but does not update the main function comment to include the same `@param` / `@return` documentation that other memory functions have.

**Suggested fix:** Add complete Doxygen documentation to `rte_memcpy()`:
```c
/**
 * Copy bytes from one location to another. The locations must not overlap.
 *
 * @param dst
 *   Pointer to the destination of the data.
 * @param src
 *   Pointer to the source data.
 * @param n
 *   Number of bytes to copy.
 * @return
 *   Pointer to the destination data.
 */
```

xAI (Grok) saw:
**Error**

- `rte_mov48()` implementation casts away `__rte_restrict` qualifiers (lines 137-140 and 143-147).  
  Casting a restrict-qualified pointer to a non-restrict pointer is undefined behavior under the C standard and can break compiler alias analysis.  
  Fix: either make the callees (`rte_mov32`/`rte_mov16`) accept the restrict qualifiers, or use a local non-restrict pointer only inside `rte_mov48` after proper validation.

**Error**

- Multiple functions (`rte_mov16`, `rte_mov32`, `rte_mov48`, `rte_mov64`, `rte_mov128`, `rte_mov256`, `rte_mov128blocks`, `rte_mov512blocks`, `rte_memcpy_generic_more_than_64`, etc.) now take `uint8_t *__rte_restrict` parameters, but their callers inside the same file (e.g., the new `rte_memcpy` implementation and the various block-copy macros) pass `uint8_t *` without the restrict qualifier.  
  This creates implicit non-restrict paths that defeat the purpose of adding `__rte_restrict` and may trigger compiler warnings or incorrect optimizations.  
  Recommended fix: either propagate `__rte_restrict` consistently through all internal call sites or keep internal helpers non-restrict and only expose restrict at the public `rte_memcpy` boundary.

**Warning**

- `rte_memcpy_generic_more_than_64` and `rte_memcpy_aligned_more_than_64` are now the only implementations for >64-byte copies, but the old `rte_memcpy_generic` and `rte_memcpy_aligned` names have been removed.  
  Any external code (or test) that took the address of the old functions will break.  
  The patch should either keep the old names as aliases or document the removal in release notes.

**Warning**

- The new `rte_memcpy` implementation for 17–64 bytes contains a compile-time constant check (`if (__rte_constant(n) && n == 64)`) followed by a fallback path that still executes `rte_mov16` three times for SSE when `n > 48`.  
  This path is correct but duplicates logic that already exists in `rte_mov48`.  
  Consider calling `rte_mov48` for the AVX case and a single `rte_mov48` + tail for SSE to reduce code duplication.

**Info**

- The removal of the `#pragma GCC diagnostic` block for `-Wstringop-overflow` is correct per the patch description (drivers were updated). No further action needed.

**Info**

- All other style, API-tag, and documentation requirements appear satisfied. No additional correctness bugs (leaks, use-after-free, races, or forbidden tokens) were identified.




^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v11] eal/x86: optimize memcpy of small sizes
  2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
  2026-05-21 19:48   ` Stephen Hemminger
@ 2026-05-21 22:42   ` Stephen Hemminger
  2026-06-01 13:38     ` Thomas Monjalon
  2026-06-01 19:48   ` Thomas Monjalon
  2 siblings, 1 reply; 55+ messages in thread
From: Stephen Hemminger @ 2026-05-21 22:42 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna,
	Anatoly Burakov, Vladimir Medvedkin, Konstantin Ananyev

On Thu, 21 May 2026 18:56:31 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes, instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, __rte_restrict was added to source and destination addresses.
> 
> Also, the missing implementation of rte_mov48() was added.
> 
> Until recently, some drivers required disabling stringop-overflow warnings
> when using rte_memcpy().
> For some strange reason, these warnings were disabled in the rte_memcpy
> header file, instead of in the problematic drivers.
> With series-38174 ("remove use of rte_memcpy from net/intel"), the
> problematic drivers were updated to use memcpy() instead of rte_memcpy(),
> so disabling these warnings is no longer required, and was removed.
> 
> Regarding performance...
> The memcpy performance test (cache-to-cache copy) shows:
> Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
> Copying 8 bytes takes 4 cycles, versus 7 cycles before.
> Copying 16 bytes takes 2 cycles, versus 4 cycles before.
> Copying 64 bytes takes 4 cycles, versus 7 cycles before.
> 
> Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---

Here is the full wordy all providers reviews.
Mostly this is test to make sure script works.

$ ./devtools/ai/compare-patch-reviews.sh ~/Downloads/v11-eal-x86-optimize-memcpy-of-small-sizes.patch 
Reviewing patch: v11-eal-x86-optimize-memcpy-of-small-sizes.patch
Providers: anthropic,openai,xai,google
Format: text
========================================

>>> Running review with: anthropic

=== Patch Review: v11-eal-x86-optimize-memcpy-of-small-sizes.patch (via Claude) ===
Review date: 2026-05-21

# Review of v11-eal-x86-optimize-memcpy-of-small-sizes.patch

## Correctness Issues

### Errors

**Use-after-free pattern in overlapping copy optimization (lines 672-673, 684, 704, 707)**

The pattern `(uint8_t *)dst - 16 + n` performs pointer arithmetic that, when `n < 16`, results in accessing memory before the `dst` pointer. This is used in an optimization technique (two overlapping stores) that is correct when `n >= 16`, but the code paths in question guarantee `n` is in specific ranges where this is safe. However, the readability is poor and the intent is not obvious from the code.

**Confidence: 80%** - This pattern is widely used in memcpy implementations and appears intentional, but deserves a comment explaining the technique to avoid confusion in future reviews.

---

## Warnings

**Missing documentation for new function rte_mov48() (line 130)**

The patch adds `rte_mov48()` but does not mention it in the commit message's "Also, the missing implementation of rte_mov48() was added." The implementation itself is correct, but there's no context about why it was missing or whether it's used elsewhere in DPDK.

**Suggestion:** Verify that `rte_mov48()` is actually used somewhere, or document why it's being added speculatively.

---

**Inconsistent handling of compile-time constant size (lines 682-683)**

The code checks `__rte_constant(n) && n == 16` to avoid a "harmless duplicate copy" but does not apply this optimization consistently. For example:
- Line 682: avoids duplicate for `n == 16`
- Line 690: avoids duplicate for `n == 32`
- Line 693: avoids duplicate for `n == 64`

But in the 33-64 byte range (lines 695-707), there's no similar optimization. This is not a bug, but the inconsistency in optimization strategy could be noted.

**Suggestion:** Add a comment explaining the rationale for when this optimization matters vs when the "harmless duplicate" is acceptable.

---

**Potential alignment assumption in AVX path (line 699)**

The AVX path at line 699 uses two overlapping `rte_mov32()` calls for sizes 33-64 bytes:
```c
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
```

For `n = 33`, the second call accesses `dst + 1` and `src + 1`, which may be misaligned for the 32-byte `__m256i` operations. However, `_mm256_loadu_si256()` and `_mm256_storeu_si256()` are documented as supporting unaligned access, so this is safe.

**Confidence: 90%** - Not a bug, but worth noting that the AVX path relies on unaligned load/store support.

---

## Style and Process Issues

### Warnings

**Unconventional comment style in rte_mov15_or_less() (lines 67-72)**

The multi-line comment explaining the rationale for the `rte_uint*_alias` structures does not follow the standard DPDK comment block format with the `*` on each line. However, this may be intentional for a technical explanation.

---

**Return value inconsistency (lines 682, 690-691, 693, 707-708)**

The function returns `dst` in some branches but the original `ret` pointer in others:
- Lines 682, 690, 707: return `dst`
- Line 714: return from `rte_memcpy_aligned_more_than_64(dst, src, n)` which returns `ret = dst`
- Line 716: return from `rte_memcpy_generic_more_than_64(dst, src, n)` which returns `ret = dst`

Both `ret` and `dst` point to the same location, so this is harmless, but using `dst` consistently would be clearer. The patch description states "Pointer to the destination data" is returned, and both satisfy this.

**Confidence: 95%** - Not an error, but inconsistent style within the same function.

---

## Documentation Issues

### Warnings

**Missing documentation update for removed warning suppression (lines removed at top)**

The patch removes:
```c
#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif
```

The commit message explains this is because the problematic drivers were updated, with a `Depends-on: series-38174`. However, there's no comment in the code explaining why the suppression was removed or a note in `doc/guides/rel_notes/` documenting this change.

**Suggestion:** Add a release note entry documenting the removal of the warning suppression and the dependency on the driver updates.

---

## Summary

**Priority findings:**

1. **(Info)** The overlapping copy optimization `dst - 16 + n` is correct but deserves a comment explaining the technique.
2. **(Warning)** Missing documentation for why `rte_mov48()` was added.
3. **(Warning)** Missing release notes entry for removal of stringop-overflow warning suppression.
4. **(Info)** Return value uses both `dst` and `ret` inconsistently (harmless but could be unified).

**Overall assessment:** The patch appears functionally correct. The main concerns are documentation completeness (missing context for `rte_mov48()`, missing release notes for warning removal) and code clarity (overlapping copy pattern could use explanatory comments). No blocking correctness issues were identified.
(anthropic: review reported errors)

========================================

>>> Running review with: openai

=== Patch Review: v11-eal-x86-optimize-memcpy-of-small-sizes.patch (via ChatGPT) ===
Review date: 2026-05-21

**DPDK Patch Review: v11-eal-x86-optimize-memcpy-of-small-sizes.patch**

---

### Errors

None found.

- The patch does not introduce resource leaks, use-after-free, race conditions, or missing synchronization.
- There are no buffer overflows or out-of-bounds accesses in the memcpy routines; all copy size calculations and code paths for small copies (<64 bytes) are explicit and conservative.
- No uninitialized or stale variable use detected.
- All error paths are clean (these are computational functions; return values are always valid).

---

### Warnings

1. **Potential inconsistent comment phrasing regarding overlap requirements.**

   - Some Doxygen comments use "locations should not overlap" while others now correctly say "locations must not overlap".
   - While this isn't a correctness bug, consistent strong wording clarifies requirement for users.

   **Suggested fix:** Standardize all relevant Doxygen comments to use "locations must not overlap."

   ```c
   /**
    * Copy bytes from one location to another,
    * locations must not overlap.
    */
   ```

2. **Redundant Doxygen param alignment in function headers.**

   - Some parameter descriptions ("Pointer to the destination of the data"/"Pointer to the destination data") remain a bit unclear or inconsistent. If updating comments, consider making param descriptions exact and explicit.

   **Suggested fix:** For each `@param` clarify if it's the destination or source pointer. For `rte_memcpy`, `"Pointer to the destination."` and `"Pointer to the source."` suffice.

---

### Info

- Use of `__rte_restrict` is correct and improves optimization; matches DPDK style for pointer restrict qualifiers.
- Removal of `#pragma GCC diagnostic ignored "-Wstringop-overflow"` is correct because the problematic drivers now use standard `memcpy()`.
- All new `rte_mov*` functions properly use `static __rte_always_inline` and restrict-qualified pointers.
- Macro duplication is avoided by consolidation, in line with maintainability guidelines.
- No forbidden tokens, banned constructs, or use of deprecated APIs/macros were added.
- No symbol namespace pollution or missing `rte_`/`RTE_` prefix issues present.
- Coding style is maintained in indentation, tabs use, and function/variable names.
- No API, public header, or C11 atomic/volatile/race issues are present.
- No documentation or API tag requirements are triggered (no public API change).
- Patch is self-contained and does not require changes to release notes or test infrastructure.
- No RST or external documentation is involved in this patch.

---

### Summary

**No correctness/blocking issues found.**
Minor documentation consistency updates suggested for comment clarity, but these do not affect correctness.

**This patch is in good shape and complies with DPDK coding and architectural guidelines.**
(openai: review reported errors)

========================================

>>> Running review with: xai

=== Patch Review: v11-eal-x86-optimize-memcpy-of-small-sizes.patch (via Grok) ===
Review date: 2026-05-21

**Review of patch: v11-eal-x86-optimize-memcpy-of-small-sizes.patch**

### Errors

**1. Missing `__rte_restrict` in rte_mov48() implementation (consistency / correctness risk)**  
The new `rte_mov48()` function is declared with `__rte_restrict`, but the implementation casts away the restrict qualifier when calling `rte_mov32()` and `rte_mov16()`:

```c
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
```

**Suggested fix**: Keep the restrict-qualified parameters in the implementation or cast only after documenting the aliasing contract. This is a minor style inconsistency but worth fixing for uniformity.

**2. Potential sign-extension issue on 64-bit offset calculations (left-shift of narrow type)**  
In `rte_mov15_or_less()`, the expression `(n & 8)` is `size_t` (ok), but the subsequent pointer arithmetic:

```c
*(uint64_t *)((uint8_t *)dst + n - 8) = *(const uint64_t *)((const uint8_t *)src + n - 8);
```

relies on implicit promotion. While currently safe, the pattern of adding a `uint8_t`/`size_t` value to a pointer after a narrow mask (`n & 7`) could trigger Coverity SIGN_EXTENSION warnings on some toolchains. Consider an explicit cast:

```c
uint8_t *d = (uint8_t *)dst + (size_t)(n - 8);
```

**3. Use of `__rte_constant` inside hot path without documented behavior**  
The code repeatedly uses `__rte_constant(n)` to optimize compile-time known sizes. This is a DPDK internal macro. While acceptable for performance-critical code, it should be documented (or at least noted) that the optimization only triggers for constant `n`. No correctness bug, but worth a comment.

### Warnings

**1. Duplicate function definitions for AVX/SSE variants**  
The patch introduces three nearly identical `rte_memcpy_generic_more_than_64()` and `rte_memcpy_aligned_more_than_64()` functions (one per vector size). This increases maintenance burden and instruction cache pressure. Consider factoring the common >64-byte logic into a single helper.

**2. Hard-coded cast chains in rte_mov48()**  
The SSE implementation of `rte_mov48()` uses explicit `(uint8_t *)` casts on restrict-qualified pointers. While functionally correct, it reduces readability. A cleaner pattern would be:

```c
rte_mov16(dst + 0, src + 0);
rte_mov16(dst + 16, src + 16);
rte_mov16(dst + 32, src + 32);
```

**3. Inconsistent use of `return dst` vs `return ret`**  
In the new `rte_memcpy()` implementation the fast path returns `dst`, while the old `rte_memcpy_generic()` returned the saved `ret`. Although `dst` and `ret` are identical, using `ret` consistently would avoid any future divergence.

**4. Missing Doxygen for new rte_mov48()**  
The new function has a Doxygen comment, but the description is minimal. Consider adding a brief note that it is an internal helper (or mark it `__rte_internal` if it should not be public).

### Info

- The removal of the `#pragma GCC diagnostic` block for `-Wstringop-overflow` is correct and aligns with the series that removed `rte_memcpy` usage from drivers.
- Adding `__rte_restrict` to `rte_memcpy()` and helper functions is a good improvement and matches modern DPDK style.
- Performance claims in the commit message are outside the scope of this review (they are not code).

### Summary

No critical correctness bugs (use-after-free, leaks, races, or missing error checks) were identified. The main issues are minor style/consistency items around restrict qualifiers, pointer arithmetic, and code duplication. The patch is otherwise clean and follows DPDK coding guidelines for the x86 EAL memcpy implementation.
(xai: review reported errors)

========================================

>>> Running review with: google

=== Patch Review: v11-eal-x86-optimize-memcpy-of-small-sizes.patch (via Gemini) ===
Review date: 2026-05-21

The following is a review of the patch against DPDK coding standards and guidelines.

### Warnings

**Implicit integer comparison in rte_memcpy.**
DPDK coding style requires explicit comparison against 0 for integer types, rather than using truthiness/logical negation.
```c
/* Current implementation */
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))

/* Suggested fix */
if ((((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK) == 0)
```

### Info

**Use of bitwise logic in rte_mov15_or_less.**
While this patch only adds `__rte_restrict` to the signature, the existing
(google: review reported warnings)

========================================

Review comparison complete.
Summary across 4 provider(s): clean=0 warnings=1 errors=3 failed=0

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v11] eal/x86: optimize memcpy of small sizes
  2026-05-21 22:42   ` Stephen Hemminger
@ 2026-06-01 13:38     ` Thomas Monjalon
  2026-06-01 14:19       ` Morten Brørup
  0 siblings, 1 reply; 55+ messages in thread
From: Thomas Monjalon @ 2026-06-01 13:38 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Liangxing Wang, Thiyagarajan P, Bala Murali Krishna,
	Anatoly Burakov, Vladimir Medvedkin, Konstantin Ananyev,
	Stephen Hemminger

22/05/2026 00:42, Stephen Hemminger:
> On Thu, 21 May 2026 18:56:31 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > The implementation for copying up to 64 bytes does not depend on address
> > alignment with the size of the CPU's vector registers. Nonetheless, the
> > exact same code for copying up to 64 bytes was present in both the aligned
> > copy function and all the CPU vector register size specific variants of
> > the unaligned copy functions.
> > With this patch, the implementation for copying up to 64 bytes was
> > consolidated into one instance, located in the common copy function,
> > before checking alignment requirements.
> > This provides three benefits:
> > 1. No copy-paste in the source code.
> > 2. A performance gain for copying up to 64 bytes, because the
> > address alignment check is avoided in this case.
> > 3. Reduced instruction memory footprint, because the compiler only
> > generates one instance of the function for copying up to 64 bytes, instead
> > of two instances (one in the unaligned copy function, and one in the
> > aligned copy function).
> > 
> > Furthermore, __rte_restrict was added to source and destination addresses.
> > 
> > Also, the missing implementation of rte_mov48() was added.
> > 
> > Until recently, some drivers required disabling stringop-overflow warnings
> > when using rte_memcpy().
> > For some strange reason, these warnings were disabled in the rte_memcpy
> > header file, instead of in the problematic drivers.
> > With series-38174 ("remove use of rte_memcpy from net/intel"), the
> > problematic drivers were updated to use memcpy() instead of rte_memcpy(),
> > so disabling these warnings is no longer required, and was removed.
> > 
> > Regarding performance...
> > The memcpy performance test (cache-to-cache copy) shows:
> > Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
> > Copying 8 bytes takes 4 cycles, versus 7 cycles before.
> > Copying 16 bytes takes 2 cycles, versus 4 cycles before.
> > Copying 64 bytes takes 4 cycles, versus 7 cycles before.
> > 
> > Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")
> > 
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> 
> Here is the full wordy all providers reviews.
[...]
> Summary across 4 provider(s): clean=0 warnings=1 errors=3 failed=0

What is the followup?
Do we target DPDK 26.07?



^ permalink raw reply	[flat|nested] 55+ messages in thread

* RE: [PATCH v11] eal/x86: optimize memcpy of small sizes
  2026-06-01 13:38     ` Thomas Monjalon
@ 2026-06-01 14:19       ` Morten Brørup
  0 siblings, 0 replies; 55+ messages in thread
From: Morten Brørup @ 2026-06-01 14:19 UTC (permalink / raw)
  To: Thomas Monjalon, Stephen Hemminger, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Konstantin Ananyev, Vipin Varghese, Liangxing Wang,
	Thiyagarajan P, Bala Murali Krishna, Anatoly Burakov,
	Vladimir Medvedkin

> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Monday, 1 June 2026 15.38
> 
> 22/05/2026 00:42, Stephen Hemminger:
> > On Thu, 21 May 2026 18:56:31 +0000
> > Morten Brørup <mb@smartsharesystems.com> wrote:
> >
> > > The implementation for copying up to 64 bytes does not depend on
> address
> > > alignment with the size of the CPU's vector registers. Nonetheless,
> the
> > > exact same code for copying up to 64 bytes was present in both the
> aligned
> > > copy function and all the CPU vector register size specific
> variants of
> > > the unaligned copy functions.
> > > With this patch, the implementation for copying up to 64 bytes was
> > > consolidated into one instance, located in the common copy
> function,
> > > before checking alignment requirements.
> > > This provides three benefits:
> > > 1. No copy-paste in the source code.
> > > 2. A performance gain for copying up to 64 bytes, because the
> > > address alignment check is avoided in this case.
> > > 3. Reduced instruction memory footprint, because the compiler only
> > > generates one instance of the function for copying up to 64 bytes,
> instead
> > > of two instances (one in the unaligned copy function, and one in
> the
> > > aligned copy function).
> > >
> > > Furthermore, __rte_restrict was added to source and destination
> addresses.
> > >
> > > Also, the missing implementation of rte_mov48() was added.
> > >
> > > Until recently, some drivers required disabling stringop-overflow
> warnings
> > > when using rte_memcpy().
> > > For some strange reason, these warnings were disabled in the
> rte_memcpy
> > > header file, instead of in the problematic drivers.
> > > With series-38174 ("remove use of rte_memcpy from net/intel"), the
> > > problematic drivers were updated to use memcpy() instead of
> rte_memcpy(),
> > > so disabling these warnings is no longer required, and was removed.
> > >
> > > Regarding performance...
> > > The memcpy performance test (cache-to-cache copy) shows:
> > > Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles
> before.
> > > Copying 8 bytes takes 4 cycles, versus 7 cycles before.
> > > Copying 16 bytes takes 2 cycles, versus 4 cycles before.
> > > Copying 64 bytes takes 4 cycles, versus 7 cycles before.
> > >
> > > Depends-on: series-38174 ("remove use of rte_memcpy from
> net/intel")
> > >
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> >
> > Here is the full wordy all providers reviews.
> [...]
> > Summary across 4 provider(s): clean=0 warnings=1 errors=3 failed=0
> 
> What is the followup?

AI wants me to fix existing code.
I had chosen to stick to the file's existing coding style etc., including some unnecessary type casts.
So AI also complains about my code doing things the same way existing code in the file does it.
Fixing existing code is out of scope for this patch. And using a different style for my changes would be confusing.

The patch description mentions that stringop-overflow warnings are no longer disabled for rte_mempcy().
AI wants this to go into the release notes (although it is x86 architecture only).
But IMO, this is far below the threshold for what should go into the release notes.

> Do we target DPDK 26.07?

IMO, yes, this v11 patch is good.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [PATCH v11] eal/x86: optimize memcpy of small sizes
  2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
  2026-05-21 19:48   ` Stephen Hemminger
  2026-05-21 22:42   ` Stephen Hemminger
@ 2026-06-01 19:48   ` Thomas Monjalon
  2 siblings, 0 replies; 55+ messages in thread
From: Thomas Monjalon @ 2026-06-01 19:48 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, Bruce Richardson, Konstantin Ananyev, Vipin Varghese,
	Stephen Hemminger, Liangxing Wang, Thiyagarajan P,
	Bala Murali Krishna, Anatoly Burakov, Vladimir Medvedkin,
	Morten Brørup, Konstantin Ananyev

21/05/2026 20:56, Morten Brørup:
> The implementation for copying up to 64 bytes does not depend on address
> alignment with the size of the CPU's vector registers. Nonetheless, the
> exact same code for copying up to 64 bytes was present in both the aligned
> copy function and all the CPU vector register size specific variants of
> the unaligned copy functions.
> With this patch, the implementation for copying up to 64 bytes was
> consolidated into one instance, located in the common copy function,
> before checking alignment requirements.
> This provides three benefits:
> 1. No copy-paste in the source code.
> 2. A performance gain for copying up to 64 bytes, because the
> address alignment check is avoided in this case.
> 3. Reduced instruction memory footprint, because the compiler only
> generates one instance of the function for copying up to 64 bytes, instead
> of two instances (one in the unaligned copy function, and one in the
> aligned copy function).
> 
> Furthermore, __rte_restrict was added to source and destination addresses.
> 
> Also, the missing implementation of rte_mov48() was added.
> 
> Until recently, some drivers required disabling stringop-overflow warnings
> when using rte_memcpy().
> For some strange reason, these warnings were disabled in the rte_memcpy
> header file, instead of in the problematic drivers.
> With series-38174 ("remove use of rte_memcpy from net/intel"), the
> problematic drivers were updated to use memcpy() instead of rte_memcpy(),
> so disabling these warnings is no longer required, and was removed.
> 
> Regarding performance...
> The memcpy performance test (cache-to-cache copy) shows:
> Copying up to 15 bytes takes ca. 4.5 cycles, versus ca. 6.5 cycles before.
> Copying 8 bytes takes 4 cycles, versus 7 cycles before.
> Copying 16 bytes takes 2 cycles, versus 4 cycles before.
> Copying 64 bytes takes 4 cycles, versus 7 cycles before.
> 
> Depends-on: series-38174 ("remove use of rte_memcpy from net/intel")
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

Applied, thanks.




^ permalink raw reply	[flat|nested] 55+ messages in thread

end of thread, other threads:[~2026-06-01 19:49 UTC | newest]

Thread overview: 55+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-20 11:45 [PATCH] eal/x86: reduce memcpy code duplication Morten Brørup
2025-11-21 10:35 ` [PATCH v2] eal/x86: optimize memcpy of small sizes Morten Brørup
2025-11-21 16:57   ` Stephen Hemminger
2025-11-21 17:02     ` Bruce Richardson
2025-11-21 17:11       ` Stephen Hemminger
2025-11-21 21:36         ` Morten Brørup
2025-11-21 10:40 ` Morten Brørup
2025-11-21 10:40 ` [PATCH v3] " Morten Brørup
2025-11-24 13:36   ` Morten Brørup
2025-11-24 15:46     ` Patrick Robb
2025-11-28 14:02   ` Konstantin Ananyev
2025-11-28 15:55     ` Morten Brørup
2025-11-28 18:10       ` Konstantin Ananyev
2025-11-29  2:17         ` Morten Brørup
2025-12-01  9:35           ` Konstantin Ananyev
2025-12-01 10:41             ` Morten Brørup
2025-11-24 20:31 ` [PATCH v4] " Morten Brørup
2025-11-25  8:19   ` Morten Brørup
2025-12-01 15:55 ` [PATCH v5] " Morten Brørup
2025-12-03 13:29   ` Morten Brørup
2026-01-03 17:53   ` Morten Brørup
2026-01-09 15:05     ` Varghese, Vipin
2026-01-11 15:52     ` Konstantin Ananyev
2026-01-11 16:01       ` Stephen Hemminger
2026-01-12  8:02       ` Morten Brørup
2026-01-12 16:00         ` Scott Mitchell
2026-01-13  0:39           ` Stephen Hemminger
2026-01-12 12:03 ` [PATCH v6] " Morten Brørup
2026-01-13 23:19   ` Stephen Hemminger
2026-01-20 11:00     ` Varghese, Vipin
2026-01-20 11:19       ` Varghese, Vipin
2026-01-20 11:22         ` Morten Brørup
2026-01-21 11:48           ` Varghese, Vipin
2026-01-22  6:59             ` Varghese, Vipin
2026-01-22  7:28               ` Liangxing Wang
2026-01-23  6:58               ` Varghese, Vipin
2026-02-20 11:08 ` [PATCH v7] " Morten Brørup
2026-03-11  7:28   ` Morten Brørup
2026-03-11 16:58   ` Bruce Richardson
2026-03-11 18:29     ` Morten Brørup
2026-03-11 19:09       ` Bruce Richardson
2026-03-12  8:33   ` Konstantin Ananyev
2026-03-19 15:55   ` Morten Brørup
2026-04-29  9:36 ` [PATCH v8] " Morten Brørup
2026-04-29 10:35 ` [PATCH v9] " Morten Brørup
2026-04-29 11:24   ` Morten Brørup
2026-05-08  6:32   ` Morten Brørup
2026-05-21 10:54   ` [TEST PATCH " Morten Brørup
2026-05-08  9:58 ` [PATCH v10] " Morten Brørup
2026-05-21 18:56 ` [PATCH v11] " Morten Brørup
2026-05-21 19:48   ` Stephen Hemminger
2026-05-21 22:42   ` Stephen Hemminger
2026-06-01 13:38     ` Thomas Monjalon
2026-06-01 14:19       ` Morten Brørup
2026-06-01 19:48   ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox