[PATCH 1/2] net: ethernet address comparison optimizations

public inbox for dev@dpdk.org
 help / color / mirror / Atom feed

* [PATCH 1/2] net: ethernet address comparison optimizations
@ 2026-01-30 10:46 Morten Brørup
  2026-01-30 10:46 ` [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function Morten Brørup
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 10:46 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

For CPU architectures without strict alignment requirements, operations on
6-byte Ethernet addresses using three 2-byte operations were replaced by a
4-byte and a 2-byte operation, i.e. two operations instead of three.

Comparison functions are pure, so added __rte_pure.

Removed superfluous parentheses. (No functional change.)

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/net/rte_ether.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
index c9a0b536c3..5552d3c1f6 100644
--- a/lib/net/rte_ether.h
+++ b/lib/net/rte_ether.h
@@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr) == 2,
  *  True  (1) if the given two ethernet address are the same;
  *  False (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
 				     const struct rte_ether_addr *ea2)
 {
+#if !defined(RTE_ARCH_STRICT_ALIGN)
+	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const unaligned_uint32_t *)ea2)[0]) |
+			(((const uint16_t *)ea1)[2] ^ ((const uint16_t *)ea2)[2])) == 0;
+#else
 	const uint16_t *w1 = (const uint16_t *)ea1;
 	const uint16_t *w2 = (const uint16_t *)ea2;
 
 	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) == 0;
+#endif
 }
 
 /**
@@ -118,11 +124,16 @@ static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
  *   True  (1) if the given ethernet address is filled with zeros;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_zero_ether_addr(const struct rte_ether_addr *ea)
 {
+#if !defined(RTE_ARCH_STRICT_ALIGN)
+	return (((const unaligned_uint32_t *)ea)[0] | ((const uint16_t *)ea)[2]) == 0;
+#else
 	const uint16_t *w = (const uint16_t *)ea;
 
 	return (w[0] | w[1] | w[2]) == 0;
+#endif
 }
 
 /**
@@ -135,6 +146,7 @@ static inline int rte_is_zero_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is a unicast address;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_unicast_ether_addr(const struct rte_ether_addr *ea)
 {
 	return (ea->addr_bytes[0] & RTE_ETHER_GROUP_ADDR) == 0;
@@ -150,6 +162,7 @@ static inline int rte_is_unicast_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is a multicast address;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_multicast_ether_addr(const struct rte_ether_addr *ea)
 {
 	return ea->addr_bytes[0] & RTE_ETHER_GROUP_ADDR;
@@ -165,6 +178,7 @@ static inline int rte_is_multicast_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is a broadcast address;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_broadcast_ether_addr(const struct rte_ether_addr *ea)
 {
 	const uint16_t *w = (const uint16_t *)ea;
@@ -182,6 +196,7 @@ static inline int rte_is_broadcast_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is a universally assigned address;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_universal_ether_addr(const struct rte_ether_addr *ea)
 {
 	return (ea->addr_bytes[0] & RTE_ETHER_LOCAL_ADMIN_ADDR) == 0;
@@ -197,6 +212,7 @@ static inline int rte_is_universal_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is a locally assigned address;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_local_admin_ether_addr(const struct rte_ether_addr *ea)
 {
 	return (ea->addr_bytes[0] & RTE_ETHER_LOCAL_ADMIN_ADDR) != 0;
@@ -213,9 +229,10 @@ static inline int rte_is_local_admin_ether_addr(const struct rte_ether_addr *ea)
  *   True  (1) if the given ethernet address is valid;
  *   false (0) otherwise.
  */
+__rte_pure
 static inline int rte_is_valid_assigned_ether_addr(const struct rte_ether_addr *ea)
 {
-	return rte_is_unicast_ether_addr(ea) && (!rte_is_zero_ether_addr(ea));
+	return rte_is_unicast_ether_addr(ea) && !rte_is_zero_ether_addr(ea);
 }
 
 /**
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function
  2026-01-30 10:46 [PATCH 1/2] net: ethernet address comparison optimizations Morten Brørup
@ 2026-01-30 10:46 ` Morten Brørup
  2026-01-30 14:03   ` Morten Brørup
  2026-01-30 10:52 ` [PATCH 1/2] net: ethernet address comparison optimizations Bruce Richardson
  2026-01-30 16:20 ` Stephen Hemminger
  2 siblings, 1 reply; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 10:46 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

Added a fast ethernet address comparison function for 64-bit CPU
architectures without strict alignment requirements, loading the ethernet
addresses as 64-bit words and comparing the relevant 6 bytes.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/net/rte_ether.h | 46 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
index 5552d3c1f6..1b640d81c2 100644
--- a/lib/net/rte_ether.h
+++ b/lib/net/rte_ether.h
@@ -114,6 +114,52 @@ static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
 #endif
 }
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Check if two Ethernet addresses are the same, performance optimized.
+ *
+ * @warning
+ * Intentional buffer overrun:
+ * The Ethernet addresses are loaded as 64-bit integers, i.e.
+ * two bytes past the memory holding the Ethernet addresses are loaded.
+ * The caller must ensure that this does not cause problems.
+ * If an Ethernet address 'ea' is a field in a structure 'S', it can be verified as follows:
+ * \code{.c}
+ *   static_assert(sizeof(struct S) >= offsetof(struct S, ea) + sizeof(uint64_t));
+ * \endcode
+ *
+ * @param ea1
+ *   A pointer to the first ether_addr structure containing the Ethernet address.
+ * @param ea2
+ *   A pointer to the second ether_addr structure containing the Ethernet address.
+ *
+ * @return
+ *   - true if the given two Ethernet addresses are the same;
+ *   - false otherwise.
+ */
+__rte_experimental
+__rte_pure
+static inline bool
+rte_is_same_ether_addr_fast(const struct rte_ether_addr *ea1,
+		const struct rte_ether_addr *ea2)
+{
+#if defined(RTE_ARCH_64) && !defined(RTE_ARCH_STRICT_ALIGN)
+	const unaligned_uint64_t * const a1 = (const unaligned_uint64_t *)ea1;
+	const unaligned_uint64_t * const a2 = (const unaligned_uint64_t *)ea2;
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	return (*a1 ^ *a2) >> 16 == 0;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	return (*a1 ^ *a2) << 16 == 0;
+#else
+#error "Unknown byte order."
+#endif /* RTE_BYTE_ORDER */
+#else
+	return rte_is_same_ether_addr(ea1, ea2);
+#endif
+}
+
 /**
  * Check if an Ethernet address is filled with zeros.
  *
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 10:46 [PATCH 1/2] net: ethernet address comparison optimizations Morten Brørup
  2026-01-30 10:46 ` [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function Morten Brørup
@ 2026-01-30 10:52 ` Bruce Richardson
  2026-01-30 11:16   ` Morten Brørup
  2026-01-30 16:20 ` Stephen Hemminger
  2 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2026-01-30 10:52 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> For CPU architectures without strict alignment requirements, operations on
> 6-byte Ethernet addresses using three 2-byte operations were replaced by a
> 4-byte and a 2-byte operation, i.e. two operations instead of three.
> 
> Comparison functions are pure, so added __rte_pure.
> 
> Removed superfluous parentheses. (No functional change.)
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/net/rte_ether.h | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> index c9a0b536c3..5552d3c1f6 100644
> --- a/lib/net/rte_ether.h
> +++ b/lib/net/rte_ether.h
> @@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr) == 2,
>   *  True  (1) if the given two ethernet address are the same;
>   *  False (0) otherwise.
>   */
> +__rte_pure
>  static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
>  				     const struct rte_ether_addr *ea2)
>  {
> +#if !defined(RTE_ARCH_STRICT_ALIGN)
> +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const unaligned_uint32_t *)ea2)[0]) |
> +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t *)ea2)[2])) == 0;
> +#else
>  	const uint16_t *w1 = (const uint16_t *)ea1;
>  	const uint16_t *w2 = (const uint16_t *)ea2;
>  
>  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) == 0;
> +#endif
>  }

Is this actually faster? For architectures that support strict alignment,
this looks like something that the compilers should be doing using proper
cost-benefit evaluation based on target architecture, rather than us doing
it in our code.

/Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 10:52 ` [PATCH 1/2] net: ethernet address comparison optimizations Bruce Richardson
@ 2026-01-30 11:16   ` Morten Brørup
  2026-01-30 11:26     ` Bruce Richardson
  0 siblings, 1 reply; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 11:16 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 30 January 2026 11.53
> 
> On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > For CPU architectures without strict alignment requirements,
> operations on
> > 6-byte Ethernet addresses using three 2-byte operations were replaced
> by a
> > 4-byte and a 2-byte operation, i.e. two operations instead of three.
> >
> > Comparison functions are pure, so added __rte_pure.
> >
> > Removed superfluous parentheses. (No functional change.)
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> >  1 file changed, 18 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > index c9a0b536c3..5552d3c1f6 100644
> > --- a/lib/net/rte_ether.h
> > +++ b/lib/net/rte_ether.h
> > @@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr) ==
> 2,
> >   *  True  (1) if the given two ethernet address are the same;
> >   *  False (0) otherwise.
> >   */
> > +__rte_pure
> >  static inline int rte_is_same_ether_addr(const struct rte_ether_addr
> *ea1,
> >  				     const struct rte_ether_addr *ea2)
> >  {
> > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> unaligned_uint32_t *)ea2)[0]) |
> > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> *)ea2)[2])) == 0;
> > +#else
> >  	const uint16_t *w1 = (const uint16_t *)ea1;
> >  	const uint16_t *w2 = (const uint16_t *)ea2;
> >
> >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) ==
> 0;
> > +#endif
> >  }
> 
> Is this actually faster?

It's a simple micro-optimization, so I haven't benchmarked it.
On x86, the compiled function is simplified and reduced in size from 34 to 24 bytes:

00000000004ed650 <review_rte_is_same_ether_addr>:
  4ed650:	0f b7 07             	movzwl (%rdi),%eax
  4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
  4ed657:	66 33 06             	xor    (%rsi),%ax
  4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
  4ed65e:	09 d0                	or     %edx,%eax
  4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
  4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
  4ed668:	66 09 d0             	or     %dx,%ax
  4ed66b:	0f 94 c0             	sete   %al
  4ed66e:	0f b6 c0             	movzbl %al,%eax
  4ed671:	c3                   	ret
  4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
  4ed679:	00 00 00 00 
  4ed67d:	0f 1f 00             	nopl   (%rax)

00000000004ed680 <rte_is_same_ether_addr_improved>:
  4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
  4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
  4ed688:	8b 17                	mov    (%rdi),%edx
  4ed68a:	33 16                	xor    (%rsi),%edx
  4ed68c:	0f b7 c0             	movzwl %ax,%eax
  4ed68f:	09 c2                	or     %eax,%edx
  4ed691:	0f 94 c0             	sete   %al
  4ed694:	0f b6 c0             	movzbl %al,%eax
  4ed697:	c3                   	ret
  4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
  4ed69f:	00

For reference, memcpy() of 6 bytes (compile time constant) also compiles to a 4-byte and a 2-byte operation, not three 2-byte operations.

> For architectures that support strict alignment,
> this looks like something that the compilers should be doing using
> proper
> cost-benefit evaluation based on target architecture, rather than us
> doing
> it in our code.

I agree with the high level message in your comment.
DPDK contains some manual optimizations from back in the days, and the evolvement of compilers have made some of them obsolete.

In this case, GCC doesn't optimize it, so I did it manually.
I haven't checked if other compilers are clever enough to do it.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 11:16   ` Morten Brørup
@ 2026-01-30 11:26     ` Bruce Richardson
  2026-01-30 13:54       ` Morten Brørup
  0 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2026-01-30 11:26 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Friday, 30 January 2026 11.53
> > 
> > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > > For CPU architectures without strict alignment requirements,
> > operations on
> > > 6-byte Ethernet addresses using three 2-byte operations were replaced
> > by a
> > > 4-byte and a 2-byte operation, i.e. two operations instead of three.
> > >
> > > Comparison functions are pure, so added __rte_pure.
> > >
> > > Removed superfluous parentheses. (No functional change.)
> > >
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > index c9a0b536c3..5552d3c1f6 100644
> > > --- a/lib/net/rte_ether.h
> > > +++ b/lib/net/rte_ether.h
> > > @@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr) ==
> > 2,
> > >   *  True  (1) if the given two ethernet address are the same;
> > >   *  False (0) otherwise.
> > >   */
> > > +__rte_pure
> > >  static inline int rte_is_same_ether_addr(const struct rte_ether_addr
> > *ea1,
> > >  				     const struct rte_ether_addr *ea2)
> > >  {
> > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> > unaligned_uint32_t *)ea2)[0]) |
> > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> > *)ea2)[2])) == 0;
> > > +#else
> > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > >
> > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) ==
> > 0;
> > > +#endif
> > >  }
> > 
> > Is this actually faster?
> 
> It's a simple micro-optimization, so I haven't benchmarked it.
> On x86, the compiled function is simplified and reduced in size from 34 to 24 bytes:
> 
> 00000000004ed650 <review_rte_is_same_ether_addr>:
>   4ed650:	0f b7 07             	movzwl (%rdi),%eax
>   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
>   4ed657:	66 33 06             	xor    (%rsi),%ax
>   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
>   4ed65e:	09 d0                	or     %edx,%eax
>   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
>   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
>   4ed668:	66 09 d0             	or     %dx,%ax
>   4ed66b:	0f 94 c0             	sete   %al
>   4ed66e:	0f b6 c0             	movzbl %al,%eax
>   4ed671:	c3                   	ret
>   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
>   4ed679:	00 00 00 00 
>   4ed67d:	0f 1f 00             	nopl   (%rax)
> 
> 00000000004ed680 <rte_is_same_ether_addr_improved>:
>   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
>   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
>   4ed688:	8b 17                	mov    (%rdi),%edx
>   4ed68a:	33 16                	xor    (%rsi),%edx
>   4ed68c:	0f b7 c0             	movzwl %ax,%eax
>   4ed68f:	09 c2                	or     %eax,%edx
>   4ed691:	0f 94 c0             	sete   %al
>   4ed694:	0f b6 c0             	movzbl %al,%eax
>   4ed697:	c3                   	ret
>   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
>   4ed69f:	00
> 
> For reference, memcpy() of 6 bytes (compile time constant) also compiles to a 4-byte and a 2-byte operation, not three 2-byte operations.
> 
What about memcmp? Does it compile similarly? Before we start adding ifdefs
like this to the code, I'd like to see some measured performance benefits
from it. While the code may be 10 bytes shorter, does that actually
translate into a measurable difference in some app?

/Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 11:26     ` Bruce Richardson
@ 2026-01-30 13:54       ` Morten Brørup
  2026-01-30 14:02         ` Bruce Richardson
  0 siblings, 1 reply; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 13:54 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 30 January 2026 12.27
> 
> On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Friday, 30 January 2026 11.53
> > >
> > > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > > > For CPU architectures without strict alignment requirements,
> > > operations on
> > > > 6-byte Ethernet addresses using three 2-byte operations were
> replaced
> > > by a
> > > > 4-byte and a 2-byte operation, i.e. two operations instead of
> three.
> > > >
> > > > Comparison functions are pure, so added __rte_pure.
> > > >
> > > > Removed superfluous parentheses. (No functional change.)
> > > >
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > ---
> > > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > > index c9a0b536c3..5552d3c1f6 100644
> > > > --- a/lib/net/rte_ether.h
> > > > +++ b/lib/net/rte_ether.h
> > > > @@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr)
> ==
> > > 2,
> > > >   *  True  (1) if the given two ethernet address are the same;
> > > >   *  False (0) otherwise.
> > > >   */
> > > > +__rte_pure
> > > >  static inline int rte_is_same_ether_addr(const struct
> rte_ether_addr
> > > *ea1,
> > > >  				     const struct rte_ether_addr *ea2)
> > > >  {
> > > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> > > unaligned_uint32_t *)ea2)[0]) |
> > > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> > > *)ea2)[2])) == 0;
> > > > +#else
> > > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > > >
> > > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^
> w2[2])) ==
> > > 0;
> > > > +#endif
> > > >  }
> > >
> > > Is this actually faster?
> >
> > It's a simple micro-optimization, so I haven't benchmarked it.
> > On x86, the compiled function is simplified and reduced in size from
> 34 to 24 bytes:
> >
> > 00000000004ed650 <review_rte_is_same_ether_addr>:
> >   4ed650:	0f b7 07             	movzwl (%rdi),%eax
> >   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
> >   4ed657:	66 33 06             	xor    (%rsi),%ax
> >   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
> >   4ed65e:	09 d0                	or     %edx,%eax
> >   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
> >   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
> >   4ed668:	66 09 d0             	or     %dx,%ax
> >   4ed66b:	0f 94 c0             	sete   %al
> >   4ed66e:	0f b6 c0             	movzbl %al,%eax
> >   4ed671:	c3                   	ret
> >   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
> >   4ed679:	00 00 00 00
> >   4ed67d:	0f 1f 00             	nopl   (%rax)
> >
> > 00000000004ed680 <rte_is_same_ether_addr_improved>:
> >   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
> >   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
> >   4ed688:	8b 17                	mov    (%rdi),%edx
> >   4ed68a:	33 16                	xor    (%rsi),%edx
> >   4ed68c:	0f b7 c0             	movzwl %ax,%eax
> >   4ed68f:	09 c2                	or     %eax,%edx
> >   4ed691:	0f 94 c0             	sete   %al
> >   4ed694:	0f b6 c0             	movzbl %al,%eax
> >   4ed697:	c3                   	ret
> >   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
> >   4ed69f:	00
> >
> > For reference, memcpy() of 6 bytes (compile time constant) also
> compiles to a 4-byte and a 2-byte operation, not three 2-byte
> operations.
> >
> What about memcmp? Does it compile similarly?

memcmp(a,b,6) on Clang compiles into something very similar.
memcmp(a,b,6) on GCC compiles into something with a branch after the first 4-byte comparison, with the assumption (regarding static branch prediction) that they are likely to differ.
I guess GCC's counterproductive behavior was the reason for originally implementing a manual comparison, instead of simply using memcmp().

BTW, GCC is clever enough to compile 8-byte and 16-byte comparisons into code without branches.
I guess that's why rte_ipv6_addr_eq() is implemented using memcpy() [1].

[1]: https://elixir.bootlin.com/dpdk/v25.11/source/lib/net/rte_ip6.h#L68

> Before we start adding ifdefs
> like this to the code, I'd like to see some measured performance
> benefits
> from it. While the code may be 10 bytes shorter, does that actually
> translate into a measurable difference in some app?

Excellent question!
Some quick rudimentary testing shows that it seems to be ~4 cycles slower than what it's replacing.
Reality beats expectations.

I'll drop this patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 13:54       ` Morten Brørup
@ 2026-01-30 14:02         ` Bruce Richardson
  2026-01-30 14:25           ` Morten Brørup
  0 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2026-01-30 14:02 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Fri, Jan 30, 2026 at 02:54:52PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Friday, 30 January 2026 12.27
> > 
> > On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > Sent: Friday, 30 January 2026 11.53
> > > >
> > > > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > > > > For CPU architectures without strict alignment requirements,
> > > > operations on
> > > > > 6-byte Ethernet addresses using three 2-byte operations were
> > replaced
> > > > by a
> > > > > 4-byte and a 2-byte operation, i.e. two operations instead of
> > three.
> > > > >
> > > > > Comparison functions are pure, so added __rte_pure.
> > > > >
> > > > > Removed superfluous parentheses. (No functional change.)
> > > > >
> > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > ---
> > > > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > > > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > > > index c9a0b536c3..5552d3c1f6 100644
> > > > > --- a/lib/net/rte_ether.h
> > > > > +++ b/lib/net/rte_ether.h
> > > > > @@ -99,13 +99,19 @@ static_assert(alignof(struct rte_ether_addr)
> > ==
> > > > 2,
> > > > >   *  True  (1) if the given two ethernet address are the same;
> > > > >   *  False (0) otherwise.
> > > > >   */
> > > > > +__rte_pure
> > > > >  static inline int rte_is_same_ether_addr(const struct
> > rte_ether_addr
> > > > *ea1,
> > > > >  				     const struct rte_ether_addr *ea2)
> > > > >  {
> > > > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> > > > unaligned_uint32_t *)ea2)[0]) |
> > > > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> > > > *)ea2)[2])) == 0;
> > > > > +#else
> > > > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > > > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > > > >
> > > > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^
> > w2[2])) ==
> > > > 0;
> > > > > +#endif
> > > > >  }
> > > >
> > > > Is this actually faster?
> > >
> > > It's a simple micro-optimization, so I haven't benchmarked it.
> > > On x86, the compiled function is simplified and reduced in size from
> > 34 to 24 bytes:
> > >
> > > 00000000004ed650 <review_rte_is_same_ether_addr>:
> > >   4ed650:	0f b7 07             	movzwl (%rdi),%eax
> > >   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
> > >   4ed657:	66 33 06             	xor    (%rsi),%ax
> > >   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
> > >   4ed65e:	09 d0                	or     %edx,%eax
> > >   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
> > >   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
> > >   4ed668:	66 09 d0             	or     %dx,%ax
> > >   4ed66b:	0f 94 c0             	sete   %al
> > >   4ed66e:	0f b6 c0             	movzbl %al,%eax
> > >   4ed671:	c3                   	ret
> > >   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
> > >   4ed679:	00 00 00 00
> > >   4ed67d:	0f 1f 00             	nopl   (%rax)
> > >
> > > 00000000004ed680 <rte_is_same_ether_addr_improved>:
> > >   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
> > >   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
> > >   4ed688:	8b 17                	mov    (%rdi),%edx
> > >   4ed68a:	33 16                	xor    (%rsi),%edx
> > >   4ed68c:	0f b7 c0             	movzwl %ax,%eax
> > >   4ed68f:	09 c2                	or     %eax,%edx
> > >   4ed691:	0f 94 c0             	sete   %al
> > >   4ed694:	0f b6 c0             	movzbl %al,%eax
> > >   4ed697:	c3                   	ret
> > >   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
> > >   4ed69f:	00
> > >
> > > For reference, memcpy() of 6 bytes (compile time constant) also
> > compiles to a 4-byte and a 2-byte operation, not three 2-byte
> > operations.
> > >
> > What about memcmp? Does it compile similarly?
> 
> memcmp(a,b,6) on Clang compiles into something very similar.
> memcmp(a,b,6) on GCC compiles into something with a branch after the first 4-byte comparison, with the assumption (regarding static branch prediction) that they are likely to differ.
> I guess GCC's counterproductive behavior was the reason for originally implementing a manual comparison, instead of simply using memcmp().
> 
> BTW, GCC is clever enough to compile 8-byte and 16-byte comparisons into code without branches.
> I guess that's why rte_ipv6_addr_eq() is implemented using memcpy() [1].
> 
> [1]: https://elixir.bootlin.com/dpdk/v25.11/source/lib/net/rte_ip6.h#L68
> 
> > Before we start adding ifdefs
> > like this to the code, I'd like to see some measured performance
> > benefits
> > from it. While the code may be 10 bytes shorter, does that actually
> > translate into a measurable difference in some app?
> 
> Excellent question!
> Some quick rudimentary testing shows that it seems to be ~4 cycles slower than what it's replacing.
> Reality beats expectations.
> 
> I'll drop this patch.
>
If you have the test-case already prepared, can you also check what
memcmp() performs like? Replacing the whole function by memcmp and punting
the optimization to the compiler would be a nice, though small, code
improvement.

/Bruce 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function
  2026-01-30 10:46 ` [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function Morten Brørup
@ 2026-01-30 14:03   ` Morten Brørup
  0 siblings, 0 replies; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 14:03 UTC (permalink / raw)
  To: Bruce Richardson, dev

> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Friday, 30 January 2026 11.46
> 
> Added a fast ethernet address comparison function for 64-bit CPU
> architectures without strict alignment requirements, loading the
> ethernet
> addresses as 64-bit words and comparing the relevant 6 bytes.

Some quick testing in a real application shows this is ~2 cycles faster than the standard rte_is_same_ether_addr() implementation (without the patch 1/2 changes).

> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/net/rte_ether.h | 46 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
> 
> diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> index 5552d3c1f6..1b640d81c2 100644
> --- a/lib/net/rte_ether.h
> +++ b/lib/net/rte_ether.h
> @@ -114,6 +114,52 @@ static inline int rte_is_same_ether_addr(const
> struct rte_ether_addr *ea1,
>  #endif
>  }
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> + *
> + * Check if two Ethernet addresses are the same, performance
> optimized.
> + *
> + * @warning
> + * Intentional buffer overrun:
> + * The Ethernet addresses are loaded as 64-bit integers, i.e.
> + * two bytes past the memory holding the Ethernet addresses are
> loaded.
> + * The caller must ensure that this does not cause problems.
> + * If an Ethernet address 'ea' is a field in a structure 'S', it can
> be verified as follows:
> + * \code{.c}
> + *   static_assert(sizeof(struct S) >= offsetof(struct S, ea) +
> sizeof(uint64_t));
> + * \endcode
> + *
> + * @param ea1
> + *   A pointer to the first ether_addr structure containing the
> Ethernet address.
> + * @param ea2
> + *   A pointer to the second ether_addr structure containing the
> Ethernet address.
> + *
> + * @return
> + *   - true if the given two Ethernet addresses are the same;
> + *   - false otherwise.
> + */
> +__rte_experimental
> +__rte_pure
> +static inline bool
> +rte_is_same_ether_addr_fast(const struct rte_ether_addr *ea1,
> +		const struct rte_ether_addr *ea2)
> +{
> +#if defined(RTE_ARCH_64) && !defined(RTE_ARCH_STRICT_ALIGN)
> +	const unaligned_uint64_t * const a1 = (const unaligned_uint64_t
> *)ea1;
> +	const unaligned_uint64_t * const a2 = (const unaligned_uint64_t
> *)ea2;
> +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
> +	return (*a1 ^ *a2) >> 16 == 0;
> +#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +	return (*a1 ^ *a2) << 16 == 0;
> +#else
> +#error "Unknown byte order."
> +#endif /* RTE_BYTE_ORDER */
> +#else
> +	return rte_is_same_ether_addr(ea1, ea2);
> +#endif
> +}
> +
>  /**
>   * Check if an Ethernet address is filled with zeros.
>   *
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 14:02         ` Bruce Richardson
@ 2026-01-30 14:25           ` Morten Brørup
  2026-01-30 14:32             ` Bruce Richardson
  0 siblings, 1 reply; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 14:25 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 30 January 2026 15.03
> 
> On Fri, Jan 30, 2026 at 02:54:52PM +0100, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Friday, 30 January 2026 12.27
> > >
> > > On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > > Sent: Friday, 30 January 2026 11.53
> > > > >
> > > > > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > > > > > For CPU architectures without strict alignment requirements,
> > > > > operations on
> > > > > > 6-byte Ethernet addresses using three 2-byte operations were
> > > replaced
> > > > > by a
> > > > > > 4-byte and a 2-byte operation, i.e. two operations instead of
> > > three.
> > > > > >
> > > > > > Comparison functions are pure, so added __rte_pure.
> > > > > >
> > > > > > Removed superfluous parentheses. (No functional change.)
> > > > > >
> > > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > ---
> > > > > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > > > > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > > > > index c9a0b536c3..5552d3c1f6 100644
> > > > > > --- a/lib/net/rte_ether.h
> > > > > > +++ b/lib/net/rte_ether.h
> > > > > > @@ -99,13 +99,19 @@ static_assert(alignof(struct
> rte_ether_addr)
> > > ==
> > > > > 2,
> > > > > >   *  True  (1) if the given two ethernet address are the
> same;
> > > > > >   *  False (0) otherwise.
> > > > > >   */
> > > > > > +__rte_pure
> > > > > >  static inline int rte_is_same_ether_addr(const struct
> > > rte_ether_addr
> > > > > *ea1,
> > > > > >  				     const struct rte_ether_addr *ea2)
> > > > > >  {
> > > > > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > > > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> > > > > unaligned_uint32_t *)ea2)[0]) |
> > > > > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> > > > > *)ea2)[2])) == 0;
> > > > > > +#else
> > > > > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > > > > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > > > > >
> > > > > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^
> > > w2[2])) ==
> > > > > 0;
> > > > > > +#endif
> > > > > >  }
> > > > >
> > > > > Is this actually faster?
> > > >
> > > > It's a simple micro-optimization, so I haven't benchmarked it.
> > > > On x86, the compiled function is simplified and reduced in size
> from
> > > 34 to 24 bytes:
> > > >
> > > > 00000000004ed650 <review_rte_is_same_ether_addr>:
> > > >   4ed650:	0f b7 07             	movzwl (%rdi),%eax
> > > >   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
> > > >   4ed657:	66 33 06             	xor    (%rsi),%ax
> > > >   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
> > > >   4ed65e:	09 d0                	or     %edx,%eax
> > > >   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
> > > >   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
> > > >   4ed668:	66 09 d0             	or     %dx,%ax
> > > >   4ed66b:	0f 94 c0             	sete   %al
> > > >   4ed66e:	0f b6 c0             	movzbl %al,%eax
> > > >   4ed671:	c3                   	ret
> > > >   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw
> 0x0(%rax,%rax,1)
> > > >   4ed679:	00 00 00 00
> > > >   4ed67d:	0f 1f 00             	nopl   (%rax)
> > > >
> > > > 00000000004ed680 <rte_is_same_ether_addr_improved>:
> > > >   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
> > > >   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
> > > >   4ed688:	8b 17                	mov    (%rdi),%edx
> > > >   4ed68a:	33 16                	xor    (%rsi),%edx
> > > >   4ed68c:	0f b7 c0             	movzwl %ax,%eax
> > > >   4ed68f:	09 c2                	or     %eax,%edx
> > > >   4ed691:	0f 94 c0             	sete   %al
> > > >   4ed694:	0f b6 c0             	movzbl %al,%eax
> > > >   4ed697:	c3                   	ret
> > > >   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
> > > >   4ed69f:	00
> > > >
> > > > For reference, memcpy() of 6 bytes (compile time constant) also
> > > compiles to a 4-byte and a 2-byte operation, not three 2-byte
> > > operations.
> > > >
> > > What about memcmp? Does it compile similarly?
> >
> > memcmp(a,b,6) on Clang compiles into something very similar.
> > memcmp(a,b,6) on GCC compiles into something with a branch after the
> first 4-byte comparison, with the assumption (regarding static branch
> prediction) that they are likely to differ.
> > I guess GCC's counterproductive behavior was the reason for
> originally implementing a manual comparison, instead of simply using
> memcmp().
> >
> > BTW, GCC is clever enough to compile 8-byte and 16-byte comparisons
> into code without branches.
> > I guess that's why rte_ipv6_addr_eq() is implemented using memcpy()
> [1].
> >
> > [1]:
> https://elixir.bootlin.com/dpdk/v25.11/source/lib/net/rte_ip6.h#L68
> >
> > > Before we start adding ifdefs
> > > like this to the code, I'd like to see some measured performance
> > > benefits
> > > from it. While the code may be 10 bytes shorter, does that actually
> > > translate into a measurable difference in some app?
> >
> > Excellent question!
> > Some quick rudimentary testing shows that it seems to be ~4 cycles
> slower than what it's replacing.
> > Reality beats expectations.
> >
> > I'll drop this patch.
> >
> If you have the test-case already prepared, can you also check what
> memcmp() performs like? Replacing the whole function by memcmp and
> punting
> the optimization to the compiler would be a nice, though small, code
> improvement.

Good you asked!

While setting up the test for memcmp(), I noticed that I had been testing my improved function without "inline".
With inline (like the original), it's ~1 cycle faster than the original.
I have restored the patch status to "New".

The memcmp() test (not forgetting "inline") performs very close to the original.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 14:25           ` Morten Brørup
@ 2026-01-30 14:32             ` Bruce Richardson
  2026-01-30 14:59               ` Morten Brørup
  0 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2026-01-30 14:32 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Fri, Jan 30, 2026 at 03:25:34PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Friday, 30 January 2026 15.03
> > 
> > On Fri, Jan 30, 2026 at 02:54:52PM +0100, Morten Brørup wrote:
> > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > Sent: Friday, 30 January 2026 12.27
> > > >
> > > > On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > > > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > > > Sent: Friday, 30 January 2026 11.53
> > > > > >
> > > > > > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup wrote:
> > > > > > > For CPU architectures without strict alignment requirements,
> > > > > > operations on
> > > > > > > 6-byte Ethernet addresses using three 2-byte operations were
> > > > replaced
> > > > > > by a
> > > > > > > 4-byte and a 2-byte operation, i.e. two operations instead of
> > > > three.
> > > > > > >
> > > > > > > Comparison functions are pure, so added __rte_pure.
> > > > > > >
> > > > > > > Removed superfluous parentheses. (No functional change.)
> > > > > > >
> > > > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > > ---
> > > > > > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > > > > > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > > > > > index c9a0b536c3..5552d3c1f6 100644
> > > > > > > --- a/lib/net/rte_ether.h
> > > > > > > +++ b/lib/net/rte_ether.h
> > > > > > > @@ -99,13 +99,19 @@ static_assert(alignof(struct
> > rte_ether_addr)
> > > > ==
> > > > > > 2,
> > > > > > >   *  True  (1) if the given two ethernet address are the
> > same;
> > > > > > >   *  False (0) otherwise.
> > > > > > >   */
> > > > > > > +__rte_pure
> > > > > > >  static inline int rte_is_same_ether_addr(const struct
> > > > rte_ether_addr
> > > > > > *ea1,
> > > > > > >  				     const struct rte_ether_addr *ea2)
> > > > > > >  {
> > > > > > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > > > > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> > > > > > unaligned_uint32_t *)ea2)[0]) |
> > > > > > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t
> > > > > > *)ea2)[2])) == 0;
> > > > > > > +#else
> > > > > > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > > > > > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > > > > > >
> > > > > > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^
> > > > w2[2])) ==
> > > > > > 0;
> > > > > > > +#endif
> > > > > > >  }
> > > > > >
> > > > > > Is this actually faster?
> > > > >
> > > > > It's a simple micro-optimization, so I haven't benchmarked it.
> > > > > On x86, the compiled function is simplified and reduced in size
> > from
> > > > 34 to 24 bytes:
> > > > >
> > > > > 00000000004ed650 <review_rte_is_same_ether_addr>:
> > > > >   4ed650:	0f b7 07             	movzwl (%rdi),%eax
> > > > >   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
> > > > >   4ed657:	66 33 06             	xor    (%rsi),%ax
> > > > >   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
> > > > >   4ed65e:	09 d0                	or     %edx,%eax
> > > > >   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
> > > > >   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
> > > > >   4ed668:	66 09 d0             	or     %dx,%ax
> > > > >   4ed66b:	0f 94 c0             	sete   %al
> > > > >   4ed66e:	0f b6 c0             	movzbl %al,%eax
> > > > >   4ed671:	c3                   	ret
> > > > >   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw
> > 0x0(%rax,%rax,1)
> > > > >   4ed679:	00 00 00 00
> > > > >   4ed67d:	0f 1f 00             	nopl   (%rax)
> > > > >
> > > > > 00000000004ed680 <rte_is_same_ether_addr_improved>:
> > > > >   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
> > > > >   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
> > > > >   4ed688:	8b 17                	mov    (%rdi),%edx
> > > > >   4ed68a:	33 16                	xor    (%rsi),%edx
> > > > >   4ed68c:	0f b7 c0             	movzwl %ax,%eax
> > > > >   4ed68f:	09 c2                	or     %eax,%edx
> > > > >   4ed691:	0f 94 c0             	sete   %al
> > > > >   4ed694:	0f b6 c0             	movzbl %al,%eax
> > > > >   4ed697:	c3                   	ret
> > > > >   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
> > > > >   4ed69f:	00
> > > > >
> > > > > For reference, memcpy() of 6 bytes (compile time constant) also
> > > > compiles to a 4-byte and a 2-byte operation, not three 2-byte
> > > > operations.
> > > > >
> > > > What about memcmp? Does it compile similarly?
> > >
> > > memcmp(a,b,6) on Clang compiles into something very similar.
> > > memcmp(a,b,6) on GCC compiles into something with a branch after the
> > first 4-byte comparison, with the assumption (regarding static branch
> > prediction) that they are likely to differ.
> > > I guess GCC's counterproductive behavior was the reason for
> > originally implementing a manual comparison, instead of simply using
> > memcmp().
> > >
> > > BTW, GCC is clever enough to compile 8-byte and 16-byte comparisons
> > into code without branches.
> > > I guess that's why rte_ipv6_addr_eq() is implemented using memcpy()
> > [1].
> > >
> > > [1]:
> > https://elixir.bootlin.com/dpdk/v25.11/source/lib/net/rte_ip6.h#L68
> > >
> > > > Before we start adding ifdefs
> > > > like this to the code, I'd like to see some measured performance
> > > > benefits
> > > > from it. While the code may be 10 bytes shorter, does that actually
> > > > translate into a measurable difference in some app?
> > >
> > > Excellent question!
> > > Some quick rudimentary testing shows that it seems to be ~4 cycles
> > slower than what it's replacing.
> > > Reality beats expectations.
> > >
> > > I'll drop this patch.
> > >
> > If you have the test-case already prepared, can you also check what
> > memcmp() performs like? Replacing the whole function by memcmp and
> > punting
> > the optimization to the compiler would be a nice, though small, code
> > improvement.
> 
> Good you asked!
> 
> While setting up the test for memcmp(), I noticed that I had been testing my improved function without "inline".
> With inline (like the original), it's ~1 cycle faster than the original.
> I have restored the patch status to "New".
> 
> The memcmp() test (not forgetting "inline") performs very close to the original.
> 

If memcmp performs like the original, I'd be tempted to forgo the 1cycle
benefit just to have the shortest simplest code.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 14:32             ` Bruce Richardson
@ 2026-01-30 14:59               ` Morten Brørup
  0 siblings, 0 replies; 14+ messages in thread
From: Morten Brørup @ 2026-01-30 14:59 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 30 January 2026 15.33
> 
> On Fri, Jan 30, 2026 at 03:25:34PM +0100, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Friday, 30 January 2026 15.03
> > >
> > > On Fri, Jan 30, 2026 at 02:54:52PM +0100, Morten Brørup wrote:
> > > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > > Sent: Friday, 30 January 2026 12.27
> > > > >
> > > > > On Fri, Jan 30, 2026 at 12:16:43PM +0100, Morten Brørup wrote:
> > > > > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > > > > Sent: Friday, 30 January 2026 11.53
> > > > > > >
> > > > > > > On Fri, Jan 30, 2026 at 10:46:16AM +0000, Morten Brørup
> wrote:
> > > > > > > > For CPU architectures without strict alignment
> requirements,
> > > > > > > operations on
> > > > > > > > 6-byte Ethernet addresses using three 2-byte operations
> were
> > > > > replaced
> > > > > > > by a
> > > > > > > > 4-byte and a 2-byte operation, i.e. two operations
> instead of
> > > > > three.
> > > > > > > >
> > > > > > > > Comparison functions are pure, so added __rte_pure.
> > > > > > > >
> > > > > > > > Removed superfluous parentheses. (No functional change.)
> > > > > > > >
> > > > > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > > > ---
> > > > > > > >  lib/net/rte_ether.h | 19 ++++++++++++++++++-
> > > > > > > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
> > > > > > > > index c9a0b536c3..5552d3c1f6 100644
> > > > > > > > --- a/lib/net/rte_ether.h
> > > > > > > > +++ b/lib/net/rte_ether.h
> > > > > > > > @@ -99,13 +99,19 @@ static_assert(alignof(struct
> > > rte_ether_addr)
> > > > > ==
> > > > > > > 2,
> > > > > > > >   *  True  (1) if the given two ethernet address are the
> > > same;
> > > > > > > >   *  False (0) otherwise.
> > > > > > > >   */
> > > > > > > > +__rte_pure
> > > > > > > >  static inline int rte_is_same_ether_addr(const struct
> > > > > rte_ether_addr
> > > > > > > *ea1,
> > > > > > > >  				     const struct rte_ether_addr
> *ea2)
> > > > > > > >  {
> > > > > > > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > > > > > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^
> ((const
> > > > > > > unaligned_uint32_t *)ea2)[0]) |
> > > > > > > > +			(((const uint16_t *)ea1)[2] ^ ((const
> uint16_t
> > > > > > > *)ea2)[2])) == 0;
> > > > > > > > +#else
> > > > > > > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > > > > > > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > > > > > > >
> > > > > > > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^
> > > > > w2[2])) ==
> > > > > > > 0;
> > > > > > > > +#endif
> > > > > > > >  }
> > > > > > >
> > > > > > > Is this actually faster?
> > > > > >
> > > > > > It's a simple micro-optimization, so I haven't benchmarked
> it.
> > > > > > On x86, the compiled function is simplified and reduced in
> size
> > > from
> > > > > 34 to 24 bytes:
> > > > > >
> > > > > > 00000000004ed650 <review_rte_is_same_ether_addr>:
> > > > > >   4ed650:	0f b7 07             	movzwl (%rdi),%eax
> > > > > >   4ed653:	0f b7 57 02          	movzwl 0x2(%rdi),%edx
> > > > > >   4ed657:	66 33 06             	xor    (%rsi),%ax
> > > > > >   4ed65a:	66 33 56 02          	xor    0x2(%rsi),%dx
> > > > > >   4ed65e:	09 d0                	or     %edx,%eax
> > > > > >   4ed660:	0f b7 57 04          	movzwl 0x4(%rdi),%edx
> > > > > >   4ed664:	66 33 56 04          	xor    0x4(%rsi),%dx
> > > > > >   4ed668:	66 09 d0             	or     %dx,%ax
> > > > > >   4ed66b:	0f 94 c0             	sete   %al
> > > > > >   4ed66e:	0f b6 c0             	movzbl %al,%eax
> > > > > >   4ed671:	c3                   	ret
> > > > > >   4ed672:	66 66 2e 0f 1f 84 00 	data16 cs nopw
> > > 0x0(%rax,%rax,1)
> > > > > >   4ed679:	00 00 00 00
> > > > > >   4ed67d:	0f 1f 00             	nopl   (%rax)
> > > > > >
> > > > > > 00000000004ed680 <rte_is_same_ether_addr_improved>:
> > > > > >   4ed680:	0f b7 47 04          	movzwl 0x4(%rdi),%eax
> > > > > >   4ed684:	66 33 46 04          	xor    0x4(%rsi),%ax
> > > > > >   4ed688:	8b 17                	mov    (%rdi),%edx
> > > > > >   4ed68a:	33 16                	xor    (%rsi),%edx
> > > > > >   4ed68c:	0f b7 c0             	movzwl %ax,%eax
> > > > > >   4ed68f:	09 c2                	or     %eax,%edx
> > > > > >   4ed691:	0f 94 c0             	sete   %al
> > > > > >   4ed694:	0f b6 c0             	movzbl %al,%eax
> > > > > >   4ed697:	c3                   	ret
> > > > > >   4ed698:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
> > > > > >   4ed69f:	00
> > > > > >
> > > > > > For reference, memcpy() of 6 bytes (compile time constant)
> also
> > > > > compiles to a 4-byte and a 2-byte operation, not three 2-byte
> > > > > operations.
> > > > > >
> > > > > What about memcmp? Does it compile similarly?
> > > >
> > > > memcmp(a,b,6) on Clang compiles into something very similar.
> > > > memcmp(a,b,6) on GCC compiles into something with a branch after
> the
> > > first 4-byte comparison, with the assumption (regarding static
> branch
> > > prediction) that they are likely to differ.
> > > > I guess GCC's counterproductive behavior was the reason for
> > > originally implementing a manual comparison, instead of simply
> using
> > > memcmp().
> > > >
> > > > BTW, GCC is clever enough to compile 8-byte and 16-byte
> comparisons
> > > into code without branches.
> > > > I guess that's why rte_ipv6_addr_eq() is implemented using
> memcpy()
> > > [1].
> > > >
> > > > [1]:
> > > https://elixir.bootlin.com/dpdk/v25.11/source/lib/net/rte_ip6.h#L68
> > > >
> > > > > Before we start adding ifdefs
> > > > > like this to the code, I'd like to see some measured
> performance
> > > > > benefits
> > > > > from it. While the code may be 10 bytes shorter, does that
> actually
> > > > > translate into a measurable difference in some app?
> > > >
> > > > Excellent question!
> > > > Some quick rudimentary testing shows that it seems to be ~4
> cycles
> > > slower than what it's replacing.
> > > > Reality beats expectations.
> > > >
> > > > I'll drop this patch.
> > > >
> > > If you have the test-case already prepared, can you also check what
> > > memcmp() performs like? Replacing the whole function by memcmp and
> > > punting
> > > the optimization to the compiler would be a nice, though small,
> code
> > > improvement.
> >
> > Good you asked!
> >
> > While setting up the test for memcmp(), I noticed that I had been
> testing my improved function without "inline".
> > With inline (like the original), it's ~1 cycle faster than the
> original.
> > I have restored the patch status to "New".
> >
> > The memcmp() test (not forgetting "inline") performs very close to
> the original.
> >
> 
> If memcmp performs like the original, I'd be tempted to forgo the
> 1cycle
> benefit just to have the shortest simplest code.

Yes, I tend to agree.
A modern C compiler should know how to compile memcmp(a,b,6) into something efficient on the architecture it is compiling for.
E.g. Clang would do it the way I proposed in this patch.

For GCC, the cost of the branch was probably eliminated by the branch predictor when running my test.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 10:46 [PATCH 1/2] net: ethernet address comparison optimizations Morten Brørup
  2026-01-30 10:46 ` [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function Morten Brørup
  2026-01-30 10:52 ` [PATCH 1/2] net: ethernet address comparison optimizations Bruce Richardson
@ 2026-01-30 16:20 ` Stephen Hemminger
  2026-01-30 16:24   ` Bruce Richardson
  2 siblings, 1 reply; 14+ messages in thread
From: Stephen Hemminger @ 2026-01-30 16:20 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Fri, 30 Jan 2026 10:46:16 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> +__rte_pure
>  static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
>  				     const struct rte_ether_addr *ea2)
>  {
> +#if !defined(RTE_ARCH_STRICT_ALIGN)
> +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const unaligned_uint32_t *)ea2)[0]) |
> +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t *)ea2)[2])) == 0;
> +#else
>  	const uint16_t *w1 = (const uint16_t *)ea1;
>  	const uint16_t *w2 = (const uint16_t *)ea2;
>  
>  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) == 0;
> +#endif
>  }
>  

FYI in Linux:

static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
	u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
		   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

	return fold == 0;
#else
	const u16 *a = (const u16 *)addr1;
	const u16 *b = (const u16 *)addr2;

	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

In FreeBSD kernel, there is no helper they just use memcmp

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 16:20 ` Stephen Hemminger
@ 2026-01-30 16:24   ` Bruce Richardson
  2026-01-30 16:31     ` Konstantin Ananyev
  0 siblings, 1 reply; 14+ messages in thread
From: Bruce Richardson @ 2026-01-30 16:24 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Morten Brørup, dev

On Fri, Jan 30, 2026 at 08:20:28AM -0800, Stephen Hemminger wrote:
> On Fri, 30 Jan 2026 10:46:16 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > +__rte_pure
> >  static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
> >  				     const struct rte_ether_addr *ea2)
> >  {
> > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const unaligned_uint32_t *)ea2)[0]) |
> > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t *)ea2)[2])) == 0;
> > +#else
> >  	const uint16_t *w1 = (const uint16_t *)ea1;
> >  	const uint16_t *w2 = (const uint16_t *)ea2;
> >  
> >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) == 0;
> > +#endif
> >  }
> >  
> 
> FYI in Linux:
> 
> static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
> {
> #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> 	u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
> 		   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));
> 
> 	return fold == 0;
> #else
> 	const u16 *a = (const u16 *)addr1;
> 	const u16 *b = (const u16 *)addr2;
> 
> 	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
> #endif
> }
> 
> In FreeBSD kernel, there is no helper they just use memcmp

+1 for just memcmp :-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 1/2] net: ethernet address comparison optimizations
  2026-01-30 16:24   ` Bruce Richardson
@ 2026-01-30 16:31     ` Konstantin Ananyev
  0 siblings, 0 replies; 14+ messages in thread
From: Konstantin Ananyev @ 2026-01-30 16:31 UTC (permalink / raw)
  To: Bruce Richardson, Stephen Hemminger; +Cc: Morten Brørup, dev@dpdk.org



> >
> > > +__rte_pure
> > >  static inline int rte_is_same_ether_addr(const struct rte_ether_addr *ea1,
> > >  				     const struct rte_ether_addr *ea2)
> > >  {
> > > +#if !defined(RTE_ARCH_STRICT_ALIGN)
> > > +	return ((((const unaligned_uint32_t *)ea1)[0] ^ ((const
> unaligned_uint32_t *)ea2)[0]) |
> > > +			(((const uint16_t *)ea1)[2] ^ ((const uint16_t *)ea2)[2]))
> == 0;
> > > +#else
> > >  	const uint16_t *w1 = (const uint16_t *)ea1;
> > >  	const uint16_t *w2 = (const uint16_t *)ea2;
> > >
> > >  	return ((w1[0] ^ w2[0]) | (w1[1] ^ w2[1]) | (w1[2] ^ w2[2])) == 0;
> > > +#endif
> > >  }
> > >
> >
> > FYI in Linux:
> >
> > static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
> > {
> > #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> > 	u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
> > 		   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));
> >
> > 	return fold == 0;
> > #else
> > 	const u16 *a = (const u16 *)addr1;
> > 	const u16 *b = (const u16 *)addr2;
> >
> > 	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
> > #endif
> > }
> >
> > In FreeBSD kernel, there is no helper they just use memcmp
> 
> +1 for just memcmp :-)

Same thoughts :)

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-01-30 16:31 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-30 10:46 [PATCH 1/2] net: ethernet address comparison optimizations Morten Brørup
2026-01-30 10:46 ` [PATCH 2/2] [RFC] net: introduce fast ethernet address comparison function Morten Brørup
2026-01-30 14:03   ` Morten Brørup
2026-01-30 10:52 ` [PATCH 1/2] net: ethernet address comparison optimizations Bruce Richardson
2026-01-30 11:16   ` Morten Brørup
2026-01-30 11:26     ` Bruce Richardson
2026-01-30 13:54       ` Morten Brørup
2026-01-30 14:02         ` Bruce Richardson
2026-01-30 14:25           ` Morten Brørup
2026-01-30 14:32             ` Bruce Richardson
2026-01-30 14:59               ` Morten Brørup
2026-01-30 16:20 ` Stephen Hemminger
2026-01-30 16:24   ` Bruce Richardson
2026-01-30 16:31     ` Konstantin Ananyev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox