[PATCH] net/crc: add 4x folding loop for x86 SSE implementation

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
@ 2026-06-09  7:57 Shreesh Adiga
  2026-06-11 17:06 ` Stephen Hemminger
  0 siblings, 1 reply; 3+ messages in thread
From: Shreesh Adiga @ 2026-06-09  7:57 UTC (permalink / raw)
  To: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev; +Cc: dev

Add a 64-byte loop that maintains 4 fold registers and processes
64 bytes at a time. The 4x fold registers is then reduced to 16 byte
single fold, similar to AVX512 implementation. This technique is
described in the paper by Intel:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"

This results in roughly 50% performance improvement due to better ILP
for large input sizes like 1024.

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
 lib/net/net_crc_sse.c | 59 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 3b6fbfecac..dfef8ecc59 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -14,6 +14,7 @@
 /** PCLMULQDQ CRC computation context structure */
 struct crc_pclmulqdq_ctx {
 	__m128i rk1_rk2;
+	__m128i rk3_rk4;
 	__m128i rk5_rk6;
 	__m128i rk7_rk8;
 };
@@ -150,9 +151,36 @@ crc32_eth_calc_pclmulqdq(
 	temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 
 	/**
-	 * Folding all data into single 16 byte data block
-	 * Assumes: fold holds first 16 bytes of data
+	 * Folding all data into 4 parallel 16 byte data block
+	 * Later folds 4 parallel blocks into single fold block
 	 */
+	if (likely(data_len >= 64)) {
+		__m128i fold1, fold2, fold3, fold4;
+		__m128i temp1, temp2, temp3, temp4;
+		fold1 = _mm_loadu_si128((const __m128i *)(data +  0));
+		fold2 = _mm_loadu_si128((const __m128i *)(data + 16));
+		fold3 = _mm_loadu_si128((const __m128i *)(data + 32));
+		fold4 = _mm_loadu_si128((const __m128i *)(data + 48));
+		fold1 = _mm_xor_si128(fold1, temp);
+		k = params->rk1_rk2;
+
+		for (n = 64; (n + 64) <= data_len; n += 64) {
+			temp1 = _mm_loadu_si128((const __m128i *)&data[n]);
+			temp2 = _mm_loadu_si128((const __m128i *)&data[n + 16]);
+			temp3 = _mm_loadu_si128((const __m128i *)&data[n + 32]);
+			temp4 = _mm_loadu_si128((const __m128i *)&data[n + 48]);
+			fold1 = crcr32_folding_round(temp1, k, fold1);
+			fold2 = crcr32_folding_round(temp2, k, fold2);
+			fold3 = crcr32_folding_round(temp3, k, fold3);
+			fold4 = crcr32_folding_round(temp4, k, fold4);
+		}
+
+		k = params->rk3_rk4;
+		fold1 = crcr32_folding_round(fold2, k, fold1);
+		fold1 = crcr32_folding_round(fold3, k, fold1);
+		fold = crcr32_folding_round(fold4, k, fold1);
+		goto single_fold_loop;
+	}
 
 	if (unlikely(data_len < 32)) {
 		if (unlikely(data_len == 16)) {
@@ -182,7 +210,7 @@ crc32_eth_calc_pclmulqdq(
 		fold = _mm_loadu_si128((const __m128i *)data);
 		fold = _mm_xor_si128(fold, temp);
 		n = 16;
-		k = params->rk1_rk2;
+		k = params->rk3_rk4;
 		goto partial_bytes;
 	}
 
@@ -191,9 +219,12 @@ crc32_eth_calc_pclmulqdq(
 	fold = _mm_loadu_si128((const __m128i *)data);
 	fold = _mm_xor_si128(fold, temp);
 
-	/** Main folding loop - the last 16 bytes is processed separately */
-	k = params->rk1_rk2;
-	for (n = 16; (n + 16) <= data_len; n += 16) {
+	/** Single folding loop - the last 16 bytes is processed separately */
+	k = params->rk3_rk4;
+	n = 16;
+
+single_fold_loop:
+	for (; (n + 16) <= data_len; n += 16) {
 		temp = _mm_loadu_si128((const __m128i *)&data[n]);
 		fold = crcr32_folding_round(temp, k, fold);
 	}
@@ -236,12 +267,14 @@ crc32_eth_calc_pclmulqdq(
 void
 rte_net_crc_sse42_init(void)
 {
-	uint64_t k1, k2, k5, k6;
+	uint64_t k1, k2, k3, k4, k5, k6;
 	uint64_t p = 0, q = 0;
 
 	/** Initialize CRC16 data */
-	k1 = 0x189aeLLU;
-	k2 = 0x8e10LLU;
+	k1 = 0x14ff2LLU;
+	k2 = 0x19a3cLLU;
+	k3 = 0x189aeLLU;
+	k4 = 0x8e10LLU;
 	k5 = 0x189aeLLU;
 	k6 = 0x114aaLLU;
 	q =  0x11c581910LLU;
@@ -249,12 +282,15 @@ rte_net_crc_sse42_init(void)
 
 	/** Save the params in context structure */
 	crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+	crc16_ccitt_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
 	crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
 	crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
 
 	/** Initialize CRC32 data */
-	k1 = 0xccaa009eLLU;
-	k2 = 0x1751997d0LLU;
+	k1 = 0x1c6e41596LLU;
+	k2 = 0x154442bd4LLU;
+	k3 = 0xccaa009eLLU;
+	k4 = 0x1751997d0LLU;
 	k5 = 0xccaa009eLLU;
 	k6 = 0x163cd6124LLU;
 	q =  0x1f7011640LLU;
@@ -262,6 +298,7 @@ rte_net_crc_sse42_init(void)
 
 	/** Save the params in context structure */
 	crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+	crc32_eth_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
 	crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
 	crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
 }
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
  2026-06-09  7:57 [PATCH] net/crc: add 4x folding loop for x86 SSE implementation Shreesh Adiga
@ 2026-06-11 17:06 ` Stephen Hemminger
  2026-06-12  3:02   ` Shreesh Adiga
  0 siblings, 1 reply; 3+ messages in thread
From: Stephen Hemminger @ 2026-06-11 17:06 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev, dev

On Tue,  9 Jun 2026 13:27:12 +0530
Shreesh Adiga <16567adigashreesh@gmail.com> wrote:

> Add a 64-byte loop that maintains 4 fold registers and processes
> 64 bytes at a time. The 4x fold registers is then reduced to 16 byte
> single fold, similar to AVX512 implementation. This technique is
> described in the paper by Intel:
> "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
> 
> This results in roughly 50% performance improvement due to better ILP
> for large input sizes like 1024.
> 
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> ---

Looks good applied to next-net.

A couple of nits from more detailed AI review, that you still might want to look at:

The current crc_autotest does not exercise the new 64-byte CRC16 path.
Its CRC32 vectors are 1512 and 348 bytes, so the CRC32 4x loop is
covered — but the largest CRC16 vector is 32 bytes, all three CRC16
tests being ≤32. So the new CRC16 rk1_rk2 (64-byte fold) constants ship
untested in CI. My exhaustive test confirms they're correct, but a
future regression there wouldn't be caught. Suggest adding a CRC16
vector ≥64 bytes, ideally a non-multiple of 64 (e.g. 80 or 100) so it
hits the 4x loop, the single-fold tail, and the partial-bytes path
together.

In partial_bytes the comment /* k = rk1 & rk2 */ is now stale
 — after the patch k holds rk3_rk4 on every path reaching it.
Not introduced by this patch, but the patch is what made it wrong;
worth fixing in passing.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
  2026-06-11 17:06 ` Stephen Hemminger
@ 2026-06-12  3:02   ` Shreesh Adiga
  0 siblings, 0 replies; 3+ messages in thread
From: Shreesh Adiga @ 2026-06-12  3:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev, dev

[-- Attachment #1: Type: text/plain, Size: 1955 bytes --]

On Thu, Jun 11, 2026 at 10:36 PM Stephen Hemminger <
stephen@networkplumber.org> wrote:

> On Tue,  9 Jun 2026 13:27:12 +0530
> Shreesh Adiga <16567adigashreesh@gmail.com> wrote:
>
> > Add a 64-byte loop that maintains 4 fold registers and processes
> > 64 bytes at a time. The 4x fold registers is then reduced to 16 byte
> > single fold, similar to AVX512 implementation. This technique is
> > described in the paper by Intel:
> > "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
> Instruction"
> >
> > This results in roughly 50% performance improvement due to better ILP
> > for large input sizes like 1024.
> >
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
>
> Looks good applied to next-net.
>
> A couple of nits from more detailed AI review, that you still might want
> to look at:
>
> The current crc_autotest does not exercise the new 64-byte CRC16 path.
> Its CRC32 vectors are 1512 and 348 bytes, so the CRC32 4x loop is
> covered — but the largest CRC16 vector is 32 bytes, all three CRC16
> tests being ≤32. So the new CRC16 rk1_rk2 (64-byte fold) constants ship
> untested in CI. My exhaustive test confirms they're correct, but a
> future regression there wouldn't be caught. Suggest adding a CRC16
> vector ≥64 bytes, ideally a non-multiple of 64 (e.g. 80 or 100) so it
> hits the 4x loop, the single-fold tail, and the partial-bytes path
> together.
>
> In partial_bytes the comment /* k = rk1 & rk2 */ is now stale
>  — after the patch k holds rk3_rk4 on every path reaching it.
> Not introduced by this patch, but the patch is what made it wrong;
> worth fixing in passing.
>
> I've submitted couple of follow up patches that should address the above:
https://patches.dpdk.org/project/dpdk/patch/20260612023745.275608-1-16567adigashreesh@gmail.com/
https://patches.dpdk.org/project/dpdk/patch/20260612025135.298226-1-16567adigashreesh@gmail.com/

[-- Attachment #2: Type: text/html, Size: 2806 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-12  3:02 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-09  7:57 [PATCH] net/crc: add 4x folding loop for x86 SSE implementation Shreesh Adiga
2026-06-11 17:06 ` Stephen Hemminger
2026-06-12  3:02   ` Shreesh Adiga

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox