* [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
@ 2026-06-09 7:57 Shreesh Adiga
2026-06-11 17:06 ` Stephen Hemminger
0 siblings, 1 reply; 3+ messages in thread
From: Shreesh Adiga @ 2026-06-09 7:57 UTC (permalink / raw)
To: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev; +Cc: dev
Add a 64-byte loop that maintains 4 fold registers and processes
64 bytes at a time. The 4x fold registers is then reduced to 16 byte
single fold, similar to AVX512 implementation. This technique is
described in the paper by Intel:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
This results in roughly 50% performance improvement due to better ILP
for large input sizes like 1024.
Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
lib/net/net_crc_sse.c | 59 +++++++++++++++++++++++++++++++++++--------
1 file changed, 48 insertions(+), 11 deletions(-)
diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 3b6fbfecac..dfef8ecc59 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -14,6 +14,7 @@
/** PCLMULQDQ CRC computation context structure */
struct crc_pclmulqdq_ctx {
__m128i rk1_rk2;
+ __m128i rk3_rk4;
__m128i rk5_rk6;
__m128i rk7_rk8;
};
@@ -150,9 +151,36 @@ crc32_eth_calc_pclmulqdq(
temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
/**
- * Folding all data into single 16 byte data block
- * Assumes: fold holds first 16 bytes of data
+ * Folding all data into 4 parallel 16 byte data block
+ * Later folds 4 parallel blocks into single fold block
*/
+ if (likely(data_len >= 64)) {
+ __m128i fold1, fold2, fold3, fold4;
+ __m128i temp1, temp2, temp3, temp4;
+ fold1 = _mm_loadu_si128((const __m128i *)(data + 0));
+ fold2 = _mm_loadu_si128((const __m128i *)(data + 16));
+ fold3 = _mm_loadu_si128((const __m128i *)(data + 32));
+ fold4 = _mm_loadu_si128((const __m128i *)(data + 48));
+ fold1 = _mm_xor_si128(fold1, temp);
+ k = params->rk1_rk2;
+
+ for (n = 64; (n + 64) <= data_len; n += 64) {
+ temp1 = _mm_loadu_si128((const __m128i *)&data[n]);
+ temp2 = _mm_loadu_si128((const __m128i *)&data[n + 16]);
+ temp3 = _mm_loadu_si128((const __m128i *)&data[n + 32]);
+ temp4 = _mm_loadu_si128((const __m128i *)&data[n + 48]);
+ fold1 = crcr32_folding_round(temp1, k, fold1);
+ fold2 = crcr32_folding_round(temp2, k, fold2);
+ fold3 = crcr32_folding_round(temp3, k, fold3);
+ fold4 = crcr32_folding_round(temp4, k, fold4);
+ }
+
+ k = params->rk3_rk4;
+ fold1 = crcr32_folding_round(fold2, k, fold1);
+ fold1 = crcr32_folding_round(fold3, k, fold1);
+ fold = crcr32_folding_round(fold4, k, fold1);
+ goto single_fold_loop;
+ }
if (unlikely(data_len < 32)) {
if (unlikely(data_len == 16)) {
@@ -182,7 +210,7 @@ crc32_eth_calc_pclmulqdq(
fold = _mm_loadu_si128((const __m128i *)data);
fold = _mm_xor_si128(fold, temp);
n = 16;
- k = params->rk1_rk2;
+ k = params->rk3_rk4;
goto partial_bytes;
}
@@ -191,9 +219,12 @@ crc32_eth_calc_pclmulqdq(
fold = _mm_loadu_si128((const __m128i *)data);
fold = _mm_xor_si128(fold, temp);
- /** Main folding loop - the last 16 bytes is processed separately */
- k = params->rk1_rk2;
- for (n = 16; (n + 16) <= data_len; n += 16) {
+ /** Single folding loop - the last 16 bytes is processed separately */
+ k = params->rk3_rk4;
+ n = 16;
+
+single_fold_loop:
+ for (; (n + 16) <= data_len; n += 16) {
temp = _mm_loadu_si128((const __m128i *)&data[n]);
fold = crcr32_folding_round(temp, k, fold);
}
@@ -236,12 +267,14 @@ crc32_eth_calc_pclmulqdq(
void
rte_net_crc_sse42_init(void)
{
- uint64_t k1, k2, k5, k6;
+ uint64_t k1, k2, k3, k4, k5, k6;
uint64_t p = 0, q = 0;
/** Initialize CRC16 data */
- k1 = 0x189aeLLU;
- k2 = 0x8e10LLU;
+ k1 = 0x14ff2LLU;
+ k2 = 0x19a3cLLU;
+ k3 = 0x189aeLLU;
+ k4 = 0x8e10LLU;
k5 = 0x189aeLLU;
k6 = 0x114aaLLU;
q = 0x11c581910LLU;
@@ -249,12 +282,15 @@ rte_net_crc_sse42_init(void)
/** Save the params in context structure */
crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+ crc16_ccitt_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
/** Initialize CRC32 data */
- k1 = 0xccaa009eLLU;
- k2 = 0x1751997d0LLU;
+ k1 = 0x1c6e41596LLU;
+ k2 = 0x154442bd4LLU;
+ k3 = 0xccaa009eLLU;
+ k4 = 0x1751997d0LLU;
k5 = 0xccaa009eLLU;
k6 = 0x163cd6124LLU;
q = 0x1f7011640LLU;
@@ -262,6 +298,7 @@ rte_net_crc_sse42_init(void)
/** Save the params in context structure */
crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+ crc32_eth_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
2026-06-09 7:57 [PATCH] net/crc: add 4x folding loop for x86 SSE implementation Shreesh Adiga
@ 2026-06-11 17:06 ` Stephen Hemminger
2026-06-12 3:02 ` Shreesh Adiga
0 siblings, 1 reply; 3+ messages in thread
From: Stephen Hemminger @ 2026-06-11 17:06 UTC (permalink / raw)
To: Shreesh Adiga; +Cc: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev, dev
On Tue, 9 Jun 2026 13:27:12 +0530
Shreesh Adiga <16567adigashreesh@gmail.com> wrote:
> Add a 64-byte loop that maintains 4 fold registers and processes
> 64 bytes at a time. The 4x fold registers is then reduced to 16 byte
> single fold, similar to AVX512 implementation. This technique is
> described in the paper by Intel:
> "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
>
> This results in roughly 50% performance improvement due to better ILP
> for large input sizes like 1024.
>
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> ---
Looks good applied to next-net.
A couple of nits from more detailed AI review, that you still might want to look at:
The current crc_autotest does not exercise the new 64-byte CRC16 path.
Its CRC32 vectors are 1512 and 348 bytes, so the CRC32 4x loop is
covered — but the largest CRC16 vector is 32 bytes, all three CRC16
tests being ≤32. So the new CRC16 rk1_rk2 (64-byte fold) constants ship
untested in CI. My exhaustive test confirms they're correct, but a
future regression there wouldn't be caught. Suggest adding a CRC16
vector ≥64 bytes, ideally a non-multiple of 64 (e.g. 80 or 100) so it
hits the 4x loop, the single-fold tail, and the partial-bytes path
together.
In partial_bytes the comment /* k = rk1 & rk2 */ is now stale
— after the patch k holds rk3_rk4 on every path reaching it.
Not introduced by this patch, but the patch is what made it wrong;
worth fixing in passing.
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
2026-06-11 17:06 ` Stephen Hemminger
@ 2026-06-12 3:02 ` Shreesh Adiga
0 siblings, 0 replies; 3+ messages in thread
From: Shreesh Adiga @ 2026-06-12 3:02 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev, dev
[-- Attachment #1: Type: text/plain, Size: 1955 bytes --]
On Thu, Jun 11, 2026 at 10:36 PM Stephen Hemminger <
stephen@networkplumber.org> wrote:
> On Tue, 9 Jun 2026 13:27:12 +0530
> Shreesh Adiga <16567adigashreesh@gmail.com> wrote:
>
> > Add a 64-byte loop that maintains 4 fold registers and processes
> > 64 bytes at a time. The 4x fold registers is then reduced to 16 byte
> > single fold, similar to AVX512 implementation. This technique is
> > described in the paper by Intel:
> > "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
> Instruction"
> >
> > This results in roughly 50% performance improvement due to better ILP
> > for large input sizes like 1024.
> >
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
>
> Looks good applied to next-net.
>
> A couple of nits from more detailed AI review, that you still might want
> to look at:
>
> The current crc_autotest does not exercise the new 64-byte CRC16 path.
> Its CRC32 vectors are 1512 and 348 bytes, so the CRC32 4x loop is
> covered — but the largest CRC16 vector is 32 bytes, all three CRC16
> tests being ≤32. So the new CRC16 rk1_rk2 (64-byte fold) constants ship
> untested in CI. My exhaustive test confirms they're correct, but a
> future regression there wouldn't be caught. Suggest adding a CRC16
> vector ≥64 bytes, ideally a non-multiple of 64 (e.g. 80 or 100) so it
> hits the 4x loop, the single-fold tail, and the partial-bytes path
> together.
>
> In partial_bytes the comment /* k = rk1 & rk2 */ is now stale
> — after the patch k holds rk3_rk4 on every path reaching it.
> Not introduced by this patch, but the patch is what made it wrong;
> worth fixing in passing.
>
> I've submitted couple of follow up patches that should address the above:
https://patches.dpdk.org/project/dpdk/patch/20260612023745.275608-1-16567adigashreesh@gmail.com/
https://patches.dpdk.org/project/dpdk/patch/20260612025135.298226-1-16567adigashreesh@gmail.com/
[-- Attachment #2: Type: text/html, Size: 2806 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-06-12 3:02 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-09 7:57 [PATCH] net/crc: add 4x folding loop for x86 SSE implementation Shreesh Adiga
2026-06-11 17:06 ` Stephen Hemminger
2026-06-12 3:02 ` Shreesh Adiga
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox