From: Shreesh Adiga <16567adigashreesh@gmail.com>
To: Wathsala Vithanage <wathsala.vithanage@arm.com>
Cc: dev@dpdk.org
Subject: [PATCH] net/crc: add 4x folding loop for aarch64 NEON implementation
Date: Tue, 16 Jun 2026 14:41:58 +0530 [thread overview]
Message-ID: <20260616091158.731075-1-16567adigashreesh@gmail.com> (raw)
Add a 64-byte loop that maintains 4 fold registers and processes
64 bytes at a time. The 4x fold registers is then reduced to 16 byte
single fold, similar to x86 SSE implementation. This technique is
described in the paper by Intel:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
This results in roughly 2x performance improvement due to better ILP
for large input sizes like 1024 observed on Cortex-X925.
Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
lib/net/net_crc_neon.c | 51 +++++++++++++++++++++++++++++++++++-------
1 file changed, 43 insertions(+), 8 deletions(-)
diff --git a/lib/net/net_crc_neon.c b/lib/net/net_crc_neon.c
index cee75ddd31..fc817e54f5 100644
--- a/lib/net/net_crc_neon.c
+++ b/lib/net/net_crc_neon.c
@@ -16,6 +16,7 @@
/** PMULL CRC computation context structure */
struct crc_pmull_ctx {
uint64x2_t rk1_rk2;
+ uint64x2_t rk3_rk4;
uint64x2_t rk5_rk6;
uint64x2_t rk7_rk8;
};
@@ -136,9 +137,36 @@ crc32_eth_calc_pmull(
temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
/**
- * Folding all data into single 16 byte data block
- * Assumes: fold holds first 16 bytes of data
+ * Folding all data into 4 parallel 16 byte data block
+ * Later folds 4 parallel blocks into single fold block
*/
+ if (likely(data_len >= 64)) {
+ uint64x2_t fold1, fold2, fold3, fold4;
+ uint64x2_t temp1, temp2, temp3, temp4;
+ fold1 = vld1q_u64((const uint64_t *)(data + 0));
+ fold2 = vld1q_u64((const uint64_t *)(data + 16));
+ fold3 = vld1q_u64((const uint64_t *)(data + 32));
+ fold4 = vld1q_u64((const uint64_t *)(data + 48));
+ fold1 = veorq_u64(fold1, temp);
+ k = params->rk1_rk2;
+
+ for (n = 64; (n + 64) <= data_len; n += 64) {
+ temp1 = vld1q_u64((const uint64_t *)&data[n + 0]);
+ temp2 = vld1q_u64((const uint64_t *)&data[n + 16]);
+ temp3 = vld1q_u64((const uint64_t *)&data[n + 32]);
+ temp4 = vld1q_u64((const uint64_t *)&data[n + 48]);
+ fold1 = crcr32_folding_round(temp1, k, fold1);
+ fold2 = crcr32_folding_round(temp2, k, fold2);
+ fold3 = crcr32_folding_round(temp3, k, fold3);
+ fold4 = crcr32_folding_round(temp4, k, fold4);
+ }
+ k = params->rk3_rk4;
+ fold1 = crcr32_folding_round(fold2, k, fold1);
+ fold1 = crcr32_folding_round(fold3, k, fold1);
+ fold = crcr32_folding_round(fold4, k, fold1);
+ goto single_fold_loop;
+ }
+
if (unlikely(data_len < 32)) {
if (unlikely(data_len == 16)) {
/* 16 bytes */
@@ -176,9 +204,12 @@ crc32_eth_calc_pmull(
fold = vld1q_u64((const uint64_t *)data);
fold = veorq_u64(fold, temp);
- /** Main folding loop - the last 16 bytes is processed separately */
- k = params->rk1_rk2;
- for (n = 16; (n + 16) <= data_len; n += 16) {
+ /** Single folding loop - the last 16 bytes is processed separately */
+ k = params->rk3_rk4;
+ n = 16;
+
+single_fold_loop:
+ for (; (n + 16) <= data_len; n += 16) {
temp = vld1q_u64((const uint64_t *)&data[n]);
fold = crcr32_folding_round(temp, k, fold);
}
@@ -194,7 +225,7 @@ crc32_eth_calc_pmull(
mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
b = vorrq_u64(b, vandq_u64(mask, last16));
- /* k = rk1 & rk2 */
+ /* k = rk3 & rk4 */
temp = vreinterpretq_u64_p128(vmull_p64(
vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
@@ -221,22 +252,26 @@ void
rte_net_crc_neon_init(void)
{
/* Initialize CRC16 data */
- uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+ uint64_t ccitt_k1_k2[2] = {0x14ff2LLU, 0x19a3cLLU};
+ uint64_t ccitt_k3_k4[2] = {0x189aeLLU, 0x8e10LLU};
uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
/* Initialize CRC32 data */
- uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+ uint64_t eth_k1_k2[2] = {0x1c6e41596LLU, 0x154442bd4LLU};
+ uint64_t eth_k3_k4[2] = {0xccaa009eLLU, 0x1751997d0LLU};
uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
/** Save the params in context structure */
crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+ crc16_ccitt_pmull.rk3_rk4 = vld1q_u64(ccitt_k3_k4);
crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
/** Save the params in context structure */
crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+ crc32_eth_pmull.rk3_rk4 = vld1q_u64(eth_k3_k4);
crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
}
--
2.53.0
reply other threads:[~2026-06-16 9:12 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260616091158.731075-1-16567adigashreesh@gmail.com \
--to=16567adigashreesh@gmail.com \
--cc=dev@dpdk.org \
--cc=wathsala.vithanage@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox