[PATCH 1/2] net: add arm64 neon version of CRC compute APIs

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
@ 2017-04-27 14:06 Ashwin Sekhar T K
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:06 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 576d60a..283743e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-04  6:56 ` Ashwin Sekhar T K
  2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (3 more replies)
  2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
                   ` (2 subsequent siblings)
  3 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:56 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

added the following flags to detect the corresponding
capability at compile time
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

at run-time, the following flags can be used to detect these
capabilities
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v3:
* Moved the feature detection changes into separate commit
* Added the AES, SHA1, SHA2 capabilities also under the CRYPTO flag
  along with PMULL

 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c14..a813c91 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v3 2/4] eal: move gcc version definition to common header
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04 15:22     ` Jan Viktorin
  2017-05-04  6:57   ` [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h

Tested compilation on arm64 with gcc

Tested compilation on x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v3:
* Moved changes for GCC_VERSION into a separate commit
* Moved GCC_VERSION definition to common header
* Removed the same from rte_lru.h

 lib/librte_eal/common/include/rte_common.h | 8 ++++++++
 lib/librte_table/rte_lru.h                 | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e..717b445 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,14 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef __GNUC__
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION (0)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062..3c5aca7 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v3 2/4] eal: move gcc version definition to common header
  2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-04 15:22     ` Jan Viktorin
  0 siblings, 0 replies; 33+ messages in thread
From: Jan Viktorin @ 2017-05-04 15:22 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, jerin.jacob,
	jianbo.liu, dev

On Wed,  3 May 2017 23:57:00 -0700
Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:

> moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h

s/moved/Moved/

> to lib/librte_eal/common/include/rte_common.h

dot after the sentence

> 
> Tested compilation on arm64 with gcc
> 
> Tested compilation on x86 with gcc and clang

Tested compilation on:

* arm64 with gcc
* x86 with gcc and clang

> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>

Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04  6:57   ` [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-05-04 15:20   ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed merge conflict in MAINTAINERS

v3:
* Moved feature detection changes and GCC_VERSION definition
  changes to separate commit
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  29 ++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2..66d64c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..f28a19d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,7 +34,9 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +80,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c..d01cf4b 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v3 4/4] test: add tests for arm64 CRC neon versions
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-04  6:57   ` [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04 15:20   ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed checkpatch errors/warnings

 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..9f2a17d 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-05-04  6:57   ` [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
@ 2017-05-04 15:20   ` Jan Viktorin
  2017-05-04 22:10     ` Thomas Monjalon
  3 siblings, 1 reply; 33+ messages in thread
From: Jan Viktorin @ 2017-05-04 15:20 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, jerin.jacob,
	jianbo.liu, dev

On Wed,  3 May 2017 23:56:59 -0700
Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:

> armv8-a has optional CRYPTO extension which adds the
> AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
> enables code generation for the ARMv8-A architecture together
> with the optional CRYPTO extensions.
> 
> added the following flags to detect the corresponding
> capability at compile time
>  * RTE_MACHINE_CPUFLAG_AES
>  * RTE_MACHINE_CPUFLAG_PMULL
>  * RTE_MACHINE_CPUFLAG_SHA1
>  * RTE_MACHINE_CPUFLAG_SHA2
> 
> at run-time, the following flags can be used to detect these
> capabilities
>  * RTE_CPUFLAG_AES
>  * RTE_CPUFLAG_PMULL
>  * RTE_CPUFLAG_SHA1
>  * RTE_CPUFLAG_SHA2
> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>

Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-04 15:20   ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
@ 2017-05-04 22:10     ` Thomas Monjalon
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-05-04 22:10 UTC (permalink / raw)
  To: Jan Viktorin, Ashwin Sekhar T K, jerin.jacob, jianbo.liu; +Cc: dev

04/05/2017 17:20, Jan Viktorin:
> On Wed,  3 May 2017 23:56:59 -0700
> Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:
> 
> > armv8-a has optional CRYPTO extension which adds the
> > AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
> > enables code generation for the ARMv8-A architecture together
> > with the optional CRYPTO extensions.
> > 
> > added the following flags to detect the corresponding
> > capability at compile time
> >  * RTE_MACHINE_CPUFLAG_AES
> >  * RTE_MACHINE_CPUFLAG_PMULL
> >  * RTE_MACHINE_CPUFLAG_SHA1
> >  * RTE_MACHINE_CPUFLAG_SHA2
> > 
> > at run-time, the following flags can be used to detect these
> > capabilities
> >  * RTE_CPUFLAG_AES
> >  * RTE_CPUFLAG_PMULL
> >  * RTE_CPUFLAG_SHA1
> >  * RTE_CPUFLAG_SHA2
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> 
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

Do you agree that this series, and others bringing NEON optimizations,
are not candidates for 17.05?
If you see an urgent fix in all these NEON patches, please shout now.

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v4 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-09  9:53 ` Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (2 more replies)
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  3 siblings, 3 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

added the following flags to detect the corresponding
capability at compile time
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

at run-time, the following flags can be used to detect these
capabilities
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
v3:
* Moved the feature detection changes into separate commit
* Added the AES, SHA1, SHA2 capabilities also under the CRYPTO flag
  along with PMULL

 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c14..a813c91 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v4 2/4] eal: move gcc version definition to common header
  2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
v3:
* Moved changes for GCC_VERSION into a separate commit
* Moved GCC_VERSION definition to common header
* Removed the same from rte_lru.h

v4:
* Edited the commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru.h                 | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e..ff4a12b 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062..5cc5966 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -120,7 +114,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -166,7 +160,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2017-05-12  5:51     ` Jianbo Liu
  2017-05-09  9:53   ` [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed merge conflict in MAINTAINERS

v3:
* Moved feature detection changes and GCC_VERSION definition
  changes to separate commit
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense

v4:
* Rebased on top of latest commit

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2..66d64c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..b215cc9 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 9d1ee63..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
+#endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c..d01cf4b 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-09  9:53   ` [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12  5:51     ` Jianbo Liu
  2017-05-12  7:25       ` Sekhar, Ashwin
  0 siblings, 1 reply; 33+ messages in thread
From: Jianbo Liu @ 2017-05-12  5:51 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, Jan Viktorin,
	Jerin Jacob, dev

On 9 May 2017 at 17:53, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Added CRC compute APIs for arm64 utilizing the pmull
> capability
>
> Added new file net_crc_neon.h to hold the arm64 pmull
> CRC implementation
>
> Verified the changes with crc_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
> v2:
> * Fixed merge conflict in MAINTAINERS
>
> v3:
> * Moved feature detection changes and GCC_VERSION definition
>   changes to separate commit
> * Replaced usage of assert() with RTE_ASSERT()
> * Made the comments in rte_vect.h more positive in sense
>
> v4:
> * Rebased on top of latest commit
>
>  MAINTAINERS                                       |   1 +
>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
>  lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c                      |  34 ++-
>  lib/librte_net/rte_net_crc.h                      |   2 +
>  5 files changed, 416 insertions(+), 6 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_neon.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b6495d2..66d64c2 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
>  F: lib/librte_acl/acl_run_neon.*
>  F: lib/librte_lpm/rte_lpm_neon.h
>  F: lib/librte_hash/rte*_arm64.h
> +F: lib/librte_net/net_crc_neon.h
>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 4107c99..b215cc9 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -35,6 +35,7 @@
>
>  #include <stdint.h>
>  #include "generic/rte_vect.h"
> +#include "rte_debug.h"
>  #include "arm_neon.h"
>
>  #ifdef __cplusplus
> @@ -78,6 +79,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
>  }
>  #endif
>
> +#if (GCC_VERSION < 70000)
> +/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
> +static inline uint64x2_t
> +vreinterpretq_u64_p128(poly128_t x)
> +{
> +       return (uint64x2_t)x;
> +}
> +
> +/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
> +static inline poly64x2_t
> +vreinterpretq_p64_u64(uint64x2_t x)
> +{
> +       return (poly64x2_t)x;
> +}
> +
> +/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
> +static inline poly64_t
> +vgetq_lane_p64(poly64x2_t x, const int lane)
> +{
> +       RTE_ASSERT(lane >= 0 && lane <= 1);
> +
> +       poly64_t *p = (poly64_t *)&x;
> +
> +       return p[lane];
> +}
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
> new file mode 100644
> index 0000000..05120a7
> --- /dev/null
> +++ b/lib/librte_net/net_crc_neon.h
> @@ -0,0 +1,357 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright (C) Cavium networks Ltd. 2017.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Cavium networks nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _NET_CRC_NEON_H_
> +#define _NET_CRC_NEON_H_
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_net_crc.h>
> +#include <rte_vect.h>
> +#include <rte_cpuflags.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/** PMULL CRC computation context structure */
> +struct crc_pmull_ctx {
> +       uint64x2_t rk1_rk2;
> +       uint64x2_t rk5_rk6;
> +       uint64x2_t rk7_rk8;
> +};
> +
> +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> +
> +static inline uint8x16_t
> +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> +{
> +       switch (n) {
> +       case 0: return vextq_u8(v0, v1, 0);
> +       case 1: return vextq_u8(v0, v1, 1);
> +       case 2: return vextq_u8(v0, v1, 2);
> +       case 3: return vextq_u8(v0, v1, 3);
> +       case 4: return vextq_u8(v0, v1, 4);
> +       case 5: return vextq_u8(v0, v1, 5);
> +       case 6: return vextq_u8(v0, v1, 6);
> +       case 7: return vextq_u8(v0, v1, 7);
> +       case 8: return vextq_u8(v0, v1, 8);
> +       case 9: return vextq_u8(v0, v1, 9);
> +       case 10: return vextq_u8(v0, v1, 10);
> +       case 11: return vextq_u8(v0, v1, 11);
> +       case 12: return vextq_u8(v0, v1, 12);
> +       case 13: return vextq_u8(v0, v1, 13);
> +       case 14: return vextq_u8(v0, v1, 14);
> +       case 15: return vextq_u8(v0, v1, 15);
> +       }
> +       return v1;
> +}
> +
> +/**
> + * Shifts right 128 bit register by specified number of bytes
> + *
> + * @param reg 128 bit value
> + * @param num number of bytes to shift reg by (0-16)
> + *
> + * @return reg << (num * 8)
> + */
> +static inline uint64x2_t
> +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> +{
> +       /* Right Shift */
> +       return vreinterpretq_u64_u8(extract_vector(
> +                               vreinterpretq_u8_u64(reg),
> +                               vdupq_n_u8(0),
> +                               num));
> +}
> +
> +/**
> + * Shifts left 128 bit register by specified number of bytes
> + *
> + * @param reg 128 bit value
> + * @param num number of bytes to shift reg by (0-16)
> + *
> + * @return reg << (num * 8)
> + */
> +static inline uint64x2_t
> +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> +{
> +       /* Left Shift */
> +       return vreinterpretq_u64_u8(extract_vector(
> +                               vdupq_n_u8(0),
> +                               vreinterpretq_u8_u64(reg),
> +                               16 - num));
> +}
> +

Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
they are common functions?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  5:51     ` Jianbo Liu
@ 2017-05-12  7:25       ` Sekhar, Ashwin
  2017-05-12  8:49         ` Jianbo Liu
  0 siblings, 1 reply; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-05-12  7:25 UTC (permalink / raw)
  To: jianbo.liu@linaro.org
  Cc: thomas@monjalon.net, jasvinder.singh@intel.com,
	cristian.dumitrescu@intel.com, viktorin@rehivetech.com,
	Jacob,  Jerin, dev@dpdk.org

On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
> On 9 May 2017 at 17:53, Ashwin Sekhar T K
> <ashwin.sekhar@caviumnetworks.com> wrote:
> > 
> > Added CRC compute APIs for arm64 utilizing the pmull
> > capability
> > 
> > Added new file net_crc_neon.h to hold the arm64 pmull
> > CRC implementation
> > 
> > Verified the changes with crc_autotest unit test case
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> > ---
> > v2:
> > * Fixed merge conflict in MAINTAINERS
> > 
> > v3:
> > * Moved feature detection changes and GCC_VERSION definition
> >   changes to separate commit
> > * Replaced usage of assert() with RTE_ASSERT()
> > * Made the comments in rte_vect.h more positive in sense
> > 
> > v4:
> > * Rebased on top of latest commit
> > 
> >  MAINTAINERS                                       |   1 +
> >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
> >  lib/librte_net/net_crc_neon.h                     | 357
> > ++++++++++++++++++++++
> >  lib/librte_net/rte_net_crc.c                      |  34 ++-
> >  lib/librte_net/rte_net_crc.h                      |   2 +
> >  5 files changed, 416 insertions(+), 6 deletions(-)
> >  create mode 100644 lib/librte_net/net_crc_neon.h
> > 
> > 
...
> > +
> > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> > +
> > +static inline uint8x16_t
> > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> > +{
> > +       switch (n) {
> > +       case 0: return vextq_u8(v0, v1, 0);
> > +       case 1: return vextq_u8(v0, v1, 1);
> > +       case 2: return vextq_u8(v0, v1, 2);
> > +       case 3: return vextq_u8(v0, v1, 3);
> > +       case 4: return vextq_u8(v0, v1, 4);
> > +       case 5: return vextq_u8(v0, v1, 5);
> > +       case 6: return vextq_u8(v0, v1, 6);
> > +       case 7: return vextq_u8(v0, v1, 7);
> > +       case 8: return vextq_u8(v0, v1, 8);
> > +       case 9: return vextq_u8(v0, v1, 9);
> > +       case 10: return vextq_u8(v0, v1, 10);
> > +       case 11: return vextq_u8(v0, v1, 11);
> > +       case 12: return vextq_u8(v0, v1, 12);
> > +       case 13: return vextq_u8(v0, v1, 13);
> > +       case 14: return vextq_u8(v0, v1, 14);
> > +       case 15: return vextq_u8(v0, v1, 15);
> > +       }
> > +       return v1;
> > +}
> > +
> > +/**
> > + * Shifts right 128 bit register by specified number of bytes
> > + *
> > + * @param reg 128 bit value
> > + * @param num number of bytes to shift reg by (0-16)
> > + *
> > + * @return reg << (num * 8)
> > + */
> > +static inline uint64x2_t
> > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> > +{
> > +       /* Right Shift */
> > +       return vreinterpretq_u64_u8(extract_vector(
> > +                               vreinterpretq_u8_u64(reg),
> > +                               vdupq_n_u8(0),
> > +                               num));
> > +}
> > +
> > +/**
> > + * Shifts left 128 bit register by specified number of bytes
> > + *
> > + * @param reg 128 bit value
> > + * @param num number of bytes to shift reg by (0-16)
> > + *
> > + * @return reg << (num * 8)
> > + */
> > +static inline uint64x2_t
> > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> > +{
> > +       /* Left Shift */
> > +       return vreinterpretq_u64_u8(extract_vector(
> > +                               vdupq_n_u8(0),
> > +                               vreinterpretq_u8_u64(reg),
> > +                               16 - num));
> > +}
> > +
> Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
> they are common functions?
These are not really common functions. I dont think it will have a
wider usage as its shifting by bytes and not by bits.

In x86 case also, xmm_shift_left is not made a common function.

Moreover, I have not tested the behaviour of these functions when the
shift amt is (< 0) or (> 16) as these cases will never arise in the CRC
code.

Thanks
Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  7:25       ` Sekhar, Ashwin
@ 2017-05-12  8:49         ` Jianbo Liu
  2017-05-12  8:56           ` Sekhar, Ashwin
  0 siblings, 1 reply; 33+ messages in thread
From: Jianbo Liu @ 2017-05-12  8:49 UTC (permalink / raw)
  To: Sekhar, Ashwin
  Cc: thomas@monjalon.net, jasvinder.singh@intel.com,
	cristian.dumitrescu@intel.com, viktorin@rehivetech.com,
	Jacob, Jerin, dev@dpdk.org

On 12 May 2017 at 15:25, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
>> On 9 May 2017 at 17:53, Ashwin Sekhar T K
>> <ashwin.sekhar@caviumnetworks.com> wrote:
>> >
>> > Added CRC compute APIs for arm64 utilizing the pmull
>> > capability
>> >
>> > Added new file net_crc_neon.h to hold the arm64 pmull
>> > CRC implementation
>> >
>> > Verified the changes with crc_autotest unit test case
>> >
>> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
>> > ---
>> > v2:
>> > * Fixed merge conflict in MAINTAINERS
>> >
>> > v3:
>> > * Moved feature detection changes and GCC_VERSION definition
>> >   changes to separate commit
>> > * Replaced usage of assert() with RTE_ASSERT()
>> > * Made the comments in rte_vect.h more positive in sense
>> >
>> > v4:
>> > * Rebased on top of latest commit
>> >
>> >  MAINTAINERS                                       |   1 +
>> >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
>> >  lib/librte_net/net_crc_neon.h                     | 357
>> > ++++++++++++++++++++++
>> >  lib/librte_net/rte_net_crc.c                      |  34 ++-
>> >  lib/librte_net/rte_net_crc.h                      |   2 +
>> >  5 files changed, 416 insertions(+), 6 deletions(-)
>> >  create mode 100644 lib/librte_net/net_crc_neon.h
>> >
>> >
> ...
>> > +
>> > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
>> > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
>> > +
>> > +static inline uint8x16_t
>> > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
>> > +{
>> > +       switch (n) {
>> > +       case 0: return vextq_u8(v0, v1, 0);
>> > +       case 1: return vextq_u8(v0, v1, 1);
>> > +       case 2: return vextq_u8(v0, v1, 2);
>> > +       case 3: return vextq_u8(v0, v1, 3);
>> > +       case 4: return vextq_u8(v0, v1, 4);
>> > +       case 5: return vextq_u8(v0, v1, 5);
>> > +       case 6: return vextq_u8(v0, v1, 6);
>> > +       case 7: return vextq_u8(v0, v1, 7);
>> > +       case 8: return vextq_u8(v0, v1, 8);
>> > +       case 9: return vextq_u8(v0, v1, 9);
>> > +       case 10: return vextq_u8(v0, v1, 10);
>> > +       case 11: return vextq_u8(v0, v1, 11);
>> > +       case 12: return vextq_u8(v0, v1, 12);
>> > +       case 13: return vextq_u8(v0, v1, 13);
>> > +       case 14: return vextq_u8(v0, v1, 14);
>> > +       case 15: return vextq_u8(v0, v1, 15);
>> > +       }
>> > +       return v1;
>> > +}
>> > +
>> > +/**
>> > + * Shifts right 128 bit register by specified number of bytes
>> > + *
>> > + * @param reg 128 bit value
>> > + * @param num number of bytes to shift reg by (0-16)
>> > + *
>> > + * @return reg << (num * 8)
>> > + */
>> > +static inline uint64x2_t
>> > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
>> > +{
>> > +       /* Right Shift */
>> > +       return vreinterpretq_u64_u8(extract_vector(
>> > +                               vreinterpretq_u8_u64(reg),
>> > +                               vdupq_n_u8(0),
>> > +                               num));
>> > +}
>> > +
>> > +/**
>> > + * Shifts left 128 bit register by specified number of bytes
>> > + *
>> > + * @param reg 128 bit value
>> > + * @param num number of bytes to shift reg by (0-16)
>> > + *
>> > + * @return reg << (num * 8)
>> > + */
>> > +static inline uint64x2_t
>> > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
>> > +{
>> > +       /* Left Shift */
>> > +       return vreinterpretq_u64_u8(extract_vector(
>> > +                               vdupq_n_u8(0),
>> > +                               vreinterpretq_u8_u64(reg),
>> > +                               16 - num));
>> > +}
>> > +
>> Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
>> they are common functions?
> These are not really common functions. I dont think it will have a
> wider usage as its shifting by bytes and not by bits.
>

I think these shifting may be used by other functions.
For example, to replace  _mm_srli_si128.

> In x86 case also, xmm_shift_left is not made a common function.
>

But its counterpart right shifting (_mm_srli_si128) is...

> Moreover, I have not tested the behaviour of these functions when the
> shift amt is (< 0) or (> 16) as these cases will never arise in the CRC
> code.
>

You can define thee functions according to current requirement.
And I don't think this parameter can be <0 or > 16.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  8:49         ` Jianbo Liu
@ 2017-05-12  8:56           ` Sekhar, Ashwin
  0 siblings, 0 replies; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-05-12  8:56 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu@linaro.org
  Cc: Jacob,  Jerin, thomas@monjalon.net, jasvinder.singh@intel.com,
	cristian.dumitrescu@intel.com, viktorin@rehivetech.com,
	dev@dpdk.org

On Fri, 2017-05-12 at 16:49 +0800, Jianbo Liu wrote:
> On 12 May 2017 at 15:25, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> wrote:
> > 
> > On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
> > > 
> > > On 9 May 2017 at 17:53, Ashwin Sekhar T K
> > > <ashwin.sekhar@caviumnetworks.com> wrote:
> > > > 
> > > > 
> > > > Added CRC compute APIs for arm64 utilizing the pmull
> > > > capability
> > > > 
> > > > Added new file net_crc_neon.h to hold the arm64 pmull
> > > > CRC implementation
> > > > 
> > > > Verified the changes with crc_autotest unit test case
> > > > 
> > > > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.
> > > > com>
> > > > ---
> > > > v2:
> > > > * Fixed merge conflict in MAINTAINERS
> > > > 
> > > > v3:
> > > > * Moved feature detection changes and GCC_VERSION definition
> > > >   changes to separate commit
> > > > * Replaced usage of assert() with RTE_ASSERT()
> > > > * Made the comments in rte_vect.h more positive in sense
> > > > 
> > > > v4:
> > > > * Rebased on top of latest commit
> > > > 
> > > >  MAINTAINERS                                       |   1 +
> > > >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
> > > >  lib/librte_net/net_crc_neon.h                     | 357
> > > > ++++++++++++++++++++++
> > > >  lib/librte_net/rte_net_crc.c                      |  34 ++-
> > > >  lib/librte_net/rte_net_crc.h                      |   2 +
> > > >  5 files changed, 416 insertions(+), 6 deletions(-)
> > > >  create mode 100644 lib/librte_net/net_crc_neon.h
> > > > 
> > > > 
> > ...
> > > 
> > > > 
> > > > +
> > > > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> > > > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> > > > +
> > > > +static inline uint8x16_t
> > > > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> > > > +{
> > > > +       switch (n) {
> > > > +       case 0: return vextq_u8(v0, v1, 0);
> > > > +       case 1: return vextq_u8(v0, v1, 1);
> > > > +       case 2: return vextq_u8(v0, v1, 2);
> > > > +       case 3: return vextq_u8(v0, v1, 3);
> > > > +       case 4: return vextq_u8(v0, v1, 4);
> > > > +       case 5: return vextq_u8(v0, v1, 5);
> > > > +       case 6: return vextq_u8(v0, v1, 6);
> > > > +       case 7: return vextq_u8(v0, v1, 7);
> > > > +       case 8: return vextq_u8(v0, v1, 8);
> > > > +       case 9: return vextq_u8(v0, v1, 9);
> > > > +       case 10: return vextq_u8(v0, v1, 10);
> > > > +       case 11: return vextq_u8(v0, v1, 11);
> > > > +       case 12: return vextq_u8(v0, v1, 12);
> > > > +       case 13: return vextq_u8(v0, v1, 13);
> > > > +       case 14: return vextq_u8(v0, v1, 14);
> > > > +       case 15: return vextq_u8(v0, v1, 15);
> > > > +       }
> > > > +       return v1;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Shifts right 128 bit register by specified number of bytes
> > > > + *
> > > > + * @param reg 128 bit value
> > > > + * @param num number of bytes to shift reg by (0-16)
> > > > + *
> > > > + * @return reg << (num * 8)
> > > > + */
> > > > +static inline uint64x2_t
> > > > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> > > > +{
> > > > +       /* Right Shift */
> > > > +       return vreinterpretq_u64_u8(extract_vector(
> > > > +                               vreinterpretq_u8_u64(reg),
> > > > +                               vdupq_n_u8(0),
> > > > +                               num));
> > > > +}
> > > > +
> > > > +/**
> > > > + * Shifts left 128 bit register by specified number of bytes
> > > > + *
> > > > + * @param reg 128 bit value
> > > > + * @param num number of bytes to shift reg by (0-16)
> > > > + *
> > > > + * @return reg << (num * 8)
> > > > + */
> > > > +static inline uint64x2_t
> > > > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> > > > +{
> > > > +       /* Left Shift */
> > > > +       return vreinterpretq_u64_u8(extract_vector(
> > > > +                               vdupq_n_u8(0),
> > > > +                               vreinterpretq_u8_u64(reg),
> > > > +                               16 - num));
> > > > +}
> > > > +
> > > Can you move shift_bytes_right/shift_bytes_left to rte_vect.h
> > > because
> > > they are common functions?
> > These are not really common functions. I dont think it will have a
> > wider usage as its shifting by bytes and not by bits.
> > 
> I think these shifting may be used by other functions.
> For example, to replace  _mm_srli_si128.
> 
> > 
> > In x86 case also, xmm_shift_left is not made a common function.
> > 
> But its counterpart right shifting (_mm_srli_si128) is...
> 
> > 
> > Moreover, I have not tested the behaviour of these functions when
> > the
> > shift amt is (< 0) or (> 16) as these cases will never arise in the
> > CRC
> > code.
> > 
> You can define thee functions according to current requirement.
> And I don't think this parameter can be <0 or > 16.

Okay. In that case, I will move it to rte_vect.h.

Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v4 4/4] test: add tests for arm64 CRC neon versions
  2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-09  9:53   ` [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed checkpatch errors/warnings

 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..9f2a17d 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v5 0/4] add arm64 neon version of CRC compute APIs
  2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
@ 2017-05-12 10:15 ` Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (3 more replies)
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  3 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

This patch series adds arm64 neon version of CRC compute APIs utilizing
the pmull capability (which is available as part of crypto extensions).

 * Patch 1 adds crypto capability in compilation of generic armv8a
   and thunderx targets.
 * Patch 2 moves GCC_VERSION defintion to a more common location as
   it will be used in the Patch 3.
 * Patch 3 adds the arm64 neon implementation of the CRC compute APIs.
 * Patch 4 adds the test case for testing arm64 neon implementation of the
   CRC compute APIs.

v5:
* Moved APIs shift_bytes_left, shift_bytes_right and extract_vector from
  net_crc_neon.h to rte_vect.h and renamed them to vshift_bytes_left,
  vshift_bytes_right and vextract respectively.

v4:
* Rebased on top of latest commit
* Edited the Patch 2 commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

v3:
* Moved feature detection changes and GCC_VERSION definition changes
  to separate commits.
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense
* Moved GCC_VERSION definition to common header and removed the same from
  rte_lru.h

v2:
* Fixed merge conflict in MAINTAINERS
* Fixed checkpatch errors/warnings

Ashwin Sekhar T K (4):
  mk: add crypto capability for generic armv8a and thunderx
  eal: move gcc version definition to common header
  net: add arm64 neon version of CRC compute APIs
  test: add tests for arm64 CRC neon versions

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_eal/common/include/rte_common.h        |   6 +
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 lib/librte_table/rte_lru.h                        |  10 +-
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   6 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 test/test/test_crc.c                              |   9 +
 12 files changed, 442 insertions(+), 16 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

Added the following flags to detect the corresponding
capability at compile time.
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

At run-time, the following flags can be used to detect the
capabilities.
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1f1..51966a5b6 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379b0..678410581 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c1470..a813c91f4 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2a6..01ac7e232 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-15  2:07     ` Jianbo Liu
  2017-07-03 20:51     ` Thomas Monjalon
  2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  3 siblings, 2 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru.h                 | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e21..ff4a12bbe 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062d0..5cc596613 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -120,7 +114,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -166,7 +160,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-15  2:07     ` Jianbo Liu
  2017-07-03 20:51     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Jianbo Liu @ 2017-05-15  2:07 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: Jerin Jacob, thomas, Jan Viktorin, jasvinder.singh, dev

On 12 May 2017 at 18:15, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> to lib/librte_eal/common/include/rte_common.h.
>
> Tested compilation on:
>  * arm64 with gcc
>  * x86 with gcc and clang
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
>  lib/librte_eal/common/include/rte_common.h |  6 ++++++
>  lib/librte_table/rte_lru.h                 | 10 ++--------
>  2 files changed, 8 insertions(+), 8 deletions(-)
>

Acked-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-15  2:07     ` Jianbo Liu
@ 2017-07-03 20:51     ` Thomas Monjalon
  2017-07-04  8:48       ` Sekhar, Ashwin
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-03 20:51 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

12/05/2017 12:15, Ashwin Sekhar T K:
> Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> to lib/librte_eal/common/include/rte_common.h.
> 
> Tested compilation on:
>  * arm64 with gcc
>  * x86 with gcc and clang
> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
> --- a/lib/librte_eal/common/include/rte_common.h
> +++ b/lib/librte_eal/common/include/rte_common.h
> +/** Define GCC_VERSION **/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
> +		__GNUC_PATCHLEVEL__)
> +#endif
[...]
> --- a/lib/librte_table/rte_lru.h
> +++ b/lib/librte_table/rte_lru.h
> -#ifdef __INTEL_COMPILER
> -#define GCC_VERSION (0)
> -#else
> -#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
> -#endif

The ICC check is lost when moving in rte_common.h.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-07-03 20:51     ` Thomas Monjalon
@ 2017-07-04  8:48       ` Sekhar, Ashwin
  0 siblings, 0 replies; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-07-04  8:48 UTC (permalink / raw)
  To: thomas@monjalon.net
  Cc: jasvinder.singh@intel.com, Jacob,  Jerin, viktorin@rehivetech.com,
	dev@dpdk.org, jianbo.liu@linaro.org

On Mon, 2017-07-03 at 22:51 +0200, Thomas Monjalon wrote:
> 12/05/2017 12:15, Ashwin Sekhar T K:
> > 
> > Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> > to lib/librte_eal/common/include/rte_common.h.
> > 
> > Tested compilation on:
> >  * arm64 with gcc
> >  * x86 with gcc and clang
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> > Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> > ---
> > --- a/lib/librte_eal/common/include/rte_common.h
> > +++ b/lib/librte_eal/common/include/rte_common.h
> > +/** Define GCC_VERSION **/
> > +#ifdef RTE_TOOLCHAIN_GCC
> > +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	
> > \
> > +		__GNUC_PATCHLEVEL__)
> > +#endif
> [...]
> > 
> > --- a/lib/librte_table/rte_lru.h
> > +++ b/lib/librte_table/rte_lru.h
> > -#ifdef __INTEL_COMPILER
> > -#define GCC_VERSION (0)
> > -#else
> > -#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 +
> > __GNUC_PATCHLEVEL__)
> > -#endif
> The ICC check is lost when moving in rte_common.h.

All usage of GCC_VERSION is kept under #ifdef RTE_TOOLCHAIN_GCC. So the
ICC check is not required.

Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-15  2:32     ` Jianbo Liu
  2017-07-03 21:06     ` Thomas Monjalon
  2017-05-12 10:15   ` [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  3 siblings, 2 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability.

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation.

Added wrappers in rte_vect.h for those neon intrinsics
which are not supported in GCC version < 7.

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2b9..66d64c2c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c9988..55e228a77 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,93 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if defined(RTE_ARCH_ARM64)
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+#endif
+
+/*
+ * If (0 <= index <= 15), then call the ASIMD ext intruction on the
+ * 128 bit regs v0 and v1 with the appropriate index.
+ *
+ * Else returns a zero vector.
+ */
+static inline uint8x16_t
+vextract(uint8x16_t v0, uint8x16_t v1, const int index)
+{
+	switch (index) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return vdupq_n_u8(0);
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				shift));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - shift));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 000000000..2be579d6b
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,297 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = vshift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = vshift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = vshift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = vshift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = vshift_bytes_left(fold, 16 - rem);
+		b = vshift_bytes_right(fold, rem);
+		mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 9d1ee63fa..be65f34bb 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
+#endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c6e..d01cf4b47 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-15  2:32     ` Jianbo Liu
  2017-07-03 21:06     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Jianbo Liu @ 2017-05-15  2:32 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: Jerin Jacob, thomas, Jan Viktorin, jasvinder.singh, dev

On 12 May 2017 at 18:15, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Added CRC compute APIs for arm64 utilizing the pmull
> capability.
>
> Added new file net_crc_neon.h to hold the arm64 pmull
> CRC implementation.
>
> Added wrappers in rte_vect.h for those neon intrinsics
> which are not supported in GCC version < 7.
>
> Verified the changes with crc_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
>  MAINTAINERS                                       |   1 +
>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
>  lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c                      |  34 ++-
>  lib/librte_net/rte_net_crc.h                      |   2 +
>  5 files changed, 416 insertions(+), 6 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_neon.h
>

Acked-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-15  2:32     ` Jianbo Liu
@ 2017-07-03 21:06     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-03 21:06 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

12/05/2017 12:15, Ashwin Sekhar T K:
> +#elif defined(ARM64_NEON_PMULL)
> +       case RTE_NET_CRC_NEON:
> +               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> +                       handlers = handlers_neon;
> +                       break;
> +               }
> +               //-fallthrough
> +#endif
>         case RTE_NET_CRC_SCALAR:
> +               //-fallthrough
>         default:

These fallthrough comments are not in the right coding style.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v5 4/4] test: add tests for arm64 CRC neon versions
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69a2..9f2a17d49 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v6 0/4] add arm64 neon version of CRC compute APIs
  2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                   ` (2 preceding siblings ...)
  2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24 ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (4 more replies)
  3 siblings, 5 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

This patch series adds arm64 neon version of CRC compute APIs utilizing
the pmull capability (which is available as part of crypto extensions).

 * Patch 1 adds crypto capability in compilation of generic armv8a
   and thunderx targets.
 * Patch 2 moves GCC_VERSION defintion to a more common location as
   it will be used in the Patch 3.
 * Patch 3 adds the arm64 neon implementation of the CRC compute APIs.
 * Patch 4 adds the test case for testing arm64 neon implementation of the
   CRC compute APIs.

v6:
* Corrected the fallthrough comment style.
* Rebased to DPDK tip.

v5:
* Moved APIs shift_bytes_left, shift_bytes_right and extract_vector from
  net_crc_neon.h to rte_vect.h and renamed them to vshift_bytes_left,
  vshift_bytes_right and vextract respectively.

v4:
* Rebased on top of latest commit
* Edited the Patch 2 commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

v3:
* Moved feature detection changes and GCC_VERSION definition changes
  to separate commits.
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense
* Moved GCC_VERSION definition to common header and removed the same from
  rte_lru.h

v2:
* Fixed merge conflict in MAINTAINERS
* Fixed checkpatch errors/warnings

Ashwin Sekhar T K (4):
  mk: add crypto capability for generic armv8a and thunderx
  eal: move gcc version definition to common header
  net: add arm64 neon version of CRC compute APIs
  test: add tests for arm64 CRC neon versions

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_eal/common/include/rte_common.h        |   6 +
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 lib/librte_table/rte_lru_x86.h                    |  10 +-
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   6 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 test/test/test_crc.c                              |   9 +
 12 files changed, 442 insertions(+), 16 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

Added the following flags to detect the corresponding
capability at compile time.
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

At run-time, the following flags can be used to detect the
capabilities.
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1f1..51966a5b6 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379b0..678410581 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c1470..a813c91f4 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2a6..01ac7e232 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v6 2/4] eal: move gcc version definition to common header
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
Acked-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru_x86.h             | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index 99596de0a..1afc66e3f 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru_x86.h b/lib/librte_table/rte_lru_x86.h
index 041b538f1..ec9082343 100644
--- a/lib/librte_table/rte_lru_x86.h
+++ b/lib/librte_table/rte_lru_x86.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -56,7 +50,7 @@ extern "C" {
 
 #if RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -100,7 +94,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-07-04  9:24   ` [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04 13:53     ` Thomas Monjalon
  2017-07-04  9:24   ` [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-07-04 13:55   ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon
  4 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability.

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation.

Added wrappers in rte_vect.h for those neon intrinsics
which are not supported in GCC version < 7.

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Acked-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index c14cbb90f..33921f721 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c9988..55e228a77 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,93 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if defined(RTE_ARCH_ARM64)
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+#endif
+
+/*
+ * If (0 <= index <= 15), then call the ASIMD ext intruction on the
+ * 128 bit regs v0 and v1 with the appropriate index.
+ *
+ * Else returns a zero vector.
+ */
+static inline uint8x16_t
+vextract(uint8x16_t v0, uint8x16_t v1, const int index)
+{
+	switch (index) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return vdupq_n_u8(0);
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				shift));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - shift));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 000000000..2be579d6b
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,297 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = vshift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = vshift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = vshift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = vshift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = vshift_bytes_left(fold, 16 - rem);
+		b = vshift_bytes_right(fold, rem);
+		mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 0391c7209..331737abe 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		/* fall-through */
+#endif
 	case RTE_NET_CRC_SCALAR:
+		/* fall-through */
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c6e..d01cf4b47 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs
  2017-07-04  9:24   ` [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04 13:53     ` Thomas Monjalon
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-04 13:53 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

04/07/2017 11:24, Ashwin Sekhar T K:
>  rte_net_crc_set_alg(enum rte_net_crc_alg alg)
>  {
>         switch (alg) {
> -       case RTE_NET_CRC_SSE42:
>  #ifdef X86_64_SSE42_PCLMULQDQ
> +       case RTE_NET_CRC_SSE42:
>                 handlers = handlers_sse42;
> -#else
> -               alg = RTE_NET_CRC_SCALAR;
> -#endif
>                 break;
> +#elif defined(ARM64_NEON_PMULL)
> +       case RTE_NET_CRC_NEON:
> +               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> +                       handlers = handlers_neon;
> +                       break;
> +               }
> +               /* fall-through */
> +#endif
>         case RTE_NET_CRC_SCALAR:
> +               /* fall-through */
>         default:
>                 handlers = handlers_scalar;
>                 break;
> 

I'm moving the fall-through comment outside of #ifdef
to fix warning.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v6 4/4] test: add tests for arm64 CRC neon versions
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-07-04  9:24   ` [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04 13:55   ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69a2..9f2a17d49 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v6 0/4] add arm64 neon version of CRC compute APIs
  2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (3 preceding siblings ...)
  2017-07-04  9:24   ` [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
@ 2017-07-04 13:55   ` Thomas Monjalon
  4 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-04 13:55 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

> Ashwin Sekhar T K (4):
>   mk: add crypto capability for generic armv8a and thunderx
>   eal: move gcc version definition to common header
>   net: add arm64 neon version of CRC compute APIs
>   test: add tests for arm64 CRC neon versions

Applied with minor changes, thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-07-04 13:55 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-04 15:22     ` Jan Viktorin
2017-05-04  6:57   ` [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:57   ` [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-04 15:20   ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
2017-05-04 22:10     ` Thomas Monjalon
2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
2017-05-09  9:53   ` [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-09  9:53   ` [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12  5:51     ` Jianbo Liu
2017-05-12  7:25       ` Sekhar, Ashwin
2017-05-12  8:49         ` Jianbo Liu
2017-05-12  8:56           ` Sekhar, Ashwin
2017-05-09  9:53   ` [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12 10:15   ` [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-15  2:07     ` Jianbo Liu
2017-07-03 20:51     ` Thomas Monjalon
2017-07-04  8:48       ` Sekhar, Ashwin
2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-15  2:32     ` Jianbo Liu
2017-07-03 21:06     ` Thomas Monjalon
2017-05-12 10:15   ` [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04 13:53     ` Thomas Monjalon
2017-07-04  9:24   ` [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04 13:55   ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.