All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
@ 2017-04-27 14:06 Ashwin Sekhar T K
  2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:06 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 576d60a..283743e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-07-04 13:55 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-27 14:06 [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:56 ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-04  6:57   ` [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-04 15:22     ` Jan Viktorin
2017-05-04  6:57   ` [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:57   ` [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-04 15:20   ` [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
2017-05-04 22:10     ` Thomas Monjalon
2017-05-09  9:53 ` [PATCH v4 " Ashwin Sekhar T K
2017-05-09  9:53   ` [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-09  9:53   ` [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12  5:51     ` Jianbo Liu
2017-05-12  7:25       ` Sekhar, Ashwin
2017-05-12  8:49         ` Jianbo Liu
2017-05-12  8:56           ` Sekhar, Ashwin
2017-05-09  9:53   ` [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-12 10:15 ` [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12 10:15   ` [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-12 10:15   ` [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-15  2:07     ` Jianbo Liu
2017-07-03 20:51     ` Thomas Monjalon
2017-07-04  8:48       ` Sekhar, Ashwin
2017-05-12 10:15   ` [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-15  2:32     ` Jianbo Liu
2017-07-03 21:06     ` Thomas Monjalon
2017-05-12 10:15   ` [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04  9:24 ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-07-04  9:24   ` [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04 13:53     ` Thomas Monjalon
2017-07-04  9:24   ` [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04 13:55   ` [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.