[PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths
@ 2026-04-17 13:08 Jay Wang
  2026-04-17 13:08 ` [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-17 13:08 UTC (permalink / raw)
  Cc: dev, nd, Jay Wang

- Add the scattered Rx burst function for 32B legacy descriptor
- Add the NEON-optimised Tx burst function
- Add the NEON-optimised RX burst functions for flexible descriptor

Jay Wang (3):
  net/iavf: add Rx scattered function for 32B desc
  net/iavf: add NEON-optimised Tx burst function
  net/iavf: add NEON support for Rx flex desc

 drivers/net/intel/iavf/iavf.h               |   4 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  52 +-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 751 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 5 files changed, 795 insertions(+), 16 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc
  2026-04-17 13:08 [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
@ 2026-04-17 13:08 ` Jay Wang
  2026-04-17 13:08 ` [PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-17 13:08 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

Added the scattered burst function on AArch64 so that we can leverage
the NEON-optimised Rx raw burst function to handle scattered packets for
the legacy 32B descriptor.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  16 ++-
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 110 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 4 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 403c61e2e8..e4936f3566 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -334,6 +334,7 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC,
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
+	IAVF_RX_NEON_SCATTERED,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 4ff6c18dc4..15566a0e18 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 		}
 	},
 #endif
-#elif defined RTE_ARCH_ARM
+#elif defined(RTE_ARCH_ARM64)
 	[IAVF_RX_NEON] = {
 		.pkt_burst = iavf_recv_pkts_vec,
 		.info = "Vector Neon",
 		.features = {
-			.rx_offloads = IAVF_RX_SCALAR_OFFLOADS,
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_128,
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_SCATTERED] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec,
+		.info = "Vector Scattered Neon",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
@@ -3839,7 +3849,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	if (adapter->rx_bulk_alloc_allowed) {
 		req_features.bulk_alloc = true;
 		default_path = IAVF_RX_BULK_ALLOC;
-#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 		if (iavf_rx_vec_dev_check(dev) != -1)
 			req_features.simd_width = iavf_get_max_simd_bitwidth();
 #endif
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 28c90b2a72..45e377d728 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2022 Intel Corporation
- * Copyright(c) 2022 Arm Limited
+ * Copyright(c) 2022-2026 Arm Limited
  */
 
 #include <stdint.h>
@@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		   struct rte_mbuf **__rte_restrict rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
 {
-	RTE_SET_USED(split_packet);
-
 	volatile union ci_rx_desc *rxdp;
 	struct ci_rx_entry *sw_ring;
 	uint16_t nb_pkts_recd;
@@ -164,6 +162,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		4, 5, 6, 7    /* octet 4~7, 32bits rss */
 		};
 
+	uint8x16_t eop_check = {
+		0x02, 0x00, 0x02, 0x00,
+		0x02, 0x00, 0x02, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00
+	};
+
 	uint16x8_t crc_adjust = {
 		0, 0,         /* ignore pkt_type field */
 		rxq->crc_len, /* sub crc on pkt_len */
@@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
 		/* pkts shift the pktlen field to be 16-bit aligned*/
 		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
 					    len_shl);
@@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		staterr = vzipq_u16(sterr_tmp1.val[1],
 				    sterr_tmp2.val[1]).val[0];
 
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			uint8x16_t eop_shuf_mask = {
+				0x00, 0x02, 0x04, 0x06,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF
+			};
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+			/* the staterr values are not in order, as the count
+			 * of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit
+			 */
+			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
+
+			/* store the resulting 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+				vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
 		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
 		staterr = vreinterpretq_u16_s16(
 				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
@@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * vPMD receive routine that reassembles single burst of 32 scattered
+ * packets.
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static __rte_always_inline uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+						split_flags);
+
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be assembled */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+	if (!rxq->pkt_first_seg &&
+			split_fl64[0] == 0 && split_fl64[1] == 0 &&
+			split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassmble any packets that need reassembly */
+	unsigned int i = 0;
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassmeble then */
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
+			&split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
+			rxq->crc_len);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+				rx_pkts + retval, IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+	/* The last one burst or nb_pkts <= IAVF_VPMD_RX_BURST */
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+			rx_pkts + retval, nb_pkts);
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build
index f9576586f6..50630a88c8 100644
--- a/drivers/net/intel/iavf/meson.build
+++ b/drivers/net/intel/iavf/meson.build
@@ -29,7 +29,7 @@ sources = files(
 if arch_subdir == 'x86'
     sources_avx2 += files('iavf_rxtx_vec_avx2.c')
     sources_avx512 += files('iavf_rxtx_vec_avx512.c')
-elif arch_subdir == 'arm'
+elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
     sources += files('iavf_rxtx_vec_neon.c')
 endif
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function
  2026-04-17 13:08 [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-04-17 13:08 ` [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
@ 2026-04-17 13:08 ` Jay Wang
  2026-04-17 13:08 ` [PATCH v1 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-17 13:08 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

This patch adds the NEON-optimised Tx burst function for Intel IAVF
driver on AArch64.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  15 ++-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 120 ++++++++++++++++++++
 4 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index e4936f3566..3e71d345a9 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -356,6 +356,7 @@ enum iavf_rx_func_type {
 enum iavf_tx_func_type {
 	IAVF_TX_DISABLED,
 	IAVF_TX_DEFAULT,
+	IAVF_TX_NEON,
 	IAVF_TX_AVX2,
 	IAVF_TX_AVX2_OFFLOAD,
 	IAVF_TX_AVX512,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 15566a0e18..645bc5ccf6 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3662,6 +3662,15 @@ static const struct ci_tx_path_info iavf_tx_path_infos[] = {
 		}
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[IAVF_TX_NEON] = {
+		.pkt_burst = iavf_xmit_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.tx_offloads = IAVF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128
+		}
+	},
 #endif
 };
 
@@ -3878,7 +3887,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int mbuf_check = adapter->devargs.mbuf_check;
 	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	struct ci_tx_queue *txq;
 	int i;
 	const struct ci_tx_path_features *selected_features;
@@ -3892,7 +3901,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 	if (dev->data->dev_started)
 		goto out;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	if (iavf_tx_vec_dev_check(dev) != -1)
 		req_features.simd_width = iavf_get_max_simd_bitwidth();
 
@@ -3915,7 +3924,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 						IAVF_TX_DEFAULT);
 
 out:
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	selected_features = &iavf_tx_path_infos[adapter->tx_func_type].features;
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h
index 80b06518b0..8b8e55e66f 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.h
+++ b/drivers/net/intel/iavf/iavf_rxtx.h
@@ -558,8 +558,6 @@ uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 					       struct rte_mbuf **rx_pkts,
 					       uint16_t nb_pkts);
-uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-				  uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 45e377d728..9c91b6bac1 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -445,6 +445,120 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline void
+iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
+	 uint64_t flags)
+{
+	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
+		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
+
+	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
+	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
+}
+
+static __rte_always_inline void
+iavf_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt,
+	uint16_t nb_pkts, uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		iavf_vtx1(txdp, *pkt, flags);
+}
+
+static __rte_always_inline uint16_t
+iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct ci_tx_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
+	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->ci_tx_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			iavf_vtx1(txdp, *tx_pkts, flags);
+
+		/* write with RS for the last descriptor in the segment */
+		iavf_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->ci_tx_ring[tx_id];
+		txep = &txq->sw_ring_vec[tx_id];
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
+					 CI_TXD_QW1_CMD_S);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		   uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		/* cross rs_thresh boundary is not allowed */
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+				num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
@@ -465,6 +579,12 @@ iavf_rx_vec_dev_check(struct rte_eth_dev *dev)
 	return iavf_rx_vec_dev_check_default(dev);
 }
 
+int __rte_cold
+iavf_tx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return iavf_tx_vec_dev_check_default(dev);
+}
+
 enum rte_vect_max_simd
 iavf_get_max_simd_bitwidth(void)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v1 3/3] net/iavf: add NEON support for Rx flex desc
  2026-04-17 13:08 [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-04-17 13:08 ` [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
  2026-04-17 13:08 ` [PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
@ 2026-04-17 13:08 ` Jay Wang
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-17 13:08 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

This patch adds the NEON-optimised Rx paths to process receive flex
descriptor.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   2 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  21 +
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 521 +++++++++++++++++++-
 3 files changed, 540 insertions(+), 4 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 3e71d345a9..360d728f3a 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -335,6 +335,8 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
 	IAVF_RX_NEON_SCATTERED,
+	IAVF_RX_NEON_FLEX_RXD,
+	IAVF_RX_NEON_SCATTERED_FLEX_RXD,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 645bc5ccf6..ae005b0648 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3571,6 +3571,27 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_pkts_vec_flex_rxd,
+		.info = "Vector Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
+	[IAVF_RX_NEON_SCATTERED_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec_flex_rxd,
+		.info = "Vector Scattered Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 9c91b6bac1..9d7281e172 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -16,12 +16,445 @@
 
 #include "../common/rx_vec_arm.h"
 
-static inline void
+#define PKTLEN_SHIFT     10
+#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
+
+static __rte_always_inline void
 iavf_rxq_rearm(struct ci_rx_queue *rxq)
 {
 	ci_rxq_rearm(rxq);
 }
 
+static __rte_always_inline uint32x4_t
+iavf_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
+{
+#define FDID_MIS_MAGIC	0xFFFFFFFFu
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));
+
+	const uint32x4_t pkt_fdir_bit = vdupq_n_u32((uint32_t)(RTE_MBUF_F_RX_FDIR |
+			RTE_MBUF_F_RX_FDIR_ID));
+	const uint32x4_t fdir_mis_mask = vdupq_n_u32(FDID_MIS_MAGIC);
+
+	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
+	uint32x4_t fdir_mask = vceqq_u32(fdir_id0_3, fdir_mis_mask);
+
+	/* xor with 0xFFFFFFFF bit-reverses the mask */
+	fdir_mask = veorq_u32(fdir_mask, fdir_mis_mask);
+	const uint32x4_t fdir_flags = vandq_u32(fdir_mask, pkt_fdir_bit);
+
+	return fdir_flags;
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
+		struct rte_mbuf **rx_pkts)
+{
+	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+
+	uint32x4_t tmp_desc, flags, rss_vlan;
+
+	/* mask everything except checksum, RSS, and VLAN flags
+	 * bit fields defined in enum iavf_rx_flex_desc_status_error_0_bits
+	 * bit 7:4 for checksum
+	 * bit 12 for RSS indication
+	 * bit 13 for VLAN indication
+	 */
+	const uint32x4_t desc_mask = {0x30f0, 0x30f0, 0x30f0, 0x30f0};
+	const uint32x4_t cksum_mask = {
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+	};
+
+	/* map the checksum, rss and vlan fields to the checksum, rss
+	 * and vlan flags.
+	 */
+	const uint8x16_t cksum_flags = {
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1
+	};
+
+	const uint8x16_t rss_vlan_flags = {
+		0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	};
+
+	/* extract status_error0 field from 4 descriptors,
+	 * and mask out everything else not in desc_mask
+	 */
+	flags = vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	tmp_desc = vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	tmp_desc = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(flags),
+				vreinterpretq_u64_u32(tmp_desc)));
+
+	tmp_desc = vandq_u32(tmp_desc, desc_mask);
+
+	/* shift each 32-bit lane right by 4 so that we can use
+	 * the checksum bit as an index into cksum_flags
+	 */
+	tmp_desc = vshrq_n_u32(tmp_desc, 4);
+	flags = vreinterpretq_u32_u8(vqtbl1q_u8(cksum_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+	/* shift left by 1 bit since we shift right by 1 bit
+	 * in cksum_flags
+	 */
+	flags = vshlq_n_u32(flags, 1);
+
+	/* first check the outer L4 checksum */
+	uint32x4_t l4_outer_mask = {0x6, 0x6, 0x6, 0x6};
+	uint32x4_t l4_outer_flags = vandq_u32(flags, l4_outer_mask);
+	l4_outer_flags = vshlq_n_u32(l4_outer_flags, 20);
+
+	/* then check the rest of cksum bits */
+	uint32x4_t l3_l4_mask = {~0x6, ~0x6, ~0x6, ~0x6};
+	uint32x4_t l3_l4_flags = vandq_u32(flags, l3_l4_mask);
+	flags = vorrq_u32(l3_l4_flags, l4_outer_flags);
+
+	/* only keep the cksum flags in flags */
+	flags = vandq_u32(flags, cksum_mask);
+
+	/* map RSS, VLAN flags in HW desc to RTE_MBUF */
+	tmp_desc = vshrq_n_u32(tmp_desc, 8);
+	rss_vlan = vreinterpretq_u32_u8(vqtbl1q_u8(rss_vlan_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+
+	/* merge the flags */
+	flags = vorrq_u32(flags, rss_vlan);
+
+	/* check the additional fdir_flags if fdir is enabled */
+	if (rxq->fdir_enabled) {
+		const uint32x4_t fdir_id0_1 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+		const uint32x4_t fdir_id2_3 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+		const uint32x4_t fdir_id0_3 =
+			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
+				vreinterpretq_u64_u32(fdir_id2_3)));
+		const uint32x4_t fdir_flags =
+			iavf_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
+
+		/* merge with fdir_flags */
+		flags = vorrq_u32(flags, fdir_flags);
+
+		/* write fdir_id to mbuf */
+		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
+		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
+		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
+		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
+	}
+
+	/* At this point, we have the 4 sets of flags in the low-16-bits
+	 * of each 32-bit value in flags.
+	 * We want to extract these, and merge them with the mbuf init data
+	 * so we can do a single 16-byte write to the mbuf to set the flags
+	 * and all the other initialization fields. Extracting the appropriate
+	 * flags means that we have to do a shift and blend for each mbuf
+	 * before we do the write.
+	 */
+	rearm0 = vsetq_lane_u64(vgetq_lane_u32(flags, 0), mbuf_init, 1);
+	rearm1 = vsetq_lane_u64(vgetq_lane_u32(flags, 1), mbuf_init, 1);
+	rearm2 = vsetq_lane_u64(vgetq_lane_u32(flags, 2), mbuf_init, 1);
+	rearm3 = vsetq_lane_u64(vgetq_lane_u32(flags, 3), mbuf_init, 1);
+
+	/* compile time check */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+					 offsetof(struct rte_mbuf, rearm_data) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+					 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+	/* write the rearm data and the olflags in one write */
+	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
+	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
+	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
+	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts,
+	uint32_t *ptype_tbl)
+{
+	const uint16x8_t ptype_mask = {
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M
+	};
+
+	uint32x4_t ptype_01 = vzip1q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	uint32x4_t ptype_23 = vzip1q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	uint32x4_t ptype_all_u32 =
+				vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(ptype_01),
+					vreinterpretq_u64_u32(ptype_23)));
+	uint16x8_t ptype_all = vreinterpretq_u16_u32(ptype_all_u32);
+
+	ptype_all = vandq_u16(ptype_all, ptype_mask);
+
+	rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)];
+	rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)];
+	rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)];
+	rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)];
+}
+
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
+ */
+static inline uint16_t
+_recv_raw_pkts_vec_flex_rxd(struct ci_rx_queue *__rte_restrict rxq,
+		   struct rte_mbuf **__rte_restrict rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	struct iavf_adapter *adapter = rxq->iavf_vsi->adapter;
+	uint32_t *ptype_tbl = adapter->ptype_tbl;
+	volatile union ci_rx_flex_desc *rxdp;
+	struct ci_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+
+	uint16x8_t crc_adjust = {
+		0, 0,			/* ignore pkt_type field */
+		rxq->crc_len,	/* sub crc on pkt_len */
+		0,				/* ignore high 16 bits of pkt_len */
+		rxq->crc_len,	/* sub crc on data_len */
+		0, 0, 0			/* ignore non-length fields */
+	};
+
+	/* mask to shuffle from flex descriptor to mbuf */
+	const uint8x16_t shuf_msk = {
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		4, 5,			/* octet 4~5, low bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits pkt_len, zero out */
+		4, 5,			/* octet 4~5, 16 bits data_len */
+		10, 11,			/* octet 10~11, 16 bits vlan_macip */
+		0xFF, 0xFF,		/* rss hash parsed separately */
+		0xFF, 0xFF,
+	};
+
+	/* compile-time check the above crc_adjust layout is correct */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	/* 4 packets DD mask */
+	const uint16x8_t dd_check = {
+		0x0001, 0x0001, 0x0001, 0x0001,
+		0, 0, 0, 0
+	};
+
+	/* 4 packets EOP mask */
+	const uint8x16_t eop_check = {
+		0x2, 0, 0x2, 0, 0x2, 0, 0x2, 0,
+		0, 0, 0, 0, 0, 0, 0, 0
+	};
+
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
+
+	rxdp = rxq->rx_flex_ring + rxq->rx_tail;
+	rte_prefetch0(rxdp);
+
+	/* see if we need to rearm the Rx queue */
+	if (rxq->rxrearm_nb > rxq->rx_free_thresh)
+		iavf_rxq_rearm(rxq);
+
+	/* check if there is actually a packet available */
+	if (!(rxdp->wb.status_error0 &
+			rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* move the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packets in a loop
+	 * [A*. mask out the unused dirty fields in flex desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. count the number of DD bits in the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info from flex desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+		 pos += IAVF_VPMD_DESCS_PER_LOOP,
+		 rxdp += IAVF_VPMD_DESCS_PER_LOOP) {
+		uint64x2_t descs[IAVF_VPMD_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		uint16x8_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint16x8_t staterr;
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* A.1 load descs[3-0] */
+		descs[3] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
+		descs[2] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
+		descs[1] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
+		descs[0] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));
+
+		/* use acquire fence to order loads of descriptor qwords */
+		rte_atomic_thread_fence(rte_memory_order_acquire);
+		/* A.2 reload qword0 to make it ordered after qword1 load */
+		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
+					descs[3], 0);
+		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
+					descs[2], 0);
+		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
+					descs[1], 0);
+		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
+					descs[0], 0);
+
+		/* B.1 load 4 mbuf pointers */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		/* B.2 copy 4 mbuf pointers into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* prefetch mbufs if it is a chained buffer */
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		iavf_flex_rxd_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+
+		/* D.1 pkts convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb0 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* D.2 pkts set in in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb0), crc_adjust);
+		pkt_mb0 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb3);
+		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb2);
+		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb1);
+		vst1q_u8((void *)&rx_pkts[pos + 0]->rx_descriptor_fields1, pkt_mb0);
+
+		iavf_flex_rxd_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+
+		/* C.1 filter staterr info only */
+		sterr_tmp2 = vzip2q_u16(vreinterpretq_u16_u64(descs[2]),
+						vreinterpretq_u16_u64(descs[3]));
+		sterr_tmp1 = vzip2q_u16(vreinterpretq_u16_u64(descs[0]),
+						vreinterpretq_u16_u64(descs[1]));
+
+		/* C.2 get 4 pkts status_error0 value */
+		staterr = vzip1q_u16(sterr_tmp1, sterr_tmp2);
+
+		/* C* extract and record EOP bits */
+		if (split_packet) {
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+
+			/* store the 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+							vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
+		/* C.3 count available number of descriptors */
+		/* mask everything except DD bit */
+		staterr = vandq_u16(staterr, dd_check);
+		/* move the status bit (bit0) into the sign bit (bit15)
+		 * of each 16-bit lane
+		 */
+		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
+
+		/* reinterpret staterr as a signed 16-bit and
+		 * arithmetic-shift-right by 15
+		 * each lane becomes 0xFFFF if original DD bit was 1, otherwise 0.
+		 * then interpret back to unsigned u16 vector
+		 */
+		staterr = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(staterr),
+					IAVF_UINT16_BIT - 1));
+
+		/* reinterpret u16x8 vector as u64x2, and fetch the low u64
+		 * which contains the first four 16-bit lanes, and invert all bits
+		 */
+		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
+
+		if (unlikely(stat == 0)) {
+			nb_pkts_recd += IAVF_VPMD_DESCS_PER_LOOP;
+		} else {
+			nb_pkts_recd += rte_ctz64(stat) / IAVF_UINT16_BIT;
+			break;
+		}
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
 static inline void
 desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 		  uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
@@ -115,9 +548,6 @@ desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
 }
 
-#define PKTLEN_SHIFT     10
-#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
-
 static inline void
 desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 		uint32_t *__rte_restrict ptype_tbl)
@@ -379,6 +809,19 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > IAVF_VPMD_RX_BURST, only scan IAVF_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+uint16_t
+iavf_recv_pkts_vec_flex_rxd(void *__rte_restrict rx_queue,
+		struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
 /*
  * vPMD receive routine that reassembles single burst of 32 scattered
  * packets.
@@ -445,6 +888,76 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+	unsigned int i = 0;
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
+					      split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
+			&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static __rte_always_inline void
 iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths
  2026-04-17 13:08 [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
                   ` (2 preceding siblings ...)
  2026-04-17 13:08 ` [PATCH v1 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
@ 2026-04-20 10:30 ` Jay Wang
  2026-04-20 10:30   ` [PATCH v2 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
                     ` (3 more replies)
  3 siblings, 4 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-20 10:30 UTC (permalink / raw)
  Cc: dev, nd, Jay Wang

- Add the scattered Rx burst function for 32B legacy descriptor
- Add the NEON-optimised Tx burst function
- Add the NEON-optimised Rx burst functions for flexible descriptor

---
v2:
- Fixed the AArch32 cross-building issue on AArch64 by restricting the
  call of iavf_rxq_vec_setup to AArch64 only

Jay Wang (3):
  net/iavf: add Rx scattered function for 32B desc
  net/iavf: add NEON-optimised Tx burst function
  net/iavf: add NEON support for Rx flex desc

 drivers/net/intel/iavf/iavf.h               |   4 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  54 +-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 751 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 5 files changed, 796 insertions(+), 17 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/3] net/iavf: add Rx scattered function for 32B desc
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
@ 2026-04-20 10:30   ` Jay Wang
  2026-04-20 10:30   ` [PATCH v2 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-20 10:30 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

Added the scattered burst function on AArch64 so that we can leverage
the NEON-optimised Rx raw burst function to handle scattered packets for
the legacy 32B descriptor.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  16 ++-
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 110 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 4 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 403c61e2e8..e4936f3566 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -334,6 +334,7 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC,
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
+	IAVF_RX_NEON_SCATTERED,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 4ff6c18dc4..15566a0e18 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 		}
 	},
 #endif
-#elif defined RTE_ARCH_ARM
+#elif defined(RTE_ARCH_ARM64)
 	[IAVF_RX_NEON] = {
 		.pkt_burst = iavf_recv_pkts_vec,
 		.info = "Vector Neon",
 		.features = {
-			.rx_offloads = IAVF_RX_SCALAR_OFFLOADS,
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_128,
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_SCATTERED] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec,
+		.info = "Vector Scattered Neon",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
@@ -3839,7 +3849,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	if (adapter->rx_bulk_alloc_allowed) {
 		req_features.bulk_alloc = true;
 		default_path = IAVF_RX_BULK_ALLOC;
-#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 		if (iavf_rx_vec_dev_check(dev) != -1)
 			req_features.simd_width = iavf_get_max_simd_bitwidth();
 #endif
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 28c90b2a72..45e377d728 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2022 Intel Corporation
- * Copyright(c) 2022 Arm Limited
+ * Copyright(c) 2022-2026 Arm Limited
  */
 
 #include <stdint.h>
@@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		   struct rte_mbuf **__rte_restrict rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
 {
-	RTE_SET_USED(split_packet);
-
 	volatile union ci_rx_desc *rxdp;
 	struct ci_rx_entry *sw_ring;
 	uint16_t nb_pkts_recd;
@@ -164,6 +162,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		4, 5, 6, 7    /* octet 4~7, 32bits rss */
 		};
 
+	uint8x16_t eop_check = {
+		0x02, 0x00, 0x02, 0x00,
+		0x02, 0x00, 0x02, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00
+	};
+
 	uint16x8_t crc_adjust = {
 		0, 0,         /* ignore pkt_type field */
 		rxq->crc_len, /* sub crc on pkt_len */
@@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
 		/* pkts shift the pktlen field to be 16-bit aligned*/
 		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
 					    len_shl);
@@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		staterr = vzipq_u16(sterr_tmp1.val[1],
 				    sterr_tmp2.val[1]).val[0];
 
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			uint8x16_t eop_shuf_mask = {
+				0x00, 0x02, 0x04, 0x06,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF
+			};
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+			/* the staterr values are not in order, as the count
+			 * of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit
+			 */
+			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
+
+			/* store the resulting 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+				vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
 		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
 		staterr = vreinterpretq_u16_s16(
 				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
@@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * vPMD receive routine that reassembles single burst of 32 scattered
+ * packets.
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static __rte_always_inline uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+						split_flags);
+
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be assembled */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+	if (!rxq->pkt_first_seg &&
+			split_fl64[0] == 0 && split_fl64[1] == 0 &&
+			split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassmble any packets that need reassembly */
+	unsigned int i = 0;
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassmeble then */
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
+			&split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
+			rxq->crc_len);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+				rx_pkts + retval, IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+	/* The last one burst or nb_pkts <= IAVF_VPMD_RX_BURST */
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+			rx_pkts + retval, nb_pkts);
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build
index f9576586f6..50630a88c8 100644
--- a/drivers/net/intel/iavf/meson.build
+++ b/drivers/net/intel/iavf/meson.build
@@ -29,7 +29,7 @@ sources = files(
 if arch_subdir == 'x86'
     sources_avx2 += files('iavf_rxtx_vec_avx2.c')
     sources_avx512 += files('iavf_rxtx_vec_avx512.c')
-elif arch_subdir == 'arm'
+elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
     sources += files('iavf_rxtx_vec_neon.c')
 endif
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/3] net/iavf: add NEON-optimised Tx burst function
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-04-20 10:30   ` [PATCH v2 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
@ 2026-04-20 10:30   ` Jay Wang
  2026-04-20 10:30   ` [PATCH v2 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-20 10:30 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

This patch adds the NEON-optimised Tx burst function for Intel IAVF
driver on AArch64.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  15 ++-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 120 ++++++++++++++++++++
 4 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index e4936f3566..3e71d345a9 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -356,6 +356,7 @@ enum iavf_rx_func_type {
 enum iavf_tx_func_type {
 	IAVF_TX_DISABLED,
 	IAVF_TX_DEFAULT,
+	IAVF_TX_NEON,
 	IAVF_TX_AVX2,
 	IAVF_TX_AVX2_OFFLOAD,
 	IAVF_TX_AVX512,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 15566a0e18..645bc5ccf6 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3662,6 +3662,15 @@ static const struct ci_tx_path_info iavf_tx_path_infos[] = {
 		}
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[IAVF_TX_NEON] = {
+		.pkt_burst = iavf_xmit_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.tx_offloads = IAVF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128
+		}
+	},
 #endif
 };
 
@@ -3878,7 +3887,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int mbuf_check = adapter->devargs.mbuf_check;
 	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	struct ci_tx_queue *txq;
 	int i;
 	const struct ci_tx_path_features *selected_features;
@@ -3892,7 +3901,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 	if (dev->data->dev_started)
 		goto out;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	if (iavf_tx_vec_dev_check(dev) != -1)
 		req_features.simd_width = iavf_get_max_simd_bitwidth();
 
@@ -3915,7 +3924,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 						IAVF_TX_DEFAULT);
 
 out:
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	selected_features = &iavf_tx_path_infos[adapter->tx_func_type].features;
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h
index 80b06518b0..8b8e55e66f 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.h
+++ b/drivers/net/intel/iavf/iavf_rxtx.h
@@ -558,8 +558,6 @@ uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 					       struct rte_mbuf **rx_pkts,
 					       uint16_t nb_pkts);
-uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-				  uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 45e377d728..9c91b6bac1 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -445,6 +445,120 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline void
+iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
+	 uint64_t flags)
+{
+	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
+		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
+
+	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
+	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
+}
+
+static __rte_always_inline void
+iavf_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt,
+	uint16_t nb_pkts, uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		iavf_vtx1(txdp, *pkt, flags);
+}
+
+static __rte_always_inline uint16_t
+iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct ci_tx_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
+	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->ci_tx_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			iavf_vtx1(txdp, *tx_pkts, flags);
+
+		/* write with RS for the last descriptor in the segment */
+		iavf_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->ci_tx_ring[tx_id];
+		txep = &txq->sw_ring_vec[tx_id];
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
+					 CI_TXD_QW1_CMD_S);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		   uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		/* cross rs_thresh boundary is not allowed */
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+				num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
@@ -465,6 +579,12 @@ iavf_rx_vec_dev_check(struct rte_eth_dev *dev)
 	return iavf_rx_vec_dev_check_default(dev);
 }
 
+int __rte_cold
+iavf_tx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return iavf_tx_vec_dev_check_default(dev);
+}
+
 enum rte_vect_max_simd
 iavf_get_max_simd_bitwidth(void)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/3] net/iavf: add NEON support for Rx flex desc
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-04-20 10:30   ` [PATCH v2 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
  2026-04-20 10:30   ` [PATCH v2 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
@ 2026-04-20 10:30   ` Jay Wang
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-04-20 10:30 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

This patch adds the NEON-optimised Rx paths to process receive flex
descriptor.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   2 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  23 +-
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 521 +++++++++++++++++++-
 3 files changed, 541 insertions(+), 5 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 3e71d345a9..360d728f3a 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -335,6 +335,8 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
 	IAVF_RX_NEON_SCATTERED,
+	IAVF_RX_NEON_FLEX_RXD,
+	IAVF_RX_NEON_SCATTERED_FLEX_RXD,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 645bc5ccf6..8e711950ff 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -713,7 +713,7 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 		ad->rx_bulk_alloc_allowed = false;
 	}
 
-#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM
+#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM64
 	/* check vector conflict */
 	if (ci_rxq_vec_capable(rxq->nb_rx_desc, rxq->rx_free_thresh) &&
 			iavf_rxq_vec_setup(rxq)) {
@@ -3571,6 +3571,27 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_pkts_vec_flex_rxd,
+		.info = "Vector Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
+	[IAVF_RX_NEON_SCATTERED_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec_flex_rxd,
+		.info = "Vector Scattered Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 9c91b6bac1..9d7281e172 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -16,12 +16,445 @@
 
 #include "../common/rx_vec_arm.h"
 
-static inline void
+#define PKTLEN_SHIFT     10
+#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
+
+static __rte_always_inline void
 iavf_rxq_rearm(struct ci_rx_queue *rxq)
 {
 	ci_rxq_rearm(rxq);
 }
 
+static __rte_always_inline uint32x4_t
+iavf_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
+{
+#define FDID_MIS_MAGIC	0xFFFFFFFFu
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));
+
+	const uint32x4_t pkt_fdir_bit = vdupq_n_u32((uint32_t)(RTE_MBUF_F_RX_FDIR |
+			RTE_MBUF_F_RX_FDIR_ID));
+	const uint32x4_t fdir_mis_mask = vdupq_n_u32(FDID_MIS_MAGIC);
+
+	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
+	uint32x4_t fdir_mask = vceqq_u32(fdir_id0_3, fdir_mis_mask);
+
+	/* xor with 0xFFFFFFFF bit-reverses the mask */
+	fdir_mask = veorq_u32(fdir_mask, fdir_mis_mask);
+	const uint32x4_t fdir_flags = vandq_u32(fdir_mask, pkt_fdir_bit);
+
+	return fdir_flags;
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
+		struct rte_mbuf **rx_pkts)
+{
+	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+
+	uint32x4_t tmp_desc, flags, rss_vlan;
+
+	/* mask everything except checksum, RSS, and VLAN flags
+	 * bit fields defined in enum iavf_rx_flex_desc_status_error_0_bits
+	 * bit 7:4 for checksum
+	 * bit 12 for RSS indication
+	 * bit 13 for VLAN indication
+	 */
+	const uint32x4_t desc_mask = {0x30f0, 0x30f0, 0x30f0, 0x30f0};
+	const uint32x4_t cksum_mask = {
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+	};
+
+	/* map the checksum, rss and vlan fields to the checksum, rss
+	 * and vlan flags.
+	 */
+	const uint8x16_t cksum_flags = {
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1
+	};
+
+	const uint8x16_t rss_vlan_flags = {
+		0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	};
+
+	/* extract status_error0 field from 4 descriptors,
+	 * and mask out everything else not in desc_mask
+	 */
+	flags = vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	tmp_desc = vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	tmp_desc = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(flags),
+				vreinterpretq_u64_u32(tmp_desc)));
+
+	tmp_desc = vandq_u32(tmp_desc, desc_mask);
+
+	/* shift each 32-bit lane right by 4 so that we can use
+	 * the checksum bit as an index into cksum_flags
+	 */
+	tmp_desc = vshrq_n_u32(tmp_desc, 4);
+	flags = vreinterpretq_u32_u8(vqtbl1q_u8(cksum_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+	/* shift left by 1 bit since we shift right by 1 bit
+	 * in cksum_flags
+	 */
+	flags = vshlq_n_u32(flags, 1);
+
+	/* first check the outer L4 checksum */
+	uint32x4_t l4_outer_mask = {0x6, 0x6, 0x6, 0x6};
+	uint32x4_t l4_outer_flags = vandq_u32(flags, l4_outer_mask);
+	l4_outer_flags = vshlq_n_u32(l4_outer_flags, 20);
+
+	/* then check the rest of cksum bits */
+	uint32x4_t l3_l4_mask = {~0x6, ~0x6, ~0x6, ~0x6};
+	uint32x4_t l3_l4_flags = vandq_u32(flags, l3_l4_mask);
+	flags = vorrq_u32(l3_l4_flags, l4_outer_flags);
+
+	/* only keep the cksum flags in flags */
+	flags = vandq_u32(flags, cksum_mask);
+
+	/* map RSS, VLAN flags in HW desc to RTE_MBUF */
+	tmp_desc = vshrq_n_u32(tmp_desc, 8);
+	rss_vlan = vreinterpretq_u32_u8(vqtbl1q_u8(rss_vlan_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+
+	/* merge the flags */
+	flags = vorrq_u32(flags, rss_vlan);
+
+	/* check the additional fdir_flags if fdir is enabled */
+	if (rxq->fdir_enabled) {
+		const uint32x4_t fdir_id0_1 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+		const uint32x4_t fdir_id2_3 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+		const uint32x4_t fdir_id0_3 =
+			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
+				vreinterpretq_u64_u32(fdir_id2_3)));
+		const uint32x4_t fdir_flags =
+			iavf_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
+
+		/* merge with fdir_flags */
+		flags = vorrq_u32(flags, fdir_flags);
+
+		/* write fdir_id to mbuf */
+		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
+		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
+		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
+		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
+	}
+
+	/* At this point, we have the 4 sets of flags in the low-16-bits
+	 * of each 32-bit value in flags.
+	 * We want to extract these, and merge them with the mbuf init data
+	 * so we can do a single 16-byte write to the mbuf to set the flags
+	 * and all the other initialization fields. Extracting the appropriate
+	 * flags means that we have to do a shift and blend for each mbuf
+	 * before we do the write.
+	 */
+	rearm0 = vsetq_lane_u64(vgetq_lane_u32(flags, 0), mbuf_init, 1);
+	rearm1 = vsetq_lane_u64(vgetq_lane_u32(flags, 1), mbuf_init, 1);
+	rearm2 = vsetq_lane_u64(vgetq_lane_u32(flags, 2), mbuf_init, 1);
+	rearm3 = vsetq_lane_u64(vgetq_lane_u32(flags, 3), mbuf_init, 1);
+
+	/* compile time check */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+					 offsetof(struct rte_mbuf, rearm_data) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+					 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+	/* write the rearm data and the olflags in one write */
+	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
+	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
+	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
+	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts,
+	uint32_t *ptype_tbl)
+{
+	const uint16x8_t ptype_mask = {
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M
+	};
+
+	uint32x4_t ptype_01 = vzip1q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	uint32x4_t ptype_23 = vzip1q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	uint32x4_t ptype_all_u32 =
+				vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(ptype_01),
+					vreinterpretq_u64_u32(ptype_23)));
+	uint16x8_t ptype_all = vreinterpretq_u16_u32(ptype_all_u32);
+
+	ptype_all = vandq_u16(ptype_all, ptype_mask);
+
+	rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)];
+	rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)];
+	rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)];
+	rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)];
+}
+
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
+ */
+static inline uint16_t
+_recv_raw_pkts_vec_flex_rxd(struct ci_rx_queue *__rte_restrict rxq,
+		   struct rte_mbuf **__rte_restrict rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	struct iavf_adapter *adapter = rxq->iavf_vsi->adapter;
+	uint32_t *ptype_tbl = adapter->ptype_tbl;
+	volatile union ci_rx_flex_desc *rxdp;
+	struct ci_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+
+	uint16x8_t crc_adjust = {
+		0, 0,			/* ignore pkt_type field */
+		rxq->crc_len,	/* sub crc on pkt_len */
+		0,				/* ignore high 16 bits of pkt_len */
+		rxq->crc_len,	/* sub crc on data_len */
+		0, 0, 0			/* ignore non-length fields */
+	};
+
+	/* mask to shuffle from flex descriptor to mbuf */
+	const uint8x16_t shuf_msk = {
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		4, 5,			/* octet 4~5, low bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits pkt_len, zero out */
+		4, 5,			/* octet 4~5, 16 bits data_len */
+		10, 11,			/* octet 10~11, 16 bits vlan_macip */
+		0xFF, 0xFF,		/* rss hash parsed separately */
+		0xFF, 0xFF,
+	};
+
+	/* compile-time check the above crc_adjust layout is correct */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	/* 4 packets DD mask */
+	const uint16x8_t dd_check = {
+		0x0001, 0x0001, 0x0001, 0x0001,
+		0, 0, 0, 0
+	};
+
+	/* 4 packets EOP mask */
+	const uint8x16_t eop_check = {
+		0x2, 0, 0x2, 0, 0x2, 0, 0x2, 0,
+		0, 0, 0, 0, 0, 0, 0, 0
+	};
+
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
+
+	rxdp = rxq->rx_flex_ring + rxq->rx_tail;
+	rte_prefetch0(rxdp);
+
+	/* see if we need to rearm the Rx queue */
+	if (rxq->rxrearm_nb > rxq->rx_free_thresh)
+		iavf_rxq_rearm(rxq);
+
+	/* check if there is actually a packet available */
+	if (!(rxdp->wb.status_error0 &
+			rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* move the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packets in a loop
+	 * [A*. mask out the unused dirty fields in flex desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. count the number of DD bits in the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info from flex desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+		 pos += IAVF_VPMD_DESCS_PER_LOOP,
+		 rxdp += IAVF_VPMD_DESCS_PER_LOOP) {
+		uint64x2_t descs[IAVF_VPMD_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		uint16x8_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint16x8_t staterr;
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* A.1 load descs[3-0] */
+		descs[3] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
+		descs[2] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
+		descs[1] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
+		descs[0] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));
+
+		/* use acquire fence to order loads of descriptor qwords */
+		rte_atomic_thread_fence(rte_memory_order_acquire);
+		/* A.2 reload qword0 to make it ordered after qword1 load */
+		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
+					descs[3], 0);
+		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
+					descs[2], 0);
+		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
+					descs[1], 0);
+		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
+					descs[0], 0);
+
+		/* B.1 load 4 mbuf pointers */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		/* B.2 copy 4 mbuf pointers into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* prefetch mbufs if it is a chained buffer */
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		iavf_flex_rxd_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+
+		/* D.1 pkts convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb0 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* D.2 pkts set in in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb0), crc_adjust);
+		pkt_mb0 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb3);
+		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb2);
+		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb1);
+		vst1q_u8((void *)&rx_pkts[pos + 0]->rx_descriptor_fields1, pkt_mb0);
+
+		iavf_flex_rxd_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+
+		/* C.1 filter staterr info only */
+		sterr_tmp2 = vzip2q_u16(vreinterpretq_u16_u64(descs[2]),
+						vreinterpretq_u16_u64(descs[3]));
+		sterr_tmp1 = vzip2q_u16(vreinterpretq_u16_u64(descs[0]),
+						vreinterpretq_u16_u64(descs[1]));
+
+		/* C.2 get 4 pkts status_error0 value */
+		staterr = vzip1q_u16(sterr_tmp1, sterr_tmp2);
+
+		/* C* extract and record EOP bits */
+		if (split_packet) {
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+
+			/* store the 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+							vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
+		/* C.3 count available number of descriptors */
+		/* mask everything except DD bit */
+		staterr = vandq_u16(staterr, dd_check);
+		/* move the status bit (bit0) into the sign bit (bit15)
+		 * of each 16-bit lane
+		 */
+		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
+
+		/* reinterpret staterr as a signed 16-bit and
+		 * arithmetic-shift-right by 15
+		 * each lane becomes 0xFFFF if original DD bit was 1, otherwise 0.
+		 * then interpret back to unsigned u16 vector
+		 */
+		staterr = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(staterr),
+					IAVF_UINT16_BIT - 1));
+
+		/* reinterpret u16x8 vector as u64x2, and fetch the low u64
+		 * which contains the first four 16-bit lanes, and invert all bits
+		 */
+		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
+
+		if (unlikely(stat == 0)) {
+			nb_pkts_recd += IAVF_VPMD_DESCS_PER_LOOP;
+		} else {
+			nb_pkts_recd += rte_ctz64(stat) / IAVF_UINT16_BIT;
+			break;
+		}
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
 static inline void
 desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 		  uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
@@ -115,9 +548,6 @@ desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
 }
 
-#define PKTLEN_SHIFT     10
-#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
-
 static inline void
 desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 		uint32_t *__rte_restrict ptype_tbl)
@@ -379,6 +809,19 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > IAVF_VPMD_RX_BURST, only scan IAVF_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+uint16_t
+iavf_recv_pkts_vec_flex_rxd(void *__rte_restrict rx_queue,
+		struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
 /*
  * vPMD receive routine that reassembles single burst of 32 scattered
  * packets.
@@ -445,6 +888,76 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+	unsigned int i = 0;
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
+					      split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
+			&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static __rte_always_inline void
 iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths
  2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
                     ` (2 preceding siblings ...)
  2026-04-20 10:30   ` [PATCH v2 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
@ 2026-05-05 11:07   ` Jay Wang
  2026-05-05 11:07     ` [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
                       ` (3 more replies)
  3 siblings, 4 replies; 13+ messages in thread
From: Jay Wang @ 2026-05-05 11:07 UTC (permalink / raw)
  Cc: dev, nd, Jay Wang

- Add the scattered Rx burst function for 32B legacy descriptor
- Add the NEON-optimised Tx burst function
- Add the NEON-optimised Rx burst functions for flexible descriptor

Single-core benchmark result on a Grace server with an Intel E810:

| buffer sz (B) | prev (MPPS) | optimised (MPPS) | uplift |
| ------------- | ----------- | ---------------- | ------ |
| 64            | 62.241      | 86.010           | 38.2%  |
| 128           | 60.813      | 81.453           | 33.9%  |
| 256           | 23.730      | 24.633           | 3.8%   |
| 512           | 10.268      | 10.380           | 1.1%   |
| 1024          | 4.588       | 4.628            | 0.9%   |
| 1518          | 4.601       | 4.669            | 1.5%   |


---
v3:
- Optimised the branch prediction in counting packets by changing
  unlikely() to likely()
- Added the single-core performance data from Grace + E810 done by
  Andrew

v2:
- Fixed the AArch32 cross-building issue on AArch64 by restricting the
  call of iavf_rxq_vec_setup to AArch64 only


Jay Wang (3):
  net/iavf: add Rx scattered function for 32B desc
  net/iavf: add NEON-optimised Tx burst function
  net/iavf: add NEON support for Rx flex desc

 drivers/net/intel/iavf/iavf.h               |   4 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  54 +-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 753 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 5 files changed, 797 insertions(+), 18 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
@ 2026-05-05 11:07     ` Jay Wang
  2026-05-05 11:07     ` [PATCH v3 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-05-05 11:07 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

Added the scattered burst function on AArch64 so that we can leverage
the NEON-optimised Rx raw burst function to handle scattered packets for
the legacy 32B descriptor.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  16 ++-
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 110 +++++++++++++++++++-
 drivers/net/intel/iavf/meson.build          |   2 +-
 4 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 403c61e2e8..e4936f3566 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -334,6 +334,7 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC,
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
+	IAVF_RX_NEON_SCATTERED,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 4ff6c18dc4..15566a0e18 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 		}
 	},
 #endif
-#elif defined RTE_ARCH_ARM
+#elif defined(RTE_ARCH_ARM64)
 	[IAVF_RX_NEON] = {
 		.pkt_burst = iavf_recv_pkts_vec,
 		.info = "Vector Neon",
 		.features = {
-			.rx_offloads = IAVF_RX_SCALAR_OFFLOADS,
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_128,
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_SCATTERED] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec,
+		.info = "Vector Scattered Neon",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
@@ -3839,7 +3849,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	if (adapter->rx_bulk_alloc_allowed) {
 		req_features.bulk_alloc = true;
 		default_path = IAVF_RX_BULK_ALLOC;
-#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 		if (iavf_rx_vec_dev_check(dev) != -1)
 			req_features.simd_width = iavf_get_max_simd_bitwidth();
 #endif
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 28c90b2a72..45e377d728 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2022 Intel Corporation
- * Copyright(c) 2022 Arm Limited
+ * Copyright(c) 2022-2026 Arm Limited
  */
 
 #include <stdint.h>
@@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		   struct rte_mbuf **__rte_restrict rx_pkts,
 		   uint16_t nb_pkts, uint8_t *split_packet)
 {
-	RTE_SET_USED(split_packet);
-
 	volatile union ci_rx_desc *rxdp;
 	struct ci_rx_entry *sw_ring;
 	uint16_t nb_pkts_recd;
@@ -164,6 +162,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		4, 5, 6, 7    /* octet 4~7, 32bits rss */
 		};
 
+	uint8x16_t eop_check = {
+		0x02, 0x00, 0x02, 0x00,
+		0x02, 0x00, 0x02, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00
+	};
+
 	uint16x8_t crc_adjust = {
 		0, 0,         /* ignore pkt_type field */
 		rxq->crc_len, /* sub crc on pkt_len */
@@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
 		/* pkts shift the pktlen field to be 16-bit aligned*/
 		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
 					    len_shl);
@@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		staterr = vzipq_u16(sterr_tmp1.val[1],
 				    sterr_tmp2.val[1]).val[0];
 
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			uint8x16_t eop_shuf_mask = {
+				0x00, 0x02, 0x04, 0x06,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				0xFF, 0xFF, 0xFF, 0xFF
+			};
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+			/* the staterr values are not in order, as the count
+			 * of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit
+			 */
+			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
+
+			/* store the resulting 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+				vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
 		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
 		staterr = vreinterpretq_u16_s16(
 				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
@@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * vPMD receive routine that reassembles single burst of 32 scattered
+ * packets.
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static __rte_always_inline uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+						split_flags);
+
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be assembled */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+	if (!rxq->pkt_first_seg &&
+			split_fl64[0] == 0 && split_fl64[1] == 0 &&
+			split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassmble any packets that need reassembly */
+	unsigned int i = 0;
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassmeble then */
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
+			&split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
+			rxq->crc_len);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+		burst = iavf_recv_scattered_burst_vec(rx_queue,
+				rx_pkts + retval, IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+	/* The last one burst or nb_pkts <= IAVF_VPMD_RX_BURST */
+	return retval + iavf_recv_scattered_burst_vec(rx_queue,
+			rx_pkts + retval, nb_pkts);
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build
index f9576586f6..50630a88c8 100644
--- a/drivers/net/intel/iavf/meson.build
+++ b/drivers/net/intel/iavf/meson.build
@@ -29,7 +29,7 @@ sources = files(
 if arch_subdir == 'x86'
     sources_avx2 += files('iavf_rxtx_vec_avx2.c')
     sources_avx512 += files('iavf_rxtx_vec_avx512.c')
-elif arch_subdir == 'arm'
+elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
     sources += files('iavf_rxtx_vec_neon.c')
 endif
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/3] net/iavf: add NEON-optimised Tx burst function
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-05-05 11:07     ` [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
@ 2026-05-05 11:07     ` Jay Wang
  2026-05-05 11:07     ` [PATCH v3 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
  2026-05-05 14:06     ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Bruce Richardson
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-05-05 11:07 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang

This patch adds the NEON-optimised Tx burst function for Intel IAVF
driver on AArch64.

Signed-off-by: Jay Wang <jay.wang2@arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  15 ++-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 120 ++++++++++++++++++++
 4 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index e4936f3566..3e71d345a9 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -356,6 +356,7 @@ enum iavf_rx_func_type {
 enum iavf_tx_func_type {
 	IAVF_TX_DISABLED,
 	IAVF_TX_DEFAULT,
+	IAVF_TX_NEON,
 	IAVF_TX_AVX2,
 	IAVF_TX_AVX2_OFFLOAD,
 	IAVF_TX_AVX512,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 15566a0e18..645bc5ccf6 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3662,6 +3662,15 @@ static const struct ci_tx_path_info iavf_tx_path_infos[] = {
 		}
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[IAVF_TX_NEON] = {
+		.pkt_burst = iavf_xmit_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.tx_offloads = IAVF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128
+		}
+	},
 #endif
 };
 
@@ -3878,7 +3887,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int mbuf_check = adapter->devargs.mbuf_check;
 	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	struct ci_tx_queue *txq;
 	int i;
 	const struct ci_tx_path_features *selected_features;
@@ -3892,7 +3901,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 	if (dev->data->dev_started)
 		goto out;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	if (iavf_tx_vec_dev_check(dev) != -1)
 		req_features.simd_width = iavf_get_max_simd_bitwidth();
 
@@ -3915,7 +3924,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 						IAVF_TX_DEFAULT);
 
 out:
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	selected_features = &iavf_tx_path_infos[adapter->tx_func_type].features;
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h
index 80b06518b0..8b8e55e66f 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.h
+++ b/drivers/net/intel/iavf/iavf_rxtx.h
@@ -558,8 +558,6 @@ uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 					       struct rte_mbuf **rx_pkts,
 					       uint16_t nb_pkts);
-uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-				  uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 45e377d728..9c91b6bac1 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -445,6 +445,120 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline void
+iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
+	 uint64_t flags)
+{
+	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
+		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
+
+	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
+	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
+}
+
+static __rte_always_inline void
+iavf_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt,
+	uint16_t nb_pkts, uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		iavf_vtx1(txdp, *pkt, flags);
+}
+
+static __rte_always_inline uint16_t
+iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct ci_tx_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
+	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->ci_tx_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			iavf_vtx1(txdp, *tx_pkts, flags);
+
+		/* write with RS for the last descriptor in the segment */
+		iavf_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->ci_tx_ring[tx_id];
+		txep = &txq->sw_ring_vec[tx_id];
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
+					 CI_TXD_QW1_CMD_S);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		   uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		/* cross rs_thresh boundary is not allowed */
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+				num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
@@ -465,6 +579,12 @@ iavf_rx_vec_dev_check(struct rte_eth_dev *dev)
 	return iavf_rx_vec_dev_check_default(dev);
 }
 
+int __rte_cold
+iavf_tx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return iavf_tx_vec_dev_check_default(dev);
+}
+
 enum rte_vect_max_simd
 iavf_get_max_simd_bitwidth(void)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 3/3] net/iavf: add NEON support for Rx flex desc
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
  2026-05-05 11:07     ` [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
  2026-05-05 11:07     ` [PATCH v3 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
@ 2026-05-05 11:07     ` Jay Wang
  2026-05-05 14:06     ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Bruce Richardson
  3 siblings, 0 replies; 13+ messages in thread
From: Jay Wang @ 2026-05-05 11:07 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, nd, Jay Wang, Andrew Bailey

This patch adds the NEON-optimised Rx paths to process receive flex
descriptor.

The single-core testpmd benchmarking was conducted by Andrew Bailey on
an Arm Grace system with an Intel E810 100G NIC. The test was run with
txq/rxq=1 and 2048 descriptors. The throughput performance uplift is
shown in the following table.

| buffer sz (B) | prev (MPPS) | optimised (MPPS) | uplift |
| ------------- | ----------- | ---------------- | ------ |
| 64            | 62.241      | 86.010           | 38.2%  |
| 128           | 60.813      | 81.453           | 33.9%  |
| 256           | 23.730      | 24.633           | 3.8%   |
| 512           | 10.268      | 10.380           | 1.1%   |
| 1024          | 4.588       | 4.628            | 0.9%   |
| 1518          | 4.601       | 4.669            | 1.5%   |

Signed-off-by: Jay Wang <jay.wang2@arm.com>
Tested-by: Andrew Bailey <abailey@iol.unh.edu>
---
 drivers/net/intel/iavf/iavf.h               |   2 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  23 +-
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 523 +++++++++++++++++++-
 3 files changed, 542 insertions(+), 6 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 3e71d345a9..360d728f3a 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -335,6 +335,8 @@ enum iavf_rx_func_type {
 	IAVF_RX_BULK_ALLOC_FLEX_RXD,
 	IAVF_RX_NEON,
 	IAVF_RX_NEON_SCATTERED,
+	IAVF_RX_NEON_FLEX_RXD,
+	IAVF_RX_NEON_SCATTERED_FLEX_RXD,
 	IAVF_RX_AVX2,
 	IAVF_RX_AVX2_SCATTERED,
 	IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 645bc5ccf6..8e711950ff 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -713,7 +713,7 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 		ad->rx_bulk_alloc_allowed = false;
 	}
 
-#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM
+#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM64
 	/* check vector conflict */
 	if (ci_rxq_vec_capable(rxq->nb_rx_desc, rxq->rx_free_thresh) &&
 			iavf_rxq_vec_setup(rxq)) {
@@ -3571,6 +3571,27 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
 			.bulk_alloc = true
 		}
 	},
+	[IAVF_RX_NEON_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_pkts_vec_flex_rxd,
+		.info = "Vector Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
+	[IAVF_RX_NEON_SCATTERED_FLEX_RXD] = {
+		.pkt_burst = iavf_recv_scattered_pkts_vec_flex_rxd,
+		.info = "Vector Scattered Neon Flex",
+		.features = {
+			.rx_offloads = IAVF_RX_VECTOR_FLEX_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.flex_desc = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 9c91b6bac1..e4c3f073cb 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -16,12 +16,445 @@
 
 #include "../common/rx_vec_arm.h"
 
-static inline void
+#define PKTLEN_SHIFT     10
+#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
+
+static __rte_always_inline void
 iavf_rxq_rearm(struct ci_rx_queue *rxq)
 {
 	ci_rxq_rearm(rxq);
 }
 
+static __rte_always_inline uint32x4_t
+iavf_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
+{
+#define FDID_MIS_MAGIC	0xFFFFFFFFu
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));
+
+	const uint32x4_t pkt_fdir_bit = vdupq_n_u32((uint32_t)(RTE_MBUF_F_RX_FDIR |
+			RTE_MBUF_F_RX_FDIR_ID));
+	const uint32x4_t fdir_mis_mask = vdupq_n_u32(FDID_MIS_MAGIC);
+
+	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
+	uint32x4_t fdir_mask = vceqq_u32(fdir_id0_3, fdir_mis_mask);
+
+	/* xor with 0xFFFFFFFF bit-reverses the mask */
+	fdir_mask = veorq_u32(fdir_mask, fdir_mis_mask);
+	const uint32x4_t fdir_flags = vandq_u32(fdir_mask, pkt_fdir_bit);
+
+	return fdir_flags;
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
+		struct rte_mbuf **rx_pkts)
+{
+	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+
+	uint32x4_t tmp_desc, flags, rss_vlan;
+
+	/* mask everything except checksum, RSS, and VLAN flags
+	 * bit fields defined in enum iavf_rx_flex_desc_status_error_0_bits
+	 * bit 7:4 for checksum
+	 * bit 12 for RSS indication
+	 * bit 13 for VLAN indication
+	 */
+	const uint32x4_t desc_mask = {0x30f0, 0x30f0, 0x30f0, 0x30f0};
+	const uint32x4_t cksum_mask = {
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+	};
+
+	/* map the checksum, rss and vlan fields to the checksum, rss
+	 * and vlan flags.
+	 */
+	const uint8x16_t cksum_flags = {
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1
+	};
+
+	const uint8x16_t rss_vlan_flags = {
+		0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	};
+
+	/* extract status_error0 field from 4 descriptors,
+	 * and mask out everything else not in desc_mask
+	 */
+	flags = vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	tmp_desc = vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	tmp_desc = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(flags),
+				vreinterpretq_u64_u32(tmp_desc)));
+
+	tmp_desc = vandq_u32(tmp_desc, desc_mask);
+
+	/* shift each 32-bit lane right by 4 so that we can use
+	 * the checksum bit as an index into cksum_flags
+	 */
+	tmp_desc = vshrq_n_u32(tmp_desc, 4);
+	flags = vreinterpretq_u32_u8(vqtbl1q_u8(cksum_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+	/* shift left by 1 bit since we shift right by 1 bit
+	 * in cksum_flags
+	 */
+	flags = vshlq_n_u32(flags, 1);
+
+	/* first check the outer L4 checksum */
+	uint32x4_t l4_outer_mask = {0x6, 0x6, 0x6, 0x6};
+	uint32x4_t l4_outer_flags = vandq_u32(flags, l4_outer_mask);
+	l4_outer_flags = vshlq_n_u32(l4_outer_flags, 20);
+
+	/* then check the rest of cksum bits */
+	uint32x4_t l3_l4_mask = {~0x6, ~0x6, ~0x6, ~0x6};
+	uint32x4_t l3_l4_flags = vandq_u32(flags, l3_l4_mask);
+	flags = vorrq_u32(l3_l4_flags, l4_outer_flags);
+
+	/* only keep the cksum flags in flags */
+	flags = vandq_u32(flags, cksum_mask);
+
+	/* map RSS, VLAN flags in HW desc to RTE_MBUF */
+	tmp_desc = vshrq_n_u32(tmp_desc, 8);
+	rss_vlan = vreinterpretq_u32_u8(vqtbl1q_u8(rss_vlan_flags,
+				vreinterpretq_u8_u32(tmp_desc)));
+
+	/* merge the flags */
+	flags = vorrq_u32(flags, rss_vlan);
+
+	/* check the additional fdir_flags if fdir is enabled */
+	if (rxq->fdir_enabled) {
+		const uint32x4_t fdir_id0_1 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+		const uint32x4_t fdir_id2_3 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+		const uint32x4_t fdir_id0_3 =
+			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
+				vreinterpretq_u64_u32(fdir_id2_3)));
+		const uint32x4_t fdir_flags =
+			iavf_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
+
+		/* merge with fdir_flags */
+		flags = vorrq_u32(flags, fdir_flags);
+
+		/* write fdir_id to mbuf */
+		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
+		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
+		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
+		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
+	}
+
+	/* At this point, we have the 4 sets of flags in the low-16-bits
+	 * of each 32-bit value in flags.
+	 * We want to extract these, and merge them with the mbuf init data
+	 * so we can do a single 16-byte write to the mbuf to set the flags
+	 * and all the other initialization fields. Extracting the appropriate
+	 * flags means that we have to do a shift and blend for each mbuf
+	 * before we do the write.
+	 */
+	rearm0 = vsetq_lane_u64(vgetq_lane_u32(flags, 0), mbuf_init, 1);
+	rearm1 = vsetq_lane_u64(vgetq_lane_u32(flags, 1), mbuf_init, 1);
+	rearm2 = vsetq_lane_u64(vgetq_lane_u32(flags, 2), mbuf_init, 1);
+	rearm3 = vsetq_lane_u64(vgetq_lane_u32(flags, 3), mbuf_init, 1);
+
+	/* compile time check */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+					 offsetof(struct rte_mbuf, rearm_data) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+					 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+	/* write the rearm data and the olflags in one write */
+	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
+	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
+	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
+	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
+}
+
+static __rte_always_inline void
+iavf_flex_rxd_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts,
+	uint32_t *ptype_tbl)
+{
+	const uint16x8_t ptype_mask = {
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M,
+		0, IAVF_RX_FLEX_DESC_PTYPE_M
+	};
+
+	uint32x4_t ptype_01 = vzip1q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+	uint32x4_t ptype_23 = vzip1q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+	uint32x4_t ptype_all_u32 =
+				vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(ptype_01),
+					vreinterpretq_u64_u32(ptype_23)));
+	uint16x8_t ptype_all = vreinterpretq_u16_u32(ptype_all_u32);
+
+	ptype_all = vandq_u16(ptype_all, ptype_mask);
+
+	rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)];
+	rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)];
+	rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)];
+	rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)];
+}
+
+/**
+ * vPMD raw receive routine for flex RxD,
+ * only accept(nb_pkts >= IAVF_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a IAVF_VPMD_DESCS_PER_LOOP power-of-two
+ */
+static inline uint16_t
+_recv_raw_pkts_vec_flex_rxd(struct ci_rx_queue *__rte_restrict rxq,
+		   struct rte_mbuf **__rte_restrict rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	struct iavf_adapter *adapter = rxq->iavf_vsi->adapter;
+	uint32_t *ptype_tbl = adapter->ptype_tbl;
+	volatile union ci_rx_flex_desc *rxdp;
+	struct ci_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+
+	uint16x8_t crc_adjust = {
+		0, 0,			/* ignore pkt_type field */
+		rxq->crc_len,	/* sub crc on pkt_len */
+		0,				/* ignore high 16 bits of pkt_len */
+		rxq->crc_len,	/* sub crc on data_len */
+		0, 0, 0			/* ignore non-length fields */
+	};
+
+	/* mask to shuffle from flex descriptor to mbuf */
+	const uint8x16_t shuf_msk = {
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		4, 5,			/* octet 4~5, low bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits pkt_len, zero out */
+		4, 5,			/* octet 4~5, 16 bits data_len */
+		10, 11,			/* octet 10~11, 16 bits vlan_macip */
+		0xFF, 0xFF,		/* rss hash parsed separately */
+		0xFF, 0xFF,
+	};
+
+	/* compile-time check the above crc_adjust layout is correct */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	/* 4 packets DD mask */
+	const uint16x8_t dd_check = {
+		0x0001, 0x0001, 0x0001, 0x0001,
+		0, 0, 0, 0
+	};
+
+	/* 4 packets EOP mask */
+	const uint8x16_t eop_check = {
+		0x2, 0, 0x2, 0, 0x2, 0, 0x2, 0,
+		0, 0, 0, 0, 0, 0, 0, 0
+	};
+
+	/* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP);
+
+	rxdp = rxq->rx_flex_ring + rxq->rx_tail;
+	rte_prefetch0(rxdp);
+
+	/* see if we need to rearm the Rx queue */
+	if (rxq->rxrearm_nb > rxq->rx_free_thresh)
+		iavf_rxq_rearm(rxq);
+
+	/* check if there is actually a packet available */
+	if (!(rxdp->wb.status_error0 &
+			rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* move the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packets in a loop
+	 * [A*. mask out the unused dirty fields in flex desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. count the number of DD bits in the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info from flex desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+		 pos += IAVF_VPMD_DESCS_PER_LOOP,
+		 rxdp += IAVF_VPMD_DESCS_PER_LOOP) {
+		uint64x2_t descs[IAVF_VPMD_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		uint16x8_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint16x8_t staterr;
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* A.1 load descs[3-0] */
+		descs[3] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
+		descs[2] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
+		descs[1] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
+		descs[0] = vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));
+
+		/* use acquire fence to order loads of descriptor qwords */
+		rte_atomic_thread_fence(rte_memory_order_acquire);
+		/* A.2 reload qword0 to make it ordered after qword1 load */
+		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
+					descs[3], 0);
+		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
+					descs[2], 0);
+		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
+					descs[1], 0);
+		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
+					descs[0], 0);
+
+		/* B.1 load 4 mbuf pointers */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		/* B.2 copy 4 mbuf pointers into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* prefetch mbufs if it is a chained buffer */
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		iavf_flex_rxd_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+
+		/* D.1 pkts convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb0 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* D.2 pkts set in in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb0), crc_adjust);
+		pkt_mb0 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb3);
+		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb2);
+		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb1);
+		vst1q_u8((void *)&rx_pkts[pos + 0]->rx_descriptor_fields1, pkt_mb0);
+
+		iavf_flex_rxd_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+
+		/* C.1 filter staterr info only */
+		sterr_tmp2 = vzip2q_u16(vreinterpretq_u16_u64(descs[2]),
+						vreinterpretq_u16_u64(descs[3]));
+		sterr_tmp1 = vzip2q_u16(vreinterpretq_u16_u64(descs[0]),
+						vreinterpretq_u16_u64(descs[1]));
+
+		/* C.2 get 4 pkts status_error0 value */
+		staterr = vzip1q_u16(sterr_tmp1, sterr_tmp2);
+
+		/* C* extract and record EOP bits */
+		if (split_packet) {
+			uint8x16_t eop_bits;
+
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+
+			/* store the 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+							vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+		}
+
+		/* C.3 count available number of descriptors */
+		/* mask everything except DD bit */
+		staterr = vandq_u16(staterr, dd_check);
+		/* move the status bit (bit0) into the sign bit (bit15)
+		 * of each 16-bit lane
+		 */
+		staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
+
+		/* reinterpret staterr as a signed 16-bit and
+		 * arithmetic-shift-right by 15
+		 * each lane becomes 0xFFFF if original DD bit was 1, otherwise 0.
+		 * then interpret back to unsigned u16 vector
+		 */
+		staterr = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(staterr),
+					IAVF_UINT16_BIT - 1));
+
+		/* reinterpret u16x8 vector as u64x2, and fetch the low u64
+		 * which contains the first four 16-bit lanes, and invert all bits
+		 */
+		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
+
+		if (likely(stat == 0)) {
+			nb_pkts_recd += IAVF_VPMD_DESCS_PER_LOOP;
+		} else {
+			nb_pkts_recd += rte_ctz64(stat) / IAVF_UINT16_BIT;
+			break;
+		}
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
 static inline void
 desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 		  uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
@@ -115,9 +548,6 @@ desc_to_olflags_v(struct ci_rx_queue *rxq, volatile union ci_rx_desc *rxdp,
 	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
 }
 
-#define PKTLEN_SHIFT     10
-#define IAVF_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
-
 static inline void
 desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 		uint32_t *__rte_restrict ptype_tbl)
@@ -351,7 +781,7 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
 		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
 
 		/* C.4 calc available number of desc */
-		if (unlikely(stat == 0)) {
+		if (likely(stat == 0)) {
 			nb_pkts_recd += IAVF_VPMD_DESCS_PER_LOOP;
 		} else {
 			nb_pkts_recd += rte_ctz64(stat) / IAVF_UINT16_BIT;
@@ -379,6 +809,19 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
+/*
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > IAVF_VPMD_RX_BURST, only scan IAVF_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+uint16_t
+iavf_recv_pkts_vec_flex_rxd(void *__rte_restrict rx_queue,
+		struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
 /*
  * vPMD receive routine that reassembles single burst of 32 scattered
  * packets.
@@ -445,6 +888,76 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ * for flex RxD
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
+				       struct rte_mbuf **rx_pkts,
+				       uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+	unsigned int i = 0;
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
+					      split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (!rxq->pkt_first_seg &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly */
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
+			&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets for flex RxD
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
+				      struct rte_mbuf **rx_pkts,
+				      uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > IAVF_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						rx_pkts + retval,
+						IAVF_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < IAVF_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + iavf_recv_scattered_burst_vec_flex_rxd(rx_queue,
+						      rx_pkts + retval,
+						      nb_pkts);
+}
+
 static __rte_always_inline void
 iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths
  2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
                       ` (2 preceding siblings ...)
  2026-05-05 11:07     ` [PATCH v3 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
@ 2026-05-05 14:06     ` Bruce Richardson
  3 siblings, 0 replies; 13+ messages in thread
From: Bruce Richardson @ 2026-05-05 14:06 UTC (permalink / raw)
  To: Jay Wang; +Cc: dev, nd

On Tue, May 05, 2026 at 11:07:41AM +0000, Jay Wang wrote:
> - Add the scattered Rx burst function for 32B legacy descriptor
> - Add the NEON-optimised Tx burst function
> - Add the NEON-optimised Rx burst functions for flexible descriptor
> 
> Single-core benchmark result on a Grace server with an Intel E810:
> 
> | buffer sz (B) | prev (MPPS) | optimised (MPPS) | uplift |
> | ------------- | ----------- | ---------------- | ------ |
> | 64            | 62.241      | 86.010           | 38.2%  |
> | 128           | 60.813      | 81.453           | 33.9%  |
> | 256           | 23.730      | 24.633           | 3.8%   |
> | 512           | 10.268      | 10.380           | 1.1%   |
> | 1024          | 4.588       | 4.628            | 0.9%   |
> | 1518          | 4.601       | 4.669            | 1.5%   |
> 
> 
Series applied to dpdk-next-net-intel

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2026-05-05 14:07 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-17 13:08 [PATCH v1 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
2026-04-17 13:08 ` [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
2026-04-17 13:08 ` [PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
2026-04-17 13:08 ` [PATCH v1 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
2026-04-20 10:30 ` [PATCH v2 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
2026-04-20 10:30   ` [PATCH v2 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
2026-04-20 10:30   ` [PATCH v2 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
2026-04-20 10:30   ` [PATCH v2 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
2026-05-05 11:07   ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Jay Wang
2026-05-05 11:07     ` [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc Jay Wang
2026-05-05 11:07     ` [PATCH v3 2/3] net/iavf: add NEON-optimised Tx burst function Jay Wang
2026-05-05 11:07     ` [PATCH v3 3/3] net/iavf: add NEON support for Rx flex desc Jay Wang
2026-05-05 14:06     ` [PATCH v3 0/3] net/iavf: add NEON support for Rx/Tx paths Bruce Richardson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox