[PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx

public inbox for dev@dpdk.org
 help / color / mirror / Atom feed

* [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx
@ 2025-09-17  5:26 Shaiq Wani
  2025-09-17  5:26 ` [PATCH 1/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
                   ` (10 more replies)
  0 siblings, 11 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-09-17  5:26 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Shaiq Wani (2):
  net/idpf: enable AVX2 for split queue Tx
  net/idpf: enable AVX2 for split queue Rx

 drivers/net/intel/idpf/idpf_common_rxtx.h     |   6 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 448 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |  20 +-
 3 files changed, 472 insertions(+), 2 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/2] net/idpf: enable AVX2 for split queue Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-09-17  5:26 ` Shaiq Wani
  2025-09-17  5:26 ` [PATCH 2/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-09-17  5:26 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 202 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 214 insertions(+)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index f84a760334..82ddcf3310 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -249,6 +249,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 1babc5114b..d0c37cbfc7 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -640,3 +640,205 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+			   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+	IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+	((uint64_t)flags) |
+	((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+				pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
+}
+
+
+static inline void
+idpf_splitq_vtx_avx2(volatile struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+	((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+
+		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3);
+		_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+
+	tx_id = txq->tx_tail;
+
+	/* restrict to max burst size */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	/* make sure we have enough free space */
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = &txq->desc_ring[tx_id];
+	txep = (void *)txq->sw_ring;
+	txep += tx_id;
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+		txep += tx_id;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs)
+		txq->tx_next_rs =
+		(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+static __rte_always_inline uint16_t
+idpf_splitq_xmit_pkts_vec_avx2_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue,
+								&tx_pkts[nb_tx],
+								num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+						uint16_t nb_pkts)
+{
+	return idpf_splitq_xmit_pkts_vec_avx2_cmn(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 5510cbd30a..7d5d8b9c48 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -919,6 +919,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (vport->tx_use_avx2) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 2/2] net/idpf: enable AVX2 for split queue Rx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-09-17  5:26 ` [PATCH 1/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-09-17  5:26 ` Shaiq Wani
  2025-09-17  9:51 ` [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-09-17  5:26 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 246 ++++++++++++++++++
 drivers/net/intel/idpf/idpf_rxtx.c            |  11 +-
 3 files changed, 258 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 82ddcf3310..d7c0e91256 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -242,6 +242,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index d0c37cbfc7..cef13b3249 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,252 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+static __rte_always_inline void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Try to bulk allocate mbufs from mempool */
+	if (rte_mempool_get_bulk(rx_bufq->mp,
+				(void **)rxep,
+				IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= rx_bufq->nb_rx_desc) {
+			__m128i zero_dma = _mm_setzero_si128();
+
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxep[i] = &rx_bufq->fake_mbuf;
+				_mm_storeu_si128((__m128i *)(uintptr_t)&rxdp[i], zero_dma);
+			}
+		}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+							IDPF_RXQ_REARM_THRESH,
+							rte_memory_order_relaxed);
+		return;
+	}
+
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
+		struct rte_mbuf *mb0 = rxep[0];
+		struct rte_mbuf *mb1 = rxep[1];
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
+static __rte_always_inline void
+idpf_splitq_rearm_avx2(struct idpf_rx_queue *rx_bufq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	struct rte_mempool_cache *cache =
+		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	if (unlikely(!cache)) {
+		idpf_splitq_rearm_common(rx_bufq);
+		return;
+	}
+
+	if (cache->len < IDPF_RXQ_REARM_THRESH) {
+		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - cache->len);
+		int ret = rte_mempool_ops_dequeue_bulk(rx_bufq->mp,
+						&cache->objs[cache->len], req);
+		if (ret == 0) {
+			cache->len += req;
+		} else {
+			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+				__m128i dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+					rxp[i] = &rx_bufq->fake_mbuf;
+					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
+					dma_addr0);
+				}
+			}
+			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+				IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+			return;
+		}
+	}
+	__m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+	const int step = 2;
+
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += step, rxp += step, rxdp += step) {
+		struct rte_mbuf *mb0 = (struct rte_mbuf *)cache->objs[--cache->len];
+		struct rte_mbuf *mb1 = (struct rte_mbuf *)cache->objs[--cache->len];
+		rxp[0] = mb0;
+		rxp[1] = mb1;
+
+		__m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		__m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		__m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+		__m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+		dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+		rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+		rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+				(rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+static __rte_always_inline uint16_t
+_idpf_splitq_recv_raw_pkts_vec_avx2(struct idpf_rx_queue *rxq,
+					struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	const uint32_t *ptype_tbl = rxq->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_avx2(rxq->bufq2);
+
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != rxq->expected_gen_id)
+		return 0;
+
+	const __m128i gen_mask =
+		_mm_set1_epi64x(((uint64_t)rxq->expected_gen_id) << 46);
+
+	uint16_t received = 0;
+	for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {
+		/* Step 1: pull mbufs */
+		__m128i ptrs = _mm_loadu_si128((__m128i *)&sw_ring[i]);
+		_mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs);
+
+		/* Step 2: load descriptors */
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+		rte_compiler_barrier();
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+
+		/* Step 3: shuffle out pkt_len, data_len, vlan, rss */
+		const __m256i shuf = _mm256_set_epi8(
+			/* descriptor 3 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+			/* descriptor 2 */
+			0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+			0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+		);
+		__m128i d01_lo = d0, d01_hi = d1;
+		__m128i d23_lo = d2, d23_hi = d3;
+
+		__m256i m23 = _mm256_shuffle_epi8(_mm256_set_m128i(d23_hi, d23_lo), shuf);
+		__m256i m01 = _mm256_shuffle_epi8(_mm256_set_m128i(d01_hi, d01_lo), shuf);
+
+		/* Step 4: extract ptypes */
+		const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+		__m256i pt23 = _mm256_and_si256(_mm256_set_m128i(d23_hi, d23_lo), ptype_mask);
+		__m256i pt01 = _mm256_and_si256(_mm256_set_m128i(d01_hi, d01_lo), ptype_mask);
+
+		uint16_t ptype2 = _mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = _mm256_extract_epi16(pt23, 9);
+		uint16_t ptype0 = _mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = _mm256_extract_epi16(pt01, 9);
+
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype3], 2);
+		m23 = _mm256_insert_epi32(m23, ptype_tbl[ptype2], 0);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype1], 2);
+		m01 = _mm256_insert_epi32(m01, ptype_tbl[ptype0], 0);
+
+		/* Step 5: extract gen bits */
+		__m128i sts0 = _mm_srli_epi64(d0, 46);
+		__m128i sts1 = _mm_srli_epi64(d1, 46);
+		__m128i sts2 = _mm_srli_epi64(d2, 46);
+		__m128i sts3 = _mm_srli_epi64(d3, 46);
+
+		__m128i merged_lo = _mm_unpacklo_epi64(sts0, sts2);
+		__m128i merged_hi = _mm_unpacklo_epi64(sts1, sts3);
+		__m128i valid = _mm_and_si128(_mm_and_si128(merged_lo, merged_hi),
+						  _mm_unpacklo_epi64(gen_mask, gen_mask));
+		__m128i cmp = _mm_cmpeq_epi64(valid, _mm_unpacklo_epi64(gen_mask, gen_mask));
+		int burst = _mm_movemask_pd(_mm_castsi128_pd(cmp));
+
+		/* Step 6: write rearm_data safely */
+		__m128i m01_lo = _mm256_castsi256_si128(m01);
+		__m128i m23_lo = _mm256_castsi256_si128(m23);
+
+		*(uint64_t *)&rx_pkts[i]->rearm_data = _mm_extract_epi64(m01_lo, 0);
+		*(uint64_t *)&rx_pkts[i + 1]->rearm_data = _mm_extract_epi64(m01_lo, 1);
+		*(uint64_t *)&rx_pkts[i + 2]->rearm_data = _mm_extract_epi64(m23_lo, 0);
+		*(uint64_t *)&rx_pkts[i + 3]->rearm_data = _mm_extract_epi64(m23_lo, 1);
+
+		received += burst;
+		if (burst != 4)
+			break;
+	}
+
+	rxq->rx_tail += received;
+	if (received & 1) {
+		rxq->rx_tail &= ~(uint16_t)1;
+		received--;
+	}
+	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
+	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
+	rxq->bufq2->rxrearm_nb += received;
+
+	return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+				 uint16_t nb_pkts)
+{
+	return _idpf_splitq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
+}
+
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 7d5d8b9c48..413902ca21 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -803,10 +803,17 @@ idpf_set_rx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (vport->rx_use_avx2) {
+				PMD_DRV_LOG(NOTICE,
+						"Using Split AVX2 Vector Rx (port %d).",
+						dev->data->port_id);
+				dev->rx_pkt_burst = idpf_dp_splitq_recv_pkts_avx2;
+				return;
+			}
 		}
 		PMD_DRV_LOG(NOTICE,
-			    "Using Split Scalar Rx (port %d).",
-			    dev->data->port_id);
+				"Using Split Scalar Rx (port %d).",
+				dev->data->port_id);
 		dev->rx_pkt_burst = idpf_dp_splitq_recv_pkts;
 	} else {
 		if (vport->rx_vec_allowed) {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-09-17  5:26 ` [PATCH 1/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-09-17  5:26 ` [PATCH 2/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-09-17  9:51 ` Bruce Richardson
  2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2025-09-17  9:51 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Wed, Sep 17, 2025 at 10:56:56AM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Shaiq Wani (2):
>   net/idpf: enable AVX2 for split queue Tx
>   net/idpf: enable AVX2 for split queue Rx
> 
The code changes in some of these patches look to conflict with changes
made in [1]. Can you please review those changes to check they are ok for
idpf/cpfl, and then rebase this series on top of them?

Thanks,
/Bruce

[1] https://patches.dpdk.org/project/dpdk/list/?series=36148

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (2 preceding siblings ...)
  2025-09-17  9:51 ` [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson
@ 2025-10-17 10:34 ` Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (6 subsequent siblings)
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-17 10:34 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.


Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 303 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 7 files changed, 383 insertions(+), 58 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v7 1/3] net/idpf: enable AVX2 for split queue Rx
  2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
@ 2025-10-17 10:34   ` Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-17 10:34 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 +++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 123 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 --------
 5 files changed, 188 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index b12bb8d9fc..fc911bf2e5 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -52,6 +52,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a5d0795057..ab3d088899 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+	    .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+	    .info = "Split AVX2 Vector",
+	    .features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+	       }},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..87f6895c4c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -252,6 +254,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..dbbb09afd7 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,129 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	uint16_t received = 0;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	for (uint16_t i = 0; i < nb_pkts;
+		i += IDPF_VPMD_DESCS_PER_LOOP,
+		rxdp += IDPF_VPMD_DESCS_PER_LOOP) {
+		/* copy 4 mbuf pointers into rx_pkts[] */
+		memcpy(&rx_pkts[i], &sw_ring[i], sizeof(rx_pkts[i]) * IDPF_VPMD_DESCS_PER_LOOP);
+		/* load four 128-bit descriptors */
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+		rte_compiler_barrier();
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+
+		/* Build 256-bit descriptor-pairs */
+		__m256i d01 = _mm256_set_m128i(d1, d0); /* low lane: d0, d1 */
+		__m256i d23 = _mm256_set_m128i(d3, d2); /* high lane: d2, d3 */
+
+		/* mask off high pkt_len bits  */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+
+		/* shuffle relevant bytes into mbuf rearm candidates */
+		__m256i mb01 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb23 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* extract ptypes from descriptors and translate via table */
+		__m256i pt01 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt23 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt01, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt23, 9);
+
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype1], 2);
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype0], 0);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype3], 2);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype2], 0);
+
+		/* build rearm vectors using mb01 and mb23 directly */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb01, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb01, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb23, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb23, 0xF0);
+
+		/* blind write rearm_data for all mbufs*/
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+
+		/* Check DD and GEN bits for all 4 descriptors */
+		uint16_t burst = 0;
+		for (int j = 0; j < IDPF_VPMD_DESCS_PER_LOOP; ++j) {
+			uint64_t g = rxdp[j].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+			bool dd = (g & 1ULL) != 0ULL;
+			uint64_t gen = (g >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+					VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+			if (dd && gen == queue->expected_gen_id)
+				burst++;
+		}
+		received += burst;
+		if (burst != IDPF_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	queue->rx_tail += received;
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	if ((queue->rx_tail & 1) == 1 && received > 1) {
+		queue->rx_tail--;
+		received--;
+	}
+	queue->bufq2->rxrearm_nb += received;
+	return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index bc2cadd738..d3a161c763 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v7 2/3] net/idpf: enable AVX2 for split queue Tx
  2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-10-17 10:34   ` Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-17 10:34 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 179 +++++++++++++++++-
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 189 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 87f6895c4c..3636d55272 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -264,6 +264,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index dbbb09afd7..5dba7f8782 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -681,7 +681,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
 	volatile struct idpf_base_tx_desc *txdp;
 	struct ci_tx_entry_vec *txep;
-	uint16_t n, nb_commit, tx_id;
+	uint16_t n, nb_commit;
 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
 
@@ -695,7 +695,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	tx_id = txq->tx_tail;
+	uint16_t tx_id = txq->tx_tail;
 	txdp = &txq->idpf_tx_ring[tx_id];
 	txep = &txq->sw_ring_vec[tx_id];
 
@@ -763,3 +763,178 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v7 3/3] doc: note on unsupported completion queue sharing
  2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-10-17 10:34   ` [PATCH v7 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-10-17 10:34   ` Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-17 10:34 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (3 preceding siblings ...)
  2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
@ 2025-10-27  8:07 ` Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (5 subsequent siblings)
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-27  8:07 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.


Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 303 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 7 files changed, 383 insertions(+), 58 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v8 1/3] net/idpf: enable AVX2 for split queue Rx
  2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-10-27  8:07   ` Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-27  8:07 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 299 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 5 files changed, 362 insertions(+), 58 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index b12bb8d9fc..fc911bf2e5 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -52,6 +52,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a5d0795057..ab3d088899 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+	    .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+	    .info = "Split AVX2 Vector",
+	    .features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+	       }},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..87f6895c4c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -252,6 +254,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..d95752c97a 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,126 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	uint16_t received = 0;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	for (int i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
+		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+	  /* Check DD bits */
+		bool dd0 = (rxdp[0].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd1 = (rxdp[1].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd2 = (rxdp[2].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd3 = (rxdp[3].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+
+		if (!(dd0 && dd1 && dd2 && dd3))
+			break;
+
+		/* copy mbuf pointers */
+		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		__m256i d23 = _mm256_set_m128i(d3, d2);
+		__m256i d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+		__m256i mb01 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb23 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* ptype extraction */
+		__m256i pt01 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt23 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt01, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt23, 9);
+
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype1], 2);
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype0], 0);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype3], 2);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype2], 0);
+
+		/* build rearm data for each mbuf */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb01, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb01, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb23, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb23, 0xF0);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
+
+		received += IDPF_VPMD_DESCS_PER_LOOP;
+	}
+
+queue->rx_tail += received;
+queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+queue->rx_tail &= (queue->nb_rx_desc - 1);
+if ((queue->rx_tail & 1) == 1 && received > 1) {
+	queue->rx_tail--;
+	received--;
+}
+queue->bufq2->rxrearm_nb += received;
+return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
@@ -558,7 +678,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
 	volatile struct idpf_base_tx_desc *txdp;
 	struct ci_tx_entry_vec *txep;
-	uint16_t n, nb_commit, tx_id;
+	uint16_t n, nb_commit;
 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
 
@@ -572,7 +692,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	tx_id = txq->tx_tail;
+	uint16_t tx_id = txq->tx_tail;
 	txdp = &txq->idpf_tx_ring[tx_id];
 	txep = &txq->sw_ring_vec[tx_id];
 
@@ -640,3 +760,178 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index bc2cadd738..d3a161c763 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v8 2/3] net/idpf: enable AVX2 for split queue Tx
  2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-10-27  8:07   ` Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-27  8:07 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h | 3 +++
 drivers/net/intel/idpf/idpf_rxtx.c        | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 87f6895c4c..3636d55272 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -264,6 +264,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v8 3/3] doc: note on unsupported completion queue sharing
  2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-10-27  8:07   ` [PATCH v8 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-10-27  8:07   ` Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-27  8:07 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (4 preceding siblings ...)
  2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-10-28  5:29 ` Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (4 subsequent siblings)
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28  5:29 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.

Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 299 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 7 files changed, 379 insertions(+), 58 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v9 1/3] net/idpf: enable AVX2 for split queue Rx
  2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-10-28  5:29   ` Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28  5:29 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 +++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 121 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 --------
 5 files changed, 186 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index c32dcfbb12..74a5495d59 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a5d0795057..ab3d088899 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+	    .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+	    .info = "Split AVX2 Vector",
+	    .features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+	       }},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..87f6895c4c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -252,6 +254,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..e78a453a8b 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,127 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	uint16_t received = 0;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	for (int i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
+		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+	  /* Check DD bits */
+		bool dd0 = (rxdp[0].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd1 = (rxdp[1].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd2 = (rxdp[2].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+		bool dd3 = (rxdp[3].flex_adv_nic_3_wb.status_err0_qw1 &
+				(1U << VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_DD_S)) != 0;
+
+		if (!(dd0 && dd1 && dd2 && dd3))
+			break;
+
+		/* copy mbuf pointers */
+		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		__m256i d23 = _mm256_set_m128i(d3, d2);
+		__m256i d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+		__m256i mb01 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb23 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* ptype extraction */
+		__m256i pt01 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt23 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt01, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt01, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt23, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt23, 9);
+
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype1], 2);
+		mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype0], 0);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype3], 2);
+		mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype2], 0);
+
+		/* build rearm data for each mbuf */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb01, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb01, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb23, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb23, 0xF0);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
+
+		received += IDPF_VPMD_DESCS_PER_LOOP;
+	}
+
+queue->rx_tail += received;
+queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+queue->rx_tail &= (queue->nb_rx_desc - 1);
+if ((queue->rx_tail & 1) == 1 && received > 1) {
+	queue->rx_tail--;
+	received--;
+}
+queue->bufq2->rxrearm_nb += received;
+return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index bc2cadd738..d3a161c763 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v9 2/3] net/idpf: enable AVX2 for split queue Tx
  2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-10-28  5:29   ` Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28  5:29 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 178 +++++++++++++++++-
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 87f6895c4c..3636d55272 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -264,6 +264,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index e78a453a8b..feb98eceff 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -679,7 +679,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
 	volatile struct idpf_base_tx_desc *txdp;
 	struct ci_tx_entry_vec *txep;
-	uint16_t n, nb_commit, tx_id;
+	uint16_t n, nb_commit;
 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
 
@@ -693,7 +693,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	tx_id = txq->tx_tail;
+	uint16_t tx_id = txq->tx_tail;
 	txdp = &txq->idpf_tx_ring[tx_id];
 	txep = &txq->sw_ring_vec[tx_id];
 
@@ -761,3 +761,177 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v9 3/3] doc: note on unsupported completion queue sharing
  2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-10-28  5:29   ` [PATCH v9 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-10-28  5:29   ` Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28  5:29 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (5 preceding siblings ...)
  2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-10-28 14:21 ` Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (3 subsequent siblings)
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28 14:21 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v10:
*Changed desc reading style in Rx path.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.

Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 299 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 7 files changed, 379 insertions(+), 58 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx
  2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2025-10-28 14:22   ` Shaiq Wani
  2025-12-11 12:53     ` Bruce Richardson
  2025-10-28 14:22   ` [PATCH v10 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 1 reply; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28 14:22 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 139 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 204 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index c32dcfbb12..74a5495d59 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a5d0795057..ab3d088899 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+	    .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+	    .info = "Split AVX2 Vector",
+	    .features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+	       }},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..87f6895c4c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -252,6 +254,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..2ea957ded8 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,145 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	uint16_t received = 0;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	for (int i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
+		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+		uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+
+		/* Extract DD bits */
+		bool dd3 = (g3 & 1ULL) != 0ULL;
+		bool dd2 = (g2 & 1ULL) != 0ULL;
+		bool dd1 = (g1 & 1ULL) != 0ULL;
+		bool dd0 = (g0 & 1ULL) != 0ULL;
+
+		/* Extract generation bits */
+		uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+
+		/* Validate descriptors */
+		bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
+		bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
+		bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
+		bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
+
+		if (!(valid0 && valid1 && valid2 && valid3))
+			break;
+
+		/* copy mbuf pointers */
+		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		__m256i d23 = _mm256_set_m128i(d3, d2);
+		__m256i d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+		__m256i mb10 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* Extract ptypes */
+		__m256i pt10 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt32 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+		/* Build rearm data for each mbuf */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+		/* Write out mbuf rearm data */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
+
+		received += IDPF_VPMD_DESCS_PER_LOOP;
+	}
+
+queue->rx_tail += received;
+queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+queue->rx_tail &= (queue->nb_rx_desc - 1);
+if ((queue->rx_tail & 1) == 1 && received > 1) {
+	queue->rx_tail--;
+	received--;
+}
+queue->bufq2->rxrearm_nb += received;
+return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index bc2cadd738..d3a161c763 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v10 2/3] net/idpf: enable AVX2 for split queue Tx
  2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-10-28 14:22   ` Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28 14:22 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 178 +++++++++++++++++-
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 87f6895c4c..3636d55272 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -264,6 +264,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 2ea957ded8..4df9c560e0 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -697,7 +697,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
 	volatile struct idpf_base_tx_desc *txdp;
 	struct ci_tx_entry_vec *txep;
-	uint16_t n, nb_commit, tx_id;
+	uint16_t n, nb_commit;
 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
 
@@ -711,7 +711,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	tx_id = txq->tx_tail;
+	uint16_t tx_id = txq->tx_tail;
 	txdp = &txq->idpf_tx_ring[tx_id];
 	txep = &txq->sw_ring_vec[tx_id];
 
@@ -779,3 +779,177 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 1c725065df..6950fabb49 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -850,6 +850,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v10 3/3] doc: note on unsupported completion queue sharing
  2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2025-10-28 14:22   ` [PATCH v10 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2025-10-28 14:22   ` Shaiq Wani
  2025-12-11 13:00     ` Bruce Richardson
  2 siblings, 1 reply; 42+ messages in thread
From: Shaiq Wani @ 2025-10-28 14:22 UTC (permalink / raw)
  To: dev, bruce.richardson; +Cc: aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx
  2025-10-28 14:22   ` [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2025-12-11 12:53     ` Bruce Richardson
  0 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2025-12-11 12:53 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Tue, Oct 28, 2025 at 07:52:00PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>

Haven't review the code again in detail - I assume it's correct at this
point (V10). However, some indentation issues are present below.

Regards,
/Bruce

> ---
>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 139 ++++++++++++++++++
>  .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
>  5 files changed, 204 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
> index c32dcfbb12..74a5495d59 100644
> --- a/drivers/net/intel/idpf/idpf_common_device.h
> +++ b/drivers/net/intel/idpf/idpf_common_device.h
> @@ -70,6 +70,7 @@ enum idpf_rx_func_type {
>  	IDPF_RX_SINGLEQ,
>  	IDPF_RX_SINGLEQ_SCATTERED,
>  	IDPF_RX_SINGLEQ_AVX2,
> +	IDPF_RX_AVX2,
>  	IDPF_RX_AVX512,
>  	IDPF_RX_SINGLEQ_AVX512,
>  	IDPF_RX_MAX
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index a5d0795057..ab3d088899 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
>  	cq->expected_gen_id = 1;
>  }
>  
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
> +void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	uint16_t rx_id;
> +	int i;
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	/* Pull 'n' more MBUFs into the software ring */
> +	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> +			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
> +		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +				rx_bufq->nb_rx_desc) {
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxp[i] = &rx_bufq->fake_mbuf;
> +				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
> +			}
> +		}
> +		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +		return;
> +	}
> +
> +	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
> +			i += 8, rxp += 8, rxdp += 8) {
> +		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	/* Update the tail pointer on the NIC */
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +
>  RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
>  void
>  idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
> @@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>  			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>  			.simd_width = RTE_VECT_SIMD_256,
>  			.extra.single_queue = true}},
> +	[IDPF_RX_AVX2] = {
> +	    .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +	    .info = "Split AVX2 Vector",
> +	    .features = {
> +			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +			.simd_width = RTE_VECT_SIMD_256,
> +	       }},

Watch your indentation. You have a mix of spaces and tabs here.

>  #ifdef CC_AVX512_SUPPORT
>  	[IDPF_RX_AVX512] = {
>  		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 3bc3323af4..87f6895c4c 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
>  __rte_internal
>  void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
>  __rte_internal
> +void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
> +__rte_internal
>  void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
>  __rte_internal
>  void idpf_qc_rx_queue_release(void *rxq);
> @@ -252,6 +254,9 @@ __rte_internal
>  uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
>  					 uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			  uint16_t nb_pkts);
>  __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index 21c8f79254..2ea957ded8 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,145 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
>  	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> +uint16_t
> +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> +{
> +	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
> +	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
> +	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
> +	volatile union virtchnl2_rx_desc *rxdp =
> +		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
> +	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
> +
> +	rte_prefetch0(rxdp);
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
> +
> +	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +		idpf_splitq_rearm_common(queue->bufq2);
> +
> +	/* head gen check */
> +	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
> +		return 0;
> +
> +	uint16_t received = 0;
> +
> +	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
> +	 * layout that will be merged into mbuf->rearm_data candidates.
> +	 */
> +
> +	const __m256i shuf = _mm256_set_epi8(
> +		/* high 128 bits (desc 3 then desc 2 lanes) */
> +		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
> +		/* low 128 bits (desc 1 then desc 0 lanes) */
> +		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
> +	);
> +
> +	/* mask that clears bits 14 and 15 of the packet length word  */
> +	const __m256i len_mask = _mm256_set_epi32(
> +		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
> +		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
> +	);
> +
> +	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +
> +	for (int i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
> +		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
> +
> +		uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +
> +		/* Extract DD bits */
> +		bool dd3 = (g3 & 1ULL) != 0ULL;
> +		bool dd2 = (g2 & 1ULL) != 0ULL;
> +		bool dd1 = (g1 & 1ULL) != 0ULL;
> +		bool dd0 = (g0 & 1ULL) != 0ULL;
> +
> +		/* Extract generation bits */
> +		uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +
> +		/* Validate descriptors */
> +		bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
> +		bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
> +		bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
> +		bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
> +
> +		if (!(valid0 && valid1 && valid2 && valid3))
> +			break;
> +
> +		/* copy mbuf pointers */
> +		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
> +			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
> +			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
> +
> +		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
> +		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
> +		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
> +		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
> +
> +		__m256i d23 = _mm256_set_m128i(d3, d2);
> +		__m256i d01 = _mm256_set_m128i(d1, d0);
> +
> +		/* mask length and shuffle to build mbuf rearm data */
> +		__m256i desc01 = _mm256_and_si256(d01, len_mask);
> +		__m256i desc23 = _mm256_and_si256(d23, len_mask);
> +		__m256i mb10 = _mm256_shuffle_epi8(desc01, shuf);
> +		__m256i mb32 = _mm256_shuffle_epi8(desc23, shuf);
> +
> +		/* Extract ptypes */
> +		__m256i pt10 = _mm256_and_si256(d01, ptype_mask);
> +		__m256i pt32 = _mm256_and_si256(d23, ptype_mask);
> +
> +		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
> +		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
> +		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
> +		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
> +
> +		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
> +		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
> +		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
> +		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
> +
> +		/* Build rearm data for each mbuf */
> +		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
> +		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
> +		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
> +		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
> +
> +		/* Write out mbuf rearm data */
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
> +		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
> +
> +		received += IDPF_VPMD_DESCS_PER_LOOP;
> +	}
> +
> +queue->rx_tail += received;
> +queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
> +queue->rx_tail &= (queue->nb_rx_desc - 1);
> +if ((queue->rx_tail & 1) == 1 && received > 1) {
> +	queue->rx_tail--;
> +	received--;
> +}
> +queue->bufq2->rxrearm_nb += received;
> +return received;
> +}
> +

Missing indentation.

> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
> +
>  static inline void
>  idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
>  		  struct rte_mbuf *pkt, uint64_t flags)
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> index bc2cadd738..d3a161c763 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> @@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
>  	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> -static __rte_always_inline void
> -idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> -{
> -	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> -	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> -	uint16_t rx_id;
> -	int i;
> -
> -	rxdp += rx_bufq->rxrearm_start;
> -
> -	/* Pull 'n' more MBUFs into the software ring */
> -	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> -				 (void *)rxp,
> -				 IDPF_RXQ_REARM_THRESH) < 0) {
> -		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> -		    rx_bufq->nb_rx_desc) {
> -			__m128i dma_addr0;
> -
> -			dma_addr0 = _mm_setzero_si128();
> -			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> -				rxp[i] = &rx_bufq->fake_mbuf;
> -				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
> -						dma_addr0);
> -			}
> -		}
> -	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> -			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> -		return;
> -	}
> -
> -	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
> -	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
> -			i += 8, rxp += 8, rxdp += 8) {
> -		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
> -	}
> -
> -	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> -	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> -		rx_bufq->rxrearm_start = 0;
> -
> -	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> -
> -	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> -			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> -
> -	/* Update the tail pointer on the NIC */
> -	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> -}
> -
>  static __rte_always_inline void
>  idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
>  {
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v10 3/3] doc: note on unsupported completion queue sharing
  2025-10-28 14:22   ` [PATCH v10 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
@ 2025-12-11 13:00     ` Bruce Richardson
  0 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2025-12-11 13:00 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Tue, Oct 28, 2025 at 07:52:02PM +0530, Shaiq Wani wrote:
> Added a note in the IDPF Poll Mode Driver documentation to clarify
> that sharing a completion queue among multiple TX queues serviced
> by different CPU cores is not supported in split queue mode.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (6 preceding siblings ...)
  2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-01-05  6:22 ` Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (4 more replies)
  2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
                   ` (2 subsequent siblings)
  10 siblings, 5 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-01-05  6:22 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v11:
*Fixed indentation issues.

v10:
*Changed desc reading style in Rx path.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.



Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 317 +++++++++++++++++-
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 7 files changed, 397 insertions(+), 58 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v11 1/3] net/idpf: enable AVX2 for split queue Rx
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-01-05  6:22   ` Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-01-05  6:22 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 139 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 204 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index c32dcfbb12..74a5495d59 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a5d0795057..617a432fe4 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -250,6 +250,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1684,6 +1736,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.extra.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+		.info = "Split AVX2 Vector",
+		.features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+			}},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..87f6895c4c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -203,6 +203,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -252,6 +254,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..51a7e79dd5 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,145 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	uint64_t head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	uint16_t received = 0;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	for (int i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
+		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+		uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+
+		/* Extract DD bits */
+		bool dd3 = (g3 & 1ULL) != 0ULL;
+		bool dd2 = (g2 & 1ULL) != 0ULL;
+		bool dd1 = (g1 & 1ULL) != 0ULL;
+		bool dd0 = (g0 & 1ULL) != 0ULL;
+
+		/* Extract generation bits */
+		uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+
+		/* Validate descriptors */
+		bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
+		bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
+		bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
+		bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
+
+		if (!(valid0 && valid1 && valid2 && valid3))
+			break;
+
+		/* copy mbuf pointers */
+		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		__m256i d23 = _mm256_set_m128i(d3, d2);
+		__m256i d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+		__m256i mb10 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* Extract ptypes */
+		__m256i pt10 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt32 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+		/* Build rearm data for each mbuf */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+		/* Write out mbuf rearm data */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
+
+		received += IDPF_VPMD_DESCS_PER_LOOP;
+	}
+
+	queue->rx_tail += received;
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	if ((queue->rx_tail & 1) == 1 && received > 1) {
+		queue->rx_tail--;
+		received--;
+	}
+	queue->bufq2->rxrearm_nb += received;
+	return received;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+
 static inline void
 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index bc2cadd738..d3a161c763 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v11 2/3] net/idpf: enable AVX2 for split queue Tx
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2026-01-05  6:22   ` Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-01-05  6:22 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 178 +++++++++++++++++-
 drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
 3 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 87f6895c4c..3636d55272 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -264,6 +264,9 @@ uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
 					struct rte_mbuf **tx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 51a7e79dd5..02cc26778e 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -697,7 +697,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
 	volatile struct idpf_base_tx_desc *txdp;
 	struct ci_tx_entry_vec *txep;
-	uint16_t n, nb_commit, tx_id;
+	uint16_t n, nb_commit;
 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
 
@@ -711,7 +711,7 @@ idpf_singleq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	tx_id = txq->tx_tail;
+	uint16_t tx_id = txq->tx_tail;
 	txdp = &txq->idpf_tx_ring[tx_id];
 	txep = &txq->sw_ring_vec[tx_id];
 
@@ -779,3 +779,177 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
diff --git a/drivers/net/intel/idpf/idpf_rxtx.c b/drivers/net/intel/idpf/idpf_rxtx.c
index 4796d8b862..b799706033 100644
--- a/drivers/net/intel/idpf/idpf_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_rxtx.c
@@ -852,6 +852,15 @@ idpf_set_tx_function(struct rte_eth_dev *dev)
 				return;
 			}
 #endif /* CC_AVX512_SUPPORT */
+			if (tx_simd_width == RTE_VECT_SIMD_256) {
+				PMD_DRV_LOG(NOTICE,
+					    "Using Split AVX2 Vector Tx (port %d).",
+					    dev->data->port_id);
+				dev->tx_pkt_burst = idpf_dp_splitq_xmit_pkts_avx2;
+				dev->tx_pkt_prepare = idpf_dp_prep_pkts;
+				return;
+			}
+
 		}
 		PMD_DRV_LOG(NOTICE,
 			    "Using Split Scalar Tx (port %d).",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v11 3/3] doc: note on unsupported completion queue sharing
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2026-01-05  6:22   ` [PATCH v11 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2026-01-05  6:22   ` Shaiq Wani
  2026-01-13 18:13   ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Stephen Hemminger
  2026-01-26 17:14   ` Bruce Richardson
  4 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-01-05  6:22 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                     ` (2 preceding siblings ...)
  2026-01-05  6:22   ` [PATCH v11 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
@ 2026-01-13 18:13   ` Stephen Hemminger
  2026-01-26 17:14   ` Bruce Richardson
  4 siblings, 0 replies; 42+ messages in thread
From: Stephen Hemminger @ 2026-01-13 18:13 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, bruce.richardson, aman.deep.singh

On Mon,  5 Jan 2026 11:52:54 +0530
Shaiq Wani <shaiq.wani@intel.com> wrote:

> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> v11:
> *Fixed indentation issues.
> 
> v10:
> *Changed desc reading style in Rx path.
> 
> v9:
> *Some minor fixes.
> 
> v8:
> *Reworked the receive func to avoid race conditions in Rx path.
> *Removed unnecessary compiler barriers.
> 
> v7:
> *Removed (char) casts, ifdef block as suggested.
> *Fixed indentation, line wrapping.
> *Did blind write of data to avoid branching.
> *changed desc building and writing  order for Tx.
> 
> v6:
> *used single load/store in splitq_recv_pkts function.
> *removed x86-specific intrinsics from common code.
> 
> v5:
> *Fixed CI errors.
> *Used defined constants instead of numbers.
> 
> v4:
> *moved splitq_rearm_common to a common location.
> *reduced duplication of code.
> *fixed splitq_recv_pkts function.
> 
> v3:
> *Fixed some indentation issues.
> *Collapsed wrapper and core function into one.
> *Fixed some pointer casting and naming inconsistency issues.
> 
> v2:
> *Fixed CI build related issues.
> *Rebased on top of idpf/cpfl rx path selection simplication patch.
> 
> 
> 
> Shaiq Wani (3):
>   net/idpf: enable AVX2 for split queue Rx
>   net/idpf: enable AVX2 for split queue Tx
>   doc: note on unsupported completion queue sharing
> 
>  doc/guides/nics/idpf.rst                      |   5 +
>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 317 +++++++++++++++++-
>  .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
>  drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
>  7 files changed, 397 insertions(+), 58 deletions(-)
> 

Thanks for sticking with this patchset.
It is worth noting that AI had some review comments.

Summary
Errors (Must Fix)
None critical.
Warnings (Should Fix)

Patch 1 & 2: Commit body text is identical and doesn't specifically describe the Tx patch. Consider making the body patch-specific.
Patch 1: "While as" grammatical error in commit body → "Whereas" or "While".
Patch 2: Contains unrelated style change (moving tx_id declaration). Should be mentioned in commit message or separated.
Patch 2: Verify IDPD_TXQ_SCAN_CQ_THRESH spelling—appears it might be a typo for IDPF_.
Patch 1 & 2: Inconsistent placement of RTE_EXPORT_INTERNAL_SYMBOL() (before vs. after function).
Patch 3: "Added" is past tense; imperative "Add" is preferred.

Info (Consider)

The series is at v11, indicating significant iteration. The core implementation looks reasonable for AVX2 vectorized packet processing.
Mid-block variable declarations are used inconsistently—some at function top, some mid-block. Consider consistent placement.
Unnecessary initialization of received = 0 since it's always assigned before use.


Recommendation
Acceptable with minor revisions. The code appears functionally correct for enabling AVX2 split queue support. Address the warnings above, particularly:

Fix the "IDPD" vs "IDPF" typo if confirmed
Update commit messages to be patch-specific
Consider separating unrelated style changes




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                     ` (3 preceding siblings ...)
  2026-01-13 18:13   ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Stephen Hemminger
@ 2026-01-26 17:14   ` Bruce Richardson
  4 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2026-01-26 17:14 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Mon, Jan 05, 2026 at 11:52:54AM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
> v11:
> *Fixed indentation issues.
> 
> v10:
> *Changed desc reading style in Rx path.
> 
> v9:
> *Some minor fixes.
> 
> v8:
> *Reworked the receive func to avoid race conditions in Rx path.
> *Removed unnecessary compiler barriers.
> 
> v7:
> *Removed (char) casts, ifdef block as suggested.
> *Fixed indentation, line wrapping.
> *Did blind write of data to avoid branching.
> *changed desc building and writing  order for Tx.
> 
> v6:
> *used single load/store in splitq_recv_pkts function.
> *removed x86-specific intrinsics from common code.
> 
> v5:
> *Fixed CI errors.
> *Used defined constants instead of numbers.
> 
> v4:
> *moved splitq_rearm_common to a common location.
> *reduced duplication of code.
> *fixed splitq_recv_pkts function.
> 
> v3:
> *Fixed some indentation issues.
> *Collapsed wrapper and core function into one.
> *Fixed some pointer casting and naming inconsistency issues.
> 
> v2:
> *Fixed CI build related issues.
> *Rebased on top of idpf/cpfl rx path selection simplication patch.
> 
> 
> 
> Shaiq Wani (3):
>   net/idpf: enable AVX2 for split queue Rx
>   net/idpf: enable AVX2 for split queue Tx
>   doc: note on unsupported completion queue sharing
> 
>  doc/guides/nics/idpf.rst                      |   5 +
>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 317 +++++++++++++++++-
>  .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
>  drivers/net/intel/idpf/idpf_rxtx.c            |   9 +
>  7 files changed, 397 insertions(+), 58 deletions(-)
> 
Beyond the AI review comments posted by Stephen, which does seem to flag
some issues that could do with being addressed, this set (mainly patch 2)
also needs a rebase to apply cleanly on top of latest next-net-intel tree.
It's missing support for the new Tx path selection logic.

Also, it would be good to have a release note entry for the new AVX2 code
paths added in either patch 1 or patch 2.

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v12 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (7 preceding siblings ...)
  2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-02-24  7:47 ` Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-24  7:47 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v12:
*Addressed AI generated comments.
*Rebased on latest next-net-intel tree.
*Included a release note entry.


v11:
*Fixed indentation issues.

v10:
*Changed desc reading style in Rx path.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.


Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: add note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   2 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  67 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 311 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ----
 7 files changed, 397 insertions(+), 56 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx
  2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
@ 2026-02-24  7:47   ` Shaiq Wani
  2026-02-24 14:27     ` Bruce Richardson
  2026-02-24  7:47   ` [PATCH v12 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 1 reply; 42+ messages in thread
From: Shaiq Wani @ 2026-02-24  7:47 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Rx path.
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 138 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 203 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index bbc969c734..1424046a16 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index b8f6418d4a..ead31fd0f8 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+		.info = "Split AVX2 Vector",
+		.features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+			}},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 914cab0f25..256e9ff54c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -249,6 +251,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index e228b72fa5..c2f41db9f6 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,144 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+	uint64_t head_gen;
+	uint16_t received = 0;
+	int i;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* head gen check */
+	head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	for (i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
+		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+		uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+		uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+
+		/* Extract DD bits */
+		bool dd3 = (g3 & 1ULL) != 0ULL;
+		bool dd2 = (g2 & 1ULL) != 0ULL;
+		bool dd1 = (g1 & 1ULL) != 0ULL;
+		bool dd0 = (g0 & 1ULL) != 0ULL;
+
+		/* Extract generation bits */
+		uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+		uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+
+		/* Validate descriptors */
+		bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
+		bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
+		bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
+		bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
+
+		if (!(valid0 && valid1 && valid2 && valid3))
+			break;
+
+		/* copy mbuf pointers */
+		memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+			&sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+		__m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		__m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		__m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		__m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		__m256i d23 = _mm256_set_m128i(d3, d2);
+		__m256i d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		__m256i desc01 = _mm256_and_si256(d01, len_mask);
+		__m256i desc23 = _mm256_and_si256(d23, len_mask);
+		__m256i mb10 = _mm256_shuffle_epi8(desc01, shuf);
+		__m256i mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* Extract ptypes */
+		__m256i pt10 = _mm256_and_si256(d01, ptype_mask);
+		__m256i pt32 = _mm256_and_si256(d23, ptype_mask);
+
+		uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+		uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+		uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+		uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+		/* Build rearm data for each mbuf */
+		__m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
+		__m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+		__m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
+		__m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+		/* Write out mbuf rearm data */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0);
+
+		received += IDPF_VPMD_DESCS_PER_LOOP;
+	}
+
+	queue->rx_tail += received;
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	if ((queue->rx_tail & 1) == 1 && received > 1) {
+		queue->rx_tail--;
+		received--;
+	}
+	queue->bufq2->rxrearm_nb += received;
+	return received;
+}
+
 static inline void
 idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index fe870617bc..eda5f929cf 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v12 2/3] net/idpf: enable AVX2 for split queue Tx
  2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2026-02-24  7:47   ` Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-24  7:47 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Tx path with
completion queue scanning support.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   8 +
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 173 ++++++++++++++++++
 5 files changed, 189 insertions(+)

diff --git a/doc/guides/rel_notes/release_26_03.rst b/doc/guides/rel_notes/release_26_03.rst
index b4499ec066..2c78b62632 100644
--- a/doc/guides/rel_notes/release_26_03.rst
+++ b/doc/guides/rel_notes/release_26_03.rst
@@ -77,6 +77,10 @@ New Features
 
   * Added support for pre and post VF reset callbacks.
 
+* **Updated Intel idpf driver.**
+
+  * Added AVX2 vectorized split queue Rx and Tx paths.
+
 * **Updated Marvell cnxk net driver.**
 
   * Added out-of-place support for CN20K SoC.
diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index 1424046a16..6f3dfbc3fc 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -81,6 +81,7 @@ enum idpf_tx_func_type {
 	IDPF_TX_SINGLEQ,
 	IDPF_TX_SINGLEQ_SIMPLE,
 	IDPF_TX_SINGLEQ_AVX2,
+	IDPF_TX_AVX2,
 	IDPF_TX_AVX512,
 	IDPF_TX_SINGLEQ_AVX512,
 	/* Need a max value defined as array values in are defined
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index ead31fd0f8..8e964bef5c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1620,6 +1620,14 @@ const struct ci_tx_path_info idpf_tx_path_infos[] = {
 			.single_queue = true
 		}
 	},
+	[IDPF_TX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx2,
+		.info = "Split AVX2",
+		.features = {
+			.tx_offloads = IDPF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256
+		}
+	},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_TX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 256e9ff54c..2c4ebb21e4 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -257,6 +257,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index c2f41db9f6..076c7cfd6a 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -767,3 +767,176 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v12 3/3] doc: add note on unsupported completion queue sharing
  2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2026-02-24  7:47   ` [PATCH v12 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2026-02-24  7:47   ` Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-24  7:47 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation to clarify
that sharing a completion queue among multiple TX queues serviced
by different CPU cores is not supported in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx
  2026-02-24  7:47   ` [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2026-02-24 14:27     ` Bruce Richardson
  0 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2026-02-24 14:27 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Tue, Feb 24, 2026 at 01:17:24PM +0530, Shaiq Wani wrote:
> Add AVX2 vectorized split queue Rx path.
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---
>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 138 ++++++++++++++++++
>  .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
>  5 files changed, 203 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
> index bbc969c734..1424046a16 100644
> --- a/drivers/net/intel/idpf/idpf_common_device.h
> +++ b/drivers/net/intel/idpf/idpf_common_device.h
> @@ -70,6 +70,7 @@ enum idpf_rx_func_type {
>  	IDPF_RX_SINGLEQ,
>  	IDPF_RX_SINGLEQ_SCATTERED,
>  	IDPF_RX_SINGLEQ_AVX2,
> +	IDPF_RX_AVX2,
>  	IDPF_RX_AVX512,
>  	IDPF_RX_SINGLEQ_AVX512,
>  	IDPF_RX_MAX
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index b8f6418d4a..ead31fd0f8 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
>  	cq->expected_gen_id = 1;
>  }
>  
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
> +void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +	uint16_t rx_id;
> +	int i;
> +
> +	rxdp += rx_bufq->rxrearm_start;
> +
> +	/* Pull 'n' more MBUFs into the software ring */
> +	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> +			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
> +		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +				rx_bufq->nb_rx_desc) {
> +			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +				rxp[i] = &rx_bufq->fake_mbuf;
> +				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
> +			}
> +		}
> +		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +		return;
> +	}
> +
> +	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
> +	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
> +			i += 8, rxp += 8, rxdp += 8) {
> +		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
> +	}
> +
> +	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +		rx_bufq->rxrearm_start = 0;
> +
> +	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
> +
> +	/* Update the tail pointer on the NIC */
> +	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +
>  RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
>  void
>  idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
> @@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>  			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>  			.simd_width = RTE_VECT_SIMD_256,
>  			.single_queue = true}},
> +	[IDPF_RX_AVX2] = {
> +		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +		.info = "Split AVX2 Vector",
> +		.features = {
> +			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +			.simd_width = RTE_VECT_SIMD_256,
> +			}},
>  #ifdef CC_AVX512_SUPPORT
>  	[IDPF_RX_AVX512] = {
>  		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 914cab0f25..256e9ff54c 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
>  __rte_internal
>  void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
>  __rte_internal
> +void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
> +__rte_internal
>  void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
>  __rte_internal
>  void idpf_qc_rx_queue_release(void *rxq);
> @@ -249,6 +251,9 @@ __rte_internal
>  uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
>  					 uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +				     uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			  uint16_t nb_pkts);
>  __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index e228b72fa5..c2f41db9f6 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,144 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
>  	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
> +uint16_t
> +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> +{
> +	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
> +	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
> +	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
> +	volatile union virtchnl2_rx_desc *rxdp =
> +		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
> +	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
> +	uint64_t head_gen;
> +	uint16_t received = 0;
> +	int i;
> +
> +	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
> +	 * layout that will be merged into mbuf->rearm_data candidates.
> +	 */
> +	const __m256i shuf = _mm256_set_epi8(
> +		/* high 128 bits (desc 3 then desc 2 lanes) */
> +		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
> +		/* low 128 bits (desc 1 then desc 0 lanes) */
> +		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
> +	);
> +
> +	/* mask that clears bits 14 and 15 of the packet length word  */
> +	const __m256i len_mask = _mm256_set_epi32(
> +		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
> +		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
> +	);
> +
> +	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +
> +	rte_prefetch0(rxdp);
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
> +
> +	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +		idpf_splitq_rearm_common(queue->bufq2);
> +
> +	/* head gen check */
> +	head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
> +		return 0;
> +
> +	for (i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) {
> +		rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
> +
> +		uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +		uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +
> +		/* Extract DD bits */
> +		bool dd3 = (g3 & 1ULL) != 0ULL;
> +		bool dd2 = (g2 & 1ULL) != 0ULL;
> +		bool dd1 = (g1 & 1ULL) != 0ULL;
> +		bool dd0 = (g0 & 1ULL) != 0ULL;
> +
> +		/* Extract generation bits */
> +		uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +		uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +							VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +
> +		/* Validate descriptors */
> +		bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
> +		bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
> +		bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
> +		bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
> +
> +		if (!(valid0 && valid1 && valid2 && valid3))
> +			break;
> +
This looks wrong. It means that if one packet is received by the NIC and
then the link goes down, for example, that one packet will never actually
be received by software.


Driver datapaths must always be able to receive single packets or whatever
number is available. However - based on past precedent - they are allowed
to request that the input buffer is a multiple of 4 or 8, but they cannot
require that packets are received in bursts of a given multiple. This will
never pass any RFC2544 test, unless, by a co-incidence, the number of sent
packets is a multiple of 4.

/Bruce

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (8 preceding siblings ...)
  2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
@ 2026-02-26  6:52 ` Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (2 more replies)
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  10 siblings, 3 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  6:52 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v13:
*reworked the recv_pkts() function.
- the driver datapath can now receive any number
- of packets available not just multiples of 4.

v12:
*Addressed AI generated comments.
*Rebased on latest next-net-intel tree.
*Included a release note entry.


v11:
*Fixed indentation issues.

v10:
*Changed desc reading style in Rx path.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.


Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: add note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   2 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  67 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 324 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ---
 7 files changed, 410 insertions(+), 56 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v13 1/3] net/idpf: enable AVX2 for split queue Rx
  2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-02-26  6:52   ` Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  6:52 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Rx path.
In case some CPUs don't support AVX512. Enable AVX2 for
them to get better per-core performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 +++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 151 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 216 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index bbc969c734..1424046a16 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index b8f6418d4a..ead31fd0f8 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+		.info = "Split AVX2 Vector",
+		.features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+			}},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 914cab0f25..256e9ff54c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -249,6 +251,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index e228b72fa5..0122c82951 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,157 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+	uint64_t head_gen;
+	uint16_t received = 0;
+	int i;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_VPMD_DESCS_PER_LOOP);
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* check if there is at least one packet available */
+	head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	for (i = 0; i < nb_pkts;
+	     i += IDPF_VPMD_DESCS_PER_LOOP,
+	     rxdp += IDPF_VPMD_DESCS_PER_LOOP) {
+		uint16_t pktlen_gen0, pktlen_gen1, pktlen_gen2, pktlen_gen3;
+		uint8_t stat0, stat1, stat2, stat3;
+		bool valid0, valid1, valid2, valid3;
+		uint16_t burst;
+		uint16_t ptype0, ptype1, ptype2, ptype3;
+		__m128i d0, d1, d2, d3;
+		__m256i d01, d23, desc01, desc23;
+		__m256i mb10, mb32, pt10, pt32;
+		__m256i rearm0, rearm1, rearm2, rearm3;
+
+		/* copy mbuf pointers (harmless for invalid descs) */
+		memcpy(&rx_pkts[i], &sw_ring[i],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+		d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		rte_compiler_barrier();
+		d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		d23 = _mm256_set_m128i(d3, d2);
+		d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		desc01 = _mm256_and_si256(d01, len_mask);
+		desc23 = _mm256_and_si256(d23, len_mask);
+		mb10 = _mm256_shuffle_epi8(desc01, shuf);
+		mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* Extract ptypes */
+		pt10 = _mm256_and_si256(d01, ptype_mask);
+		pt32 = _mm256_and_si256(d23, ptype_mask);
+
+		ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+		ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+		ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+		ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+		/* Build rearm data for each mbuf */
+		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
+		rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
+		rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+		/* Write out mbuf rearm data */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+
+		/* Extract DD and generation bits from the already-loaded
+		 * descriptor data (d0-d3) */
+		stat0 = (uint8_t)_mm_extract_epi8(d0, 1);
+		stat1 = (uint8_t)_mm_extract_epi8(d1, 1);
+		stat2 = (uint8_t)_mm_extract_epi8(d2, 1);
+		stat3 = (uint8_t)_mm_extract_epi8(d3, 1);
+
+		pktlen_gen0 = (uint16_t)_mm_extract_epi16(d0, 2);
+		pktlen_gen1 = (uint16_t)_mm_extract_epi16(d1, 2);
+		pktlen_gen2 = (uint16_t)_mm_extract_epi16(d2, 2);
+		pktlen_gen3 = (uint16_t)_mm_extract_epi16(d3, 2);
+
+		valid0 = (stat0 & 1) &&
+			 (((pktlen_gen0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid1 = (stat1 & 1) &&
+			 (((pktlen_gen1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid2 = (stat2 & 1) &&
+			 (((pktlen_gen2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid3 = (stat3 & 1) &&
+			 (((pktlen_gen3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+
+		/* count valid descriptors (holes are impossible because
+		 * descriptors are read in reverse order while the NIC
+		 * completes them in forward order)
+		 */
+		burst = valid0 + valid1 + valid2 + valid3;
+		received += burst;
+		if (burst != IDPF_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+
+	queue->rx_tail += received;
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	if ((queue->rx_tail & 1) == 1 && received > 1) {
+		queue->rx_tail--;
+		received--;
+	}
+	queue->bufq2->rxrearm_nb += received;
+	return received;
+}
+
 static inline void
 idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index fe870617bc..eda5f929cf 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v13 2/3] net/idpf: enable AVX2 for split queue Tx
  2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2026-02-26  6:52   ` Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  6:52 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Tx path with
completion queue scanning support.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   8 +
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 173 ++++++++++++++++++
 5 files changed, 189 insertions(+)

diff --git a/doc/guides/rel_notes/release_26_03.rst b/doc/guides/rel_notes/release_26_03.rst
index b4499ec066..2c78b62632 100644
--- a/doc/guides/rel_notes/release_26_03.rst
+++ b/doc/guides/rel_notes/release_26_03.rst
@@ -77,6 +77,10 @@ New Features
 
   * Added support for pre and post VF reset callbacks.
 
+* **Updated Intel idpf driver.**
+
+  * Added AVX2 vectorized split queue Rx and Tx paths.
+
 * **Updated Marvell cnxk net driver.**
 
   * Added out-of-place support for CN20K SoC.
diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index 1424046a16..6f3dfbc3fc 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -81,6 +81,7 @@ enum idpf_tx_func_type {
 	IDPF_TX_SINGLEQ,
 	IDPF_TX_SINGLEQ_SIMPLE,
 	IDPF_TX_SINGLEQ_AVX2,
+	IDPF_TX_AVX2,
 	IDPF_TX_AVX512,
 	IDPF_TX_SINGLEQ_AVX512,
 	/* Need a max value defined as array values in are defined
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index ead31fd0f8..8e964bef5c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1620,6 +1620,14 @@ const struct ci_tx_path_info idpf_tx_path_infos[] = {
 			.single_queue = true
 		}
 	},
+	[IDPF_TX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx2,
+		.info = "Split AVX2",
+		.features = {
+			.tx_offloads = IDPF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256
+		}
+	},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_TX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 256e9ff54c..2c4ebb21e4 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -257,6 +257,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 0122c82951..7e0c3adcc9 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -780,3 +780,176 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v13 3/3] doc: add note on unsupported completion queue sharing
  2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2026-02-26  6:52   ` [PATCH v13 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2026-02-26  6:52   ` Shaiq Wani
  2 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  6:52 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation
to clarify that sharing a completion queue among multiple
TX queues serviced by different CPU cores is not supported
in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                   ` (9 preceding siblings ...)
  2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-02-26  9:42 ` Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
                     ` (3 more replies)
  10 siblings, 4 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  9:42 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

v14:
*rebased on latest next-net-intel.

v13:
*reworked the recv_pkts() function.
- the driver datapath can now receive any number
- of packets available not just multiples of 4.

v12:
*Addressed AI generated comments.
*Rebased on latest next-net-intel tree.
*Included a release note entry.

v11:
*Fixed indentation issues.

v10:
*Changed desc reading style in Rx path.

v9:
*Some minor fixes.

v8:
*Reworked the receive func to avoid race conditions in Rx path.
*Removed unnecessary compiler barriers.

v7:
*Removed (char) casts, ifdef block as suggested.
*Fixed indentation, line wrapping.
*Did blind write of data to avoid branching.
*changed desc building and writing  order for Tx.

v6:
*used single load/store in splitq_recv_pkts function.
*removed x86-specific intrinsics from common code.

v5:
*Fixed CI errors.
*Used defined constants instead of numbers.

v4:
*moved splitq_rearm_common to a common location.
*reduced duplication of code.
*fixed splitq_recv_pkts function.

v3:
*Fixed some indentation issues.
*Collapsed wrapper and core function into one.
*Fixed some pointer casting and naming inconsistency issues.

v2:
*Fixed CI build related issues.
*Rebased on top of idpf/cpfl rx path selection simplication patch.

Shaiq Wani (3):
  net/idpf: enable AVX2 for split queue Rx
  net/idpf: enable AVX2 for split queue Tx
  doc: add note on unsupported completion queue sharing

 doc/guides/nics/idpf.rst                      |   5 +
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   2 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  67 ++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   8 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 324 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 ---
 7 files changed, 410 insertions(+), 56 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v14 1/3] net/idpf: enable AVX2 for split queue Rx
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
@ 2026-02-26  9:42   ` Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  9:42 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Rx path.
In case some CPUs don't support AVX512. Enable AVX2 for
them to get better per-core performance.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 +++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 151 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 216 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index bbc969c734..1424046a16 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
 	IDPF_RX_SINGLEQ,
 	IDPF_RX_SINGLEQ_SCATTERED,
 	IDPF_RX_SINGLEQ_AVX2,
+	IDPF_RX_AVX2,
 	IDPF_RX_AVX512,
 	IDPF_RX_SINGLEQ_AVX512,
 	IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index b8f6418d4a..ead31fd0f8 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
 	cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+	uint16_t rx_id;
+	int i;
+
+	rxdp += rx_bufq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+			(void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+				rx_bufq->nb_rx_desc) {
+			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+				rxp[i] = &rx_bufq->fake_mbuf;
+				rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+			}
+		}
+		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+			IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+			i += 8, rxp += 8, rxdp += 8) {
+		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
+		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
+	}
+
+	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+		rx_bufq->rxrearm_start = 0;
+
+	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
 			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
 			.simd_width = RTE_VECT_SIMD_256,
 			.single_queue = true}},
+	[IDPF_RX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+		.info = "Split AVX2 Vector",
+		.features = {
+			.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256,
+			}},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_RX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 914cab0f25..256e9ff54c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -249,6 +251,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 					 uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+				     uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index e228b72fa5..0122c82951 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,157 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16
 	return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+	const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+	struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+	volatile union virtchnl2_rx_desc *rxdp =
+		(volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer);
+	uint64_t head_gen;
+	uint16_t received = 0;
+	int i;
+
+	/* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+	 * layout that will be merged into mbuf->rearm_data candidates.
+	 */
+	const __m256i shuf = _mm256_set_epi8(
+		/* high 128 bits (desc 3 then desc 2 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+		/* low 128 bits (desc 1 then desc 0 lanes) */
+		0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+		0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+	);
+
+	/* mask that clears bits 14 and 15 of the packet length word  */
+	const __m256i len_mask = _mm256_set_epi32(
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+		0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+	);
+
+	const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+	rte_prefetch0(rxdp);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_VPMD_DESCS_PER_LOOP);
+
+	if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+		idpf_splitq_rearm_common(queue->bufq2);
+
+	/* check if there is at least one packet available */
+	head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+	if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+		 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+		return 0;
+
+	for (i = 0; i < nb_pkts;
+	     i += IDPF_VPMD_DESCS_PER_LOOP,
+	     rxdp += IDPF_VPMD_DESCS_PER_LOOP) {
+		uint16_t pktlen_gen0, pktlen_gen1, pktlen_gen2, pktlen_gen3;
+		uint8_t stat0, stat1, stat2, stat3;
+		bool valid0, valid1, valid2, valid3;
+		uint16_t burst;
+		uint16_t ptype0, ptype1, ptype2, ptype3;
+		__m128i d0, d1, d2, d3;
+		__m256i d01, d23, desc01, desc23;
+		__m256i mb10, mb32, pt10, pt32;
+		__m256i rearm0, rearm1, rearm2, rearm3;
+
+		/* copy mbuf pointers (harmless for invalid descs) */
+		memcpy(&rx_pkts[i], &sw_ring[i],
+			sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+		d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3]));
+		rte_compiler_barrier();
+		d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2]));
+		rte_compiler_barrier();
+		d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1]));
+		rte_compiler_barrier();
+		d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0]));
+
+		d23 = _mm256_set_m128i(d3, d2);
+		d01 = _mm256_set_m128i(d1, d0);
+
+		/* mask length and shuffle to build mbuf rearm data */
+		desc01 = _mm256_and_si256(d01, len_mask);
+		desc23 = _mm256_and_si256(d23, len_mask);
+		mb10 = _mm256_shuffle_epi8(desc01, shuf);
+		mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+		/* Extract ptypes */
+		pt10 = _mm256_and_si256(d01, ptype_mask);
+		pt32 = _mm256_and_si256(d23, ptype_mask);
+
+		ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+		ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+		ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+		ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+		mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+		mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+		/* Build rearm data for each mbuf */
+		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20);
+		rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20);
+		rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+		/* Write out mbuf rearm data */
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
+
+		/* Extract DD and generation bits from the already-loaded
+		 * descriptor data (d0-d3) */
+		stat0 = (uint8_t)_mm_extract_epi8(d0, 1);
+		stat1 = (uint8_t)_mm_extract_epi8(d1, 1);
+		stat2 = (uint8_t)_mm_extract_epi8(d2, 1);
+		stat3 = (uint8_t)_mm_extract_epi8(d3, 1);
+
+		pktlen_gen0 = (uint16_t)_mm_extract_epi16(d0, 2);
+		pktlen_gen1 = (uint16_t)_mm_extract_epi16(d1, 2);
+		pktlen_gen2 = (uint16_t)_mm_extract_epi16(d2, 2);
+		pktlen_gen3 = (uint16_t)_mm_extract_epi16(d3, 2);
+
+		valid0 = (stat0 & 1) &&
+			 (((pktlen_gen0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid1 = (stat1 & 1) &&
+			 (((pktlen_gen1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid2 = (stat2 & 1) &&
+			 (((pktlen_gen2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+		valid3 = (stat3 & 1) &&
+			 (((pktlen_gen3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+			   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id);
+
+		/* count valid descriptors (holes are impossible because
+		 * descriptors are read in reverse order while the NIC
+		 * completes them in forward order)
+		 */
+		burst = valid0 + valid1 + valid2 + valid3;
+		received += burst;
+		if (burst != IDPF_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+
+	queue->rx_tail += received;
+	queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+	queue->rx_tail &= (queue->nb_rx_desc - 1);
+	if ((queue->rx_tail & 1) == 1 && received > 1) {
+		queue->rx_tail--;
+		received--;
+	}
+	queue->bufq2->rxrearm_nb += received;
+	return received;
+}
+
 static inline void
 idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp,
 		  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index fe870617bc..eda5f929cf 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-	uint16_t rx_id;
-	int i;
-
-	rxdp += rx_bufq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-				 (void *)rxp,
-				 IDPF_RXQ_REARM_THRESH) < 0) {
-		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-		    rx_bufq->nb_rx_desc) {
-			__m128i dma_addr0;
-
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-				rxp[i] = &rx_bufq->fake_mbuf;
-				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
-						dma_addr0);
-			}
-		}
-	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
-	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-			i += 8, rxp += 8, rxdp += 8) {
-		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
-		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
-	}
-
-	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-		rx_bufq->rxrearm_start = 0;
-
-	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v14 2/3] net/idpf: enable AVX2 for split queue Tx
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
@ 2026-02-26  9:42   ` Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
  2026-02-26 11:31   ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson
  3 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  9:42 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Add AVX2 vectorized split queue Tx path with
completion queue scanning support.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/rel_notes/release_26_03.rst        |   4 +
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |   8 +
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   3 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 173 ++++++++++++++++++
 5 files changed, 189 insertions(+)

diff --git a/doc/guides/rel_notes/release_26_03.rst b/doc/guides/rel_notes/release_26_03.rst
index b4499ec066..2c78b62632 100644
--- a/doc/guides/rel_notes/release_26_03.rst
+++ b/doc/guides/rel_notes/release_26_03.rst
@@ -77,6 +77,10 @@ New Features
 
   * Added support for pre and post VF reset callbacks.
 
+* **Updated Intel idpf driver.**
+
+  * Added AVX2 vectorized split queue Rx and Tx paths.
+
 * **Updated Marvell cnxk net driver.**
 
   * Added out-of-place support for CN20K SoC.
diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h
index 1424046a16..6f3dfbc3fc 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -81,6 +81,7 @@ enum idpf_tx_func_type {
 	IDPF_TX_SINGLEQ,
 	IDPF_TX_SINGLEQ_SIMPLE,
 	IDPF_TX_SINGLEQ_AVX2,
+	IDPF_TX_AVX2,
 	IDPF_TX_AVX512,
 	IDPF_TX_SINGLEQ_AVX512,
 	/* Need a max value defined as array values in are defined
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c
index ead31fd0f8..8e964bef5c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1620,6 +1620,14 @@ const struct ci_tx_path_info idpf_tx_path_infos[] = {
 			.single_queue = true
 		}
 	},
+	[IDPF_TX_AVX2] = {
+		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx2,
+		.info = "Split AVX2",
+		.features = {
+			.tx_offloads = IDPF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_256
+		}
+	},
 #ifdef CC_AVX512_SUPPORT
 	[IDPF_TX_AVX512] = {
 		.pkt_burst = idpf_dp_splitq_xmit_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 256e9ff54c..2c4ebb21e4 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -257,6 +257,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+				       uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
 					struct rte_mbuf **rx_pkts,
 					uint16_t nb_pkts);
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 0122c82951..3ccd517c46 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -780,3 +780,176 @@ idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	return nb_tx;
 }
+
+static __rte_always_inline void
+idpf_splitq_scan_cq_ring(struct ci_tx_queue *cq)
+{
+	struct idpf_splitq_tx_compl_desc *compl_ring;
+	struct ci_tx_queue *txq;
+	uint16_t genid, txq_qid, cq_qid, i;
+	uint8_t ctype;
+
+	cq_qid = cq->tx_tail;
+
+	for (i = 0; i < IDPF_TXQ_SCAN_CQ_THRESH; i++) {
+		if (cq_qid == cq->nb_tx_desc) {
+			cq_qid = 0;
+			cq->expected_gen_id ^= 1;  /* toggle generation bit */
+		}
+
+		compl_ring = &cq->compl_ring[cq_qid];
+
+		genid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_GEN_M) >> IDPF_TXD_COMPLQ_GEN_S;
+
+		if (genid != cq->expected_gen_id)
+			break;
+
+		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
+		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
+				   IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
+
+		txq = cq->txqs[txq_qid - cq->tx_start_qid];
+		if (ctype == IDPF_TXD_COMPLT_RS)
+			txq->rs_compl_count++;
+
+		cq_qid++;
+	}
+
+	cq->tx_tail = cq_qid;
+}
+
+static __rte_always_inline void
+idpf_splitq_vtx1_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw =
+		IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
+		((uint64_t)flags) |
+		((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+	__m128i descriptor = _mm_set_epi64x(high_qw,
+						pkt->buf_iova + pkt->data_off);
+	_mm_storeu_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+idpf_splitq_vtx_avx2(struct idpf_flex_tx_sched_desc *txdp,
+				struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+	const uint64_t hi_qw_tmpl = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | ((uint64_t)flags);
+
+	/* align if needed */
+	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++, pkt++, nb_pkts--;
+	}
+
+	for (; nb_pkts >= IDPF_VPMD_DESCS_PER_LOOP; txdp += IDPF_VPMD_DESCS_PER_LOOP,
+			pkt += IDPF_VPMD_DESCS_PER_LOOP, nb_pkts -= IDPF_VPMD_DESCS_PER_LOOP) {
+		uint64_t hi_qw0 = hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw1 = hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw2 = hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+		uint64_t hi_qw3 = hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S);
+
+		__m256i desc0_1 = _mm256_set_epi64x(hi_qw1,
+			pkt[1]->buf_iova + pkt[1]->data_off,
+			hi_qw0,
+			pkt[0]->buf_iova + pkt[0]->data_off);
+		__m256i desc2_3 = _mm256_set_epi64x(hi_qw3,
+			pkt[3]->buf_iova + pkt[3]->data_off,
+			hi_qw2,
+			pkt[2]->buf_iova + pkt[2]->data_off);
+
+		_mm256_storeu_si256((__m256i *)(txdp + 0), desc0_1);
+		_mm256_storeu_si256((__m256i *)(txdp + 2), desc2_3);
+	}
+
+	while (nb_pkts--) {
+		idpf_splitq_vtx1_avx2(txdp, *pkt, flags);
+		txdp++;
+		pkt++;
+	}
+}
+
+static inline uint16_t
+idpf_splitq_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	struct idpf_flex_tx_sched_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit;
+	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
+	uint16_t tx_id = txq->tx_tail;
+
+	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_pkts = nb_commit;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	txdp = (struct idpf_flex_tx_sched_desc *)&txq->desc_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		idpf_splitq_vtx_avx2(txdp, tx_pkts, n - 1, cmd_dtype);
+		tx_pkts += (n - 1);
+		txdp += (n - 1);
+
+		idpf_splitq_vtx1_avx2(txdp, *tx_pkts++, cmd_dtype);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+		tx_id = 0;
+
+		txdp = &txq->desc_ring[tx_id];
+		txep = (void *)txq->sw_ring;
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	idpf_splitq_vtx_avx2(txdp, tx_pkts, nb_commit, cmd_dtype);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	txq->tx_tail = tx_id;
+
+	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_xmit_pkts_avx2)
+uint16_t
+idpf_dp_splitq_xmit_pkts_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+					uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+		idpf_splitq_scan_cq_ring(txq->complq);
+
+		if (txq->rs_compl_count > txq->tx_free_thresh) {
+			ci_tx_free_bufs_vec(txq, idpf_tx_desc_done, false);
+			txq->rs_compl_count -= txq->tx_rs_thresh;
+		}
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = idpf_splitq_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v14 3/3] doc: add note on unsupported completion queue sharing
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
  2026-02-26  9:42   ` [PATCH v14 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
@ 2026-02-26  9:42   ` Shaiq Wani
  2026-02-26 11:31   ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson
  3 siblings, 0 replies; 42+ messages in thread
From: Shaiq Wani @ 2026-02-26  9:42 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh

Added a note in the IDPF Poll Mode Driver documentation
to clarify that sharing a completion queue among multiple
TX queues serviced by different CPU cores is not supported
in split queue mode.

Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
---
 doc/guides/nics/idpf.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst
index b99cc18188..47e38fcb13 100644
--- a/doc/guides/nics/idpf.rst
+++ b/doc/guides/nics/idpf.rst
@@ -79,6 +79,11 @@ Runtime Configuration
   Then the PMD will configure Tx queue with single queue mode.
   Otherwise, split queue mode is chosen by default.
 
+.. note::
+
+  In split queue mode, sharing a completion queue among multiple TX queues that are
+  serviced by different CPU cores is not supported.
+
 
 Driver compilation and testing
 ------------------------------
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx
  2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
                     ` (2 preceding siblings ...)
  2026-02-26  9:42   ` [PATCH v14 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
@ 2026-02-26 11:31   ` Bruce Richardson
  3 siblings, 0 replies; 42+ messages in thread
From: Bruce Richardson @ 2026-02-26 11:31 UTC (permalink / raw)
  To: Shaiq Wani; +Cc: dev, aman.deep.singh

On Thu, Feb 26, 2026 at 03:12:19PM +0530, Shaiq Wani wrote:
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> In the single queue model, the same descriptor queue is used by SW
> to post descriptors to the device and used by device to report completed
> descriptors to SW. While as the split queue model separates them into
> different queues for parallel processing and improved performance.
> 
Series-acked-by: Bruce Richardson <bruce.richardson@intel.com>

Applied to dpdk-next-net-intel.
Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2026-02-26 11:32 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-17  5:26 [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-09-17  5:26 ` [PATCH 1/2] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-09-17  5:26 ` [PATCH 2/2] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-09-17  9:51 ` [PATCH 0/2] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson
2025-10-17 10:34 ` [PATCH v7 0/3] " Shaiq Wani
2025-10-17 10:34   ` [PATCH v7 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-10-17 10:34   ` [PATCH v7 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-10-17 10:34   ` [PATCH v7 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
2025-10-27  8:07 ` [PATCH v7 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-10-27  8:07   ` [PATCH v8 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-10-27  8:07   ` [PATCH v8 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-10-27  8:07   ` [PATCH v8 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
2025-10-28  5:29 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-10-28  5:29   ` [PATCH v9 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-10-28  5:29   ` [PATCH v9 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-10-28  5:29   ` [PATCH v9 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
2025-10-28 14:21 ` [PATCH v9 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2025-10-28 14:22   ` [PATCH v10 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2025-12-11 12:53     ` Bruce Richardson
2025-10-28 14:22   ` [PATCH v10 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2025-10-28 14:22   ` [PATCH v10 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
2025-12-11 13:00     ` Bruce Richardson
2026-01-05  6:22 ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2026-01-05  6:22   ` [PATCH v11 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2026-01-05  6:22   ` [PATCH v11 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2026-01-05  6:22   ` [PATCH v11 3/3] doc: note on unsupported completion queue sharing Shaiq Wani
2026-01-13 18:13   ` [PATCH v11 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Stephen Hemminger
2026-01-26 17:14   ` Bruce Richardson
2026-02-24  7:47 ` [PATCH v12 " Shaiq Wani
2026-02-24  7:47   ` [PATCH v12 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2026-02-24 14:27     ` Bruce Richardson
2026-02-24  7:47   ` [PATCH v12 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2026-02-24  7:47   ` [PATCH v12 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
2026-02-26  6:52 ` [PATCH v13 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2026-02-26  6:52   ` [PATCH v13 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2026-02-26  6:52   ` [PATCH v13 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2026-02-26  6:52   ` [PATCH v13 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
2026-02-26  9:42 ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Shaiq Wani
2026-02-26  9:42   ` [PATCH v14 1/3] net/idpf: enable AVX2 for split queue Rx Shaiq Wani
2026-02-26  9:42   ` [PATCH v14 2/3] net/idpf: enable AVX2 for split queue Tx Shaiq Wani
2026-02-26  9:42   ` [PATCH v14 3/3] doc: add note on unsupported completion queue sharing Shaiq Wani
2026-02-26 11:31   ` [PATCH v14 0/3] net/idpf: enable AVX2 for split queue Rx/Tx Bruce Richardson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox