All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] net/zxdh: optimize Rx/Tx path performance
@ 2026-03-26  2:28 Junlong Wang
  2026-03-26  3:27 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Junlong Wang @ 2026-03-26  2:28 UTC (permalink / raw)
  To: stephen; +Cc: dev, Junlong Wang


[-- Attachment #1.1.1: Type: text/plain, Size: 46849 bytes --]

This patch optimizes the ZXDH PMD's receive and transmit path for better
performance through several improvements:

- Add simple TX/RX burst functions (zxdh_xmit_pkts_simple and
  zxdh_recv_single_pkts) for single-segment packet scenarios.
- Remove RX software ring (sw_ring) to reduce memory allocation and
  copy.
- Optimize descriptor management with prefetching and simplified
  cleanup.
- Reorganize structure fields for better cache locality.

These changes reduce CPU cycles and memory bandwidth consumption,
resulting in improved packet processing throughput.

Signed-off-by: Junlong Wang <wang.junlong1@zte.com.cn>
---
 drivers/net/zxdh/zxdh_ethdev.c     |  95 +++---
 drivers/net/zxdh/zxdh_ethdev_ops.c |  24 +-
 drivers/net/zxdh/zxdh_ethdev_ops.h |   4 +
 drivers/net/zxdh/zxdh_pci.c        |   2 +-
 drivers/net/zxdh/zxdh_queue.c      |  11 +-
 drivers/net/zxdh/zxdh_queue.h      | 120 ++++---
 drivers/net/zxdh/zxdh_rxtx.c       | 518 +++++++++++++++++++++--------
 drivers/net/zxdh/zxdh_rxtx.h       |  27 +-
 8 files changed, 534 insertions(+), 267 deletions(-)

diff --git a/drivers/net/zxdh/zxdh_ethdev.c b/drivers/net/zxdh/zxdh_ethdev.c
index aeb01f4652..a5238fc6f8 100644
--- a/drivers/net/zxdh/zxdh_ethdev.c
+++ b/drivers/net/zxdh/zxdh_ethdev.c
@@ -490,7 +490,7 @@ zxdh_dev_free_mbufs(struct rte_eth_dev *dev)
 		if (!vq)
 			continue;
 		while ((buf = zxdh_queue_detach_unused(vq)) != NULL)
-			rte_pktmbuf_free(buf);
+			rte_pktmbuf_free_seg(buf);
 		PMD_DRV_LOG(DEBUG, "freeing %s[%d] used and unused buf",
 		"rxq", i * 2);
 	}
@@ -499,7 +499,7 @@ zxdh_dev_free_mbufs(struct rte_eth_dev *dev)
 		if (!vq)
 			continue;
 		while ((buf = zxdh_queue_detach_unused(vq)) != NULL)
-			rte_pktmbuf_free(buf);
+			rte_pktmbuf_free_seg(buf);
 		PMD_DRV_LOG(DEBUG, "freeing %s[%d] used and unused buf",
 		"txq", i * 2 + 1);
 	}
@@ -644,7 +644,6 @@ zxdh_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_logic_qidx)
 	struct zxdh_virtnet_tx *txvq = NULL;
 	struct zxdh_virtqueue *vq = NULL;
 	size_t sz_hdr_mz = 0;
-	void *sw_ring = NULL;
 	int32_t queue_type = zxdh_get_queue_type(vtpci_logic_qidx);
 	int32_t numa_node = dev->device->numa_node;
 	uint16_t vtpci_phy_qidx = 0;
@@ -692,11 +691,10 @@ zxdh_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_logic_qidx)
 	vq->vq_queue_index = vtpci_phy_qidx;
 	vq->vq_nentries = vq_size;
 
-	vq->vq_packed.used_wrap_counter = 1;
-	vq->vq_packed.cached_flags = ZXDH_VRING_PACKED_DESC_F_AVAIL;
-	vq->vq_packed.event_flags_shadow = 0;
+	vq->used_wrap_counter = 1;
+	vq->cached_flags = ZXDH_VRING_PACKED_DESC_F_AVAIL;
 	if (queue_type == ZXDH_VTNET_RQ)
-		vq->vq_packed.cached_flags |= ZXDH_VRING_DESC_F_WRITE;
+		vq->cached_flags |= ZXDH_VRING_DESC_F_WRITE;
 
 	/*
 	 * Reserve a memzone for vring elements
@@ -741,46 +739,28 @@ zxdh_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_logic_qidx)
 	}
 
 	if (queue_type == ZXDH_VTNET_RQ) {
-		size_t sz_sw = (ZXDH_MBUF_BURST_SZ + vq_size) * sizeof(vq->sw_ring[0]);
-
-		sw_ring = rte_zmalloc_socket("sw_ring", sz_sw, RTE_CACHE_LINE_SIZE, numa_node);
-		if (!sw_ring) {
-			PMD_DRV_LOG(ERR, "can not allocate RX soft ring");
-			ret = -ENOMEM;
-			goto fail_q_alloc;
-		}
-
-		vq->sw_ring = sw_ring;
 		rxvq = &vq->rxq;
 		rxvq->vq = vq;
 		rxvq->port_id = dev->data->port_id;
 		rxvq->mz = mz;
 	} else {             /* queue_type == VTNET_TQ */
+		if (hdr_mz == NULL) {
+			ret = -ENOMEM;
+			PMD_DRV_LOG(ERR, "can not allocate TX soft ring: %d", ret);
+			goto fail_q_alloc;
+		}
 		txvq = &vq->txq;
 		txvq->vq = vq;
 		txvq->port_id = dev->data->port_id;
 		txvq->mz = mz;
 		txvq->zxdh_net_hdr_mz = hdr_mz;
-		txvq->zxdh_net_hdr_mem = hdr_mz->iova;
+		if (hdr_mz)
+			txvq->zxdh_net_hdr_mem = hdr_mz->iova;
 	}
 
-	vq->offset = offsetof(struct rte_mbuf, buf_iova);
-	if (queue_type == ZXDH_VTNET_TQ) {
+	if (queue_type == ZXDH_VTNET_TQ && hdr_mz) {
 		struct zxdh_tx_region *txr = hdr_mz->addr;
-		uint32_t i;
-
 		memset(txr, 0, vq_size * sizeof(*txr));
-		for (i = 0; i < vq_size; i++) {
-			/* first indirect descriptor is always the tx header */
-			struct zxdh_vring_packed_desc *start_dp = txr[i].tx_packed_indir;
-
-			zxdh_vring_desc_init_indirect_packed(start_dp,
-					RTE_DIM(txr[i].tx_packed_indir));
-			start_dp->addr = txvq->zxdh_net_hdr_mem + i * sizeof(*txr) +
-					offsetof(struct zxdh_tx_region, tx_hdr);
-			/* length will be updated to actual pi hdr size when xmit pkt */
-			start_dp->len = 0;
-		}
 	}
 	if (ZXDH_VTPCI_OPS(hw)->setup_queue(hw, vq) < 0) {
 		PMD_DRV_LOG(ERR, "setup_queue failed");
@@ -788,8 +768,8 @@ zxdh_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_logic_qidx)
 	}
 	return 0;
 fail_q_alloc:
-	rte_free(sw_ring);
-	rte_memzone_free(hdr_mz);
+	if (hdr_mz)
+		rte_memzone_free(hdr_mz);
 	rte_memzone_free(mz);
 	rte_free(vq);
 	return ret;
@@ -1290,18 +1270,49 @@ zxdh_dev_close(struct rte_eth_dev *dev)
 	return ret;
 }
 
+/*
+ * Determine whether the current configuration requires support for scattered
+ * receive; return 1 if scattered receive is required and 0 if not.
+ */
+static int zxdh_scattered_rx(struct rte_eth_dev *eth_dev)
+{
+	uint16_t buf_size;
+
+	if (eth_dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TCP_LRO) {
+		eth_dev->data->lro = 1;
+		return 1;
+	}
+
+	if (eth_dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+		return 1;
+
+
+	PMD_DRV_LOG(DEBUG, "port %d min_rx_buf_size %d",
+		eth_dev->data->port_id, eth_dev->data->min_rx_buf_size);
+	buf_size = eth_dev->data->min_rx_buf_size - RTE_PKTMBUF_HEADROOM;
+	if (eth_dev->data->mtu + ZXDH_ETH_OVERHEAD > buf_size)
+		return 1;
+
+	return 0;
+}
+
 static int32_t
 zxdh_set_rxtx_funcs(struct rte_eth_dev *eth_dev)
 {
-	struct zxdh_hw *hw = eth_dev->data->dev_private;
+	uint64_t tx_offloads = eth_dev->data->dev_conf.txmode.offloads;
 
-	if (!zxdh_pci_with_feature(hw, ZXDH_NET_F_MRG_RXBUF)) {
-		PMD_DRV_LOG(ERR, "port %u not support rx mergeable", eth_dev->data->port_id);
-		return -1;
-	}
 	eth_dev->tx_pkt_prepare = zxdh_xmit_pkts_prepare;
-	eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_packed;
-	eth_dev->rx_pkt_burst = &zxdh_recv_pkts_packed;
+	eth_dev->data->scattered_rx = zxdh_scattered_rx(eth_dev);
+
+	if (!(tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
+		eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_simple;
+	else
+		eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_packed;
+
+	if (eth_dev->data->scattered_rx)
+		eth_dev->rx_pkt_burst = &zxdh_recv_pkts_packed;
+	else
+		eth_dev->rx_pkt_burst = &zxdh_recv_single_pkts;
 
 	return 0;
 }
diff --git a/drivers/net/zxdh/zxdh_ethdev_ops.c b/drivers/net/zxdh/zxdh_ethdev_ops.c
index 50247116d9..e2c2885add 100644
--- a/drivers/net/zxdh/zxdh_ethdev_ops.c
+++ b/drivers/net/zxdh/zxdh_ethdev_ops.c
@@ -95,10 +95,6 @@ static const struct rte_zxdh_xstats_name_off zxdh_rxq_stat_strings[] = {
 	{"good_bytes",             offsetof(struct zxdh_virtnet_rx, stats.bytes)},
 	{"errors",                 offsetof(struct zxdh_virtnet_rx, stats.errors)},
 	{"idle",                   offsetof(struct zxdh_virtnet_rx, stats.idle)},
-	{"full",                   offsetof(struct zxdh_virtnet_rx, stats.full)},
-	{"norefill",               offsetof(struct zxdh_virtnet_rx, stats.norefill)},
-	{"multicast_packets",      offsetof(struct zxdh_virtnet_rx, stats.multicast)},
-	{"broadcast_packets",      offsetof(struct zxdh_virtnet_rx, stats.broadcast)},
 	{"truncated_err",          offsetof(struct zxdh_virtnet_rx, stats.truncated_err)},
 	{"offload_cfg_err",        offsetof(struct zxdh_virtnet_rx, stats.offload_cfg_err)},
 	{"invalid_hdr_len_err",    offsetof(struct zxdh_virtnet_rx, stats.invalid_hdr_len_err)},
@@ -117,14 +113,12 @@ static const struct rte_zxdh_xstats_name_off zxdh_txq_stat_strings[] = {
 	{"good_packets",           offsetof(struct zxdh_virtnet_tx, stats.packets)},
 	{"good_bytes",             offsetof(struct zxdh_virtnet_tx, stats.bytes)},
 	{"errors",                 offsetof(struct zxdh_virtnet_tx, stats.errors)},
-	{"idle",                   offsetof(struct zxdh_virtnet_tx, stats.idle)},
-	{"norefill",               offsetof(struct zxdh_virtnet_tx, stats.norefill)},
-	{"multicast_packets",      offsetof(struct zxdh_virtnet_tx, stats.multicast)},
-	{"broadcast_packets",      offsetof(struct zxdh_virtnet_tx, stats.broadcast)},
+	{"idle",                 offsetof(struct zxdh_virtnet_tx, stats.idle)},
 	{"truncated_err",          offsetof(struct zxdh_virtnet_tx, stats.truncated_err)},
 	{"offload_cfg_err",        offsetof(struct zxdh_virtnet_tx, stats.offload_cfg_err)},
 	{"invalid_hdr_len_err",    offsetof(struct zxdh_virtnet_tx, stats.invalid_hdr_len_err)},
 	{"no_segs_err",            offsetof(struct zxdh_virtnet_tx, stats.no_segs_err)},
+	{"no_free_tx_desc_err",    offsetof(struct zxdh_virtnet_tx, stats.no_free_tx_desc_err)},
 	{"undersize_packets",      offsetof(struct zxdh_virtnet_tx, stats.size_bins[0])},
 	{"size_64_packets",        offsetof(struct zxdh_virtnet_tx, stats.size_bins[1])},
 	{"size_65_127_packets",    offsetof(struct zxdh_virtnet_tx, stats.size_bins[2])},
@@ -2026,6 +2020,20 @@ int zxdh_dev_mtu_set(struct rte_eth_dev *dev, uint16_t new_mtu)
 	uint16_t vfid = zxdh_vport_to_vfid(hw->vport);
 	int ret;
 
+	/* If device is started, refuse mtu that requires the support of
+	 * scattered packets when this feature has not been enabled before.
+	 */
+	if (dev->data->dev_started &&
+		((!dev->data->scattered_rx &&
+		 ((uint32_t)ZXDH_MTU_TO_PKTLEN(new_mtu) >
+		 (dev->data->min_rx_buf_size - RTE_PKTMBUF_HEADROOM))) ||
+		 (dev->data->scattered_rx &&
+		 ((uint32_t)ZXDH_MTU_TO_PKTLEN(new_mtu) <=
+		 (dev->data->min_rx_buf_size - RTE_PKTMBUF_HEADROOM))))) {
+		PMD_DRV_LOG(ERR, "Stop port first.");
+		return -EINVAL;
+	}
+
 	if (hw->is_pf) {
 		ret = zxdh_get_panel_attr(dev, &panel);
 		if (ret != 0) {
diff --git a/drivers/net/zxdh/zxdh_ethdev_ops.h b/drivers/net/zxdh/zxdh_ethdev_ops.h
index 6dfe4be473..c49d79c232 100644
--- a/drivers/net/zxdh/zxdh_ethdev_ops.h
+++ b/drivers/net/zxdh/zxdh_ethdev_ops.h
@@ -40,6 +40,10 @@
 #define ZXDH_SPM_SPEED_4X_100G         RTE_BIT32(10)
 #define ZXDH_SPM_SPEED_4X_200G         RTE_BIT32(11)
 
+#define ZXDH_VLAN_TAG_LEN   4
+#define ZXDH_ETH_OVERHEAD  (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + ZXDH_VLAN_TAG_LEN * 2)
+#define ZXDH_MTU_TO_PKTLEN(mtu) ((mtu) + ZXDH_ETH_OVERHEAD)
+
 struct zxdh_np_stats_data {
 	uint64_t n_pkts_dropped;
 	uint64_t n_bytes_dropped;
diff --git a/drivers/net/zxdh/zxdh_pci.c b/drivers/net/zxdh/zxdh_pci.c
index 4ba31905fc..0bc27ed111 100644
--- a/drivers/net/zxdh/zxdh_pci.c
+++ b/drivers/net/zxdh/zxdh_pci.c
@@ -231,7 +231,7 @@ zxdh_notify_queue(struct zxdh_hw *hw, struct zxdh_virtqueue *vq)
 
 	notify_data = ((uint32_t)vq->vq_avail_idx << 16) | vq->vq_queue_index;
 	if (zxdh_pci_with_feature(hw, ZXDH_F_RING_PACKED) &&
-			(vq->vq_packed.cached_flags & ZXDH_VRING_PACKED_DESC_F_AVAIL))
+			(vq->cached_flags & ZXDH_VRING_PACKED_DESC_F_AVAIL))
 		notify_data |= RTE_BIT32(31);
 
 	PMD_DRV_LOG(DEBUG, "queue:%d notify_data 0x%x notify_addr 0x%p",
diff --git a/drivers/net/zxdh/zxdh_queue.c b/drivers/net/zxdh/zxdh_queue.c
index 7162593b16..4668cb5d13 100644
--- a/drivers/net/zxdh/zxdh_queue.c
+++ b/drivers/net/zxdh/zxdh_queue.c
@@ -407,7 +407,7 @@ int32_t zxdh_enqueue_recv_refill_packed(struct zxdh_virtqueue *vq,
 {
 	struct zxdh_vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
 	struct zxdh_vq_desc_extra *dxp;
-	uint16_t flags = vq->vq_packed.cached_flags;
+	uint16_t flags = vq->cached_flags;
 	int32_t i;
 	uint16_t idx;
 
@@ -415,7 +415,6 @@ int32_t zxdh_enqueue_recv_refill_packed(struct zxdh_virtqueue *vq,
 		idx = vq->vq_avail_idx;
 		dxp = &vq->vq_descx[idx];
 		dxp->cookie = (void *)cookie[i];
-		dxp->ndescs = 1;
 		/* rx pkt fill in data_off */
 		start_dp[idx].addr = rte_mbuf_iova_get(cookie[i]) + RTE_PKTMBUF_HEADROOM;
 		start_dp[idx].len = cookie[i]->buf_len - RTE_PKTMBUF_HEADROOM;
@@ -423,8 +422,8 @@ int32_t zxdh_enqueue_recv_refill_packed(struct zxdh_virtqueue *vq,
 		zxdh_queue_store_flags_packed(&start_dp[idx], flags);
 		if (++vq->vq_avail_idx >= vq->vq_nentries) {
 			vq->vq_avail_idx -= vq->vq_nentries;
-			vq->vq_packed.cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
-			flags = vq->vq_packed.cached_flags;
+			vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+			flags = vq->cached_flags;
 		}
 	}
 	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
@@ -467,7 +466,7 @@ void zxdh_queue_rxvq_flush(struct zxdh_virtqueue *vq)
 	int32_t cnt = 0;
 
 	i = vq->vq_used_cons_idx;
-	while (zxdh_desc_used(&descs[i], vq) && cnt++ < vq->vq_nentries) {
+	while (desc_is_used(&descs[i], vq) && cnt++ < vq->vq_nentries) {
 		dxp = &vq->vq_descx[descs[i].id];
 		if (dxp->cookie != NULL) {
 			rte_pktmbuf_free(dxp->cookie);
@@ -477,7 +476,7 @@ void zxdh_queue_rxvq_flush(struct zxdh_virtqueue *vq)
 		vq->vq_used_cons_idx++;
 		if (vq->vq_used_cons_idx >= vq->vq_nentries) {
 			vq->vq_used_cons_idx -= vq->vq_nentries;
-			vq->vq_packed.used_wrap_counter ^= 1;
+			vq->used_wrap_counter ^= 1;
 		}
 		i = vq->vq_used_cons_idx;
 	}
diff --git a/drivers/net/zxdh/zxdh_queue.h b/drivers/net/zxdh/zxdh_queue.h
index 1a0c8a0d90..94101c8269 100644
--- a/drivers/net/zxdh/zxdh_queue.h
+++ b/drivers/net/zxdh/zxdh_queue.h
@@ -9,6 +9,7 @@
 
 #include <rte_common.h>
 #include <rte_atomic.h>
+#include <rte_io.h>
 
 #include "zxdh_ethdev.h"
 #include "zxdh_rxtx.h"
@@ -117,7 +118,6 @@ struct zxdh_vring_packed_desc_event {
 };
 
 struct zxdh_vring_packed {
-	uint32_t num;
 	struct zxdh_vring_packed_desc *desc;
 	struct zxdh_vring_packed_desc_event *driver;
 	struct zxdh_vring_packed_desc_event *device;
@@ -129,50 +129,59 @@ struct zxdh_vq_desc_extra {
 	uint16_t next;
 };
 
+struct zxdh_vring {
+	uint32_t num;
+	struct zxdh_vring_desc  *desc;
+	struct zxdh_vring_avail *avail;
+	struct zxdh_vring_used  *used;
+};
+
 struct zxdh_virtqueue {
+	union {
+		struct {
+			struct zxdh_vring ring; /**< vring keeping desc, used and avail */
+		} vq_split;
+		struct __rte_packed_begin {
+			struct zxdh_vring_packed ring;
+		} __rte_packed_end vq_packed;
+	};
 	struct zxdh_hw  *hw; /* < zxdh_hw structure pointer. */
 
-	struct {
-		/* vring keeping descs and events */
-		struct zxdh_vring_packed ring;
-		uint8_t used_wrap_counter;
-		uint8_t rsv;
-		uint16_t cached_flags; /* < cached flags for descs */
-		uint16_t event_flags_shadow;
-		uint16_t rsv1;
-	} vq_packed;
-
-	uint16_t vq_used_cons_idx; /* < last consumed descriptor */
-	uint16_t vq_nentries;  /* < vring desc numbers */
-	uint16_t vq_free_cnt;  /* < num of desc available */
-	uint16_t vq_avail_idx; /* < sync until needed */
-	uint16_t vq_free_thresh; /* < free threshold */
-	uint16_t rsv2;
-
-	void *vq_ring_virt_mem;  /* < linear address of vring */
-	uint32_t vq_ring_size;
+	uint16_t vq_used_cons_idx; /**< last consumed descriptor */
+	uint16_t vq_avail_idx; /**< sync until needed */
+	uint16_t vq_nentries;  /**< vring desc numbers */
+	uint16_t vq_free_cnt;  /**< num of desc available */
+
+	uint16_t cached_flags; /**< cached flags for descs */
+	uint8_t used_wrap_counter;
+	uint8_t rsv;
+	uint16_t vq_free_thresh; /**< free threshold */
+	uint16_t next_qidx;
+
+	void *notify_addr;
 
 	union {
 		struct zxdh_virtnet_rx rxq;
 		struct zxdh_virtnet_tx txq;
 	};
 
-	/*
-	 * physical address of vring, or virtual address
-	 */
-	rte_iova_t vq_ring_mem;
+	uint16_t vq_queue_index; /* PACKED: phy_idx, SPLIT: logic_idx */
+	uint16_t event_flags_shadow;
+	uint32_t vq_ring_size;
 
-	/*
+	/**
 	 * Head of the free chain in the descriptor table. If
 	 * there are no free descriptors, this will be set to
 	 * VQ_RING_DESC_CHAIN_END.
-	 */
+	 **/
 	uint16_t  vq_desc_head_idx;
 	uint16_t  vq_desc_tail_idx;
-	uint16_t  vq_queue_index;   /* < PCI queue index */
-	uint16_t  offset; /* < relative offset to obtain addr in mbuf */
-	uint16_t *notify_addr;
-	struct rte_mbuf **sw_ring;  /* < RX software ring. */
+	uint32_t rsv_8B;
+
+	void *vq_ring_virt_mem;  /**< linear address of vring*/
+	/* physical address of vring, or virtual address for virtio_user. */
+	rte_iova_t vq_ring_mem;
+
 	struct zxdh_vq_desc_extra vq_descx[];
 };
 
@@ -296,10 +305,9 @@ static inline void
 zxdh_vring_init_packed(struct zxdh_vring_packed *vr, uint8_t *p,
 		unsigned long align, uint32_t num)
 {
-	vr->num    = num;
 	vr->desc   = (struct zxdh_vring_packed_desc *)p;
 	vr->driver = (struct zxdh_vring_packed_desc_event *)(p +
-				 vr->num * sizeof(struct zxdh_vring_packed_desc));
+				 num * sizeof(struct zxdh_vring_packed_desc));
 	vr->device = (struct zxdh_vring_packed_desc_event *)RTE_ALIGN_CEIL(((uintptr_t)vr->driver +
 				 sizeof(struct zxdh_vring_packed_desc_event)), align);
 }
@@ -331,30 +339,21 @@ zxdh_vring_desc_init_indirect_packed(struct zxdh_vring_packed_desc *dp, int32_t
 static inline void
 zxdh_queue_disable_intr(struct zxdh_virtqueue *vq)
 {
-	if (vq->vq_packed.event_flags_shadow != ZXDH_RING_EVENT_FLAGS_DISABLE) {
-		vq->vq_packed.event_flags_shadow = ZXDH_RING_EVENT_FLAGS_DISABLE;
-		vq->vq_packed.ring.driver->desc_event_flags = vq->vq_packed.event_flags_shadow;
+	if (vq->event_flags_shadow != ZXDH_RING_EVENT_FLAGS_DISABLE) {
+		vq->event_flags_shadow = ZXDH_RING_EVENT_FLAGS_DISABLE;
+		vq->vq_packed.ring.driver->desc_event_flags = vq->event_flags_shadow;
 	}
 }
 
 static inline void
 zxdh_queue_enable_intr(struct zxdh_virtqueue *vq)
 {
-	if (vq->vq_packed.event_flags_shadow == ZXDH_RING_EVENT_FLAGS_DISABLE) {
-		vq->vq_packed.event_flags_shadow = ZXDH_RING_EVENT_FLAGS_DISABLE;
-		vq->vq_packed.ring.driver->desc_event_flags = vq->vq_packed.event_flags_shadow;
+	if (vq->event_flags_shadow == ZXDH_RING_EVENT_FLAGS_DISABLE) {
+		vq->event_flags_shadow = ZXDH_RING_EVENT_FLAGS_DISABLE;
+		vq->vq_packed.ring.driver->desc_event_flags = vq->event_flags_shadow;
 	}
 }
 
-static inline void
-zxdh_mb(uint8_t weak_barriers)
-{
-	if (weak_barriers)
-		rte_atomic_thread_fence(rte_memory_order_seq_cst);
-	else
-		rte_mb();
-}
-
 static inline
 int32_t desc_is_used(struct zxdh_vring_packed_desc *desc, struct zxdh_virtqueue *vq)
 {
@@ -365,7 +364,7 @@ int32_t desc_is_used(struct zxdh_vring_packed_desc *desc, struct zxdh_virtqueue
 	rte_io_rmb();
 	used = !!(flags & ZXDH_VRING_PACKED_DESC_F_USED);
 	avail = !!(flags & ZXDH_VRING_PACKED_DESC_F_AVAIL);
-	return avail == used && used == vq->vq_packed.used_wrap_counter;
+	return avail == used && used == vq->used_wrap_counter;
 }
 
 static inline int32_t
@@ -381,22 +380,17 @@ zxdh_queue_store_flags_packed(struct zxdh_vring_packed_desc *dp, uint16_t flags)
 	dp->flags = flags;
 }
 
-static inline int32_t
-zxdh_desc_used(struct zxdh_vring_packed_desc *desc, struct zxdh_virtqueue *vq)
-{
-	uint16_t flags;
-	uint16_t used, avail;
-
-	flags = desc->flags;
-	rte_io_rmb();
-	used = !!(flags & ZXDH_VRING_PACKED_DESC_F_USED);
-	avail = !!(flags & ZXDH_VRING_PACKED_DESC_F_AVAIL);
-	return avail == used && used == vq->vq_packed.used_wrap_counter;
-}
-
 static inline void zxdh_queue_notify(struct zxdh_virtqueue *vq)
 {
-	ZXDH_VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+	/* Bit[0:15]: vq queue index
+	 * Bit[16:30]: avail index
+	 * Bit[31]: avail wrap counter
+	 */
+	uint32_t notify_data = ((uint32_t)(!!(vq->cached_flags &
+		ZXDH_VRING_PACKED_DESC_F_AVAIL)) << 31) |
+		((uint32_t)vq->vq_avail_idx << 16) |
+		vq->vq_queue_index;
+	rte_write32(notify_data, vq->notify_addr);
 }
 
 static inline int32_t
@@ -404,7 +398,7 @@ zxdh_queue_kick_prepare_packed(struct zxdh_virtqueue *vq)
 {
 	uint16_t flags = 0;
 
-	zxdh_mb(1);
+	rte_mb();
 	flags = vq->vq_packed.ring.device->desc_event_flags;
 
 	return (flags != ZXDH_RING_EVENT_FLAGS_DISABLE);
diff --git a/drivers/net/zxdh/zxdh_rxtx.c b/drivers/net/zxdh/zxdh_rxtx.c
index db86922aea..111cf54b0d 100644
--- a/drivers/net/zxdh/zxdh_rxtx.c
+++ b/drivers/net/zxdh/zxdh_rxtx.c
@@ -114,6 +114,22 @@
 		RTE_MBUF_F_TX_SEC_OFFLOAD |     \
 		RTE_MBUF_F_TX_UDP_SEG)
 
+#if RTE_CACHE_LINE_SIZE == 128
+#define NEXT_CACHELINE_OFF_16B   8
+#define NEXT_CACHELINE_OFF_8B   16
+#elif RTE_CACHE_LINE_SIZE == 64
+#define NEXT_CACHELINE_OFF_16B   4
+#define NEXT_CACHELINE_OFF_8B    8
+#else
+#define NEXT_CACHELINE_OFF_16B  (RTE_CACHE_LINE_SIZE / 16)
+#define NEXT_CACHELINE_OFF_8B   (RTE_CACHE_LINE_SIZE / 8)
+#endif
+#define N_PER_LOOP  NEXT_CACHELINE_OFF_8B
+#define N_PER_LOOP_MASK (N_PER_LOOP - 1)
+
+#define rxq_get_vq(q) ((q)->vq)
+#define txq_get_vq(q) ((q)->vq)
+
 uint32_t zxdh_outer_l2_type[16] = {
 	0,
 	RTE_PTYPE_L2_ETHER,
@@ -201,43 +217,6 @@ uint32_t zxdh_inner_l4_type[16] = {
 	0,
 };
 
-static void
-zxdh_xmit_cleanup_inorder_packed(struct zxdh_virtqueue *vq, int32_t num)
-{
-	uint16_t used_idx = 0;
-	uint16_t id       = 0;
-	uint16_t curr_id  = 0;
-	uint16_t free_cnt = 0;
-	uint16_t size     = vq->vq_nentries;
-	struct zxdh_vring_packed_desc *desc = vq->vq_packed.ring.desc;
-	struct zxdh_vq_desc_extra     *dxp  = NULL;
-
-	used_idx = vq->vq_used_cons_idx;
-	/* desc_is_used has a load-acquire or rte_io_rmb inside
-	 * and wait for used desc in virtqueue.
-	 */
-	while (num > 0 && zxdh_desc_used(&desc[used_idx], vq)) {
-		id = desc[used_idx].id;
-		do {
-			curr_id = used_idx;
-			dxp = &vq->vq_descx[used_idx];
-			used_idx += dxp->ndescs;
-			free_cnt += dxp->ndescs;
-			num -= dxp->ndescs;
-			if (used_idx >= size) {
-				used_idx -= size;
-				vq->vq_packed.used_wrap_counter ^= 1;
-			}
-			if (dxp->cookie != NULL) {
-				rte_pktmbuf_free(dxp->cookie);
-				dxp->cookie = NULL;
-			}
-		} while (curr_id != id);
-	}
-	vq->vq_used_cons_idx = used_idx;
-	vq->vq_free_cnt += free_cnt;
-}
-
 static inline uint16_t
 zxdh_get_mtu(struct zxdh_virtqueue *vq)
 {
@@ -334,18 +313,17 @@ zxdh_xmit_fill_net_hdr(struct zxdh_virtqueue *vq, struct rte_mbuf *cookie,
 }
 
 static inline void
-zxdh_enqueue_xmit_packed_fast(struct zxdh_virtnet_tx *txvq,
+zxdh_xmit_enqueue_push(struct zxdh_virtnet_tx *txvq,
 						struct rte_mbuf *cookie)
 {
 	struct zxdh_virtqueue *vq = txvq->vq;
 	uint16_t id = vq->vq_avail_idx;
 	struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
-	uint16_t flags = vq->vq_packed.cached_flags;
+	uint16_t flags = vq->cached_flags;
 	struct zxdh_net_hdr_dl *hdr = NULL;
 	uint8_t hdr_len = vq->hw->dl_net_hdr_len;
 	struct zxdh_vring_packed_desc *dp = &vq->vq_packed.ring.desc[id];
 
-	dxp->ndescs = 1;
 	dxp->cookie = cookie;
 	hdr = rte_pktmbuf_mtod_offset(cookie, struct zxdh_net_hdr_dl *, -hdr_len);
 	zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
@@ -355,69 +333,65 @@ zxdh_enqueue_xmit_packed_fast(struct zxdh_virtnet_tx *txvq,
 	dp->id   = id;
 	if (++vq->vq_avail_idx >= vq->vq_nentries) {
 		vq->vq_avail_idx -= vq->vq_nentries;
-		vq->vq_packed.cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+		vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
 	}
 	vq->vq_free_cnt--;
 	zxdh_queue_store_flags_packed(dp, flags);
 }
 
 static inline void
-zxdh_enqueue_xmit_packed(struct zxdh_virtnet_tx *txvq,
+zxdh_xmit_enqueue_append(struct zxdh_virtnet_tx *txvq,
 						struct rte_mbuf *cookie,
 						uint16_t needed)
 {
 	struct zxdh_tx_region *txr = txvq->zxdh_net_hdr_mz->addr;
 	struct zxdh_virtqueue *vq = txvq->vq;
-	uint16_t id = vq->vq_avail_idx;
-	struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
+	struct zxdh_vq_desc_extra *dep = &vq->vq_descx[0];
 	uint16_t head_idx = vq->vq_avail_idx;
 	uint16_t idx = head_idx;
 	struct zxdh_vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
 	struct zxdh_vring_packed_desc *head_dp = &vq->vq_packed.ring.desc[idx];
 	struct zxdh_net_hdr_dl *hdr = NULL;
-
-	uint16_t head_flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
+	uint16_t id = vq->vq_avail_idx;
+	struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
 	uint8_t hdr_len = vq->hw->dl_net_hdr_len;
+	uint16_t head_flags = 0;
 
-	dxp->ndescs = needed;
-	dxp->cookie = cookie;
-	head_flags |= vq->vq_packed.cached_flags;
+	dxp->cookie = NULL;
 
+	/* setup first tx ring slot to point to header stored in reserved region. */
 	start_dp[idx].addr = txvq->zxdh_net_hdr_mem + RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
 	start_dp[idx].len  = hdr_len;
-	head_flags |= ZXDH_VRING_DESC_F_NEXT;
+	start_dp[idx].id = idx;
+	head_flags |= vq->cached_flags | ZXDH_VRING_DESC_F_NEXT;
 	hdr = (void *)&txr[idx].tx_hdr;
 
-	rte_prefetch1(hdr);
+	zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
+
 	idx++;
 	if (idx >= vq->vq_nentries) {
 		idx -= vq->vq_nentries;
-		vq->vq_packed.cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+		vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
 	}
 
-	zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
-
 	do {
 		start_dp[idx].addr = rte_pktmbuf_iova(cookie);
 		start_dp[idx].len  = cookie->data_len;
-		start_dp[idx].id = id;
-		if (likely(idx != head_idx)) {
-			uint16_t flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
-
-			flags |= vq->vq_packed.cached_flags;
-			start_dp[idx].flags = flags;
-		}
+		start_dp[idx].id = idx;
 
+		dep[idx].cookie = cookie;
+		uint16_t flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
+		flags |= vq->cached_flags;
+		start_dp[idx].flags = flags;
 		idx++;
 		if (idx >= vq->vq_nentries) {
 			idx -= vq->vq_nentries;
-			vq->vq_packed.cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+			vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
 		}
 	} while ((cookie = cookie->next) != NULL);
 
 	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
 	vq->vq_avail_idx = idx;
-
 	zxdh_queue_store_flags_packed(head_dp, head_flags);
 }
 
@@ -456,7 +430,7 @@ zxdh_update_packet_stats(struct zxdh_virtnet_stats *stats, struct rte_mbuf *mbuf
 }
 
 static void
-zxdh_xmit_flush(struct zxdh_virtqueue *vq)
+zxdh_xmit_fast_flush(struct zxdh_virtqueue *vq)
 {
 	uint16_t id       = 0;
 	uint16_t curr_id  = 0;
@@ -472,20 +446,22 @@ zxdh_xmit_flush(struct zxdh_virtqueue *vq)
 	 * for a used descriptor in the virtqueue.
 	 */
 	while (desc_is_used(&desc[used_idx], vq)) {
+		rte_prefetch0(&desc[used_idx + NEXT_CACHELINE_OFF_16B]);
 		id = desc[used_idx].id;
 		do {
+			desc[used_idx].id = used_idx;
 			curr_id = used_idx;
 			dxp = &vq->vq_descx[used_idx];
-			used_idx += dxp->ndescs;
-			free_cnt += dxp->ndescs;
-			if (used_idx >= size) {
-				used_idx -= size;
-				vq->vq_packed.used_wrap_counter ^= 1;
-			}
 			if (dxp->cookie != NULL) {
-				rte_pktmbuf_free(dxp->cookie);
+				rte_pktmbuf_free_seg(dxp->cookie);
 				dxp->cookie = NULL;
 			}
+			used_idx += 1;
+			free_cnt += 1;
+			if (unlikely(used_idx == size)) {
+				used_idx = 0;
+				vq->used_wrap_counter ^= 1;
+			}
 		} while (curr_id != id);
 	}
 	vq->vq_used_cons_idx = used_idx;
@@ -499,13 +475,12 @@ zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkt
 	struct zxdh_virtqueue  *vq   = txvq->vq;
 	uint16_t nb_tx = 0;
 
-	zxdh_xmit_flush(vq);
+	zxdh_xmit_fast_flush(vq);
 
 	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
 		struct rte_mbuf *txm = tx_pkts[nb_tx];
 		int32_t can_push     = 0;
 		int32_t slots        = 0;
-		int32_t need         = 0;
 
 		rte_prefetch0(txm);
 		/* optimize ring usage */
@@ -522,26 +497,15 @@ zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkt
 		 * default    => number of segments + 1
 		 **/
 		slots = txm->nb_segs + !can_push;
-		need = slots - vq->vq_free_cnt;
 		/* Positive value indicates it need free vring descriptors */
-		if (unlikely(need > 0)) {
-			zxdh_xmit_cleanup_inorder_packed(vq, need);
-			need = slots - vq->vq_free_cnt;
-			if (unlikely(need > 0)) {
-				PMD_TX_LOG(ERR,
-						" No enough %d free tx descriptors to transmit."
-						"freecnt %d",
-						need,
-						vq->vq_free_cnt);
-				break;
-			}
-		}
+		if (unlikely(slots >  vq->vq_free_cnt))
+			break;
 
 		/* Enqueue Packet buffers */
 		if (can_push)
-			zxdh_enqueue_xmit_packed_fast(txvq, txm);
+			zxdh_xmit_enqueue_push(txvq, txm);
 		else
-			zxdh_enqueue_xmit_packed(txvq, txm, slots);
+			zxdh_xmit_enqueue_append(txvq, txm, slots);
 		zxdh_update_packet_stats(&txvq->stats, txm);
 	}
 	txvq->stats.packets += nb_tx;
@@ -579,11 +543,6 @@ uint16_t zxdh_xmit_pkts_prepare(void *tx_queue, struct rte_mbuf **tx_pkts,
 		}
 #endif
 
-		error = rte_net_intel_cksum_prepare(m);
-		if (unlikely(error)) {
-			rte_errno = -error;
-			break;
-		}
 		if (m->nb_segs > ZXDH_TX_MAX_SEGS) {
 			PMD_TX_LOG(ERR, "%d segs dropped", m->nb_segs);
 			txvq->stats.truncated_err += nb_pkts - nb_tx;
@@ -613,13 +572,15 @@ zxdh_dequeue_burst_rx_packed(struct zxdh_virtqueue *vq,
 	uint16_t i, used_idx;
 	uint16_t id;
 
+	used_idx = vq->vq_used_cons_idx;
+	rte_prefetch0(&desc[used_idx]);
+
 	for (i = 0; i < num; i++) {
 		used_idx = vq->vq_used_cons_idx;
-		/**
-		 * desc_is_used has a load-acquire or rte_io_rmb inside
+		/* desc_is_used has a load-acquire or rte_io_rmb inside
 		 * and wait for used desc in virtqueue.
 		 */
-		if (!zxdh_desc_used(&desc[used_idx], vq))
+		if (!desc_is_used(&desc[used_idx], vq))
 			return i;
 		len[i] = desc[used_idx].len;
 		id = desc[used_idx].id;
@@ -637,7 +598,7 @@ zxdh_dequeue_burst_rx_packed(struct zxdh_virtqueue *vq,
 		vq->vq_used_cons_idx++;
 		if (vq->vq_used_cons_idx >= vq->vq_nentries) {
 			vq->vq_used_cons_idx -= vq->vq_nentries;
-			vq->vq_packed.used_wrap_counter ^= 1;
+			vq->used_wrap_counter ^= 1;
 		}
 	}
 	return i;
@@ -823,17 +784,52 @@ zxdh_rx_update_mbuf(struct zxdh_hw *hw, struct rte_mbuf *m, struct zxdh_net_hdr_
 	}
 }
 
-static void zxdh_discard_rxbuf(struct zxdh_virtqueue *vq, struct rte_mbuf *m)
+static void refill_desc_unwrap(struct zxdh_virtqueue *vq,
+		struct rte_mbuf **cookie, uint16_t nb_pkts)
 {
-	int32_t error = 0;
-	/*
-	 * Requeue the discarded mbuf. This should always be
-	 * successful since it was just dequeued.
-	 */
-	error = zxdh_enqueue_recv_refill_packed(vq, &m, 1);
-	if (unlikely(error)) {
-		PMD_RX_LOG(ERR, "cannot enqueue discarded mbuf");
-		rte_pktmbuf_free(m);
+	struct zxdh_vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
+	struct zxdh_vq_desc_extra *dxp;
+	uint16_t flags = vq->cached_flags;
+	int32_t i;
+	uint16_t idx;
+
+	idx = vq->vq_avail_idx;
+	for (i = 0; i < nb_pkts; i++) {
+		dxp = &vq->vq_descx[idx];
+		dxp->cookie = (void *)cookie[i];
+		start_dp[idx].addr = rte_mbuf_iova_get(cookie[i]) + RTE_PKTMBUF_HEADROOM;
+		start_dp[idx].len = cookie[i]->buf_len - RTE_PKTMBUF_HEADROOM;
+		start_dp[idx].flags = flags;
+		idx++;
+	}
+	vq->vq_avail_idx += nb_pkts;
+	vq->vq_free_cnt = vq->vq_free_cnt - nb_pkts;
+}
+
+static void refill_que_descs(struct zxdh_virtqueue *vq, struct rte_eth_dev *dev)
+{
+	/* free_cnt may include mrg descs */
+	struct rte_mbuf *new_pkts[ZXDH_MBUF_BURST_SZ];
+	uint16_t free_cnt = RTE_MIN(ZXDH_MBUF_BURST_SZ, vq->vq_free_cnt);
+	struct zxdh_virtnet_rx *rxvq = &vq->rxq;
+	uint16_t  unwrap_cnt, left_cnt;
+
+	if (!rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts, free_cnt)) {
+		left_cnt = free_cnt;
+		unwrap_cnt = 0;
+		if ((vq->vq_avail_idx + free_cnt) >= vq->vq_nentries) {
+			unwrap_cnt = vq->vq_nentries - vq->vq_avail_idx;
+			left_cnt = free_cnt - unwrap_cnt;
+			refill_desc_unwrap(vq, new_pkts, unwrap_cnt);
+			vq->vq_avail_idx = 0;
+			vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+		}
+		if (left_cnt)
+			refill_desc_unwrap(vq, new_pkts + unwrap_cnt, left_cnt);
+
+		rte_io_wmb();
+	} else {
+		dev->data->rx_mbuf_alloc_failed += free_cnt;
 	}
 }
 
@@ -842,7 +838,7 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 				uint16_t nb_pkts)
 {
 	struct zxdh_virtnet_rx *rxvq = rx_queue;
-	struct zxdh_virtqueue *vq = rxvq->vq;
+	struct zxdh_virtqueue *vq = rxq_get_vq(rxvq);
 	struct zxdh_hw *hw = vq->hw;
 	struct rte_mbuf *rxm = NULL;
 	struct rte_mbuf *prev = NULL;
@@ -852,7 +848,6 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint16_t len = 0;
 	uint32_t seg_num = 0;
 	uint32_t seg_res = 0;
-	uint32_t error = 0;
 	uint16_t hdr_size = 0;
 	uint16_t nb_rx = 0;
 	uint16_t i;
@@ -873,7 +868,8 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 		rx_pkts[nb_rx] = rxm;
 		prev = rxm;
 		len = lens[i];
-		header = rte_pktmbuf_mtod(rxm, struct zxdh_net_hdr_ul *);
+		header = (struct zxdh_net_hdr_ul *)((char *)
+					rxm->buf_addr + RTE_PKTMBUF_HEADROOM);
 
 		seg_num  = header->type_hdr.num_buffers;
 
@@ -886,7 +882,7 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rxvq->stats.invalid_hdr_len_err++;
 			continue;
 		}
-		rxm->data_off += hdr_size;
+		rxm->data_off = RTE_PKTMBUF_HEADROOM + hdr_size;
 		rxm->nb_segs = seg_num;
 		rxm->ol_flags = 0;
 		rcvd_pkt_len = len - hdr_size;
@@ -902,18 +898,19 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 			len = lens[i];
 			rxm = rcv_pkts[i];
 			rxm->data_len = len;
+			rxm->data_off = RTE_PKTMBUF_HEADROOM;
 			rcvd_pkt_len += len;
 			prev->next = rxm;
 			prev = rxm;
 			rxm->next = NULL;
-			seg_res -= 1;
+			seg_res--;
 		}
 
 		if (!seg_res) {
 			if (rcvd_pkt_len != rx_pkts[nb_rx]->pkt_len) {
 				PMD_RX_LOG(ERR, "dropped rcvd_pkt_len %d pktlen %d",
 					rcvd_pkt_len, rx_pkts[nb_rx]->pkt_len);
-				zxdh_discard_rxbuf(vq, rx_pkts[nb_rx]);
+				rte_pktmbuf_free(rx_pkts[nb_rx]);
 				rxvq->stats.errors++;
 				rxvq->stats.truncated_err++;
 				continue;
@@ -942,14 +939,14 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 			prev->next = rxm;
 			prev = rxm;
 			rxm->next = NULL;
-			extra_idx += 1;
+			extra_idx++;
 		}
 		seg_res -= rcv_cnt;
 		if (!seg_res) {
 			if (unlikely(rcvd_pkt_len != rx_pkts[nb_rx]->pkt_len)) {
 				PMD_RX_LOG(ERR, "dropped rcvd_pkt_len %d pktlen %d",
 					rcvd_pkt_len, rx_pkts[nb_rx]->pkt_len);
-				zxdh_discard_rxbuf(vq, rx_pkts[nb_rx]);
+				rte_pktmbuf_free(rx_pkts[nb_rx]);
 				rxvq->stats.errors++;
 				rxvq->stats.truncated_err++;
 				continue;
@@ -961,26 +958,285 @@ zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts,
 	rxvq->stats.packets += nb_rx;
 
 refill:
-	/* Allocate new mbuf for the used descriptor */
-	if (likely(!zxdh_queue_full(vq))) {
-		struct rte_mbuf *new_pkts[ZXDH_MBUF_BURST_SZ];
-		/* free_cnt may include mrg descs */
-		uint16_t free_cnt = RTE_MIN(vq->vq_free_cnt, ZXDH_MBUF_BURST_SZ);
-
-		if (!rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts, free_cnt)) {
-			error = zxdh_enqueue_recv_refill_packed(vq, new_pkts, free_cnt);
-			if (unlikely(error)) {
-				for (i = 0; i < free_cnt; i++)
-					rte_pktmbuf_free(new_pkts[i]);
-			}
+	if (vq->vq_free_cnt > 0) {
+		struct rte_eth_dev *dev = hw->eth_dev;
+		refill_que_descs(vq, dev);
+		zxdh_queue_notify(vq);
+	}
+
+	return nb_rx;
+}
+
+static inline int pkt_padding(struct rte_mbuf *cookie, struct zxdh_hw *hw)
+{
+	uint16_t mtu_or_mss = 0;
+	uint16_t pkt_flag_lw16 = ZXDH_NO_IPID_UPDATE;
+	uint16_t l3_offset;
+	uint8_t pcode = ZXDH_PCODE_NO_IP_PKT_TYPE;
+	uint8_t l3_ptype = ZXDH_PI_L3TYPE_NOIP;
+	struct zxdh_pi_hdr *pi_hdr;
+	struct zxdh_pd_hdr_dl *pd_hdr;
+	struct zxdh_net_hdr_dl *net_hdr_dl = hw->net_hdr_dl;
+	uint8_t hdr_len = hw->dl_net_hdr_len;
+	uint16_t ol_flag = 0;
+	struct zxdh_net_hdr_dl *hdr = NULL;
+	hdr = (struct zxdh_net_hdr_dl *)rte_pktmbuf_prepend(cookie, hdr_len);
+	if (unlikely(hdr == NULL))
+		return -1;
+
+	rte_memcpy(hdr, net_hdr_dl, hdr_len);
+
+	if (hw->has_tx_offload) {
+		pi_hdr = &hdr->pipd_hdr_dl.pi_hdr;
+		pd_hdr = &hdr->pipd_hdr_dl.pd_hdr;
+
+		pcode = ZXDH_PCODE_IP_PKT_TYPE;
+		if (cookie->ol_flags & RTE_MBUF_F_TX_IPV6)
+			l3_ptype = ZXDH_PI_L3TYPE_IPV6;
+		else if (cookie->ol_flags & RTE_MBUF_F_TX_IPV4)
+			l3_ptype = ZXDH_PI_L3TYPE_IP;
+		else
+			pcode = ZXDH_PCODE_NO_IP_PKT_TYPE;
 
-			if (unlikely(zxdh_queue_kick_prepare_packed(vq)))
-				zxdh_queue_notify(vq);
+		if (cookie->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+			mtu_or_mss = (cookie->tso_segsz >= ZXDH_MIN_MSS) ?
+				cookie->tso_segsz : ZXDH_MIN_MSS;
+			pi_hdr->pkt_flag_hi8  |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+			pkt_flag_lw16 |= ZXDH_NO_IP_FRAGMENT | ZXDH_TX_IP_CKSUM_CAL;
+			pcode = ZXDH_PCODE_TCP_PKT_TYPE;
+		} else if (cookie->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
+			mtu_or_mss = hw->eth_dev->data->mtu;
+			mtu_or_mss = (mtu_or_mss >= ZXDH_MIN_MSS) ? mtu_or_mss : ZXDH_MIN_MSS;
+			pkt_flag_lw16 |= ZXDH_TX_IP_CKSUM_CAL;
+			pi_hdr->pkt_flag_hi8 |= ZXDH_NO_TCP_FRAGMENT | ZXDH_TX_TCPUDP_CKSUM_CAL;
+			pcode = ZXDH_PCODE_UDP_PKT_TYPE;
 		} else {
-			struct rte_eth_dev *dev = hw->eth_dev;
+			pkt_flag_lw16 |= ZXDH_NO_IP_FRAGMENT;
+			pi_hdr->pkt_flag_hi8 |= ZXDH_NO_TCP_FRAGMENT;
+		}
+
+		if (cookie->ol_flags & RTE_MBUF_F_TX_IP_CKSUM)
+			pkt_flag_lw16 |= ZXDH_TX_IP_CKSUM_CAL;
+
+		if ((cookie->ol_flags & RTE_MBUF_F_TX_UDP_CKSUM) == RTE_MBUF_F_TX_UDP_CKSUM) {
+			pcode = ZXDH_PCODE_UDP_PKT_TYPE;
+			pi_hdr->pkt_flag_hi8 |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+		} else if ((cookie->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) ==
+			RTE_MBUF_F_TX_TCP_CKSUM) {
+			pcode = ZXDH_PCODE_TCP_PKT_TYPE;
+			pi_hdr->pkt_flag_hi8 |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+		}
+		pkt_flag_lw16 |= (mtu_or_mss >> ZXDH_MTU_MSS_UNIT_SHIFTBIT) & ZXDH_MTU_MSS_MASK;
+		pi_hdr->pkt_flag_lw16 = rte_be_to_cpu_16(pkt_flag_lw16);
+		pi_hdr->pkt_type = l3_ptype | ZXDH_PKT_FORM_CPU | pcode;
+
+		l3_offset = hdr_len + cookie->l2_len;
+		l3_offset += (cookie->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) ?
+					cookie->outer_l2_len + cookie->outer_l3_len : 0;
+		pi_hdr->l3_offset = rte_be_to_cpu_16(l3_offset);
+		pi_hdr->l4_offset = rte_be_to_cpu_16(l3_offset + cookie->l3_len);
+		if (cookie->ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM)
+			ol_flag |= ZXDH_PD_OFFLOAD_OUTER_IPCSUM;
+	} else {
+		pd_hdr = &hdr->pd_hdr;
+	}
+
+	pd_hdr->dst_vfid = rte_be_to_cpu_16(cookie->port);
+
+	if (cookie->ol_flags & (RTE_MBUF_F_TX_VLAN | RTE_MBUF_F_TX_QINQ)) {
+		ol_flag |= ZXDH_PD_OFFLOAD_CVLAN_INSERT;
+		pd_hdr->cvlan_insert = rte_be_to_cpu_16(cookie->vlan_tci);
+		if (cookie->ol_flags & RTE_MBUF_F_TX_QINQ) {
+			ol_flag |= ZXDH_PD_OFFLOAD_SVLAN_INSERT;
+			pd_hdr->svlan_insert = rte_be_to_cpu_16(cookie->vlan_tci_outer);
+		}
+	}
+
+	pd_hdr->ol_flag = rte_be_to_cpu_16(ol_flag);
+	return 0;
+}
+
+/* Populate 4 descriptors with data from 4 mbufs */
+static inline void
+tx_bunch(struct zxdh_virtqueue *vq, volatile struct zxdh_vring_packed_desc *txdp,
+		struct rte_mbuf **pkts)
+{
+	uint16_t flags = vq->cached_flags;
+	int i;
+	for (i = 0; i < N_PER_LOOP; ++i, ++txdp, ++pkts) {
+		/* write data to descriptor */
+		txdp->addr = rte_mbuf_data_iova(*pkts);
+		txdp->len = (*pkts)->data_len;
+		txdp->flags = flags;
+	}
+}
+
+/* Populate 1 descriptor with data from 1 mbuf */
+static inline void
+tx1(struct zxdh_virtqueue *vq, volatile struct zxdh_vring_packed_desc *txdp,
+		struct rte_mbuf *pkts)
+{
+	uint16_t flags = vq->cached_flags;
+	txdp->addr = rte_mbuf_data_iova(pkts);
+	txdp->len = pkts->data_len;
+	txdp->flags = flags;
+}
+
+static void submit_to_backend_simple(struct zxdh_virtqueue  *vq,
+			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct zxdh_hw *hw = vq->hw;
+	struct rte_mbuf *m = NULL;
+	uint16_t id =  vq->vq_avail_idx;
+	struct zxdh_vring_packed_desc *txdp = &vq->vq_packed.ring.desc[id];
+	struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
+	int mainpart, leftover;
+	int i, j;
+
+	/*
+	 * Process most of the packets in chunks of N pkts.  Any
+	 * leftover packets will get processed one at a time.
+	 */
+	mainpart = (nb_pkts & ((uint32_t)~N_PER_LOOP_MASK));
+	leftover = (nb_pkts & ((uint32_t)N_PER_LOOP_MASK));
+
+	for (i = 0; i < mainpart; i += N_PER_LOOP) {
+		rte_prefetch0(dxp + i);
+		rte_prefetch0(tx_pkts + i);
+		for (j = 0; j < N_PER_LOOP; ++j) {
+			m  = *(tx_pkts + i + j);
+			pkt_padding(m, hw);
+			(dxp + i + j)->cookie = (void *)m;
+		}
+		/* write data to descriptor */
+		tx_bunch(vq, txdp + i, tx_pkts + i);
+	}
 
-			dev->data->rx_mbuf_alloc_failed += free_cnt;
+	if (leftover > 0) {
+		rte_prefetch0(dxp + mainpart);
+		rte_prefetch0(tx_pkts + mainpart);
+
+		for (i = 0; i < leftover; ++i) {
+			m =  *(tx_pkts + mainpart + i);
+			pkt_padding(m, hw);
+			(dxp + mainpart + i)->cookie = m;
+			tx1(vq, txdp + mainpart + i, *(tx_pkts + mainpart + i));
 		}
 	}
+}
+
+uint16_t zxdh_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct zxdh_virtnet_tx *txvq = tx_queue;
+	struct zxdh_virtqueue  *vq   = txq_get_vq(txvq);
+	uint16_t nb_tx = 0, nb_tx_left;
+
+	zxdh_xmit_fast_flush(vq);
+
+	nb_pkts = (uint16_t)RTE_MIN(nb_pkts, vq->vq_free_cnt);
+	if (unlikely(nb_pkts == 0)) {
+		txvq->stats.idle++;
+		return 0;
+	}
+
+	nb_tx_left = nb_pkts;
+	if ((vq->vq_avail_idx + nb_pkts) >= vq->vq_nentries) {
+		nb_tx = vq->vq_nentries - vq->vq_avail_idx;
+		nb_tx_left = nb_pkts - nb_tx;
+		submit_to_backend_simple(vq, tx_pkts, nb_tx);
+		vq->vq_avail_idx = 0;
+		vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+
+		vq->vq_free_cnt  -= nb_tx;
+		tx_pkts += nb_tx;
+	}
+	if (nb_tx_left) {
+		submit_to_backend_simple(vq, tx_pkts, nb_tx_left);
+		vq->vq_avail_idx  += nb_tx_left;
+		vq->vq_free_cnt  -= nb_tx_left;
+	}
+
+	zxdh_queue_notify(vq);
+	txvq->stats.packets += nb_pkts;
+
+	return nb_pkts;
+}
+
+static inline int zxdh_init_mbuf(struct rte_mbuf *rxm, uint16_t len,
+		struct zxdh_hw *hw, struct zxdh_virtnet_rx *rxvq)
+{
+	uint16_t hdr_size = 0;
+	struct zxdh_net_hdr_ul *header;
+
+	header = (struct zxdh_net_hdr_ul *)((char *)
+					rxm->buf_addr + RTE_PKTMBUF_HEADROOM);
+	rxm->ol_flags = 0;
+	rxm->vlan_tci = 0;
+	rxm->vlan_tci_outer = 0;
+
+	hdr_size = header->type_hdr.pd_len << 1;
+	if (unlikely(header->type_hdr.num_buffers != 1)) {
+		PMD_RX_LOG(DEBUG, "hdr_size:%u nb_segs %d is invalid",
+			hdr_size, header->type_hdr.num_buffers);
+		rte_pktmbuf_free(rxm);
+		rxvq->stats.invalid_hdr_len_err++;
+		return -1;
+	}
+	zxdh_rx_update_mbuf(hw, rxm, header);
+
+	rxm->nb_segs = 1;
+	rxm->data_off = RTE_PKTMBUF_HEADROOM + hdr_size;
+	rxm->data_len = len - hdr_size;
+	rxm->port = hw->port_id;
+
+	if (rxm->data_len != rxm->pkt_len) {
+		PMD_RX_LOG(ERR, "dropped rcvd_pkt_len %d pktlen %d  bufaddr %p.",
+					rxm->data_len, rxm->pkt_len, rxm->buf_addr);
+		rte_pktmbuf_dump(stdout, rxm, 40);
+		rte_pktmbuf_free(rxm);
+		rxvq->stats.truncated_err++;
+		rxvq->stats.errors++;
+		return -1;
+	}
+	return 0;
+}
+
+uint16_t zxdh_recv_single_pkts(void *rx_queue, struct rte_mbuf **rcv_pkts, uint16_t nb_pkts)
+{
+	struct zxdh_virtnet_rx *rxvq = rx_queue;
+	struct zxdh_virtqueue *vq = rxq_get_vq(rxvq);
+	struct zxdh_hw *hw = vq->hw;
+	struct rte_mbuf *rxm;
+	uint32_t lens[ZXDH_MBUF_BURST_SZ];
+	uint16_t len = 0;
+	uint16_t nb_rx = 0;
+	uint16_t num;
+	uint16_t i = 0;
+
+	num = nb_pkts;
+	if (unlikely(num > ZXDH_MBUF_BURST_SZ))
+		num = ZXDH_MBUF_BURST_SZ;
+	num = zxdh_dequeue_burst_rx_packed(vq, rcv_pkts, lens, num);
+	if (num == 0) {
+		rxvq->stats.idle++;
+		goto refill;
+	}
+
+	for (i = 0; i < num; i++) {
+		rxm = rcv_pkts[i];
+		len = lens[i];
+		if (unlikely(zxdh_init_mbuf(rxm, len, hw, &vq->rxq) < 0))
+			break;
+
+		nb_rx++;
+	}
+	rxvq->stats.packets += nb_rx;
+
+refill:
+	if (vq->vq_free_cnt > 0) {
+		struct rte_eth_dev *dev = hw->eth_dev;
+		refill_que_descs(vq, dev);
+		zxdh_queue_notify(vq);
+	}
 	return nb_rx;
 }
diff --git a/drivers/net/zxdh/zxdh_rxtx.h b/drivers/net/zxdh/zxdh_rxtx.h
index 424048607e..6fce04b803 100644
--- a/drivers/net/zxdh/zxdh_rxtx.h
+++ b/drivers/net/zxdh/zxdh_rxtx.h
@@ -36,44 +36,39 @@ struct zxdh_virtnet_stats {
 	uint64_t bytes;
 	uint64_t errors;
 	uint64_t idle;
-	uint64_t full;
-	uint64_t norefill;
-	uint64_t multicast;
-	uint64_t broadcast;
 	uint64_t truncated_err;
 	uint64_t offload_cfg_err;
 	uint64_t invalid_hdr_len_err;
 	uint64_t no_segs_err;
+	uint64_t no_free_tx_desc_err;
 	uint64_t size_bins[8];
 };
 
 struct __rte_cache_aligned zxdh_virtnet_rx {
 	struct zxdh_virtqueue         *vq;
-
-	uint64_t                  mbuf_initializer; /* value to init mbufs. */
 	struct rte_mempool       *mpool;            /* mempool for mbuf allocation */
-	uint16_t                  queue_id;         /* DPDK queue index. */
-	uint16_t                  port_id;          /* Device port identifier. */
 	struct zxdh_virtnet_stats      stats;
 	const struct rte_memzone *mz;               /* mem zone to populate RX ring. */
-
-	/* dummy mbuf, for wraparound when processing RX ring. */
-	struct rte_mbuf           fake_mbuf;
+	uint64_t offloads;
+	uint16_t                  queue_id;         /* DPDK queue index. */
+	uint16_t                  port_id;          /* Device port identifier. */
 };
 
 struct __rte_cache_aligned zxdh_virtnet_tx {
 	struct zxdh_virtqueue         *vq;
-
-	rte_iova_t                zxdh_net_hdr_mem; /* hdr for each xmit packet */
-	uint16_t                  queue_id;           /* DPDK queue index. */
-	uint16_t                  port_id;            /* Device port identifier. */
+	const struct rte_memzone *zxdh_net_hdr_mz;  /* memzone to populate hdr. */
+	rte_iova_t               zxdh_net_hdr_mem; /* hdr for each xmit packet */
 	struct zxdh_virtnet_stats      stats;
 	const struct rte_memzone *mz;                 /* mem zone to populate TX ring. */
-	const struct rte_memzone *zxdh_net_hdr_mz;  /* memzone to populate hdr. */
+	uint64_t offloads;
+	uint16_t                  queue_id;           /* DPDK queue index. */
+	uint16_t                  port_id;            /* Device port identifier. */
 };
 
 uint16_t zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 uint16_t zxdh_xmit_pkts_prepare(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 uint16_t zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+uint16_t zxdh_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t zxdh_recv_single_pkts(void *rx_queue, struct rte_mbuf **rcv_pkts, uint16_t nb_pkts);
 
 #endif  /* ZXDH_RXTX_H */
-- 
2.27.0

[-- Attachment #1.1.2: Type: text/html , Size: 115946 bytes --]

^ permalink raw reply related	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2026-06-17 15:21 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-26  2:28 [PATCH v1] net/zxdh: optimize Rx/Tx path performance Junlong Wang
2026-03-26  3:27 ` Stephen Hemminger
2026-04-06  4:26 ` Stephen Hemminger
2026-04-23  1:18 ` [PATCH v2 0/3] " Junlong Wang
2026-04-23  1:18   ` [PATCH v2 1/3] net/zxdh: optimize queue structure to improve performance Junlong Wang
2026-04-23 18:57     ` Stephen Hemminger
2026-04-23  1:18   ` [PATCH v2 2/3] net/zxdh: optimize Rx recv pkts performance Junlong Wang
2026-04-23 18:54     ` Stephen Hemminger
2026-04-23 23:39     ` Stephen Hemminger
2026-04-23  1:18   ` [PATCH v2 3/3] net/zxdh: optimize Tx xmit " Junlong Wang
2026-04-23 19:23   ` [PATCH v2 0/3] net/zxdh: optimize Rx/Tx path performance Stephen Hemminger
2026-05-09  6:29   ` [PATCH v3 " Junlong Wang
2026-05-09  6:29     ` [PATCH v3 1/3] net/zxdh: optimize queue structure to improve performance Junlong Wang
2026-05-18  2:20       ` Stephen Hemminger
2026-05-09  6:29     ` [PATCH v3 2/3] net/zxdh: optimize Rx recv pkts performance Junlong Wang
2026-05-09  6:29     ` [PATCH v3 3/3] net/zxdh: optimize Tx xmit " Junlong Wang
2026-05-18  2:22       ` Stephen Hemminger
2026-06-06  6:32     ` [PATCH v4 0/4] net/zxdh: optimize Rx/Tx path performance Junlong Wang
2026-06-06  6:32       ` [PATCH v4 1/4] net/zxdh: optimize queue structure to improve performance Junlong Wang
2026-06-06  6:32       ` [PATCH v4 2/4] net/zxdh: optimize Rx recv pkts performance Junlong Wang
2026-06-06  6:32       ` [PATCH v4 3/4] net/zxdh: optimize Tx xmit " Junlong Wang
2026-06-06  6:32       ` [PATCH v4 4/4] net/zxdh: fix queue enable intr issues Junlong Wang
2026-06-07 18:00       ` [PATCH v4 0/4] net/zxdh: optimize Rx/Tx path performance Stephen Hemminger
2026-06-15  1:19       ` [PATCH v5 " Junlong Wang
2026-06-15  1:19         ` [PATCH v5 1/4] net/zxdh: fix queue enable intr issues Junlong Wang
2026-06-15  1:19         ` [PATCH v5 2/4] net/zxdh: optimize queue structure to improve performance Junlong Wang
2026-06-15  1:19         ` [PATCH v5 3/4] net/zxdh: optimize Rx recv pkts performance Junlong Wang
2026-06-15  1:19         ` [PATCH v5 4/4] net/zxdh: optimize Tx xmit " Junlong Wang
2026-06-15 18:38           ` Stephen Hemminger
2026-06-17  8:28       ` [PATCH v6 0/4] net/zxdh: optimize Rx/Tx path performance Junlong Wang
2026-06-17  8:28         ` [PATCH v6 1/4] net/zxdh: fix queue enable intr issues Junlong Wang
2026-06-17  8:28         ` [PATCH v6 2/4] net/zxdh: optimize queue structure to improve performance Junlong Wang
2026-06-17  8:28         ` [PATCH v6 3/4] net/zxdh: optimize Rx recv pkts performance Junlong Wang
2026-06-17  8:28         ` [PATCH v6 4/4] net/zxdh: optimize Tx xmit " Junlong Wang
2026-06-17 15:21         ` [PATCH v6 0/4] net/zxdh: optimize Rx/Tx path performance Stephen Hemminger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.