DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 06/20] net/sxe2: support TM hierarchy and shaping
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev, Jie Liu
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

From: Jie Liu <liujie5@linkdatatechnology.com>

This patch implements the traffic management ops for example PMD.
It supports a 4-level hierarchy: port, vsi, queue group and queue.

- Support node add/delete and hierarchy commit.
- Support private shaper and rate limiting on each node.

The hardware requires all nodes to be configured before the hierarchy
is committed to the global registers.

Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
---
 drivers/net/sxe2/meson.build     |    1 +
 drivers/net/sxe2/sxe2_cmd_chnl.c |  163 +++++
 drivers/net/sxe2/sxe2_cmd_chnl.h |    6 +
 drivers/net/sxe2/sxe2_drv_cmd.h  |   26 +
 drivers/net/sxe2/sxe2_ethdev.c   |   82 +++
 drivers/net/sxe2/sxe2_ethdev.h   |    5 +
 drivers/net/sxe2/sxe2_tm.c       | 1151 ++++++++++++++++++++++++++++++
 drivers/net/sxe2/sxe2_tm.h       |   76 ++
 drivers/net/sxe2/sxe2_tx.c       |    1 -
 9 files changed, 1510 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/sxe2/sxe2_tm.c
 create mode 100644 drivers/net/sxe2/sxe2_tm.h

diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build
index da7a690063..f03ea15356 100644
--- a/drivers/net/sxe2/meson.build
+++ b/drivers/net/sxe2/meson.build
@@ -63,4 +63,5 @@ sources += files(
         'sxe2_mac.c',
         'sxe2_filter.c',
         'sxe2_rss.c',
+        'sxe2_tm.c',
 )
diff --git a/drivers/net/sxe2/sxe2_cmd_chnl.c b/drivers/net/sxe2/sxe2_cmd_chnl.c
index b997e7b044..19323ffcc4 100644
--- a/drivers/net/sxe2/sxe2_cmd_chnl.c
+++ b/drivers/net/sxe2/sxe2_cmd_chnl.c
@@ -230,6 +230,7 @@ static void sxe2_txq_ctxt_cfg_fill(struct sxe2_tx_queue *txq,
 				   struct sxe2_drv_txq_cfg_req *req,
 				   uint16_t txq_cnt)
 {
+	struct sxe2_adapter *adapter = txq->vsi->adapter;
 	struct sxe2_drv_txq_ctxt *ctxt = req->cfg;
 	uint16_t q_idx = 0;
 
@@ -241,6 +242,8 @@ static void sxe2_txq_ctxt_cfg_fill(struct sxe2_tx_queue *txq,
 		ctxt->depth = txq[q_idx].ring_depth;
 		ctxt->dma_addr = txq[q_idx].base_addr;
 		ctxt->queue_id = txq[q_idx].queue_id;
+
+		ctxt->sched_mode = sxe2_sched_mode_get(adapter);
 	}
 }
 
@@ -310,6 +313,7 @@ int32_t sxe2_drv_txq_switch(struct sxe2_adapter *adapter, struct sxe2_tx_queue *
 	req.q_idx = txq->queue_id;
 
 	req.is_enable  = (uint8_t)enable;
+	req.sched_mode = sxe2_sched_mode_get(adapter);
 	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_TXQ_DISABLE,
 			&req, sizeof(req), NULL, 0);
 
@@ -714,3 +718,162 @@ int32_t sxe2_drv_ptp_gettime(struct sxe2_adapter *adapter, struct sxe2_rx_queue
 	(void)rxq;
 	return 0;
 }
+
+int32_t sxe2_drv_root_tree_alloc(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_tm_res tm_resp;
+	int32_t ret;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_SCHED_ROOT_TREE_ALLOC,
+				 NULL, 0,
+				 &tm_resp, sizeof(tm_resp));
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "add sched root failed, ret:%d", ret);
+		goto l_end;
+	}
+
+	tm_ctxt->root_teid = tm_resp.teid;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_drv_root_tree_release(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_tm_res tm_res = {0};
+	int32_t ret;
+
+	tm_res.teid = adapter->tm_ctxt.root_teid;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_SCHED_ROOT_TREE_RELEASE,
+				 &tm_res, sizeof(tm_res),
+				 NULL, 0);
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "release sched root failed, ret:%d", ret);
+		goto l_end;
+	}
+
+l_end:
+	return ret;
+}
+
+static void sxe2_drv_tm_node_to_info(struct sxe2_adapter *adapter,
+		struct sxe2_tm_node *node, struct sxe2_tm_info *info)
+{
+	uint32_t rate = 0;
+
+	if (node->shaper_profile->profile.committed.rate == UINT64_MAX)
+		rate = UINT32_MAX;
+	else
+		rate = (uint32_t)(node->shaper_profile->profile.committed.rate * 8 / 1000);
+
+	info->committed = rte_cpu_to_le_32(rate);
+
+	if (node->shaper_profile->profile.peak.rate == UINT64_MAX)
+		rate = UINT32_MAX;
+	else
+		rate = (uint32_t)(node->shaper_profile->profile.peak.rate * 8 / 1000);
+
+	info->peak = rte_cpu_to_le_32(rate);
+
+	info->priority = (adapter->tm_ctxt.prio_max - 1 - node->priority);
+
+	info->weight = rte_cpu_to_le_16(node->hw_weight);
+}
+
+static int32_t sxe2_drv_tm_commit_node(struct sxe2_adapter *adapter,
+		struct sxe2_tm_node *tm_node)
+{
+	struct sxe2_drv_cmd_params cmd = { 0 };
+	struct sxe2_tm_add_mid_msg msg_mid = {0};
+	struct sxe2_tm_add_queue_msg msg_queue = {0};
+	struct sxe2_tm_res res = {0};
+	struct sxe2_common_device *cdev = adapter->cdev;
+	int32_t ret;
+	uint32_t i;
+
+	if (tm_node->type == SXE2_TM_NODE_TYPE_VSIG) {
+		goto l_add;
+	} else if (tm_node->type == SXE2_TM_NODE_TYPE_MID) {
+		sxe2_drv_tm_node_to_info(adapter, tm_node, &msg_mid.info);
+		msg_mid.parent_teid = rte_cpu_to_le_16(tm_node->parent->teid);
+		msg_mid.adj_lvl = adapter->sched_ctxt.adj_lvl;
+
+		sxe2_drv_cmd_params_fill(adapter, &cmd,
+					 SXE2_DRV_CMD_SCHED_TM_ADD_MID_NODE,
+					 &msg_mid, sizeof(msg_mid),
+					 &res, sizeof(res));
+		ret = sxe2_drv_cmd_exec(cdev, &cmd);
+		if (ret) {
+			PMD_LOG_ERR(DRV, "add tm mid node failed, ret:%d", ret);
+			goto l_end;
+		}
+	} else if (tm_node->type == SXE2_TM_NODE_TYPE_QUEUE) {
+		sxe2_drv_tm_node_to_info(adapter, tm_node, &msg_queue.info);
+		msg_queue.parent_teid = rte_cpu_to_le_16(tm_node->parent->teid);
+		msg_queue.queue_id = rte_cpu_to_le_16(tm_node->id);
+		msg_queue.adj_lvl = adapter->sched_ctxt.adj_lvl;
+
+		sxe2_drv_cmd_params_fill(adapter, &cmd,
+					 SXE2_DRV_CMD_SCHED_TM_ADD_QUEUE_NODE,
+					 &msg_queue, sizeof(msg_queue),
+					 &res, sizeof(res));
+		ret = sxe2_drv_cmd_exec(cdev, &cmd);
+		if (ret) {
+			PMD_LOG_ERR(DRV, "add tm queue failed, ret:%d", ret);
+			goto l_end;
+		}
+	} else {
+		PMD_LOG_ERR(DRV, "commit tm node failed, type:%d", tm_node->type);
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	tm_node->teid = rte_le_to_cpu_16(res.teid);
+
+l_add:
+	for (i = 0; i < tm_node->child_cnt; i++) {
+		ret = sxe2_drv_tm_commit_node(adapter, tm_node->children[i]);
+		if (ret)
+			goto l_end;
+	}
+l_end:
+	return ret;
+}
+
+int32_t sxe2_drv_tm_commit(struct sxe2_adapter *adapter)
+{
+	struct sxe2_drv_cmd_params cmd = { 0 };
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_tm_res tm_res = {0};
+	int32_t ret;
+
+	tm_res.teid = adapter->tm_ctxt.root_teid;
+	sxe2_drv_cmd_params_fill(adapter, &cmd, SXE2_DRV_CMD_SCHED_ROOT_CHILDREN_DELETE,
+				 &tm_res, sizeof(tm_res),
+				 NULL, 0);
+	ret = sxe2_drv_cmd_exec(cdev, &cmd);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "del tm root failed, ret:%d", ret);
+		goto l_end;
+	}
+
+	ret = sxe2_drv_tm_commit_node(adapter, adapter->tm_ctxt.root);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "commit tm node failed, ret:%d", ret);
+		goto l_end;
+	}
+
+	PMD_LOG_DEBUG(DRV, "commit tm success");
+l_end:
+	return ret;
+}
diff --git a/drivers/net/sxe2/sxe2_cmd_chnl.h b/drivers/net/sxe2/sxe2_cmd_chnl.h
index 2546c65a6c..77e689abcd 100644
--- a/drivers/net/sxe2/sxe2_cmd_chnl.h
+++ b/drivers/net/sxe2/sxe2_cmd_chnl.h
@@ -38,6 +38,12 @@ int32_t sxe2_drv_mac_link_status_get(struct sxe2_adapter *adapter);
 
 int32_t sxe2_drv_promisc_config(struct sxe2_adapter *adapter, bool set);
 
+int32_t sxe2_drv_root_tree_release(struct rte_eth_dev *dev);
+
+int32_t sxe2_drv_root_tree_alloc(struct rte_eth_dev *dev);
+
+int32_t sxe2_drv_tm_commit(struct sxe2_adapter *adapter);
+
 int32_t sxe2_drv_allmulti_config(struct sxe2_adapter *adapter, bool set);
 
 int32_t sxe2_drv_uc_config(struct sxe2_adapter *adapter, struct rte_ether_addr *addr, bool add);
diff --git a/drivers/net/sxe2/sxe2_drv_cmd.h b/drivers/net/sxe2/sxe2_drv_cmd.h
index 9998f241f0..67c6885cae 100644
--- a/drivers/net/sxe2/sxe2_drv_cmd.h
+++ b/drivers/net/sxe2/sxe2_drv_cmd.h
@@ -349,6 +349,32 @@ struct __rte_aligned(4) __rte_packed_begin sxe2_rss_hf_req {
 	uint8_t rsv1[3];
 } __rte_packed_end;
 
+struct __rte_aligned(4) __rte_packed_begin sxe2_tm_res {
+	uint16_t teid;
+	uint8_t rsv[2];
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_tm_info {
+	uint32_t committed;
+	uint32_t peak;
+	uint8_t priority;
+	uint8_t reserve;
+	uint16_t weight;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_tm_add_mid_msg {
+	uint16_t parent_teid;
+	uint8_t adj_lvl;
+	struct sxe2_tm_info info;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_tm_add_queue_msg {
+	uint16_t parent_teid;
+	uint16_t queue_id;
+	uint8_t adj_lvl;
+	struct sxe2_tm_info info;
+} __rte_packed_end;
+
 enum sxe2_drv_cmd_module {
 	SXE2_DRV_CMD_MODULE_HANDSHAKE = 0,
 	SXE2_DRV_CMD_MODULE_DEV = 1,
diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c
index 77a2b5b1dc..f98cf367f1 100644
--- a/drivers/net/sxe2/sxe2_ethdev.c
+++ b/drivers/net/sxe2/sxe2_ethdev.c
@@ -130,6 +130,8 @@ static const struct eth_dev_ops sxe2_eth_dev_ops = {
 	.reta_query                 = sxe2_dev_rss_reta_query,
 	.rss_hash_update            = sxe2_dev_rss_hash_update,
 	.rss_hash_conf_get          = sxe2_dev_rss_hash_conf_get,
+
+	.tm_ops_get                 = sxe2_tm_ops_get,
 };
 
 static int32_t sxe2_dev_configure(struct rte_eth_dev *dev)
@@ -575,6 +577,14 @@ static void sxe2_drv_dev_caps_set(struct sxe2_adapter *adapter,
 		adapter->cap_flags |= SXE2_DEV_CAPS_OFFLOAD_FC_STATE;
 }
 
+static void sxe2_sw_sched_hw_cap_set(struct sxe2_adapter *adapter,
+				     struct sxe2_txsch_caps *txsch_caps)
+{
+	adapter->sched_ctxt.tm_layers = txsch_caps->layer_cap;
+	adapter->sched_ctxt.root_max_children = txsch_caps->tm_mid_node_num;
+	adapter->sched_ctxt.prio_max = txsch_caps->prio_num;
+}
+
 static int32_t sxe2_func_caps_get(struct sxe2_adapter *adapter)
 {
 	int32_t ret = -1;
@@ -594,6 +604,8 @@ static int32_t sxe2_func_caps_get(struct sxe2_adapter *adapter)
 
 	sxe2_sw_vsi_ctx_hw_cap_set(adapter, &dev_caps.vsi_caps);
 
+	sxe2_sw_sched_hw_cap_set(adapter, &dev_caps.txsch_caps);
+
 l_end:
 	return ret;
 }
@@ -930,6 +942,68 @@ void sxe2_dev_pci_map_uinit(struct rte_eth_dev *dev)
 	adapter->dev_info.dev_data = NULL;
 }
 
+uint32_t sxe2_sched_mode_get(struct sxe2_adapter *adapter)
+{
+	uint32_t ret_mode = SXE2_SCHED_MODE_INVALID;
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+
+	if (adapter->devargs.high_performance_mode)
+		ret_mode = SXE2_SCHED_MODE_HIGH_PERFORMANCE;
+	else if (tm_ctxt->committed)
+		ret_mode = SXE2_SCHED_MODE_TM;
+	else
+		ret_mode = SXE2_SCHED_MODE_DEFAULT;
+
+	return ret_mode;
+}
+
+static int32_t sxe2_sched_init(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	adapter->sched_ctxt.adj_lvl = adapter->devargs.sched_layer_mode;
+
+	if (adapter->devargs.high_performance_mode) {
+		PMD_LOG_DEBUG(DRV, "TM feature will be disabled in high-performance mode.");
+		adapter->cap_flags &= ~(SXE2_DEV_CAPS_OFFLOAD_TM);
+	} else {
+		ret = sxe2_tm_init(dev);
+		if (ret)
+			goto l_end;
+
+		ret = sxe2_drv_root_tree_alloc(dev);
+		if (ret)
+			goto l_end;
+	}
+
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_sched_uinit(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	if (adapter->devargs.high_performance_mode == 0) {
+		ret = sxe2_tm_uninit(dev);
+		if (ret) {
+			PMD_LOG_ERR(INIT, "Failed to uninit tm, ret=%d", ret);
+			goto l_end;
+		}
+
+		ret = sxe2_drv_root_tree_release(dev);
+		if (ret) {
+			PMD_LOG_ERR(INIT, "Failed to release root tree, ret=%d", ret);
+			goto l_end;
+		}
+	}
+
+l_end:
+	return ret;
+}
+
 static int32_t sxe2_dev_init(struct rte_eth_dev *dev,
 			     struct sxe2_dev_kvargs_info *kvargs __rte_unused)
 {
@@ -985,8 +1059,15 @@ static int32_t sxe2_dev_init(struct rte_eth_dev *dev,
 		goto init_rss_err;
 	}
 
+	ret = sxe2_sched_init(dev);
+	if (ret) {
+		PMD_LOG_ERR(INIT, "Failed to init sched, ret=%d", ret);
+		goto init_sched_err;
+	}
+
 	goto l_end;
 
+init_sched_err:
 init_rss_err:
 init_eth_err:
 init_dev_info_err:
@@ -1002,6 +1083,7 @@ static int32_t sxe2_dev_close(struct rte_eth_dev *dev)
 	(void)sxe2_dev_stop(dev);
 	(void)sxe2_queues_release(dev);
 	(void)sxe2_rss_disable(dev);
+	(void)sxe2_sched_uinit(dev);
 	sxe2_vsi_uninit(dev);
 	sxe2_dev_pci_map_uinit(dev);
 	sxe2_eth_uinit(dev);
diff --git a/drivers/net/sxe2/sxe2_ethdev.h b/drivers/net/sxe2/sxe2_ethdev.h
index 3aa3799b79..95594fcbde 100644
--- a/drivers/net/sxe2/sxe2_ethdev.h
+++ b/drivers/net/sxe2/sxe2_ethdev.h
@@ -20,6 +20,7 @@
 #include "sxe2_queue.h"
 #include "sxe2_mac.h"
 #include "sxe2_osal.h"
+#include "sxe2_tm.h"
 #include "sxe2_filter.h"
 
 struct sxe2_link_msg {
@@ -309,6 +310,8 @@ struct sxe2_adapter {
 	struct sxe2_rss_context       rss_ctxt;
 	struct sxe2_link_context      link_ctxt;
 	struct sxe2_ptp_context       ptp_ctxt;
+	struct sxe2_sched_hw_cap      sched_ctxt;
+	struct sxe2_tm_context        tm_ctxt;
 	struct sxe2_devargs           devargs;
 	struct sxe2_switchdev_info    switchdev_info;
 	bool                          rule_started;
@@ -332,6 +335,8 @@ void *sxe2_pci_map_addr_get(struct sxe2_adapter *adapter,
 			    enum sxe2_pci_map_resource res_type,
 			    uint16_t idx_in_func);
 
+uint32_t sxe2_sched_mode_get(struct sxe2_adapter *adapter);
+
 struct sxe2_pci_map_bar_info *sxe2_dev_get_bar_info(struct sxe2_adapter *adapter,
 						    enum sxe2_pci_map_resource res_type);
 
diff --git a/drivers/net/sxe2/sxe2_tm.c b/drivers/net/sxe2/sxe2_tm.c
new file mode 100644
index 0000000000..4c4f793cd5
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_tm.c
@@ -0,0 +1,1151 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#include <rte_ethdev.h>
+#include <rte_tm_driver.h>
+
+#include "sxe2_tm.h"
+#include "sxe2_ethdev.h"
+#include "sxe2_common_log.h"
+#include "sxe2_cmd_chnl.h"
+
+static uint16_t sxe2_tm_level_node_num_get(uint8_t level)
+{
+	uint16_t node_num = 0;
+
+	switch (level) {
+	case 0:
+		node_num = SXE2_TM_1L_NODE_NUM_MAX;
+		break;
+	case 1:
+		node_num = SXE2_TM_2L_NODE_NUM_MAX;
+		break;
+	case 2:
+		node_num = SXE2_TM_3L_NODE_NUM_MAX;
+	break;
+	case 3:
+		node_num = SXE2_TM_4L_NODE_NUM_MAX;
+		break;
+	}
+	return node_num;
+}
+
+static int32_t sxe2_tm_capabilities_get(struct rte_eth_dev *dev,
+			  struct rte_tm_capabilities *cap,
+			  struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	uint32_t i;
+
+	if (!cap || !error) {
+		PMD_LOG_ERR(DRV, "sxe2 get tm cap failed, cap or error is NULL.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	error->type = RTE_TM_ERROR_TYPE_NONE;
+	memset(cap, 0, sizeof(struct rte_tm_capabilities));
+
+	for (i = 0; i < adapter->tm_ctxt.tm_layers; i++)
+		cap->n_nodes_max += sxe2_tm_level_node_num_get(i);
+
+	cap->n_levels_max = adapter->tm_ctxt.tm_layers;
+
+	cap->non_leaf_nodes_identical = 1;
+
+	cap->leaf_nodes_identical = 1;
+
+	cap->shaper_n_max = cap->n_nodes_max;
+
+	cap->shaper_private_n_max = cap->n_nodes_max;
+
+	cap->shaper_private_dual_rate_n_max = cap->n_nodes_max;
+
+	cap->shaper_private_rate_min = 0;
+
+	cap->shaper_private_rate_max = 12500000000ull;
+
+	cap->shaper_private_packet_mode_supported = 0;
+	cap->shaper_private_byte_mode_supported = 1;
+
+	cap->shaper_pkt_length_adjust_min = RTE_TM_ETH_FRAMING_OVERHEAD;
+
+	cap->shaper_pkt_length_adjust_max = RTE_TM_ETH_FRAMING_OVERHEAD_FCS;
+
+	cap->shaper_shared_n_max = 0;
+	cap->shaper_shared_n_nodes_per_shaper_max = 0;
+	cap->shaper_shared_n_shapers_per_node_max = 0;
+	cap->shaper_shared_dual_rate_n_max = 0;
+	cap->shaper_shared_rate_min = 0;
+	cap->shaper_shared_rate_max = 0;
+	cap->shaper_shared_packet_mode_supported = 0;
+	cap->shaper_shared_byte_mode_supported = 0;
+
+	cap->sched_n_children_max = dev->data->nb_tx_queues;
+
+	cap->sched_sp_n_priorities_max = 7;
+
+	cap->sched_wfq_n_children_per_group_max = 1;
+	cap->sched_wfq_n_groups_max = 0;
+	cap->sched_wfq_weight_max = 0;
+	cap->sched_wfq_packet_mode_supported = 0;
+	cap->sched_wfq_byte_mode_supported = 0;
+
+	cap->cman_wred_packet_mode_supported = 0;
+	cap->cman_wred_byte_mode_supported = 0;
+	cap->cman_head_drop_supported = 0;
+	cap->cman_wred_context_n_max = 0;
+	cap->cman_wred_context_private_n_max = 0;
+	cap->cman_wred_context_shared_n_max = 0;
+	cap->cman_wred_context_shared_n_nodes_per_context_max = 0;
+	cap->cman_wred_context_shared_n_contexts_per_node_max = 0;
+
+	cap->dynamic_update_mask = 0;
+
+	cap->stats_mask = 0;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_level_capabilities_get(struct rte_eth_dev *dev,
+		uint32_t level_id, struct rte_tm_level_capabilities *cap,
+		struct rte_tm_error *error)
+{
+	int32_t ret = 0;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	if (!cap || !error) {
+		PMD_LOG_ERR(DRV, "sxe2 get tm cap failed, cap or error is NULL.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	if (level_id >= adapter->tm_ctxt.tm_layers) {
+		ret = -EINVAL;
+		error->type = RTE_TM_ERROR_TYPE_LEVEL_ID;
+		error->message = "too deep level";
+		goto l_end;
+	}
+
+	cap->n_nodes_max = sxe2_tm_level_node_num_get(level_id);
+
+	cap->non_leaf_nodes_identical = true;
+
+	cap->leaf_nodes_identical = true;
+
+	if (level_id != adapter->tm_ctxt.tm_layers - 1) {
+		cap->n_nodes_nonleaf_max = cap->n_nodes_max;
+		cap->n_nodes_leaf_max = 0;
+
+		cap->nonleaf.shaper_private_supported = true;
+		cap->nonleaf.shaper_private_dual_rate_supported = true;
+		cap->nonleaf.shaper_private_rate_min = 0;
+
+		cap->nonleaf.shaper_private_rate_max = 12500000000ull;
+		cap->nonleaf.shaper_private_packet_mode_supported = 0;
+		cap->nonleaf.shaper_private_byte_mode_supported = 1;
+
+		cap->nonleaf.shaper_shared_n_max = 0;
+		cap->nonleaf.shaper_shared_packet_mode_supported = 0;
+		cap->nonleaf.shaper_shared_byte_mode_supported = 0;
+
+		cap->nonleaf.sched_n_children_max = SXE2_TM_MAX_CHILDREN_COUNT;
+		cap->nonleaf.sched_sp_n_priorities_max = 7;
+
+		cap->nonleaf.sched_wfq_n_children_per_group_max = 0;
+		cap->nonleaf.sched_wfq_n_groups_max = 0;
+		cap->nonleaf.sched_wfq_weight_max = 0;
+		cap->nonleaf.sched_wfq_packet_mode_supported = 0;
+		cap->nonleaf.sched_wfq_byte_mode_supported = 0;
+
+		cap->nonleaf.stats_mask = 0;
+	} else {
+		cap->n_nodes_nonleaf_max = 0;
+		cap->n_nodes_leaf_max = cap->n_nodes_max;
+
+		cap->leaf.shaper_private_supported = true;
+		cap->leaf.shaper_private_dual_rate_supported = true;
+		cap->leaf.shaper_private_rate_min = 0;
+		cap->leaf.shaper_private_rate_max = 12500000000ull;
+		cap->leaf.shaper_private_packet_mode_supported = 0;
+		cap->leaf.shaper_private_byte_mode_supported = 1;
+
+		cap->leaf.shaper_shared_n_max = 0;
+		cap->leaf.shaper_shared_packet_mode_supported = 0;
+		cap->leaf.shaper_shared_byte_mode_supported = 0;
+
+		cap->leaf.cman_head_drop_supported = false;
+		cap->leaf.cman_wred_context_private_supported = false;
+		cap->leaf.cman_wred_context_shared_n_max = 0;
+
+		cap->leaf.stats_mask = 0;
+	}
+
+l_end:
+	return ret;
+}
+
+static struct sxe2_tm_node *sxe2_tm_find_node(struct sxe2_tm_node *parent, uint32_t id)
+{
+	struct sxe2_tm_node *node = NULL;
+	uint32_t i;
+
+	if (parent == NULL || parent->id == id) {
+		node = parent;
+		goto l_end;
+	}
+
+	for (i = 0; i < parent->child_cnt; i++) {
+		node = sxe2_tm_find_node(parent->children[i], id);
+		if (node)
+			goto l_end;
+	}
+
+l_end:
+	return node;
+}
+
+static int32_t sxe2_node_capabilities_get(struct rte_eth_dev *dev, uint32_t node_id,
+				      struct rte_tm_node_capabilities *cap,
+				      struct rte_tm_error *error)
+{
+	int32_t ret = -EINVAL;
+	struct sxe2_tm_node *tm_node;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	if (!cap || !error) {
+		PMD_LOG_ERR(DRV, "sxe2 get tm cap failed, cap or error is NULL.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	if (node_id == RTE_TM_NODE_ID_NULL) {
+		PMD_LOG_ERR(DRV, "invalid node id");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "invalid node id";
+		goto l_end;
+	}
+
+	tm_node = sxe2_tm_find_node(adapter->tm_ctxt.root, node_id);
+	if (!tm_node) {
+		PMD_LOG_ERR(DRV, "no such node");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "no such node";
+		goto l_end;
+	}
+
+	cap->shaper_private_supported = true;
+	cap->shaper_private_dual_rate_supported = true;
+	cap->shaper_private_rate_min = 0;
+	cap->shaper_private_rate_max = 12500000000ull;
+	cap->shaper_private_packet_mode_supported = 0;
+	cap->shaper_private_byte_mode_supported = 1;
+
+	cap->shaper_shared_n_max = 0;
+	cap->shaper_shared_packet_mode_supported = 0;
+	cap->shaper_shared_byte_mode_supported = 0;
+
+	if (tm_node->level == adapter->tm_ctxt.tm_layers - 1) {
+		cap->leaf.cman_head_drop_supported = false;
+		cap->leaf.cman_wred_context_private_supported = false;
+		cap->leaf.cman_wred_context_shared_n_max = 0;
+	} else {
+		cap->nonleaf.sched_n_children_max = SXE2_TM_MAX_CHILDREN_COUNT;
+		cap->nonleaf.sched_sp_n_priorities_max = 7;
+		cap->nonleaf.sched_wfq_n_children_per_group_max = 0;
+		cap->nonleaf.sched_wfq_n_groups_max = 0;
+		cap->nonleaf.sched_wfq_weight_max = 0;
+		cap->nonleaf.sched_wfq_packet_mode_supported = 0;
+		cap->nonleaf.sched_wfq_byte_mode_supported = 0;
+	}
+	cap->stats_mask = 0;
+
+	ret = 0;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_shaper_profile_param_check(const struct rte_tm_shaper_params *profile,
+					      struct rte_tm_error *error)
+{
+	int32_t ret = 0;
+
+	if (profile->committed.size) {
+		PMD_LOG_ERR(DRV, "committed bucket size not supported.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_COMMITTED_SIZE;
+		error->message = "committed bucket size not supported";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (profile->peak.size) {
+		PMD_LOG_ERR(DRV, "peak bucket size not supported.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PEAK_SIZE;
+		error->message = "peak bucket size not supported";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (profile->pkt_length_adjust) {
+		PMD_LOG_ERR(DRV, "packet length adjustment not supported.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PKT_ADJUST_LEN;
+		error->message = "packet length adjustment not supported";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (profile->committed.rate > SXE2_HW_RATE_MAX ||
+		profile->committed.rate < SXE2_HW_RATE_MIN) {
+		PMD_LOG_ERR(DRV, "The committed rate limit value is required to be in "
+			"the range [%" PRIu64 ", %" PRIu64 "].",
+			(uint64_t)SXE2_HW_RATE_MIN, (uint64_t)SXE2_HW_RATE_MAX);
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_COMMITTED_RATE;
+		error->message = "invalid rate limit: value out of range.";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (profile->peak.rate > SXE2_HW_RATE_MAX ||
+		profile->peak.rate < SXE2_HW_RATE_MIN) {
+		PMD_LOG_ERR(DRV, "The peak rate limit value is required to be in "
+			"the range [%" PRIu64 ", %" PRIu64 "].",
+			(uint64_t)SXE2_HW_RATE_MIN, (uint64_t)SXE2_HW_RATE_MAX);
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PEAK_RATE;
+		error->message = "invalid rate limit: value out of range.";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (profile->committed.rate > profile->peak.rate) {
+		PMD_LOG_ERR(DRV, "committed rate can't be greater than peak rate.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_COMMITTED_RATE;
+		error->message = "committed rate can't be greater than peak rate.";
+		ret = -EINVAL;
+		goto l_end;
+	}
+l_end:
+	return ret;
+}
+
+static inline struct sxe2_tm_shaper_profile *
+sxe2_tm_shaper_profile_search(struct rte_eth_dev *dev, uint32_t id)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_shaper_profile_list *shaper_profile_list =
+		&adapter->tm_ctxt.profile_list;
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+
+	TAILQ_FOREACH(shaper_profile, shaper_profile_list, node) {
+		if (id == shaper_profile->id)
+			goto l_end;
+	}
+
+	shaper_profile = NULL;
+
+l_end:
+	return shaper_profile;
+}
+
+static int32_t sxe2_tm_shaper_profile_add(struct rte_eth_dev *dev, uint32_t shaper_profile_id,
+				      const struct rte_tm_shaper_params *profile,
+				      struct rte_tm_error *error)
+{
+	int32_t ret = -EINVAL;
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	if (!profile || !error) {
+		PMD_LOG_ERR(DRV, "Invalid input: profile:0x%p or error:0x%p is null.",
+			profile, error);
+		if (error) {
+			error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+			error->message = "Invalid input: profile or error is null.";
+		}
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	ret = sxe2_tm_shaper_profile_param_check(profile, error);
+	if (ret)
+		goto l_end;
+
+	shaper_profile = sxe2_tm_shaper_profile_search(dev, shaper_profile_id);
+	if (shaper_profile) {
+		PMD_LOG_ERR(DRV, "profile ID exist.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_ID;
+		error->message = "profile ID exist";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	shaper_profile = rte_zmalloc("sxe2_tm_shaper_profile",
+							sizeof(struct sxe2_tm_shaper_profile), 0);
+	if (!shaper_profile) {
+		PMD_LOG_ERR(DRV, "Alloc shaper_profile memory failed.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "Alloc shaper_profile memory failed";
+		ret = -ENOMEM;
+		goto l_end;
+	}
+
+	rte_memcpy(&shaper_profile->profile, profile,
+					sizeof(struct rte_tm_shaper_params));
+	shaper_profile->id = shaper_profile_id;
+
+	TAILQ_INSERT_TAIL(&adapter->tm_ctxt.profile_list, shaper_profile, node);
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_shaper_profile_del(struct rte_eth_dev *dev,
+		uint32_t id, struct rte_tm_error *error)
+{
+	int32_t ret = -EINVAL;
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	if (!error) {
+		PMD_LOG_ERR(DRV, "Error param is null.");
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	shaper_profile = sxe2_tm_shaper_profile_search(dev, id);
+	if (!shaper_profile) {
+		PMD_LOG_ERR(DRV, "profile ID not exist.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE_ID;
+		error->message = "profile ID not exist";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (shaper_profile->ref_cnt) {
+		PMD_LOG_ERR(DRV, "profile in use.");
+		error->type = RTE_TM_ERROR_TYPE_SHAPER_PROFILE;
+		error->message = "profile in use";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	TAILQ_REMOVE(&adapter->tm_ctxt.profile_list, shaper_profile, node);
+	rte_free(shaper_profile);
+
+	ret = 0;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_node_param_check(struct rte_eth_dev *dev,
+			uint32_t parent_node_type,
+			uint32_t node_id, uint32_t priority, uint32_t weight,
+			const struct rte_tm_node_params *params,
+			bool is_leaf, struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	int32_t ret = -EINVAL;
+
+	if (node_id == RTE_TM_NODE_ID_NULL) {
+		PMD_LOG_ERR(DRV, "Invalid node id.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "invalid node id";
+		goto l_end;
+	}
+
+	if (parent_node_type == SXE2_TM_NODE_TYPE_VSIG &&
+			priority >= tm_ctxt->prio_max) {
+		PMD_LOG_ERR(DRV, "Priority should be less than %u.", tm_ctxt->prio_max);
+		error->type = RTE_TM_ERROR_TYPE_NODE_PRIORITY;
+		error->message = "The priority is too high.";
+		goto l_end;
+	}
+
+	if (priority > SXE2_TM_PRIO_MAX) {
+		PMD_LOG_ERR(DRV, "Priority should be less than %u.", SXE2_TM_PRIO_MAX);
+		error->type = RTE_TM_ERROR_TYPE_NODE_PRIORITY;
+		error->message = "The priority is too high.";
+		goto l_end;
+	}
+
+	if (weight > SXE2_TM_WEIGHT_MAX || weight < SXE2_TM_WEIGHT_MIN) {
+		PMD_LOG_ERR(DRV, "Weight must be between 1 and 200.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_WEIGHT;
+		error->message = "weight must be between 1 and 200";
+		goto l_end;
+	}
+
+	if (params->shared_shaper_id) {
+		PMD_LOG_ERR(DRV, "Shared shaper not supported.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_SHARED_SHAPER_ID;
+		error->message = "shared shaper not supported";
+		goto l_end;
+	}
+	if (params->n_shared_shapers) {
+		PMD_LOG_ERR(DRV, "Shared shaper not supported..");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SHARED_SHAPERS;
+		error->message = "shared shaper not supported";
+		goto l_end;
+	}
+
+	if (!is_leaf) {
+		if (node_id <= dev->data->nb_tx_queues) {
+			PMD_LOG_ERR(DRV, "no leaf node id must bigger than queue id.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+			error->message = "no leaf node id must bigger than queue id.";
+			goto l_end;
+		}
+
+		if (params->nonleaf.wfq_weight_mode) {
+			PMD_LOG_ERR(DRV, "WFQ not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_WFQ_WEIGHT_MODE;
+			error->message = "WFQ not supported";
+			goto l_end;
+		}
+
+		if (params->nonleaf.n_sp_priorities != 1) {
+			PMD_LOG_ERR(DRV, "SP priority not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SP_PRIORITIES;
+			error->message = "SP priority not supported";
+			goto l_end;
+		}
+	} else {
+		if (node_id >= dev->data->nb_tx_queues) {
+			PMD_LOG_ERR(DRV, "leaf node id must be queue id.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+			error->message = "leaf node id must be queue id.";
+			goto l_end;
+		}
+
+		if (params->leaf.cman) {
+			PMD_LOG_ERR(DRV, "Congestion management not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_CMAN;
+			error->message = "Congestion management not supported";
+			goto l_end;
+		}
+		if (params->leaf.wred.wred_profile_id != RTE_TM_WRED_PROFILE_ID_NONE) {
+			PMD_LOG_ERR(DRV, "WRED not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_WRED_PROFILE_ID;
+			error->message = "WRED not supported";
+			goto l_end;
+		}
+		if (params->leaf.wred.shared_wred_context_id) {
+			PMD_LOG_ERR(DRV, "WRED not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_SHARED_WRED_CONTEXT_ID;
+			error->message = "WRED not supported";
+			goto l_end;
+		}
+		if (params->leaf.wred.n_shared_wred_contexts) {
+			PMD_LOG_ERR(DRV, "WRED not supported.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SHARED_WRED_CONTEXTS;
+			error->message = "WRED not supported";
+			goto l_end;
+		}
+	}
+	ret = 0;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_add_child(struct sxe2_tm_node *parent,
+		struct sxe2_tm_node *child)
+{
+	int32_t ret = -1;
+	uint32_t i;
+	for (i = 0; i < SXE2_TM_MAX_CHILDREN_COUNT; i++) {
+		if (parent->children[i] == NULL) {
+			parent->children[i] = child;
+			child->index_in_parent = i;
+			parent->child_cnt++;
+			ret = 0;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int32_t sxe2_tm_node_add(struct rte_eth_dev *dev, uint32_t node_id, uint32_t parent_node_id,
+			    uint32_t priority, uint32_t weight, uint32_t level_id,
+			    const struct rte_tm_node_params *params,
+			    struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	struct sxe2_tm_node *tm_node = NULL;
+	struct sxe2_tm_node *parent_node = NULL;
+	int32_t ret = -EINVAL;
+	bool is_leaf;
+
+	if (!params || !error) {
+		PMD_LOG_ERR(DRV, "Invalid input: params:0x%p or error:0x%p is null.",
+			params, error);
+		if (error) {
+			error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+			error->message = "Invalid input: params or error is null.";
+		}
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	shaper_profile = sxe2_tm_shaper_profile_search(dev, params->shaper_profile_id);
+	if (!shaper_profile) {
+		PMD_LOG_ERR(DRV, "Shaper profile does not exist.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS_SHAPER_PROFILE_ID;
+		error->message = "shaper profile does not exist";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (parent_node_id == RTE_TM_NODE_ID_NULL) {
+		if (level_id != 0) {
+			PMD_LOG_ERR(DRV, "Wrong level, root node (NULL parent) must be at level 0.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS;
+			error->message = "Wrong level, root node (NULL parent) must be at level 0";
+			ret = -EINVAL;
+			goto l_end;
+		}
+
+		if (tm_ctxt->root) {
+			PMD_LOG_ERR(DRV, "Already have a root.");
+			error->type = RTE_TM_ERROR_TYPE_NODE_PARENT_NODE_ID;
+			error->message = "already have a root";
+			ret = -EINVAL;
+			goto l_end;
+		}
+
+		ret = sxe2_tm_node_param_check(dev, SXE2_TM_NODE_TYPE_INVALID, node_id, priority,
+					       weight, params, false, error);
+		if (ret)
+			goto l_end;
+
+		tm_node = rte_zmalloc("tm_node_root", sizeof(struct sxe2_tm_node), 0);
+		if (!tm_node) {
+			PMD_LOG_ERR(DRV, "Alloc tm_node memory failed.");
+			error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+			error->message = "Alloc tm_node memory failed";
+			ret = -ENOMEM;
+			goto l_end;
+		}
+
+		tm_node->id = node_id;
+		tm_node->level = 0;
+		tm_node->parent = NULL;
+		tm_node->child_cnt = 0;
+		tm_node->weight = weight;
+		tm_node->hw_weight = SXE2_TM_WEIGHT_SUM;
+		tm_node->type = SXE2_TM_NODE_TYPE_VSIG;
+		tm_node->priority = priority;
+		tm_node->shaper_profile = shaper_profile;
+
+		tm_node->teid = tm_ctxt->root_teid;
+
+		shaper_profile->ref_cnt++;
+		tm_ctxt->root = tm_node;
+		ret = 0;
+		goto l_end;
+	}
+
+	parent_node = sxe2_tm_find_node(tm_ctxt->root, parent_node_id);
+	if (!parent_node) {
+		PMD_LOG_ERR(DRV, "Parent not exist.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARENT_NODE_ID;
+		error->message = "parent not exist";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (parent_node->child_cnt >= SXE2_TM_MAX_CHILDREN_COUNT ||
+		(parent_node->type == SXE2_TM_NODE_TYPE_VSIG &&
+		 parent_node->child_cnt >= tm_ctxt->root_max_children)) {
+		PMD_LOG_ERR(DRV, "Parent node is full.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS;
+		error->message = "parent node is full";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (level_id == RTE_TM_NODE_LEVEL_ID_ANY) {
+		level_id = parent_node->level + 1;
+	} else if (level_id != parent_node->level + 1) {
+		PMD_LOG_ERR(DRV, "Wrong level.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS;
+		error->message = "Wrong level";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (level_id >= tm_ctxt->tm_layers) {
+		PMD_LOG_ERR(DRV, "The maximum number of TM configuration levels is %d",
+				tm_ctxt->tm_layers);
+		error->type = RTE_TM_ERROR_TYPE_NODE_PARAMS;
+		error->message = "TM level exceeds supported hardware limit";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (level_id + 1 == tm_ctxt->tm_layers)
+		is_leaf = true;
+	else
+		is_leaf = false;
+
+	ret = sxe2_tm_node_param_check(dev, parent_node->type, node_id, priority, weight,
+				       params, is_leaf, error);
+	if (ret)
+		goto l_end;
+
+	if (sxe2_tm_find_node(tm_ctxt->root, node_id)) {
+		PMD_LOG_ERR(DRV, "Node id already used.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "node id already used";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	tm_node = rte_zmalloc("tm_node_no_root", sizeof(struct sxe2_tm_node), 0);
+	if (!tm_node) {
+		PMD_LOG_ERR(DRV, "Alloc tm_node memory failed.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "Alloc tm_node memory failed";
+		ret = -ENOMEM;
+		goto l_end;
+	}
+
+	if (level_id + 1 != tm_ctxt->tm_layers)
+		tm_node->type = SXE2_TM_NODE_TYPE_MID;
+	else
+		tm_node->type = SXE2_TM_NODE_TYPE_QUEUE;
+	tm_node->id = node_id;
+	tm_node->level = level_id;
+	tm_node->parent = parent_node;
+	tm_node->child_cnt = 0;
+	tm_node->weight = weight;
+	tm_node->priority = priority;
+	tm_node->shaper_profile = shaper_profile;
+	shaper_profile->ref_cnt++;
+
+	ret = sxe2_tm_add_child(parent_node, tm_node);
+	if (ret) {
+		shaper_profile->ref_cnt--;
+		rte_free(tm_node);
+	}
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_tree_delete(struct sxe2_tm_node *tm_node)
+{
+	int32_t ret = 0;
+	uint32_t i, j;
+	struct sxe2_tm_node *parent = NULL;
+
+	if (!tm_node)
+		goto l_end;
+
+	parent = tm_node->parent;
+
+	if (tm_node->child_cnt != 0) {
+		for (i = SXE2_TM_MAX_CHILDREN_COUNT; i > 0; i--) {
+			if (tm_node->children[i - 1])
+				ret = sxe2_tm_tree_delete(tm_node->children[i - 1]);
+		}
+	}
+
+	if (tm_node->shaper_profile)
+		tm_node->shaper_profile->ref_cnt--;
+
+	if (tm_node->type != SXE2_TM_NODE_TYPE_VSIG && parent) {
+		for (i = 0; i < parent->child_cnt; i++) {
+			if (parent->children[i] == tm_node)
+				break;
+		}
+		for (j = i; j < parent->child_cnt - 1; j++)
+			parent->children[j] = parent->children[j + 1];
+
+		parent->children[parent->child_cnt - 1] = NULL;
+		parent->child_cnt--;
+	}
+	rte_free(tm_node);
+	tm_node = NULL;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_node_delete(struct rte_eth_dev *dev, uint32_t node_id,
+		struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	struct sxe2_tm_node *tm_node = NULL;
+	int32_t ret = -EINVAL;
+
+	if (!error) {
+		PMD_LOG_ERR(DRV, "Invalid input: error is null.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	if (node_id == RTE_TM_NODE_ID_NULL) {
+		PMD_LOG_ERR(DRV, "Invalid node id. node_id = %u", node_id);
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "invalid node id";
+		goto l_end;
+	}
+
+	tm_node = sxe2_tm_find_node(tm_ctxt->root, node_id);
+	if (!tm_node) {
+		PMD_LOG_ERR(DRV, "No such node.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "no such node";
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	ret = sxe2_tm_tree_delete(tm_node);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "Delete node failed.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "Delete node failed";
+		goto l_end;
+	}
+
+	if (tm_node == tm_ctxt->root)
+		tm_ctxt->root = NULL;
+
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_node_type_get(struct rte_eth_dev *dev, uint32_t node_id,
+		int32_t *is_leaf, struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	struct sxe2_tm_node *tm_node = NULL;
+	int32_t ret = -EINVAL;
+
+	if (!is_leaf || !error) {
+		PMD_LOG_ERR(DRV, "Invalid input: is_leaf:0x%p or error:0x%p is null.",
+			is_leaf, error);
+		if (error) {
+			error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+			error->message = "Invalid input: is_leaf or error is null";
+		}
+		goto l_end;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_end;
+	}
+
+	if (node_id == RTE_TM_NODE_ID_NULL) {
+		PMD_LOG_ERR(DRV, "Invalid node id.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "invalid node id";
+		goto l_end;
+	}
+
+	tm_node = sxe2_tm_find_node(tm_ctxt->root, node_id);
+	if (!tm_node) {
+		PMD_LOG_ERR(DRV, "No such node.");
+		error->type = RTE_TM_ERROR_TYPE_NODE_ID;
+		error->message = "no such node";
+		goto l_end;
+	}
+
+	if (tm_node->level + 1 == tm_ctxt->tm_layers)
+		*is_leaf = true;
+	else
+		*is_leaf = false;
+	ret = 0;
+l_end:
+	return ret;
+}
+
+int32_t sxe2_tm_uninit(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+	int32_t ret = 0;
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		ret = 0;
+		goto l_end;
+	}
+
+	tm_ctxt->tm_layers = 0;
+	tm_ctxt->root_max_children = 0;
+	tm_ctxt->committed = false;
+
+	(void)sxe2_tm_tree_delete(tm_ctxt->root);
+
+	while ((shaper_profile = TAILQ_FIRST(&tm_ctxt->profile_list))) {
+		TAILQ_REMOVE(&tm_ctxt->profile_list, shaper_profile, node);
+		rte_free(shaper_profile);
+	}
+l_end:
+	return ret;
+}
+
+int32_t sxe2_tm_init(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_sched_hw_cap *sched_ctxt = &adapter->sched_ctxt;
+	struct sxe2_tm_context *tm_ctxt = &adapter->tm_ctxt;
+	int32_t ret = 0;
+	struct sxe2_tm_shaper_profile *shaper_profile = NULL;
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0)
+		goto l_end;
+
+	tm_ctxt->tm_layers = sched_ctxt->tm_layers;
+	tm_ctxt->root_max_children = sched_ctxt->root_max_children;
+	tm_ctxt->prio_max = sched_ctxt->prio_max;
+	tm_ctxt->committed = false;
+	TAILQ_INIT(&tm_ctxt->profile_list);
+	tm_ctxt->root = NULL;
+
+	shaper_profile = rte_zmalloc("sxe2_tm_shaper_profile",
+		sizeof(struct sxe2_tm_shaper_profile), 0);
+	if (!shaper_profile) {
+		PMD_LOG_ERR(DRV, "Alloc shaper_profile memory failed.");
+		ret = -ENOMEM;
+		goto l_end;
+	}
+	shaper_profile->id = RTE_TM_SHAPER_PROFILE_ID_NONE;
+	shaper_profile->ref_cnt = 1;
+	shaper_profile->profile.committed.rate = UINT64_MAX;
+	shaper_profile->profile.peak.rate = UINT64_MAX;
+	TAILQ_INSERT_TAIL(&tm_ctxt->profile_list, shaper_profile, node);
+
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_chk_all_leaf(struct rte_eth_dev *dev)
+{
+	int32_t ret = 0;
+	uint32_t i = 0;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	for (i = 0; i < dev->data->nb_tx_queues; i++) {
+		if (!sxe2_tm_find_node(adapter->tm_ctxt.root, i)) {
+			ret = -1;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int32_t sxe2_tm_weight_calc(struct sxe2_tm_node *tm_node)
+{
+	int32_t ret = 0;
+	int32_t total_weight = 0;
+	int32_t total_weight2 = 0;
+	uint32_t i = 0;
+	uint32_t j = 0;
+	uint32_t k = 0;
+	uint32_t maxindex = 0;
+	uint32_t maxweight = 0;
+	struct sxe2_tm_node *cacl_node[SXE2_TM_MAX_CHILDREN_COUNT] = {NULL};
+
+	if (!tm_node) {
+		PMD_LOG_ERR(DRV, "Invalid input: tm_node is null.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (tm_node->child_cnt == 0)
+		goto l_end;
+
+	for (j = SXE2_TM_PRIO_MIN; j <= SXE2_TM_PRIO_MAX; j++) {
+		k = 0;
+		total_weight = 0;
+		total_weight2 = 0;
+		maxindex = 0;
+		maxweight = 0;
+
+		for (i = 0; i < tm_node->child_cnt; i++) {
+			if (tm_node->children[i]->priority == j)
+				cacl_node[k++] = tm_node->children[i];
+		}
+		if (k == 0)
+			continue;
+
+		for (i = 0; i < k; i++)
+			total_weight += cacl_node[i]->weight;
+
+		for (i = 0; i < k; i++) {
+			cacl_node[i]->hw_weight = cacl_node[i]->weight *
+				SXE2_TM_WEIGHT_SUM / total_weight;
+			total_weight2 += cacl_node[i]->hw_weight;
+			if (cacl_node[i]->hw_weight > maxweight) {
+				maxweight = cacl_node[i]->hw_weight;
+				maxindex = i;
+			}
+		}
+
+		cacl_node[maxindex]->hw_weight += SXE2_TM_WEIGHT_SUM - total_weight2;
+	}
+
+	for (i = 0; i < tm_node->child_cnt; i++) {
+		ret = sxe2_tm_weight_calc(tm_node->children[i]);
+		if (ret)
+			goto l_end;
+	}
+
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_tm_hierarchy_commit(struct rte_eth_dev *dev,
+				     int32_t clear_on_fail, struct rte_tm_error *error)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = -EINVAL;
+
+	if (!error) {
+		PMD_LOG_ERR(DRV, "Invalid input: error is null.");
+		ret = -EINVAL;
+		goto l_clear_on_fail;
+	}
+
+	if ((adapter->cap_flags & SXE2_DEV_CAPS_OFFLOAD_TM) == 0) {
+		PMD_LOG_ERR(DRV, "The TM capability is not supported.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The TM capability is not supported.";
+		ret = ENOTSUP;
+		goto l_clear_on_fail;
+	}
+
+	if (dev->data->dev_started) {
+		PMD_LOG_ERR(DRV, "Device failed to Stop.");
+		error->message = "Device failed to Stop";
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		ret = -EPERM;
+		goto l_clear_on_fail;
+	}
+
+	ret = sxe2_tm_chk_all_leaf(dev);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "All tx queues need config.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "All tx queues need config.";
+		goto l_clear_on_fail;
+	}
+
+	ret = sxe2_tm_weight_calc(adapter->tm_ctxt.root);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "The weight in tree is wrong.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "The weight in tree is wrong.";
+		goto l_clear_on_fail;
+	}
+
+	ret = sxe2_drv_tm_commit(adapter);
+	if (ret) {
+		PMD_LOG_ERR(DRV, "Commit tree to fw failed.");
+		error->type = RTE_TM_ERROR_TYPE_UNSPECIFIED;
+		error->message = "Commit tree to fw failed.";
+		goto l_clear_on_fail;
+	}
+
+	adapter->tm_ctxt.committed = true;
+	ret = 0;
+	goto l_end;
+
+l_clear_on_fail:
+	if (clear_on_fail) {
+		(void)sxe2_tm_uninit(dev);
+		(void)sxe2_tm_init(dev);
+	}
+
+l_end:
+	return ret;
+}
+
+static const struct rte_tm_ops sxe2_tm_ops = {
+	.capabilities_get = sxe2_tm_capabilities_get,
+	.level_capabilities_get = sxe2_level_capabilities_get,
+	.node_capabilities_get = sxe2_node_capabilities_get,
+	.shaper_profile_add = sxe2_tm_shaper_profile_add,
+	.shaper_profile_delete = sxe2_tm_shaper_profile_del,
+	.node_add = sxe2_tm_node_add,
+	.node_delete = sxe2_tm_node_delete,
+	.node_type_get = sxe2_tm_node_type_get,
+
+	.hierarchy_commit = sxe2_tm_hierarchy_commit,
+};
+
+int32_t sxe2_tm_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg)
+{
+	int32_t ret = 0;
+
+	if (!arg) {
+		ret = -EINVAL;
+		PMD_LOG_ERR(DRV, "%s failed because arg is NULL", __func__);
+		goto l_end;
+	}
+	*(const void **)arg = &sxe2_tm_ops;
+l_end:
+	return ret;
+}
diff --git a/drivers/net/sxe2/sxe2_tm.h b/drivers/net/sxe2/sxe2_tm.h
new file mode 100644
index 0000000000..c4f8da6a8e
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_tm.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#ifndef __SXE2_TM_H__
+#define __SXE2_TM_H__
+#include <ethdev_driver.h>
+#include <rte_flow_driver.h>
+#include "sxe2_osal.h"
+
+#define SXE2_TM_MAX_LEVEL 7
+#define SXE2_TM_1L_NODE_NUM_MAX 1
+#define SXE2_TM_2L_NODE_NUM_MAX 8
+#define SXE2_TM_3L_NODE_NUM_MAX 64
+#define SXE2_TM_4L_NODE_NUM_MAX 256
+
+#define SXE2_TM_MAX_CHILDREN_COUNT 8
+
+#define SXE2_TM_WEIGHT_MAX  (200)
+#define SXE2_TM_WEIGHT_MIN  (1)
+#define SXE2_TM_WEIGHT_SUM  (32768)
+
+#define SXE2_HW_RATE_MIN 62500ull
+#define SXE2_HW_RATE_MAX 12500000000ull
+
+#define SXE2_TM_PRIO_MAX (7)
+#define SXE2_TM_PRIO_MIN (0)
+
+enum sxe2_tm_node_type {
+	SXE2_TM_NODE_TYPE_VSIG = 0,
+	SXE2_TM_NODE_TYPE_MID,
+	SXE2_TM_NODE_TYPE_QUEUE,
+	SXE2_TM_NODE_TYPE_INVALID,
+};
+
+struct sxe2_tm_shaper_profile {
+	TAILQ_ENTRY(sxe2_tm_shaper_profile) node;
+	uint32_t id;
+	uint32_t ref_cnt;
+	struct rte_tm_shaper_params profile;
+};
+
+TAILQ_HEAD(sxe2_shaper_profile_list, sxe2_tm_shaper_profile);
+
+struct sxe2_tm_node {
+	uint16_t id;
+	uint16_t teid;
+	uint32_t level;
+	uint32_t child_cnt;
+	uint32_t type;
+	uint16_t hw_weight;
+	uint16_t weight;
+	uint8_t priority;
+	struct sxe2_tm_node *parent;
+	uint8_t index_in_parent;
+	struct sxe2_tm_node *children[SXE2_TM_MAX_CHILDREN_COUNT];
+	struct sxe2_tm_shaper_profile *shaper_profile;
+};
+
+struct sxe2_tm_context {
+	uint32_t tm_layers;
+	uint16_t root_teid;
+	uint8_t root_max_children;
+	uint8_t prio_max;
+	bool committed;
+	struct sxe2_tm_node *root;
+	struct sxe2_shaper_profile_list profile_list;
+};
+
+int32_t sxe2_tm_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg);
+
+int32_t sxe2_tm_init(struct rte_eth_dev *dev);
+
+int32_t sxe2_tm_uninit(struct rte_eth_dev *dev);
+
+#endif /* __SXE2_TM_H__ */
diff --git a/drivers/net/sxe2/sxe2_tx.c b/drivers/net/sxe2/sxe2_tx.c
index a05beb8c7a..a280edc9c5 100644
--- a/drivers/net/sxe2/sxe2_tx.c
+++ b/drivers/net/sxe2/sxe2_tx.c
@@ -304,7 +304,6 @@ int32_t __rte_cold sxe2_tx_queue_setup(struct rte_eth_dev *dev,
 	}
 
 	offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
-
 	txq = sxe2_tx_queue_alloc(dev, queue_idx, nb_desc, socket_id);
 	if (txq == NULL) {
 		PMD_LOG_ERR(TX, "failed to alloc sxe2vf tx queue:%u resource", queue_idx);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 02/20] net/sxe2: add AVX2 vector data path for Rx and Tx
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev, Jie Liu
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

From: Jie Liu <liujie5@linkdatatechnology.com>

Added AVX256 vectorized versions of Rx and Tx data path functions to
improve packet processing performance.

The vector path uses AVX2 SIMD instructions to process multiple
descriptors per loop, significantly reducing the per-packet overhead.

Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>

net/sxe2: support AVX512 vectorized path for Rx and Tx
---
 drivers/net/sxe2/meson.build          |   9 +
 drivers/net/sxe2/sxe2_txrx.c          |  38 +-
 drivers/net/sxe2/sxe2_txrx_vec.h      |  12 +-
 drivers/net/sxe2/sxe2_txrx_vec_avx2.c | 776 ++++++++++++++++++++++++++
 4 files changed, 830 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx2.c

diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build
index 7bd0d8120c..c225dd7cd8 100644
--- a/drivers/net/sxe2/meson.build
+++ b/drivers/net/sxe2/meson.build
@@ -39,6 +39,15 @@ if arch_subdir == 'x86'
                         c_args: avx512_args)
                 objs += sxe2_avx512_lib.extract_objects('sxe2_txrx_vec_avx512.c')
         endif
+        sxe2_avx2_lib = static_library('sxe2_avx2_lib',
+                'sxe2_txrx_vec_avx2.c',
+                dependencies: [static_rte_ethdev,
+                        static_rte_kvargs, static_rte_hash,
+                        static_rte_security, static_rte_cryptodev,
+                        static_rte_bus_pci],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2'])
+        objs += sxe2_avx2_lib.extract_objects('sxe2_txrx_vec_avx2.c')
 endif
 
 sources += files(
diff --git a/drivers/net/sxe2/sxe2_txrx.c b/drivers/net/sxe2/sxe2_txrx.c
index aa1c474088..eaf95259a5 100644
--- a/drivers/net/sxe2/sxe2_txrx.c
+++ b/drivers/net/sxe2/sxe2_txrx.c
@@ -167,8 +167,14 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev)
 				PMD_LOG_INFO(TX, "AVX512 is not supported in build env.");
 #endif
 			}
-			if ((tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) == 0)
-				tx_mode_flags |= SXE2_TX_MODE_VEC_SSE;
+			if (((tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) == 0) &&
+			    ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) ||
+			    (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)) &&
+			    (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256))
+				tx_mode_flags |= SXE2_TX_MODE_VEC_AVX2;
+
+			if ((0 == (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK)))
+				tx_mode_flags |=  SXE2_TX_MODE_VEC_SSE;
 #endif
 			if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) {
 				ret = sxe2_tx_queues_vec_prepare(dev);
@@ -197,6 +203,13 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev)
 				dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx512_simple;
 			}
 #endif
+		} else if (tx_mode_flags & SXE2_TX_MODE_VEC_AVX2) {
+			if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) {
+				dev->tx_pkt_prepare = sxe2_tx_pkts_prepare;
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx2;
+			} else {
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx2_simple;
+			}
 		} else {
 			if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) {
 				dev->tx_pkt_prepare = sxe2_tx_pkts_prepare;
@@ -231,6 +244,10 @@ static const struct {
 	{ sxe2_tx_pkts_vec_avx512_simple,
 	      "Vector AVX512 Simple" },
 #endif
+	{ sxe2_tx_pkts_vec_avx2,
+	      "Vector AVX2" },
+	{ sxe2_tx_pkts_vec_avx2_simple,
+	      "Vector AVX2 Simple" },
 	{ sxe2_tx_pkts_vec_sse,
 	      "Vector SSE" },
 	{ sxe2_tx_pkts_vec_sse_simple,
@@ -330,7 +347,13 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev)
 				PMD_LOG_INFO(RX, "AVX512 support detected but not enabled");
 #endif
 			}
-			if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0 &&
+			if (((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0) &&
+				((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) ||
+				(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)) &&
+				(rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256))
+				rx_mode_flags |= SXE2_RX_MODE_VEC_AVX2;
+
+			if (((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0) &&
 				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
 				rx_mode_flags |= SXE2_RX_MODE_VEC_SSE;
 #endif
@@ -354,6 +377,11 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev)
 			else
 				dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx512;
 #endif
+		} else if (rx_mode_flags & SXE2_RX_MODE_VEC_AVX2) {
+			if (rx_mode_flags & SXE2_RX_MODE_VEC_OFFLOAD)
+				dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx2_offload;
+			else
+				dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx2;
 		} else {
 			dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_sse_offload;
 		}
@@ -381,6 +409,10 @@ static const struct {
 	{ sxe2_rx_pkts_scattered_vec_avx512_offload,
 	      "Offload Vector AVX512 Scattered" },
 #endif
+	{ sxe2_rx_pkts_scattered_vec_avx2,
+	      "Vector AVX2 Scattered" },
+	{ sxe2_rx_pkts_scattered_vec_avx2_offload,
+	      "Offload Vector AVX2 Scattered" },
 	{ sxe2_rx_pkts_scattered_vec_sse_offload,
 	      "Vector SSE Scattered" },
 #endif
diff --git a/drivers/net/sxe2/sxe2_txrx_vec.h b/drivers/net/sxe2/sxe2_txrx_vec.h
index af7c8d12b2..369777606f 100644
--- a/drivers/net/sxe2/sxe2_txrx_vec.h
+++ b/drivers/net/sxe2/sxe2_txrx_vec.h
@@ -11,19 +11,21 @@
 #define SXE2_RX_MODE_VEC_SIMPLE    RTE_BIT32(0)
 #define SXE2_RX_MODE_VEC_OFFLOAD   RTE_BIT32(1)
 #define SXE2_RX_MODE_VEC_SSE       RTE_BIT32(2)
+#define SXE2_RX_MODE_VEC_AVX2      RTE_BIT32(3)
 #define SXE2_RX_MODE_VEC_AVX512    RTE_BIT32(4)
 #define SXE2_RX_MODE_BATCH_ALLOC   RTE_BIT32(10)
 #define SXE2_RX_MODE_VEC_SET_MASK	(SXE2_RX_MODE_VEC_SIMPLE | \
 			SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE | \
-			SXE2_RX_MODE_VEC_AVX512)
+			SXE2_RX_MODE_VEC_AVX2 | SXE2_RX_MODE_VEC_AVX512)
 #define SXE2_TX_MODE_VEC_SIMPLE   RTE_BIT32(0)
 #define SXE2_TX_MODE_VEC_OFFLOAD  RTE_BIT32(1)
 #define SXE2_TX_MODE_VEC_SSE      RTE_BIT32(2)
+#define SXE2_TX_MODE_VEC_AVX2     RTE_BIT32(3)
 #define SXE2_TX_MODE_VEC_AVX512   RTE_BIT32(4)
 #define SXE2_TX_MODE_SIMPLE_BATCH RTE_BIT32(10)
 #define SXE2_TX_MODE_VEC_SET_MASK	(SXE2_TX_MODE_VEC_SIMPLE | \
 			SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE | \
-			SXE2_TX_MODE_VEC_AVX512)
+			SXE2_TX_MODE_VEC_AVX2 | SXE2_TX_MODE_VEC_AVX512)
 #define SXE2_TX_VEC_NO_SUPPORT_OFFLOAD (		  \
 			RTE_ETH_TX_OFFLOAD_MULTI_SEGS |		  \
 			RTE_ETH_TX_OFFLOAD_QINQ_INSERT |	  \
@@ -68,6 +70,12 @@ uint16_t sxe2_rx_pkts_scattered_vec_avx512(void *rx_queue,
 		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 uint16_t sxe2_rx_pkts_scattered_vec_avx512_offload(void *rx_queue,
 		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_tx_pkts_vec_avx2_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_tx_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_rx_pkts_scattered_vec_avx2(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_rx_pkts_scattered_vec_avx2_offload(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 #endif
 int32_t __rte_cold sxe2_tx_vec_support_check(struct rte_eth_dev *dev, uint32_t *vec_flags);
 int32_t __rte_cold sxe2_tx_queues_vec_prepare(struct rte_eth_dev *dev);
diff --git a/drivers/net/sxe2/sxe2_txrx_vec_avx2.c b/drivers/net/sxe2/sxe2_txrx_vec_avx2.c
new file mode 100644
index 0000000000..72b09850b6
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_txrx_vec_avx2.c
@@ -0,0 +1,776 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "sxe2_ethdev.h"
+#include "sxe2_common_log.h"
+#include "sxe2_queue.h"
+#include "sxe2_txrx_vec.h"
+#include "sxe2_txrx_vec_common.h"
+#include "sxe2_vsi.h"
+
+static inline void
+sxe2_tx_desc_fill_one_avx2(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf *pkt,
+			   uint64_t desc_cmd, bool with_offloads)
+{
+	__m128i data_desc;
+	uint64_t desc_qw1;
+	uint32_t desc_offset;
+
+	desc_qw1 = (SXE2_TX_DESC_DTYPE_DATA |
+		   ((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT |
+		   ((uint64_t)pkt->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+
+	desc_offset = SXE2_TX_DATA_DESC_MACLEN_VAL(pkt->l2_len);
+	desc_qw1 |= ((uint64_t)desc_offset) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+	if (with_offloads)
+		sxe2_tx_desc_fill_offloads(pkt, &desc_qw1);
+
+	data_desc = _mm_set_epi64x(desc_qw1, rte_pktmbuf_iova(pkt));
+	_mm_store_si128(RTE_CAST_PTR(__m128i *, desc), data_desc);
+}
+
+static __rte_always_inline void
+sxe2_tx_desc_fill_avx2(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf **pkts,
+		       uint16_t pkts_num, uint64_t desc_cmd, bool with_offloads)
+{
+	__m256i desc_group0;
+	__m256i desc_group1;
+	uint64_t desc0_qw1;
+	uint64_t desc1_qw1;
+	uint64_t desc2_qw1;
+	uint64_t desc3_qw1;
+
+	const uint64_t desc_qw1_com = (SXE2_TX_DESC_DTYPE_DATA |
+					((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT);
+	uint32_t desc_offset[4] = {0};
+
+	if (((uint64_t)desc & 0x1F) != 0 && pkts_num != 0) {
+		sxe2_tx_desc_fill_one_avx2(desc, *pkts, desc_cmd, with_offloads);
+		pkts_num--;
+		desc++;
+		pkts++;
+	}
+
+	while (pkts_num > 3) {
+		desc3_qw1 = (desc_qw1_com |
+					((uint64_t)pkts[3]->data_len)
+					<< SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+
+		desc_offset[3] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[3]->l2_len);
+		desc3_qw1 |= ((uint64_t)desc_offset[3]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[3], &desc3_qw1);
+
+		desc2_qw1 = (desc_qw1_com |
+					((uint64_t)pkts[2]->data_len)
+					<< SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+		desc_offset[2] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[2]->l2_len);
+		desc2_qw1 |= ((uint64_t)desc_offset[2]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[2], &desc2_qw1);
+
+		desc1_qw1 = (desc_qw1_com |
+					((uint64_t)pkts[1]->data_len)
+					<< SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+		desc_offset[1] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[1]->l2_len);
+		desc1_qw1 |= ((uint64_t)desc_offset[1]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[1], &desc1_qw1);
+
+		desc0_qw1 = (desc_qw1_com |
+					((uint64_t)pkts[0]->data_len)
+					<< SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+		desc_offset[0] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[0]->l2_len);
+		desc0_qw1 |= ((uint64_t)desc_offset[0]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[0], &desc0_qw1);
+
+		desc_group1 = _mm256_set_epi64x(desc3_qw1, rte_pktmbuf_iova(pkts[3]),
+					desc2_qw1, rte_pktmbuf_iova(pkts[2]));
+
+		desc_group0 = _mm256_set_epi64x(desc1_qw1, rte_pktmbuf_iova(pkts[1]),
+					desc0_qw1, rte_pktmbuf_iova(pkts[0]));
+
+		_mm256_store_si256(RTE_CAST_PTR(__m256i *, desc + 2), desc_group1);
+		_mm256_store_si256(RTE_CAST_PTR(__m256i *, desc), desc_group0);
+
+		pkts_num -= 4;
+		desc     += 4;
+		pkts     += 4;
+	}
+
+	while (pkts_num) {
+		sxe2_tx_desc_fill_one_avx2(desc, *pkts, desc_cmd, with_offloads);
+		pkts_num--;
+		desc++;
+		pkts++;
+	}
+}
+
+static __rte_always_inline uint16_t
+sxe2_tx_pkts_vec_avx2_batch(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts,
+			    uint16_t nb_pkts, bool with_offloads)
+{
+	volatile union sxe2_tx_data_desc *desc;
+	struct sxe2_tx_buffer *buffer;
+	uint16_t next_use;
+	uint16_t res_num;
+	uint16_t tx_num;
+
+	if (txq->desc_free_num < txq->free_thresh)
+		(void)sxe2_tx_bufs_free_vec(txq);
+
+	nb_pkts = RTE_MIN(txq->desc_free_num, nb_pkts);
+	if (unlikely(nb_pkts == 0)) {
+		PMD_LOG_DEBUG(TX, "Tx pkts avx2 batch: may not enough free desc, "
+				"free_desc=%u, need_tx_pkts=%u",
+				txq->desc_free_num, nb_pkts);
+		goto l_end;
+	}
+	tx_num = nb_pkts;
+
+	next_use = txq->next_use;
+	desc     = &txq->desc_ring[next_use];
+	buffer   = &txq->buffer_ring[next_use];
+
+	txq->desc_free_num -= nb_pkts;
+
+	res_num = txq->ring_depth - txq->next_use;
+
+	if (tx_num >= res_num) {
+		sxe2_tx_pkts_mbuf_fill(buffer, tx_pkts, res_num);
+
+		sxe2_tx_desc_fill_avx2(desc, tx_pkts, res_num,
+				SXE2_TX_DATA_DESC_CMD_EOP, with_offloads);
+		tx_pkts += (res_num - 1);
+		desc    += (res_num - 1);
+
+		sxe2_tx_desc_fill_one_avx2(desc, *tx_pkts++,
+				(SXE2_TX_DATA_DESC_CMD_EOP | SXE2_TX_DATA_DESC_CMD_RS),
+				with_offloads);
+
+		tx_num -= res_num;
+
+		next_use     = 0;
+		txq->next_rs = txq->rs_thresh - 1;
+		desc         = &txq->desc_ring[next_use];
+		buffer       = &txq->buffer_ring[next_use];
+	}
+
+	sxe2_tx_pkts_mbuf_fill(buffer, tx_pkts, tx_num);
+
+	sxe2_tx_desc_fill_avx2(desc, tx_pkts, tx_num,
+			SXE2_TX_DATA_DESC_CMD_EOP, with_offloads);
+
+	next_use += tx_num;
+	if (next_use > txq->next_rs) {
+		txq->desc_ring[txq->next_rs].read.type_cmd_off_bsz_l2t |=
+			rte_cpu_to_le_64(SXE2_TX_DATA_DESC_CMD_RS_MASK);
+
+		txq->next_rs += txq->rs_thresh;
+	}
+	txq->next_use = next_use;
+	SXE2_PCI_REG_WRITE_WC(txq->tdt_reg_addr, next_use);
+	PMD_LOG_DEBUG(TX, "port_id=%u queue_id=%u next_use=%u send_pkts=%u",
+			txq->port_id, txq->queue_id, next_use, nb_pkts);
+l_end:
+	return nb_pkts;
+}
+
+static __rte_always_inline uint16_t
+sxe2_tx_pkts_vec_avx2_common(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts,
+			     uint16_t nb_pkts, bool with_offloads)
+{
+	uint16_t tx_done_num = 0;
+	uint16_t tx_once_num;
+	uint16_t tx_need_num;
+
+	while (nb_pkts) {
+		tx_need_num = RTE_MIN(nb_pkts, txq->rs_thresh);
+		tx_once_num = sxe2_tx_pkts_vec_avx2_batch(txq,
+					tx_pkts + tx_done_num, tx_need_num, with_offloads);
+
+		nb_pkts     -= tx_once_num;
+		tx_done_num += tx_once_num;
+
+		if (tx_once_num < tx_need_num)
+			break;
+	}
+	return tx_done_num;
+}
+
+uint16_t sxe2_tx_pkts_vec_avx2_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_tx_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t sxe2_tx_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_tx_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, true);
+}
+
+static inline void sxe2_rx_queue_rearm_avx2(struct sxe2_rx_queue *rxq)
+{
+	volatile union sxe2_rx_desc *desc;
+	struct rte_mbuf **buffer;
+	struct rte_mbuf *mbuf0, *mbuf1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i virt_addr0, virt_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+	int32_t ret;
+	uint16_t i;
+	uint16_t new_tail;
+
+	buffer = &rxq->buffer_ring[rxq->realloc_start];
+	desc   = &rxq->desc_ring[rxq->realloc_start];
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void *)buffer, SXE2_RX_REARM_THRESH_VEC);
+	if (ret != 0) {
+		if ((rxq->realloc_num + SXE2_RX_REARM_THRESH_VEC) >= rxq->ring_depth) {
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < SXE2_RX_NUM_PER_LOOP_AVX; ++i) {
+				buffer[i] = &rxq->fake_mbuf;
+				_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc[i].read), dma_addr0);
+			}
+		}
+
+		rxq->vsi->adapter->dev_info.dev_data->rx_mbuf_alloc_failed +=
+				SXE2_RX_REARM_THRESH_VEC;
+		return;
+	}
+
+	for (i = 0; i < SXE2_RX_REARM_THRESH_VEC; i += 2, buffer += 2) {
+		mbuf0 = buffer[0];
+		mbuf1 = buffer[1];
+#if RTE_IOVA_IN_MBUF
+
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				 offsetof(struct rte_mbuf, buf_addr) + 8);
+#endif
+		virt_addr0 = _mm_loadu_si128((__m128i *)&mbuf0->buf_addr);
+		virt_addr1 = _mm_loadu_si128((__m128i *)&mbuf1->buf_addr);
+
+#if RTE_IOVA_IN_MBUF
+
+		dma_addr0 = _mm_unpackhi_epi64(virt_addr0, virt_addr0);
+		dma_addr1 = _mm_unpackhi_epi64(virt_addr1, virt_addr1);
+#else
+
+		dma_addr0 = _mm_unpacklo_epi64(virt_addr0, virt_addr0);
+		dma_addr1 = _mm_unpacklo_epi64(virt_addr1, virt_addr1);
+#endif
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr0);
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr1);
+	}
+
+	rxq->realloc_start += SXE2_RX_REARM_THRESH_VEC;
+	if (rxq->realloc_start >= rxq->ring_depth)
+		rxq->realloc_start = 0;
+	rxq->realloc_num -= SXE2_RX_REARM_THRESH_VEC;
+
+	new_tail = (rxq->realloc_start == 0) ?
+		(rxq->ring_depth - 1) : (rxq->realloc_start - 1);
+	SXE2_PCI_REG_WRITE_WC(rxq->rdt_reg_addr, new_tail);
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_common_vec_avx2(struct sxe2_rx_queue *rxq,
+			     struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_rxe_flags,
+			     uint8_t *umbcast_flags, bool do_offload)
+{
+	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const __m256i mbuf_init   = _mm256_set_epi64x(0, 0, 0, rxq->mbuf_init_value);
+	struct rte_mbuf **buffer;
+	volatile union sxe2_rx_desc *desc;
+	__m256i mbufs6_7, mbufs4_5, mbufs2_3, mbufs0_1;
+	uint32_t bit_num;
+	uint16_t done_num;
+	uint16_t i = 0;
+	uint16_t j = 0;
+
+	buffer   = &rxq->buffer_ring[rxq->processing_idx];
+	desc     = &rxq->desc_ring[rxq->processing_idx];
+	done_num = 0;
+
+	rte_prefetch0(desc);
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, SXE2_RX_NUM_PER_LOOP_AVX);
+
+	if (rxq->realloc_num > SXE2_RX_REARM_THRESH_VEC)
+		sxe2_rx_queue_rearm_avx2(rxq);
+
+	if (0 == (rte_le_to_cpu_64(desc->wb.status_err_ptype_len) &
+				SXE2_RX_DESC_STATUS_DD_MASK))
+		goto l_end;
+
+	const __m256i crc_adjust =
+		_mm256_set_epi16(0, 0, 0, -rxq->crc_len,
+				 0, -rxq->crc_len, 0,
+				 0, 0, 0, 0,
+				 -rxq->crc_len, 0, -rxq->crc_len, 0, 0);
+
+	const __m256i dd_mask = _mm256_set1_epi32(1);
+	const __m256i rvp_shuf_mask =
+		_mm256_set_epi8(7, 6, 5, 4,
+				3, 2, 13, 12,
+				0xFF, 0xFF, 13, 12,
+				0xFF, 0xFF, 0xFF, 0xFF,
+				7, 6, 5, 4,
+				3, 2, 13, 12,
+				0xFF, 0xFF, 13, 12,
+				0xFF, 0xFF, 0xFF, 0xFF);
+
+	const __m128i eop_shuf_mask =
+		_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFF,
+			     0xFF, 0xFF, 0xFF, 0xFF,
+			     8, 0, 10, 2,
+			     12, 4, 14, 6);
+
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	for (i = 0; i < nb_pkts; i += SXE2_RX_NUM_PER_LOOP_AVX,
+				desc += SXE2_RX_NUM_PER_LOOP_AVX) {
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+					_mm256_loadu_si256((void *)&buffer[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256((void *)&rx_pkts[i + 4],
+					_mm256_loadu_si256((void *)&buffer[i + 4]));
+#endif
+
+		const __m128i desc7 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 7));
+		rte_compiler_barrier();
+		const __m128i desc6 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 6));
+		rte_compiler_barrier();
+		const __m128i desc5 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 5));
+		rte_compiler_barrier();
+		const __m128i desc4 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 4));
+		rte_compiler_barrier();
+		const __m128i desc3 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 3));
+		rte_compiler_barrier();
+		const __m128i desc2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 2));
+		rte_compiler_barrier();
+		const __m128i desc1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 1));
+		rte_compiler_barrier();
+		const __m128i desc0 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 0));
+
+		const __m256i descs6_7 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc6), desc7, 1);
+		const __m256i descs4_5 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc4), desc5, 1);
+		const __m256i descs2_3 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc2), desc3, 1);
+		const __m256i descs0_1 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc0), desc1, 1);
+
+		if (split_rxe_flags) {
+			for (j = 0; j < SXE2_RX_NUM_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		mbufs6_7 = _mm256_shuffle_epi8(descs6_7, rvp_shuf_mask);
+		mbufs4_5 = _mm256_shuffle_epi8(descs4_5, rvp_shuf_mask);
+
+		mbufs6_7 = _mm256_add_epi16(mbufs6_7, crc_adjust);
+		mbufs4_5 = _mm256_add_epi16(mbufs4_5, crc_adjust);
+
+		const __m256i ptype_mask = _mm256_set1_epi32(SXE2_RX_DESC_PTYPE_MASK);
+
+		const __m256i staterrs4_7 = _mm256_unpackhi_epi32(descs6_7, descs4_5);
+
+		__m256i ptypes4_7 = _mm256_and_si256(staterrs4_7, ptype_mask);
+
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes4_7, 9);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes4_7, 1);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_7, 11);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_7, 3);
+
+		mbufs6_7 = _mm256_insert_epi32(mbufs6_7, ptype_tbl[ptype7], 4);
+		mbufs6_7 = _mm256_insert_epi32(mbufs6_7, ptype_tbl[ptype6], 0);
+		mbufs4_5 = _mm256_insert_epi32(mbufs4_5, ptype_tbl[ptype5], 4);
+		mbufs4_5 = _mm256_insert_epi32(mbufs4_5, ptype_tbl[ptype4], 0);
+
+		mbufs2_3 = _mm256_shuffle_epi8(descs2_3, rvp_shuf_mask);
+		mbufs0_1 = _mm256_shuffle_epi8(descs0_1, rvp_shuf_mask);
+
+		mbufs2_3 = _mm256_add_epi16(mbufs2_3, crc_adjust);
+		mbufs0_1 = _mm256_add_epi16(mbufs0_1, crc_adjust);
+
+		const __m256i staterrs0_3 = _mm256_unpackhi_epi32(descs2_3, descs0_1);
+
+		__m256i ptypes0_3 = _mm256_and_si256(staterrs0_3, ptype_mask);
+
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes0_3, 9);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes0_3, 1);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_3, 11);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_3, 3);
+
+		mbufs2_3 = _mm256_insert_epi32(mbufs2_3, ptype_tbl[ptype3], 4);
+		mbufs2_3 = _mm256_insert_epi32(mbufs2_3, ptype_tbl[ptype2], 0);
+		mbufs0_1 = _mm256_insert_epi32(mbufs0_1, ptype_tbl[ptype1], 4);
+		mbufs0_1 = _mm256_insert_epi32(mbufs0_1, ptype_tbl[ptype0], 0);
+
+		__m256i staterrs0_7 = _mm256_unpacklo_epi64(staterrs4_7, staterrs0_3);
+
+		__m256i stu_len0_7 = _mm256_unpackhi_epi64(staterrs4_7, staterrs0_3);
+		__m256i mbuf_flags = _mm256_setzero_si256();
+
+		if (do_offload) {
+			const __m256i desc_flags_mask = _mm256_set1_epi32(0x00001C04);
+			const __m256i desc_flags_rss_mask = _mm256_set1_epi32(0x20000000);
+			const __m256i vlan_flags =
+				_mm256_set_epi8
+				(0, 0, 0, 0, 0, 0, 0, 0,
+				 0, 0, 0,
+				 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+				 0, 0, 0, 0,
+				 0, 0, 0, 0, 0, 0, 0, 0,
+				 0, 0, 0,
+				 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+				 0, 0, 0, 0);
+
+			const __m256i rss_flags =
+				_mm256_set_epi8
+				(0, 0, 0, 0, 0, 0, 0, 0,
+				 0, 0, 0, RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, 0,
+				 0, 0, 0, 0, 0, 0, 0, 0,
+				 0, 0, 0, RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, 0);
+
+			const __m256i cksum_flags =
+				_mm256_set_epi8
+				(0, 0, 0, 0, 0, 0, 0, 0,
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+
+				 0, 0, 0, 0, 0, 0, 0, 0,
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+					RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_BAD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+				 ((RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+					RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1));
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32
+				(RTE_MBUF_F_RX_IP_CKSUM_MASK |
+				RTE_MBUF_F_RX_L4_CKSUM_MASK |
+				RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
+				RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
+			const __m256i vlan_mask =
+				_mm256_set1_epi32
+				(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
+
+			__m256i tmp_flags;
+			__m256i descs_flags = _mm256_and_si256(staterrs0_7, desc_flags_mask);
+			stu_len0_7 = _mm256_and_si256(stu_len0_7, desc_flags_rss_mask);
+
+			tmp_flags = _mm256_shuffle_epi8(vlan_flags, descs_flags);
+			mbuf_flags = _mm256_and_si256(tmp_flags, vlan_mask);
+
+			descs_flags = _mm256_srli_epi32(descs_flags, 10);
+			tmp_flags = _mm256_shuffle_epi8(cksum_flags, descs_flags);
+			tmp_flags = _mm256_slli_epi32(tmp_flags, 1);
+			tmp_flags = _mm256_and_si256(tmp_flags, cksum_mask);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+			descs_flags = _mm256_srli_epi32(stu_len0_7, 27);
+			tmp_flags = _mm256_shuffle_epi8(rss_flags, descs_flags);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC
+
+			if (rxq->fnav_enable) {
+				__m256i fnav_vld0_3, fnav_vld4_7;
+				__m256i fnav_vld0_7;
+				__m256i v_zeros, v_ffff, v_u32_one;
+				const __m256i fdir_flags =
+					_mm256_set1_epi32
+					(RTE_MBUF_F_RX_FDIR | RTE_MBUF_F_RX_FDIR_ID);
+				fnav_vld0_3 = _mm256_unpacklo_epi32(descs2_3, descs0_1);
+				fnav_vld4_7 = _mm256_unpacklo_epi32(descs6_7, descs4_5);
+
+				fnav_vld0_7 = _mm256_unpacklo_epi64(fnav_vld4_7, fnav_vld0_3);
+
+				fnav_vld0_7 = _mm256_slli_epi32(fnav_vld0_7, 26);
+				fnav_vld0_7 = _mm256_srli_epi32(fnav_vld0_7, 31);
+
+				v_zeros = _mm256_setzero_si256();
+				v_ffff = _mm256_cmpeq_epi32(v_zeros, v_zeros);
+				v_u32_one = _mm256_srli_epi32(v_ffff, 31);
+
+				tmp_flags = _mm256_cmpeq_epi32(fnav_vld0_7, v_u32_one);
+
+				tmp_flags = _mm256_and_si256(tmp_flags, fdir_flags);
+
+				mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+				rx_pkts[i + 0]->hash.fdir.hi = desc[0].wb.fd_filter_id;
+				rx_pkts[i + 1]->hash.fdir.hi = desc[1].wb.fd_filter_id;
+				rx_pkts[i + 2]->hash.fdir.hi = desc[2].wb.fd_filter_id;
+				rx_pkts[i + 3]->hash.fdir.hi = desc[3].wb.fd_filter_id;
+				rx_pkts[i + 4]->hash.fdir.hi = desc[4].wb.fd_filter_id;
+				rx_pkts[i + 5]->hash.fdir.hi = desc[5].wb.fd_filter_id;
+				rx_pkts[i + 6]->hash.fdir.hi = desc[6].wb.fd_filter_id;
+				rx_pkts[i + 7]->hash.fdir.hi = desc[7].wb.fd_filter_id;
+			}
+#endif
+		}
+
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rx_descriptor_fields1) !=
+				 offsetof(struct rte_mbuf, rearm_data) + 16);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+		__m256i rearm_arr[8];
+
+		rearm_arr[6] = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(mbuf_flags, 8), 4);
+		rearm_arr[4] = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(mbuf_flags, 4), 4);
+		rearm_arr[2] = _mm256_blend_epi32(mbuf_init, mbuf_flags, 4);
+		rearm_arr[0] = _mm256_blend_epi32(mbuf_init, _mm256_srli_si256(mbuf_flags, 4), 4);
+
+		rearm_arr[6] = _mm256_permute2f128_si256(rearm_arr[6], mbufs6_7, 0x20);
+		rearm_arr[4] = _mm256_permute2f128_si256(rearm_arr[4], mbufs4_5, 0x20);
+		rearm_arr[2] = _mm256_permute2f128_si256(rearm_arr[2], mbufs2_3, 0x20);
+		rearm_arr[0] = _mm256_permute2f128_si256(rearm_arr[0], mbufs0_1, 0x20);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm_arr[6]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm_arr[4]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm_arr[2]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm_arr[0]);
+
+		const __m256i tmp_mbuf_flags =
+			_mm256_castsi128_si256(_mm256_extracti128_si256(mbuf_flags, 1));
+
+		rearm_arr[7] =
+			_mm256_blend_epi32(mbuf_init, _mm256_slli_si256(tmp_mbuf_flags, 8), 4);
+		rearm_arr[5] =
+			_mm256_blend_epi32(mbuf_init, _mm256_slli_si256(tmp_mbuf_flags, 4), 4);
+		rearm_arr[3] =
+			_mm256_blend_epi32(mbuf_init, tmp_mbuf_flags, 4);
+		rearm_arr[1] =
+			_mm256_blend_epi32(mbuf_init, _mm256_srli_si256(tmp_mbuf_flags, 4), 4);
+
+		rearm_arr[7] = _mm256_blend_epi32(rearm_arr[7], mbufs6_7, 0XF0);
+		rearm_arr[5] = _mm256_blend_epi32(rearm_arr[5], mbufs4_5, 0XF0);
+		rearm_arr[3] = _mm256_blend_epi32(rearm_arr[3], mbufs2_3, 0XF0);
+		rearm_arr[1] = _mm256_blend_epi32(rearm_arr[1], mbufs0_1, 0XF0);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm_arr[7]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm_arr[5]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm_arr[3]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm_arr[1]);
+
+		if (umbcast_flags != NULL) {
+			const __m256i umbcast_mask =
+					_mm256_set1_epi32(SXE2_RX_DESC_STATUS_UMBCAST_MASK);
+			__m256i umbcast_bits_256 = _mm256_and_si256(staterrs0_7,
+								    umbcast_mask);
+
+			umbcast_bits_256 = _mm256_srli_epi32(umbcast_bits_256, 24);
+
+			__m128i umbcast_bits_128 = _mm_packs_epi32
+							(_mm256_castsi256_si128(umbcast_bits_256),
+							 _mm256_extractf128_si256
+								(umbcast_bits_256, 1));
+
+			umbcast_bits_128 = _mm_shuffle_epi8(umbcast_bits_128, eop_shuf_mask);
+
+			*(uint64_t *)umbcast_flags = _mm_cvtsi128_si64(umbcast_bits_128);
+			umbcast_flags += SXE2_RX_NUM_PER_LOOP_AVX;
+		}
+
+		if (split_rxe_flags != NULL) {
+			const __m256i eop_rxe_mask = _mm256_set1_epi32
+							(SXE2_RX_DESC_STATUS_EOP_MASK |
+							 SXE2_RX_DESC_ERROR_RXE_MASK |
+							 SXE2_RX_DESC_ERROR_OVERSIZE_MASK);
+
+			const __m128i eop_mask_128 = _mm_set1_epi16(SXE2_RX_DESC_STATUS_EOP_MASK);
+			const __m128i rxe_mask_128 = _mm_set1_epi16(SXE2_RX_DESC_ERROR_RXE_MASK |
+					SXE2_RX_DESC_ERROR_OVERSIZE_MASK);
+
+			const __m256i tmp_stats = _mm256_and_si256(staterrs0_7, eop_rxe_mask);
+
+			const __m128i eop_rxe_bits = _mm_packs_epi32
+							(_mm256_castsi256_si128(tmp_stats),
+							 _mm256_extractf128_si256(tmp_stats, 1));
+
+			__m128i not_eop_bits = _mm_andnot_si128(eop_rxe_bits, eop_mask_128);
+
+			not_eop_bits = _mm_or_si128
+					(not_eop_bits,
+					 _mm_srli_epi16
+					(_mm_and_si128(eop_rxe_bits, rxe_mask_128),
+					7));
+
+			not_eop_bits = _mm_shuffle_epi8(not_eop_bits, eop_shuf_mask);
+
+			*(uint64_t *)split_rxe_flags = _mm_cvtsi128_si64(not_eop_bits);
+			split_rxe_flags += SXE2_RX_NUM_PER_LOOP_AVX;
+		}
+
+		staterrs0_7 = _mm256_and_si256(staterrs0_7, dd_mask);
+
+		staterrs0_7 = _mm256_packs_epi32(staterrs0_7, _mm256_setzero_si256());
+		bit_num = rte_popcount64
+				(_mm_cvtsi128_si64(_mm256_extracti128_si256(staterrs0_7, 1)));
+		bit_num += rte_popcount64
+				(_mm_cvtsi128_si64(_mm256_castsi256_si128(staterrs0_7)));
+
+		done_num += bit_num;
+
+		if (bit_num != SXE2_RX_NUM_PER_LOOP_AVX)
+			break;
+	}
+
+	rxq->processing_idx += done_num;
+	rxq->processing_idx &= (rxq->ring_depth - 1);
+	if ((1 == (rxq->processing_idx & 1)) && done_num > 1) {
+		rxq->processing_idx--;
+		done_num--;
+	}
+	rxq->realloc_num     += done_num;
+
+l_end:
+	PMD_LOG_DEBUG(RX, "port_id=%u queue_id=%u last_id=%u recv_pkts=%d",
+		rxq->port_id, rxq->queue_id, rxq->processing_idx, done_num);
+	return done_num;
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_scattered_batch_vec_avx2(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts, bool do_offload)
+{
+	const uint64_t *split_rxe_flags64;
+	uint8_t split_rxe_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0};
+	uint8_t umbcast_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0};
+	uint16_t rx_done_num;
+	uint16_t rx_pkt_done_num;
+
+	rx_pkt_done_num = 0;
+
+	if (rxq->vsi->adapter->devargs.sw_stats_en) {
+		rx_done_num = sxe2_rx_pkts_common_vec_avx2(rxq, rx_pkts, nb_pkts,
+				split_rxe_flags, umbcast_flags, do_offload);
+	} else {
+		rx_done_num = sxe2_rx_pkts_common_vec_avx2(rxq, rx_pkts, nb_pkts,
+				split_rxe_flags, NULL, do_offload);
+	}
+	if (rx_done_num == 0)
+		goto l_end;
+
+	if (!rxq->vsi->adapter->devargs.sw_stats_en) {
+		split_rxe_flags64 = (uint64_t *)split_rxe_flags;
+
+		if (rxq->pkt_first_seg == NULL &&
+				split_rxe_flags64[0] == 0 && split_rxe_flags64[1] == 0 &&
+				split_rxe_flags64[2] == 0 && split_rxe_flags64[3] == 0) {
+			rx_pkt_done_num = rx_done_num;
+			goto l_end;
+		}
+
+		if (rxq->pkt_first_seg == NULL) {
+			while (rx_pkt_done_num < rx_done_num &&
+					split_rxe_flags[rx_pkt_done_num] == 0)
+				rx_pkt_done_num++;
+
+			if (rx_pkt_done_num == rx_done_num)
+				goto l_end;
+
+			rxq->pkt_first_seg = rx_pkts[rx_pkt_done_num];
+		}
+	}
+
+	rx_pkt_done_num += sxe2_rx_pkts_refactor(rxq, &rx_pkts[rx_pkt_done_num],
+			rx_done_num - rx_pkt_done_num, &split_rxe_flags[rx_pkt_done_num],
+			&umbcast_flags[rx_pkt_done_num]);
+
+l_end:
+	return rx_pkt_done_num;
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_scattered_common_vec_avx2(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts, bool do_offload)
+{
+	uint16_t done_num = 0;
+	uint16_t once_num;
+
+	while (nb_pkts > SXE2_RX_PKTS_BURST_BATCH_NUM) {
+		once_num =
+		sxe2_rx_pkts_scattered_batch_vec_avx2(rxq,
+						      rx_pkts + done_num,
+						      SXE2_RX_PKTS_BURST_BATCH_NUM,
+						      do_offload);
+		done_num += once_num;
+		nb_pkts -= once_num;
+		if (once_num < SXE2_RX_PKTS_BURST_BATCH_NUM)
+			goto l_end;
+	}
+
+	done_num += sxe2_rx_pkts_scattered_batch_vec_avx2(rxq,
+			rx_pkts + done_num, nb_pkts, do_offload);
+l_end:
+	return done_num;
+}
+
+uint16_t sxe2_rx_pkts_scattered_vec_avx2(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_rx_pkts_scattered_common_vec_avx2(rx_queue,
+			rx_pkts, nb_pkts, false);
+}
+
+uint16_t sxe2_rx_pkts_scattered_vec_avx2_offload(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_rx_pkts_scattered_common_vec_avx2(rx_queue,
+			rx_pkts, nb_pkts, true);
+}
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 04/20] net/sxe2: support L2 filtering and MAC config
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev, Jie Liu
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

From: Jie Liu <liujie5@linkdatatechnology.com>

- Support primary/secondary MAC address setup.
- Enable L2 broadcast/multicast filter bits.
- Add multicast address update logic.

Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
---
 drivers/net/sxe2/meson.build      |   1 +
 drivers/net/sxe2/sxe2_cmd_chnl.c  | 198 ++++++++
 drivers/net/sxe2/sxe2_cmd_chnl.h  |  17 +
 drivers/net/sxe2/sxe2_drv_cmd.h   |  87 ++++
 drivers/net/sxe2/sxe2_ethdev.c    |  70 ++-
 drivers/net/sxe2/sxe2_ethdev.h    |  43 +-
 drivers/net/sxe2/sxe2_filter.c    | 782 ++++++++++++++++++++++++++++++
 drivers/net/sxe2/sxe2_filter.h    |  98 ++++
 drivers/net/sxe2/sxe2_mac.c       | 432 +++++++++++++++++
 drivers/net/sxe2/sxe2_mac.h       |  34 ++
 drivers/net/sxe2/sxe2_txrx_poll.c |  49 ++
 11 files changed, 1805 insertions(+), 6 deletions(-)
 create mode 100644 drivers/net/sxe2/sxe2_filter.c
 create mode 100644 drivers/net/sxe2/sxe2_filter.h

diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build
index b14b5120c1..b661e3cbf4 100644
--- a/drivers/net/sxe2/meson.build
+++ b/drivers/net/sxe2/meson.build
@@ -61,4 +61,5 @@ sources += files(
         'sxe2_txrx.c',
         'sxe2_txrx_vec.c',
         'sxe2_mac.c',
+        'sxe2_filter.c',
 )
diff --git a/drivers/net/sxe2/sxe2_cmd_chnl.c b/drivers/net/sxe2/sxe2_cmd_chnl.c
index 07eeb7f38c..1fa9ad718e 100644
--- a/drivers/net/sxe2/sxe2_cmd_chnl.c
+++ b/drivers/net/sxe2/sxe2_cmd_chnl.c
@@ -343,3 +343,201 @@ int32_t sxe2_drv_mac_link_status_get(struct sxe2_adapter *adapter)
 l_end:
 	return ret;
 }
+
+int32_t sxe2_drv_promisc_config(struct sxe2_adapter *adapter, bool set)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_promisc_filter_cfg_req promisc_filter_cfg_req = {0};
+
+	promisc_filter_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	promisc_filter_cfg_req.is_add = set;
+	promisc_filter_cfg_req.type = SXE2_PROMISC_FILTER_TYPE_PROMISC;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_PROMISC_CFG,
+				 &promisc_filter_cfg_req,
+				 sizeof(promisc_filter_cfg_req),
+				 NULL, 0);
+
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "promic config failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_allmulti_config(struct sxe2_adapter *adapter, bool set)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_promisc_filter_cfg_req promisc_filter_cfg_req = {0};
+
+	promisc_filter_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	promisc_filter_cfg_req.is_add = set;
+	promisc_filter_cfg_req.type = SXE2_PROMISC_FILTER_TYPE_ALLMULTI;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_ALLMULTI_CFG,
+				 &promisc_filter_cfg_req,
+				 sizeof(promisc_filter_cfg_req),
+				 NULL, 0);
+
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "allmulti config failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_uc_config(struct sxe2_adapter *adapter, struct rte_ether_addr *addr, bool add)
+{
+	int32_t ret = 0;
+	int32_t i;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_mac_filter_cfg_req mac_filter_cfg_req = {0};
+
+	mac_filter_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	for (i = 0; i < SXE2_ETH_ALEN; i++)
+		mac_filter_cfg_req.addr[i] = addr->addr_bytes[i];
+	mac_filter_cfg_req.is_add = add;
+	mac_filter_cfg_req.type = SXE2_MAC_FILTER_TYPE_UC;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_MAC_ADDR_UC,
+		 &mac_filter_cfg_req, sizeof(mac_filter_cfg_req),
+		 NULL, 0);
+
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "uc config query failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_mc_config(struct sxe2_adapter *adapter, struct rte_ether_addr *addr, bool add)
+{
+	int32_t ret = 0;
+	int32_t i;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_mac_filter_cfg_req mac_filter_cfg_req = {0};
+
+	mac_filter_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	for (i = 0; i < SXE2_ETH_ALEN; i++)
+		mac_filter_cfg_req.addr[i] = addr->addr_bytes[i];
+
+	mac_filter_cfg_req.is_add = add;
+	mac_filter_cfg_req.type = SXE2_MAC_FILTER_TYPE_MC;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_MAC_ADDR_MC,
+		 &mac_filter_cfg_req, sizeof(mac_filter_cfg_req),
+		 NULL, 0);
+
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "mac config query failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_vlan_config_query(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_drv_vlan_cfg_query_resp vlan_cfg_query_resp = {0};
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_VLAN_CFG_QUERY,
+				 NULL, 0,
+				 &vlan_cfg_query_resp,
+	 sizeof(vlan_cfg_query_resp));
+
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "vlan config query failed, ret=%d", ret);
+
+	adapter->filter_ctxt.vlan_info.port_vlan_exist = vlan_cfg_query_resp.port_vlan_exist;
+	adapter->filter_ctxt.vlan_info.is_switchdev = vlan_cfg_query_resp.is_switchdev;
+
+
+	adapter->filter_ctxt.vlan_info.tpid = vlan_cfg_query_resp.tpid;
+	adapter->filter_ctxt.vlan_info.vid = vlan_cfg_query_resp.vid;
+
+	adapter->filter_ctxt.vlan_info.outer_insert = vlan_cfg_query_resp.outer_insert;
+	adapter->filter_ctxt.vlan_info.outer_strip = vlan_cfg_query_resp.outer_strip;
+	adapter->filter_ctxt.vlan_info.inner_insert = vlan_cfg_query_resp.inner_insert;
+	adapter->filter_ctxt.vlan_info.inner_strip = vlan_cfg_query_resp.inner_strip;
+
+	return ret;
+}
+
+int32_t sxe2_drv_vlan_filter_id_config(struct sxe2_adapter *adapter,
+				       struct sxe2_vlan *vlan, bool on)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_vlan_filter_cfg_req vlan_filter_cfg_req = {0};
+
+	vlan_filter_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	vlan_filter_cfg_req.tpid_id = vlan->tpid;
+	vlan_filter_cfg_req.vlan_id = vlan->vid;
+	vlan_filter_cfg_req.prio = vlan->prio;
+	vlan_filter_cfg_req.is_add = on;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_VLAN_FILTER_ADD_DEL,
+				 &vlan_filter_cfg_req, sizeof(vlan_filter_cfg_req),
+				 NULL, 0);
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "vlan config failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_vlan_insert_strip_cfg(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_drv_vlan_offload_cfg_req vlan_offload_cfg_req = {0};
+
+	vlan_offload_cfg_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	vlan_offload_cfg_req.tpid = adapter->filter_ctxt.vlan_info.tpid;
+	vlan_offload_cfg_req.outer_insert = adapter->filter_ctxt.vlan_info.outer_insert;
+	vlan_offload_cfg_req.outer_strip = adapter->filter_ctxt.vlan_info.outer_strip;
+	vlan_offload_cfg_req.inner_insert = adapter->filter_ctxt.vlan_info.inner_insert;
+	vlan_offload_cfg_req.inner_strip = adapter->filter_ctxt.vlan_info.inner_strip;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_VLAN_OFFLOAD_CFG,
+				 &vlan_offload_cfg_req,
+				 sizeof(vlan_offload_cfg_req),
+				 NULL, 0);
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "vlan config query failed, ret=%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_drv_vlan_filter_switch(struct sxe2_adapter *adapter, bool on)
+{
+	int32_t ret = 0;
+	struct sxe2_common_device *cdev = adapter->cdev;
+	struct sxe2_drv_cmd_params param = {0};
+	struct sxe2_vlan_filter_switch_req vlan_filter_switch_req = {0};
+
+	vlan_filter_switch_req.vsi_id = adapter->vsi_ctxt.dpdk_vsi_id;
+	vlan_filter_switch_req.is_oper_enable = on;
+
+	sxe2_drv_cmd_params_fill(adapter, &param, SXE2_DRV_CMD_VLAN_FILTER_SWITCH,
+				 &vlan_filter_switch_req,
+				 sizeof(vlan_filter_switch_req),
+				 NULL, 0);
+	ret = sxe2_drv_cmd_exec(cdev, &param);
+	if (ret)
+		PMD_DEV_LOG_WARN(adapter, DRV, "vlan config filter failed, ret=%d", ret);
+
+	return ret;
+}
diff --git a/drivers/net/sxe2/sxe2_cmd_chnl.h b/drivers/net/sxe2/sxe2_cmd_chnl.h
index cda676ed97..c93bc2b0c9 100644
--- a/drivers/net/sxe2/sxe2_cmd_chnl.h
+++ b/drivers/net/sxe2/sxe2_cmd_chnl.h
@@ -36,4 +36,21 @@ int32_t sxe2_drv_txq_ctxt_cfg(struct sxe2_adapter *adapter,
 
 int32_t sxe2_drv_mac_link_status_get(struct sxe2_adapter *adapter);
 
+int32_t sxe2_drv_promisc_config(struct sxe2_adapter *adapter, bool set);
+
+int32_t sxe2_drv_allmulti_config(struct sxe2_adapter *adapter, bool set);
+
+int32_t sxe2_drv_uc_config(struct sxe2_adapter *adapter, struct rte_ether_addr *addr, bool add);
+
+int32_t sxe2_drv_mc_config(struct sxe2_adapter *adapter, struct rte_ether_addr *addr, bool add);
+
+int32_t sxe2_drv_vlan_config_query(struct sxe2_adapter *adapter);
+
+int32_t sxe2_drv_vlan_filter_id_config(struct sxe2_adapter *adapter,
+				       struct sxe2_vlan *vlan, bool on);
+
+int32_t sxe2_drv_vlan_insert_strip_cfg(struct sxe2_adapter *adapter);
+
+int32_t sxe2_drv_vlan_filter_switch(struct sxe2_adapter *adapter, bool on);
+
 #endif /* SXE2_CMD_CHNL_H */
diff --git a/drivers/net/sxe2/sxe2_drv_cmd.h b/drivers/net/sxe2/sxe2_drv_cmd.h
index a0f08b5184..d69d650148 100644
--- a/drivers/net/sxe2/sxe2_drv_cmd.h
+++ b/drivers/net/sxe2/sxe2_drv_cmd.h
@@ -233,6 +233,93 @@ struct __rte_aligned(4) __rte_packed_begin sxe2_drv_link_info_resp {
 	uint8_t rsv[3];
 } __rte_packed_end;
 
+struct __rte_aligned(4) __rte_packed_begin sxe2_switchdev_info {
+	uint8_t is_switchdev;
+	uint8_t primary;
+	uint8_t representor;
+	uint8_t port_name_type;
+	uint32_t ctrl_num;
+	uint32_t pf_num;
+	uint32_t vf_num;
+	uint32_t mpesw_owner;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vlan_cfg_query_resp {
+	uint16_t vsi_id;
+	uint8_t port_vlan_exist;
+	uint8_t is_switchdev;
+	uint16_t tpid;
+	uint16_t vid;
+	uint8_t outer_insert;
+	uint8_t outer_strip;
+	uint8_t inner_insert;
+	uint8_t inner_strip;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vlan_offload_cfg_req {
+	uint16_t vsi_id;
+	uint16_t tpid;
+	uint8_t outer_insert;
+	uint8_t outer_strip;
+	uint8_t inner_insert;
+	uint8_t inner_strip;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_port_vlan_cfg_req {
+	uint16_t vsi_id;
+	uint16_t tpid;
+	uint16_t vid;
+	uint8_t prio;
+	uint8_t rsv;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_mac_filter_cfg_req {
+	uint16_t vsi_id;
+	uint8_t addr[SXE2_ETH_ALEN];
+	uint8_t type;
+	uint8_t is_add;
+	uint8_t rsv[2];
+} __rte_packed_end;
+
+enum sxe2_promisc_filter_type {
+	SXE2_PROMISC_FILTER_TYPE_PROMISC = 0,
+	SXE2_PROMISC_FILTER_TYPE_ALLMULTI,
+	SXE2_PROMISC_FILTER_TYPE_MAX,
+};
+
+enum sxe2_mac_filter_type {
+	SXE2_MAC_FILTER_TYPE_UC = 0,
+	SXE2_MAC_FILTER_TYPE_MC,
+	SXE2_MAC_FILTER_TYPE_MAX,
+};
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_promisc_filter_cfg_req {
+	uint16_t vsi_id;
+	uint8_t type;
+	uint8_t is_add;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_srcvsi_ext_cfg_req {
+	uint16_t vsi_id;
+	uint16_t srcvsi_list[SXE2_SRCVSI_PRUNE_MAX_NUM];
+	uint8_t srcvsi_cnt;
+	uint8_t is_add;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_vlan_filter_cfg_req {
+	uint16_t vsi_id;
+	uint16_t vlan_id;
+	uint16_t tpid_id;
+	uint8_t prio;
+	uint8_t is_add;
+} __rte_packed_end;
+
+struct __rte_aligned(4) __rte_packed_begin sxe2_vlan_filter_switch_req {
+	uint16_t vsi_id;
+	uint8_t is_oper_enable;
+	uint8_t rsv;
+} __rte_packed_end;
+
 enum sxe2_drv_cmd_module {
 	SXE2_DRV_CMD_MODULE_HANDSHAKE = 0,
 	SXE2_DRV_CMD_MODULE_DEV = 1,
diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c
index 80881595d6..4ffe32d353 100644
--- a/drivers/net/sxe2/sxe2_ethdev.c
+++ b/drivers/net/sxe2/sxe2_ethdev.c
@@ -111,8 +111,20 @@ static const struct eth_dev_ops sxe2_eth_dev_ops = {
 	.tx_burst_mode_get          = sxe2_tx_burst_mode_get,
 	.tx_done_cleanup            = sxe2_tx_done_cleanup,
 
+	.promiscuous_enable         = sxe2_promisc_enable,
+	.promiscuous_disable        = sxe2_promisc_disable,
+	.allmulticast_enable        = sxe2_allmulti_enable,
+	.allmulticast_disable       = sxe2_allmulti_disable,
+
+	.mac_addr_add               = sxe2_mac_addr_add,
+	.mac_addr_remove            = sxe2_mac_addr_del,
+	.mac_addr_set               = sxe2_mac_addr_set,
+	.set_mc_addr_list           = sxe2_set_mc_addr_list,
 	.mtu_set                    = sxe2_mtu_set,
 	.buffer_split_supported_hdr_ptypes_get = sxe2_buffer_split_supported_hdr_ptypes_get,
+
+	.vlan_filter_set            = sxe2_dev_vlan_filter_set,
+	.vlan_offload_set           = sxe2_dev_vlan_offload_set,
 };
 
 static int32_t sxe2_dev_configure(struct rte_eth_dev *dev)
@@ -123,6 +135,13 @@ static int32_t sxe2_dev_configure(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.rxmode.mq_mode  & RTE_ETH_MQ_RX_RSS_FLAG)
 		dev->data->dev_conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
 
+	ret = sxe2_vlan_default_cfg(dev);
+	if (ret) {
+		PMD_LOG_ERR(INIT, "Failed to init vlan, ret=%d", ret);
+		goto end;
+	}
+
+end:
 	return ret;
 }
 
@@ -138,6 +157,8 @@ static int32_t sxe2_dev_stop(struct rte_eth_dev *dev)
 	sxe2_txqs_all_stop(dev);
 	sxe2_rxqs_all_stop(dev);
 
+	(void)sxe2_filter_rule_stop(dev);
+
 	dev->data->dev_started = 0;
 	adapter->started = 0;
 l_end:
@@ -165,16 +186,23 @@ static int32_t sxe2_dev_start(struct rte_eth_dev *dev)
 		goto l_end;
 	}
 
+	ret = sxe2_filter_rule_start(dev);
+	if (ret) {
+		PMD_LOG_ERR(INIT, "Failed to add all mc addr to fw.");
+		goto l_end;
+	}
+
 	ret = sxe2_queues_start(dev);
 	if (ret) {
 		PMD_LOG_ERR(INIT, "enable queues failed");
-		goto l_end;
+		goto l_start_queues_err;
 	}
 
 	dev->data->dev_started = 1;
 	adapter->started = 1;
 	goto l_end;
-
+l_start_queues_err:
+	(void)sxe2_filter_rule_stop(dev);
 l_end:
 	return ret;
 }
@@ -194,6 +222,7 @@ static int32_t sxe2_dev_infos_get(struct rte_eth_dev *dev,
 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
 
 	dev_info->rx_offload_capa =
+		RTE_ETH_RX_OFFLOAD_VLAN_STRIP |
 		RTE_ETH_RX_OFFLOAD_KEEP_CRC |
 		RTE_ETH_RX_OFFLOAD_SCATTER |
 		RTE_ETH_RX_OFFLOAD_IPV4_CKSUM |
@@ -202,9 +231,15 @@ static int32_t sxe2_dev_infos_get(struct rte_eth_dev *dev,
 		RTE_ETH_RX_OFFLOAD_SCTP_CKSUM |
 		RTE_ETH_RX_OFFLOAD_OUTER_IPV4_CKSUM |
 		RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT |
+#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC
+		RTE_ETH_RX_OFFLOAD_QINQ_STRIP |
+#endif
+		RTE_ETH_RX_OFFLOAD_VLAN_EXTEND |
 		RTE_ETH_RX_OFFLOAD_TCP_LRO;
 
 	dev_info->tx_offload_capa =
+		RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
+		RTE_ETH_TX_OFFLOAD_QINQ_INSERT |
 		RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
 		RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE |
 		RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
@@ -428,6 +463,12 @@ static int32_t sxe2_eth_init(struct rte_eth_dev *dev)
 {
 	int32_t ret = 0;
 
+	ret = sxe2_filter_init(dev);
+	if (ret) {
+		PMD_LOG_ERR(INIT, "Failed to initialize l2 filter, ret:%d", ret);
+		goto l_end;
+	}
+
 	ret = sxe2_link_update_init(dev);
 	if (ret) {
 		PMD_LOG_ERR(INIT, "Failed to initialize link update, ret:%d", ret);
@@ -439,12 +480,37 @@ static int32_t sxe2_eth_init(struct rte_eth_dev *dev)
 		PMD_LOG_ERR(INIT, "Failed to set mtu, ret=%d", ret);
 		goto l_end;
 	}
+
+	ret = sxe2_mac_addr_init(dev);
+	if (ret != 0) {
+		PMD_LOG_ERR(INIT, "Failed to initialize mac address, ret:%d", ret);
+		goto l_end;
+	}
+
+	ret = sxe2_mac_default_cfg(dev);
+	if (ret != 0) {
+		PMD_LOG_ERR(INIT, "Failed to configure default mac address, ret:%d", ret);
+		goto l_err;
+	}
+
+	ret = sxe2_vlan_cfg_init(dev);
+	if (ret) {
+		PMD_LOG_ERR(INIT, "Failed to initialize vlan config, ret:%d", ret);
+		goto l_err;
+	}
+	goto l_end;
+
+l_err:
+	sxe2_mac_addr_uinit(dev);
+	(void)sxe2_filter_uinit(dev);
 l_end:
 	return ret;
 }
 
 static void sxe2_eth_uinit(struct rte_eth_dev *dev __rte_unused)
 {
+	sxe2_mac_addr_uinit(dev);
+	(void)sxe2_filter_uinit(dev);
 }
 
 static void sxe2_drv_dev_caps_set(struct sxe2_adapter *adapter,
diff --git a/drivers/net/sxe2/sxe2_ethdev.h b/drivers/net/sxe2/sxe2_ethdev.h
index 9de47d8338..fa3c69d360 100644
--- a/drivers/net/sxe2/sxe2_ethdev.h
+++ b/drivers/net/sxe2/sxe2_ethdev.h
@@ -15,9 +15,11 @@
 
 #include "sxe2_common.h"
 #include "sxe2_vsi.h"
-#include "sxe2_queue.h"
 #include "sxe2_irq.h"
+#include "sxe2_queue.h"
+#include "sxe2_mac.h"
 #include "sxe2_osal.h"
+#include "sxe2_filter.h"
 
 struct sxe2_link_msg {
 	uint32_t speed;
@@ -35,7 +37,7 @@ enum sxe2_fnav_tunnel_flag_type {
 #define SXE2_FRAME_SIZE_MAX    9832
 #define SXE2_VLAN_TAG_SIZE     4
 #define SXE2_ETH_OVERHEAD \
-	(RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + SXE2_VLAN_TAG_SIZE)
+	(RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + 2 * SXE2_VLAN_TAG_SIZE)
 #define SXE2_ETH_MAX_LEN (RTE_ETHER_MTU + SXE2_ETH_OVERHEAD)
 
 #ifdef SXE2_TEST
@@ -267,6 +269,27 @@ struct sxe2_link_context {
 	uint32_t  speed;
 };
 
+struct sxe2_filter_context {
+	rte_spinlock_t filter_lock;
+	struct sxe2_vlan_info               vlan_info;
+	struct sxe2_uc_filter_list_head    uc_list;
+	struct sxe2_mc_filter_list_head    mc_list;
+	struct sxe2_vlan_filter_list_head  vlan_list;
+	uint8_t                                 uc_num;
+	uint8_t                                 mc_num;
+	uint8_t                                 vlan_num;
+	uint8_t                                 rsv;
+	uint32_t hw_promisc_flags;
+	uint32_t cur_promisc_flags;
+
+	bool hw_uplink_config;
+	bool cur_uplink_config;
+	bool hw_repr_config;
+	bool cur_repr_config;
+	bool hw_l2_config;
+	bool cur_l2_config;
+};
+
 struct sxe2_adapter {
 	struct sxe2_common_device      *cdev;
 	struct sxe2_dev_info            dev_info;
@@ -276,10 +299,14 @@ struct sxe2_adapter {
 	struct sxe2_irq_context       irq_ctxt;
 	struct sxe2_queue_context     q_ctxt;
 	struct sxe2_vsi_context       vsi_ctxt;
+	struct sxe2_filter_context    filter_ctxt;
 	struct sxe2_link_context      link_ctxt;
 	struct sxe2_devargs           devargs;
-	uint16_t                      dev_port_id;
-	uint64_t                      cap_flags;
+	struct sxe2_switchdev_info    switchdev_info;
+	bool                          rule_started;
+	bool                          flow_isolated;
+	uint16_t                           dev_port_id;
+	uint64_t                           cap_flags;
 	enum sxe2_dev_type            dev_type;
 	uint32_t    ptype_tbl[SXE2_MAX_PTYPE_NUM];
 	struct rte_ether_addr           mac_addr;
@@ -315,4 +342,12 @@ int32_t sxe2_dev_pci_map_init(struct rte_eth_dev *dev);
 
 void sxe2_dev_pci_map_uinit(struct rte_eth_dev *dev);
 
+static inline bool
+sxe2_dev_port_vlan_check(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *ad = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	return ad->filter_ctxt.vlan_info.port_vlan_exist;
+}
+
 #endif /* SXE2_ETHDEV_H */
diff --git a/drivers/net/sxe2/sxe2_filter.c b/drivers/net/sxe2/sxe2_filter.c
new file mode 100644
index 0000000000..b2a726f77e
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_filter.c
@@ -0,0 +1,782 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#include <rte_os.h>
+#include <rte_tailq.h>
+#include "sxe2_osal.h"
+#include "sxe2_mac.h"
+#include "sxe2_common_log.h"
+#include "sxe2_ethdev.h"
+#include "sxe2_cmd_chnl.h"
+
+static struct sxe2_mac_filter *sxe2_uc_filter_find(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *macaddr)
+{
+	struct sxe2_mac_filter *filter      = NULL;
+	struct sxe2_mac_filter *entry       = NULL;
+	struct sxe2_mac_filter *next_entry  = NULL;
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	RTE_TAILQ_FOREACH_SAFE(entry, &adapter->filter_ctxt.uc_list, next, next_entry) {
+		if (rte_is_same_ether_addr(macaddr, &entry->mac_addr)) {
+			filter = entry;
+			break;
+		}
+	}
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	return filter;
+}
+
+int32_t sxe2_uc_filter_add(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr, bool default_config)
+{
+	struct sxe2_mac_filter *filter = NULL;
+	bool hw_config = false;
+	int32_t ret = 0;
+
+	filter = sxe2_uc_filter_find(adapter, mac_addr);
+	if (filter) {
+		if (default_config && !filter->default_config)
+			filter->default_config = true;
+		PMD_DEV_LOG_INFO(adapter, DRV, "This MAC filter already exists.");
+		goto l_end;
+	}
+
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot add hw uc addr in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add hw uc addr in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add hw uc addr in switchdev mode");
+	} else {
+		ret = sxe2_drv_uc_config(adapter, mac_addr, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add uc rule");
+			ret = -EINVAL;
+			goto l_end;
+		}
+		hw_config = true;
+	}
+
+	filter = rte_zmalloc("sxe2_uc_filter",
+			     sizeof(struct sxe2_mac_filter), 0);
+	if (!filter) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to allocate memory");
+		ret = -ENOMEM;
+		goto l_end;
+	}
+	filter->hw_config = hw_config;
+	filter->default_config = default_config;
+	rte_ether_addr_copy(mac_addr, &filter->mac_addr);
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_INSERT_TAIL(&adapter->filter_ctxt.uc_list, filter, next);
+	adapter->filter_ctxt.uc_num++;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	PMD_DEV_LOG_INFO(adapter, DRV, "add mac rule, mac num %u.", adapter->filter_ctxt.uc_num);
+	ret = 0;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_uc_filter_del(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr)
+{
+	struct sxe2_mac_filter *filter = NULL;
+	int32_t ret                         = -1;
+
+	filter = sxe2_uc_filter_find(adapter, mac_addr);
+	if (!filter) {
+		PMD_DEV_LOG_INFO(adapter, DRV, "This MAC filter not exists.");
+		ret = 0;
+		goto l_end;
+	}
+	if (filter->hw_config) {
+		ret = sxe2_drv_uc_config(adapter, mac_addr, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete mac rule");
+			if (ret == -EPERM)
+				goto l_free;
+			ret = -EINVAL;
+			goto l_end;
+		}
+	}
+	PMD_DEV_LOG_INFO(adapter, DRV, "remove mac rule, uc num %u.", adapter->filter_ctxt.uc_num);
+	ret = 0;
+
+l_free:
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_REMOVE(&adapter->filter_ctxt.uc_list, filter, next);
+	adapter->filter_ctxt.uc_num--;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+	rte_free(filter);
+	filter = NULL;
+l_end:
+	return ret;
+}
+
+void sxe2_uc_filter_clear(struct sxe2_adapter *adapter, bool default_config)
+{
+	struct sxe2_mac_filter *entry;
+	struct sxe2_mac_filter *next_entry;
+
+	RTE_TAILQ_FOREACH_SAFE(entry, &adapter->filter_ctxt.uc_list, next, next_entry) {
+		if (entry->default_config && !default_config)
+			continue;
+
+		if (sxe2_uc_filter_del(adapter, &entry->mac_addr))
+			PMD_DEV_LOG_ERR(adapter, DRV, "This MAC filter delete fail.");
+	}
+}
+
+static struct sxe2_mac_filter *sxe2_mc_filter_find(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *macaddr)
+{
+	struct sxe2_mac_filter *filter      = NULL;
+	struct sxe2_mac_filter *entry       = NULL;
+	struct sxe2_mac_filter *next_entry  = NULL;
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	RTE_TAILQ_FOREACH_SAFE(entry, &adapter->filter_ctxt.mc_list, next, next_entry) {
+		if (rte_is_same_ether_addr(macaddr, &entry->mac_addr)) {
+			filter = entry;
+			break;
+		}
+	}
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	return filter;
+}
+
+int32_t sxe2_mc_filter_add(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr, bool default_config)
+{
+	struct sxe2_mac_filter *filter = NULL;
+	bool hw_config = false;
+	int32_t ret = 0;
+
+	filter = sxe2_mc_filter_find(adapter, mac_addr);
+	if (filter) {
+		if (default_config && !filter->default_config)
+			filter->default_config = true;
+		PMD_DEV_LOG_INFO(adapter, DRV, "This MAC filter already exists.");
+		goto l_end;
+	}
+
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot add hw mc addr in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add hw mc addr in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add hw mc addr in switchdev mode");
+	} else {
+		ret = sxe2_drv_mc_config(adapter, mac_addr, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add mac rule");
+			ret = -EINVAL;
+			goto l_end;
+		}
+		hw_config = true;
+	}
+
+	filter = rte_zmalloc("sxe2_mc_filter",
+			     sizeof(struct sxe2_mac_filter), 0);
+	if (!filter) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to allocate memory");
+		ret = -ENOMEM;
+		goto l_end;
+	}
+	filter->hw_config = hw_config;
+	filter->default_config = default_config;
+	rte_ether_addr_copy(mac_addr, &filter->mac_addr);
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_INSERT_TAIL(&adapter->filter_ctxt.mc_list, filter, next);
+	adapter->filter_ctxt.mc_num++;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	PMD_DEV_LOG_INFO(adapter, DRV, "add mc rule, mc num %u.", adapter->filter_ctxt.mc_num);
+	ret = 0;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_mc_filter_del(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr)
+{
+	struct sxe2_mac_filter *filter = NULL;
+	int32_t ret                         = -1;
+
+	filter = sxe2_mc_filter_find(adapter, mac_addr);
+	if (!filter) {
+		PMD_DEV_LOG_INFO(adapter, DRV, "This MAC filter not exists.");
+		ret = 0;
+		goto l_end;
+	}
+
+	if (filter->hw_config) {
+		ret = sxe2_drv_mc_config(adapter, mac_addr, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete mc rule");
+			if (ret == -EPERM)
+				goto l_free;
+			ret = -EINVAL;
+			goto l_end;
+		}
+	}
+	PMD_DEV_LOG_INFO(adapter, DRV, "remove mc rule, mc num %u.", adapter->filter_ctxt.mc_num);
+	ret = 0;
+
+l_free:
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_REMOVE(&adapter->filter_ctxt.mc_list, filter, next);
+	adapter->filter_ctxt.mc_num--;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+	rte_free(filter);
+	filter = NULL;
+l_end:
+	return ret;
+}
+
+void sxe2_mc_filter_clear(struct sxe2_adapter *adapter, bool default_config)
+{
+	struct sxe2_mac_filter *entry;
+	struct sxe2_mac_filter *next_entry;
+
+	RTE_TAILQ_FOREACH_SAFE(entry, &adapter->filter_ctxt.mc_list, next, next_entry) {
+		if (entry->default_config && !default_config)
+			continue;
+		if (sxe2_mc_filter_del(adapter, &entry->mac_addr))
+			PMD_DEV_LOG_ERR(adapter, DRV, "This MAC filter delete fail.");
+	}
+}
+
+static struct sxe2_vlan_filter *sxe2_vlan_filter_find(struct sxe2_adapter *adapter,
+			struct sxe2_vlan *vlan)
+{
+	struct sxe2_vlan_filter *f;
+	struct sxe2_vlan_filter *save_f = NULL;
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_FOREACH(f, &adapter->filter_ctxt.vlan_list, next)
+	{
+		if (vlan->tpid == f->vlan_info.tpid &&
+			vlan->vid == f->vlan_info.vid) {
+			save_f = f;
+			break;
+		}
+	}
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	return save_f;
+}
+
+int32_t sxe2_vlan_filter_add(struct sxe2_adapter *adapter,
+			     struct sxe2_vlan *vlan, bool default_config)
+{
+	struct sxe2_vlan_filter *filter = NULL;
+	bool hw_config                 = false;
+	int32_t ret                    = 0;
+
+	if (!vlan || vlan->vid > RTE_ETHER_MAX_VLAN_ID) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "This vlan filter is invalid.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	filter = sxe2_vlan_filter_find(adapter, vlan);
+	if (filter) {
+		PMD_DEV_LOG_INFO(adapter, DRV, "This vlan filter already exists.");
+		ret = 0;
+		goto l_end;
+	}
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot add vlan in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add vlan in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add vlan in switchdev mode");
+	} else {
+		ret = sxe2_drv_vlan_filter_id_config(adapter, vlan, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add vlan rule");
+			ret = -EINVAL;
+			goto l_end;
+		}
+		hw_config = true;
+	}
+
+	filter = rte_zmalloc("sxe2_vlan_filter", sizeof(*filter), 0);
+	if (!filter) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to allocate memory");
+		ret = -ENOMEM;
+		goto l_end;
+	}
+
+	filter->hw_config = hw_config;
+	filter->default_config = default_config;
+
+	filter->vlan_info.tpid = vlan->tpid;
+	filter->vlan_info.vid = vlan->vid;
+	filter->vlan_info.prio = vlan->prio;
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_INSERT_TAIL(&adapter->filter_ctxt.vlan_list, filter, next);
+	adapter->filter_ctxt.vlan_num++;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+
+	ret = 0;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_vlan_filter_del(struct sxe2_adapter *adapter, struct sxe2_vlan *vlan)
+{
+	struct sxe2_vlan_filter *filter = NULL;
+	int32_t ret                         = -1;
+
+	if (!vlan || vlan->vid > RTE_ETHER_MAX_VLAN_ID) {
+		PMD_DEV_LOG_INFO(adapter, DRV, "This vlan filter is invalid.");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	filter = sxe2_vlan_filter_find(adapter, vlan);
+	if (!filter) {
+		PMD_DEV_LOG_INFO(adapter, DRV, "This vlan filter not exists.");
+		ret = 0;
+		goto l_end;
+	}
+
+	if (filter->hw_config) {
+		ret = sxe2_drv_vlan_filter_id_config(adapter, vlan, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete vlan rule");
+			if (ret == -EPERM)
+				goto l_free;
+			ret = -EINVAL;
+			goto l_end;
+		}
+	}
+	ret = 0;
+
+l_free:
+
+	rte_spinlock_lock(&adapter->filter_ctxt.filter_lock);
+	TAILQ_REMOVE(&adapter->filter_ctxt.vlan_list, filter, next);
+	adapter->filter_ctxt.vlan_num--;
+	rte_spinlock_unlock(&adapter->filter_ctxt.filter_lock);
+	rte_free(filter);
+	filter = NULL;
+l_end:
+	return ret;
+}
+
+void sxe2_vlan_filters_clear(struct sxe2_adapter *adapter, bool default_config)
+{
+	int32_t ret = 0;
+	struct sxe2_vlan_filter *v_f;
+	void *temp;
+
+	if (adapter->filter_ctxt.vlan_num == 0)
+		return;
+
+	RTE_TAILQ_FOREACH_SAFE(v_f, &adapter->filter_ctxt.vlan_list, next, temp)
+	{
+		if (v_f->default_config && !default_config)
+			continue;
+		ret = sxe2_vlan_filter_del(adapter, &v_f->vlan_info);
+		if (ret)
+			PMD_DEV_LOG_ERR(adapter, DRV, "This vlan filter delete fail.");
+	}
+}
+
+int32_t sxe2_vlan_filter_ctrl(struct sxe2_adapter *adapter, bool flag)
+{
+	struct sxe2_vlan_info *vlan_info = &adapter->filter_ctxt.vlan_info;
+	int32_t ret = 0;
+
+	if (vlan_info->filter_on == flag)
+		goto l_end;
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot add vlan filter ctrl in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add vlan filter ctrl in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot add vlan filter ctrl in switchdev mode");
+	} else {
+		ret = sxe2_drv_vlan_filter_switch(adapter, flag);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add vlan filter ctrl");
+			goto l_end;
+		}
+		vlan_info->hw_filter_on = flag;
+	}
+	vlan_info->filter_on = flag;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_promisc_add(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot enable promiscuous in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot enable promiscuous in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot enable promiscuous in switchdev mode");
+	} else if (!(adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC)) {
+		ret = sxe2_drv_promisc_config(adapter, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg promiscuous, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags |= SXE2_PROMISC;
+	}
+	adapter->filter_ctxt.cur_promisc_flags |= SXE2_PROMISC;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_promisc_del(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+
+	if (!adapter->flow_isolated &&
+	    (adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC)) {
+		ret = sxe2_drv_promisc_config(adapter, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg promiscuous, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags &= ~SXE2_PROMISC;
+	}
+
+	adapter->filter_ctxt.cur_promisc_flags &= ~SXE2_PROMISC;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_allmulti_add(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+
+	if (!adapter->rule_started) {
+		PMD_DEV_LOG_DEBUG(adapter, DRV, "cannot enable allmulticast in port stop status");
+	} else if (adapter->flow_isolated) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot enable allmulticast in flow isolation mode");
+	} else if (adapter->switchdev_info.is_switchdev) {
+		PMD_DEV_LOG_WARN(adapter, DRV, "cannot enable allmulticast in switchdev mode");
+	} else if (!(adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC_MULTICAST)) {
+		ret = sxe2_drv_allmulti_config(adapter, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg allmulticast, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags |= SXE2_PROMISC_MULTICAST;
+	}
+	adapter->filter_ctxt.cur_promisc_flags |= SXE2_PROMISC_MULTICAST;
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_allmulti_del(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+
+	if (!adapter->flow_isolated &&
+	    (adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC_MULTICAST)) {
+		ret = sxe2_drv_allmulti_config(adapter, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg allmulticast, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags &= ~SXE2_PROMISC_MULTICAST;
+	}
+
+	adapter->filter_ctxt.cur_promisc_flags &= ~SXE2_PROMISC_MULTICAST;
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_all_filter_hw_clear(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+	struct sxe2_mac_filter *mac_entry;
+	struct sxe2_mac_filter *next_mac_entry;
+	struct sxe2_vlan_filter *vlan_entry;
+	struct sxe2_vlan_filter *next_vlan_entry;
+
+	if (adapter->filter_ctxt.uc_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(mac_entry, &adapter->filter_ctxt.uc_list, next,
+			    next_mac_entry) {
+			if (mac_entry->hw_config) {
+				ret = sxe2_drv_uc_config(adapter, &mac_entry->mac_addr, false);
+				if (ret) {
+					PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete mac rule");
+					ret = -EINVAL;
+					goto l_end;
+				}
+				mac_entry->hw_config = false;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.mc_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(mac_entry, &adapter->filter_ctxt.mc_list, next,
+			    next_mac_entry) {
+			if (mac_entry->hw_config) {
+				ret = sxe2_drv_mc_config(adapter, &mac_entry->mac_addr, false);
+				if (ret) {
+					PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete mc rule");
+					ret = -EINVAL;
+					goto l_end;
+				}
+				mac_entry->hw_config = false;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.vlan_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(vlan_entry, &adapter->filter_ctxt.vlan_list, next,
+			    next_vlan_entry) {
+			if (vlan_entry->hw_config) {
+				ret = sxe2_drv_vlan_filter_id_config(adapter,
+				    &vlan_entry->vlan_info, false);
+				if (ret) {
+					PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete vlan rule");
+					ret = -EINVAL;
+					goto l_end;
+				}
+				vlan_entry->hw_config = false;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.vlan_info.hw_filter_on) {
+		ret = sxe2_drv_vlan_filter_switch(adapter, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to delete vlan rule");
+			ret = -EINVAL;
+			goto l_end;
+		}
+		adapter->filter_ctxt.vlan_info.hw_filter_on = false;
+	}
+
+	if (adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC) {
+		ret = sxe2_drv_promisc_config(adapter, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg promiscuous, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags &= ~SXE2_PROMISC;
+	}
+
+	if (adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC_MULTICAST) {
+		ret = sxe2_drv_allmulti_config(adapter, false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "failed to cfg allmulticast, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags &= ~SXE2_PROMISC_MULTICAST;
+	}
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_all_filter_hw_set(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+	struct sxe2_mac_filter *mac_entry;
+	struct sxe2_mac_filter *next_mac_entry;
+	struct sxe2_vlan_filter *vlan_entry;
+	struct sxe2_vlan_filter *next_vlan_entry;
+
+	if (adapter->filter_ctxt.uc_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(mac_entry, &adapter->filter_ctxt.uc_list, next,
+				       next_mac_entry) {
+			if (!mac_entry->hw_config) {
+				ret = sxe2_drv_uc_config(adapter, &mac_entry->mac_addr,
+							 true);
+				if (ret && ret != -EEXIST) {
+					PMD_DEV_LOG_ERR(adapter, DRV,
+							"Failed to add uc rule, ret:%d", ret);
+					ret = -EINVAL;
+					goto l_end;
+				}
+				mac_entry->hw_config = true;
+				ret = 0;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.mc_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(mac_entry, &adapter->filter_ctxt.mc_list, next,
+				       next_mac_entry) {
+			if (!mac_entry->hw_config) {
+				ret = sxe2_drv_mc_config(adapter, &mac_entry->mac_addr, true);
+				if (ret && ret != -EEXIST) {
+					PMD_DEV_LOG_ERR(adapter, DRV,
+							"Failed to add mc rule, ret:%d", ret);
+					ret = -EINVAL;
+					goto l_end;
+				}
+				mac_entry->hw_config = true;
+				ret = 0;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.vlan_num > 0) {
+		RTE_TAILQ_FOREACH_SAFE(vlan_entry, &adapter->filter_ctxt.vlan_list, next,
+				       next_vlan_entry) {
+			if (!vlan_entry->hw_config) {
+				ret = sxe2_drv_vlan_filter_id_config(adapter,
+				    &vlan_entry->vlan_info, true);
+				if (ret && ret != -EEXIST) {
+					PMD_DEV_LOG_ERR(adapter, DRV,
+							"Failed to add vlan rule, ret:%d", ret);
+					ret = -EINVAL;
+					goto l_end;
+				}
+				vlan_entry->hw_config = true;
+				ret = 0;
+			}
+		}
+	}
+
+	if (adapter->filter_ctxt.vlan_info.filter_on) {
+		if (!(adapter->filter_ctxt.vlan_info.hw_filter_on)) {
+			ret = sxe2_drv_vlan_filter_switch(adapter, true);
+			if (ret && ret != -EEXIST) {
+				PMD_DEV_LOG_ERR(adapter, DRV,
+						"Failed to add vlan ctrl, ret:%d", ret);
+				ret = -EINVAL;
+				goto l_end;
+			}
+			adapter->filter_ctxt.vlan_info.hw_filter_on = true;
+			ret = 0;
+		}
+	}
+
+	if ((adapter->filter_ctxt.cur_promisc_flags & SXE2_PROMISC) &&
+	    (!(adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC))) {
+		ret = sxe2_drv_promisc_config(adapter, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+					"Failed to set promisc, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags |= SXE2_PROMISC;
+		ret = 0;
+	}
+
+	if ((adapter->filter_ctxt.cur_promisc_flags & SXE2_PROMISC_MULTICAST) &&
+	    (!(adapter->filter_ctxt.hw_promisc_flags & SXE2_PROMISC_MULTICAST))) {
+		ret = sxe2_drv_allmulti_config(adapter, true);
+		if (ret && ret != -EEXIST) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+					"Failed to set allmulti, ret:%d", ret);
+			goto l_end;
+		}
+		adapter->filter_ctxt.hw_promisc_flags |= SXE2_PROMISC_MULTICAST;
+		ret = 0;
+	}
+l_end:
+	return ret;
+}
+
+int32_t sxe2_l2_rule_update(struct sxe2_adapter *adapter)
+{
+	int32_t ret = 0;
+
+	if (!adapter->flow_isolated && !adapter->switchdev_info.is_switchdev &&
+	    adapter->rule_started) {
+		adapter->filter_ctxt.cur_l2_config = true;
+	} else {
+		adapter->filter_ctxt.cur_l2_config = false;
+	}
+
+	if (adapter->filter_ctxt.cur_l2_config !=
+	    adapter->filter_ctxt.hw_l2_config) {
+		if (adapter->filter_ctxt.cur_l2_config) {
+			ret = sxe2_all_filter_hw_set(adapter);
+			if (!ret)
+				adapter->filter_ctxt.hw_l2_config = true;
+		} else {
+			ret = sxe2_all_filter_hw_clear(adapter);
+			if (!ret)
+				adapter->filter_ctxt.hw_l2_config = false;
+		}
+	}
+	return ret;
+}
+
+int32_t sxe2_filter_rule_stop(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	adapter->rule_started = 0;
+
+	ret = sxe2_l2_rule_update(adapter);
+	if (ret != 0)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to update l2 rule");
+
+	return ret;
+}
+
+int32_t sxe2_filter_rule_start(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	adapter->rule_started = 1;
+
+	ret = sxe2_l2_rule_update(adapter);
+	if (ret != 0)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to update l2 rule");
+
+	return ret;
+}
+
+int32_t sxe2_filter_init(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	rte_spinlock_init(&adapter->filter_ctxt.filter_lock);
+
+	TAILQ_INIT(&adapter->filter_ctxt.uc_list);
+	adapter->filter_ctxt.uc_num = 0;
+
+	TAILQ_INIT(&adapter->filter_ctxt.mc_list);
+	adapter->filter_ctxt.mc_num = 0;
+
+	TAILQ_INIT(&adapter->filter_ctxt.vlan_list);
+	adapter->filter_ctxt.vlan_num = 0;
+	return 0;
+}
+
+int32_t sxe2_filter_uinit(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	sxe2_uc_filter_clear(adapter, true);
+	adapter->filter_ctxt.uc_num = 0;
+
+	sxe2_mc_filter_clear(adapter, true);
+	adapter->filter_ctxt.mc_num = 0;
+
+	sxe2_vlan_filters_clear(adapter, true);
+	adapter->filter_ctxt.vlan_num = 0;
+	return 0;
+}
diff --git a/drivers/net/sxe2/sxe2_filter.h b/drivers/net/sxe2/sxe2_filter.h
new file mode 100644
index 0000000000..6262e8c845
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_filter.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#ifndef __SXE2_FILTER_H__
+#define __SXE2_FILTER_H__
+#include <ethdev_driver.h>
+
+#define SXE2_PROMISC  (1UL << 0UL)
+#define SXE2_PROMISC_MULTICAST  (1UL << 1UL)
+
+struct sxe2_vlan_info {
+	uint8_t port_vlan_exist;
+	uint8_t is_switchdev;
+	uint16_t max_cnt;
+	uint16_t cnt;
+
+	bool filter_on;
+	bool hw_filter_on;
+
+	uint16_t tpid;
+	uint16_t vid;
+
+	uint8_t outer_insert;
+	uint8_t outer_strip;
+	uint8_t inner_insert;
+	uint8_t inner_strip;
+};
+
+struct sxe2_vlan {
+	uint16_t tpid;
+	uint16_t vid;
+	uint8_t prio;
+};
+
+struct sxe2_vlan_filter {
+	TAILQ_ENTRY(sxe2_vlan_filter) next;
+	bool hw_config;
+	bool default_config;
+	struct sxe2_vlan vlan_info;
+};
+
+TAILQ_HEAD(sxe2_vlan_filter_list_head, sxe2_vlan_filter);
+
+struct sxe2_mac_filter {
+	TAILQ_ENTRY(sxe2_mac_filter) next;
+	bool hw_config;
+	bool default_config;
+	struct rte_ether_addr mac_addr;
+};
+
+TAILQ_HEAD(sxe2_uc_filter_list_head, sxe2_mac_filter);
+TAILQ_HEAD(sxe2_mc_filter_list_head, sxe2_mac_filter);
+
+int32_t sxe2_uc_filter_add(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr, bool default_config);
+
+int32_t sxe2_uc_filter_del(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr);
+
+void sxe2_uc_filter_clear(struct sxe2_adapter *adapter, bool default_config);
+
+int32_t sxe2_mc_filter_add(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr, bool default_config);
+
+int32_t sxe2_mc_filter_del(struct sxe2_adapter *adapter,
+			struct rte_ether_addr *mac_addr);
+
+void sxe2_mc_filter_clear(struct sxe2_adapter *adapter, bool default_config);
+
+int32_t sxe2_vlan_filter_add(struct sxe2_adapter *adapter,
+	struct sxe2_vlan *vlan, bool default_config);
+
+int32_t sxe2_vlan_filter_del(struct sxe2_adapter *adapter, struct sxe2_vlan *vlan);
+
+void sxe2_vlan_filters_clear(struct sxe2_adapter *adapter, bool default_config);
+
+int32_t sxe2_vlan_filter_ctrl(struct sxe2_adapter *adapter, bool flag);
+
+int32_t sxe2_promisc_add(struct sxe2_adapter *adapter);
+
+int32_t sxe2_promisc_del(struct sxe2_adapter *adapter);
+
+int32_t sxe2_allmulti_add(struct sxe2_adapter *adapter);
+
+int32_t sxe2_allmulti_del(struct sxe2_adapter *adapter);
+
+int32_t sxe2_l2_rule_update(struct sxe2_adapter *adapter);
+
+int32_t sxe2_filter_rule_stop(struct rte_eth_dev *dev);
+
+int32_t sxe2_filter_rule_start(struct rte_eth_dev *dev);
+
+int32_t sxe2_filter_init(struct rte_eth_dev *dev);
+
+int32_t sxe2_filter_uinit(struct rte_eth_dev *dev);
+
+#endif /* __SXE2_FILTER_H__ */
diff --git a/drivers/net/sxe2/sxe2_mac.c b/drivers/net/sxe2/sxe2_mac.c
index 3c2f909002..d94936a742 100644
--- a/drivers/net/sxe2/sxe2_mac.c
+++ b/drivers/net/sxe2/sxe2_mac.c
@@ -10,6 +10,438 @@
 #include "sxe2_cmd_chnl.h"
 #include "sxe2_host_regs.h"
 
+int32_t sxe2_mac_default_cfg(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t                     ret;
+	struct rte_ether_addr broadcast = {
+		.addr_bytes = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} };
+	struct rte_ether_addr mac_addr;
+
+	rte_ether_addr_copy((struct rte_ether_addr *)
+		adapter->dev_info.mac.perm_addr, &mac_addr);
+	ret = sxe2_uc_filter_add(adapter, &mac_addr, true);
+	if (ret != 0) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add default MAC filter");
+		goto l_end;
+	}
+
+	rte_ether_addr_copy(&broadcast, &mac_addr);
+	ret = sxe2_mc_filter_add(adapter, &mac_addr, true);
+	if (ret != 0) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add broadcast MAC filter");
+		goto l_end;
+	}
+
+	ret = 0;
+l_end:
+	return ret;
+}
+
+int32_t sxe2_mac_addr_init(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret                         = -1;
+	PMD_INIT_FUNC_TRACE();
+
+	if (!rte_is_unicast_ether_addr
+		((struct rte_ether_addr *)adapter->dev_info.mac.perm_addr)) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Invalid MAC address");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	dev->data->mac_addrs = rte_zmalloc("sxe2_mac_adds",
+					sizeof(struct rte_ether_addr) * SXE2_NUM_MACADDR_MAX, 0);
+	if (!dev->data->mac_addrs) {
+		PMD_LOG_ERR(DRV, "Failed to allocate memory to store mac address");
+		ret = -ENOMEM;
+		goto l_end;
+	}
+
+	rte_ether_addr_copy((struct rte_ether_addr *)adapter->dev_info.mac.perm_addr,
+		&dev->data->mac_addrs[0]);
+
+	ret = 0;
+
+l_end:
+	return ret;
+}
+
+void sxe2_mac_addr_uinit(struct rte_eth_dev *dev)
+{
+	PMD_INIT_FUNC_TRACE();
+	if (dev != NULL && dev->data->mac_addrs != NULL) {
+		rte_free(dev->data->mac_addrs);
+		dev->data->mac_addrs = NULL;
+	}
+}
+
+int32_t sxe2_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr,
+		      __rte_unused uint32_t index, __rte_unused uint32_t pool)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = -1;
+
+	if (rte_is_zero_ether_addr(mac_addr)) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Invalid MAC Address");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	if (rte_is_multicast_ether_addr(mac_addr))
+		ret = sxe2_mc_filter_add(adapter, mac_addr, true);
+	else
+		ret = sxe2_uc_filter_add(adapter, mac_addr, false);
+
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add MAC filter");
+
+l_end:
+	return ret;
+}
+
+void sxe2_mac_addr_del(struct rte_eth_dev *dev,  uint32_t index)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct rte_ether_addr *mac_addr = &dev->data->mac_addrs[index];
+	int32_t ret = -1;
+
+	if (rte_is_multicast_ether_addr(mac_addr))
+		ret = sxe2_mc_filter_del(adapter, mac_addr);
+	else
+		ret = sxe2_uc_filter_del(adapter, mac_addr);
+
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to remove MAC filter");
+}
+
+int32_t sxe2_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	struct rte_ether_addr *old_addr = (struct rte_ether_addr *)&adapter->dev_info.mac.perm_addr;
+	struct rte_ether_addr temp_addr;
+
+	if (rte_is_same_ether_addr(old_addr, mac_addr))
+		goto l_end;
+
+	if (rte_is_multicast_ether_addr(mac_addr)) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to set multicast addr");
+		ret = -EINVAL;
+		goto l_end;
+	}
+
+	ret = sxe2_uc_filter_del(adapter, old_addr);
+	if (ret) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to remove MAC filter");
+		goto l_end;
+	}
+
+	rte_ether_addr_copy(old_addr, &temp_addr);
+
+	rte_ether_addr_copy(mac_addr, old_addr);
+
+	ret = sxe2_uc_filter_add(adapter, mac_addr, true);
+	if (ret) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add MAC filter");
+		rte_ether_addr_copy(&temp_addr, old_addr);
+		(void)sxe2_uc_filter_add(adapter, old_addr, true);
+		goto l_end;
+	}
+l_end:
+	return ret;
+}
+
+int32_t sxe2_set_mc_addr_list(struct rte_eth_dev *dev,
+			struct rte_ether_addr *mc_addrs,
+			uint32_t mc_addrs_num)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	uint32_t i;
+	const uint8_t *mac;
+
+	if (mc_addrs_num > SXE2_NUM_MACADDR_MAX) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Too many multicast MAC addresses, ");
+		ret =  -1;
+		goto l_end;
+	}
+
+	sxe2_mc_filter_clear(adapter, false);
+
+	for (i = 0; i < mc_addrs_num; i++) {
+		if (!rte_is_multicast_ether_addr(&mc_addrs[i])) {
+			mac = mc_addrs[i].addr_bytes;
+			PMD_DEV_LOG_ERR(adapter, DRV,
+					"Invalid mac: %02x:%02x:%02x:%02x:%02x:%02x",
+					mac[0], mac[1], mac[2], mac[3], mac[4],
+					mac[5]);
+			ret = -EINVAL;
+			goto add_err;
+		}
+
+		ret = sxe2_mc_filter_add(adapter, &mc_addrs[i], false);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+			    "Failed to remove old multicast MAC filter list");
+			goto add_err;
+		}
+	}
+	goto l_end;
+add_err:
+	sxe2_mc_filter_clear(adapter, false);
+l_end:
+	return ret;
+}
+
+int32_t sxe2_dev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int32_t on)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	struct sxe2_vlan vlan = {
+		.tpid = RTE_ETHER_TYPE_VLAN,
+		.vid = vlan_id,
+		.prio = 0
+	};
+	int32_t ret = 0;
+
+	if (sxe2_dev_port_vlan_check(dev)) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Filter not supported with Port VLAN");
+		ret = -ENOTSUP;
+		goto l_end;
+	}
+
+	if (vlan_id == 0)
+		goto l_end;
+
+	if (on) {
+		ret = sxe2_vlan_filter_add(adapter, &vlan, false);
+		if (ret < 0) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add vlan filter");
+			goto l_end;
+		}
+	} else {
+		ret = sxe2_vlan_filter_del(adapter, &vlan);
+		if (ret < 0) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to remove vlan filter");
+			goto l_end;
+		}
+	}
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_dev_vlan_offload_set(struct rte_eth_dev *dev, int32_t mask)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	struct rte_eth_txmode *txmode = &dev->data->dev_conf.txmode;
+	struct sxe2_vlan_info new_info = adapter->filter_ctxt.vlan_info;
+	bool port_vlan = new_info.port_vlan_exist;
+
+	uint8_t out_strip_mask = SXE2_DPDK_OFFLOAD_OUTER_STRIP_8021Q |
+			    SXE2_DPDK_OFFLOAD_OUTER_STRIP_8021AD |
+			    SXE2_DPDK_OFFLOAD_OUTER_STRIP_QINQ1;
+
+	if (txmode->offloads & RTE_ETH_TX_OFFLOAD_QINQ_INSERT) {
+		if (!(txmode->offloads & RTE_ETH_TX_OFFLOAD_VLAN_INSERT)) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+			    "VLAN INSERT must be enabled when QinQ INSERT is enabled");
+			return -EINVAL;
+		}
+		if (port_vlan) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+					"QINQ INSERT not supported with Port VLAN");
+			return -EINVAL;
+		}
+	}
+
+	if (mask & RTE_ETH_QINQ_STRIP_MASK) {
+		if (rxmode->offloads & RTE_ETH_RX_OFFLOAD_QINQ_STRIP) {
+			if (port_vlan) {
+				PMD_DEV_LOG_ERR(adapter, DRV,
+						"QinQ strip not supported with Port VLAN");
+				return -EINVAL;
+			}
+			new_info.inner_strip = SXE2_VSI_TSR_ID_VLAN;
+		} else {
+			new_info.inner_strip = 0;
+		}
+	}
+
+	if (mask & RTE_ETH_VLAN_STRIP_MASK) {
+		if (rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP) {
+			new_info.outer_strip =
+				port_vlan ? 0 : out_strip_mask;
+			new_info.inner_strip =
+				port_vlan ? new_info.inner_strip : new_info.inner_strip;
+		} else {
+			if (new_info.inner_strip != 0) {
+				PMD_DEV_LOG_ERR(adapter, DRV,
+					"Must disable QinQ strip before disabling VLAN strip");
+				return -EINVAL;
+			}
+			new_info.outer_strip = 0;
+		}
+	}
+
+	if (mask & (RTE_ETH_VLAN_STRIP_MASK | RTE_ETH_QINQ_STRIP_MASK)) {
+		struct sxe2_vlan_info old_info = adapter->filter_ctxt.vlan_info;
+		adapter->filter_ctxt.vlan_info = new_info;
+
+		ret = sxe2_drv_vlan_insert_strip_cfg(adapter);
+		if (ret) {
+			adapter->filter_ctxt.vlan_info = old_info;
+			return ret;
+		}
+	}
+	if (mask & RTE_ETH_VLAN_FILTER_MASK) {
+		if (adapter->filter_ctxt.vlan_info.port_vlan_exist) {
+			ret = 0;
+			PMD_DEV_LOG_INFO(adapter, INIT, "vlan filter is not support when port vlan is enabled");
+			goto l_end;
+		}
+
+		ret = sxe2_vlan_filter_ctrl(adapter,
+			    !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_FILTER));
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV,
+			    "sxe2_drv_vlan_filter_switch failed ret:%d", ret);
+			goto l_end;
+		}
+	}
+
+	PMD_DEV_LOG_DEBUG(adapter, DRV,
+	    "mask:0x%x rx mode offload:0x%" PRIx64 " vlan offload set done",
+	    mask, rxmode->offloads);
+l_end:
+	return ret;
+}
+
+static int32_t sxe2_vlan_filter_zero(struct sxe2_adapter *adapter)
+{
+	struct sxe2_vlan vlan;
+	int32_t ret;
+	uint16_t tpids[] = {RTE_ETHER_TYPE_VLAN, RTE_ETHER_TYPE_QINQ, RTE_ETHER_TYPE_QINQ1};
+	uint8_t i;
+
+	vlan = (struct sxe2_vlan){0, 0, 0};
+	ret = sxe2_vlan_filter_add(adapter, &vlan, true);
+	if (ret) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add VLAN ID 0");
+		goto l_end;
+	}
+
+	for (i = 0; i < RTE_DIM(tpids); i++) {
+		vlan = (struct sxe2_vlan){tpids[i], 0, 0};
+		ret = sxe2_vlan_filter_add(adapter, &vlan, true);
+		if (ret) {
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add VLAN ID 0 when tpid:0x%x",
+					tpids[i]);
+			goto l_end;
+		}
+	}
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_vlan_cfg_init(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	ret = sxe2_drv_vlan_config_query(adapter);
+	if (ret) {
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to query vlan config, ret=%d", ret);
+		goto l_end;
+	}
+
+	if (!sxe2_dev_port_vlan_check(dev))
+		adapter->filter_ctxt.vlan_info.outer_insert =
+			SXE2_DPDK_OFFLOAD_OUTER_INSERT_8021Q |
+			SXE2_DPDK_OFFLOAD_INSERT_ENABLE;
+	else
+		adapter->filter_ctxt.vlan_info.outer_insert = 0;
+
+	adapter->filter_ctxt.vlan_info.inner_insert =
+			SXE2_DPDK_OFFLOAD_INNER_INSERT_QINQ1 | SXE2_DPDK_OFFLOAD_INSERT_ENABLE;
+
+	if (!sxe2_dev_port_vlan_check(dev)) {
+		ret = sxe2_vlan_filter_zero(adapter);
+		if (ret != 0)
+			PMD_DEV_LOG_ERR(adapter, DRV, "Failed to add vlan filter switch:0 "
+					"for port:%d", adapter->port_idx);
+	}
+
+l_end:
+	return ret;
+}
+
+int32_t sxe2_vlan_default_cfg(struct rte_eth_dev *dev)
+{
+	int32_t ret = 0;
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+
+	ret = sxe2_dev_vlan_offload_set(dev, RTE_ETH_VLAN_STRIP_MASK |
+					RTE_ETH_QINQ_STRIP_MASK |
+					RTE_ETH_VLAN_FILTER_MASK);
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to cfg vlan offload, ret:%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_promisc_enable(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	ret = sxe2_promisc_add(adapter);
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to enable promisc, ret:%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_promisc_disable(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	ret = sxe2_promisc_del(adapter);
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to disable promisc, ret:%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_allmulti_enable(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	ret = sxe2_allmulti_add(adapter);
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to enable allmulti, ret:%d", ret);
+
+	return ret;
+}
+
+int32_t sxe2_allmulti_disable(struct rte_eth_dev *dev)
+{
+	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
+	int32_t ret = 0;
+
+	ret = sxe2_allmulti_del(adapter);
+	if (ret)
+		PMD_DEV_LOG_ERR(adapter, DRV, "Failed to disable allmulti, ret:%d", ret);
+
+	return ret;
+}
+
 int32_t sxe2_link_update_init(struct rte_eth_dev *dev)
 {
 	struct sxe2_adapter *adapter = SXE2_DEV_PRIVATE_TO_ADAPTER(dev);
diff --git a/drivers/net/sxe2/sxe2_mac.h b/drivers/net/sxe2/sxe2_mac.h
index f2f3edaeff..55fd1829a0 100644
--- a/drivers/net/sxe2/sxe2_mac.h
+++ b/drivers/net/sxe2/sxe2_mac.h
@@ -43,6 +43,40 @@ struct sxe2_mac_mc_list {
 
 int32_t sxe2_link_update_init(struct rte_eth_dev *dev);
 
+int32_t sxe2_mac_default_cfg(struct rte_eth_dev *dev);
+
+int32_t sxe2_vlan_cfg_init(struct rte_eth_dev *dev);
+
+int32_t sxe2_mac_addr_init(struct rte_eth_dev *dev);
+
+void sxe2_mac_addr_uinit(struct rte_eth_dev *dev);
+
+int32_t sxe2_mac_addr_add(struct rte_eth_dev *dev,
+			struct rte_ether_addr *mac_addr,
+			__rte_unused uint32_t index, __rte_unused uint32_t pool);
+
+void sxe2_mac_addr_del(struct rte_eth_dev *dev,  uint32_t index);
+
+int32_t sxe2_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr);
+
+int32_t sxe2_set_mc_addr_list(struct rte_eth_dev *dev,
+	struct rte_ether_addr *mc_addrs,
+	uint32_t mc_addrs_num);
+
+int32_t sxe2_promisc_enable(struct rte_eth_dev *dev);
+
+int32_t sxe2_promisc_disable(struct rte_eth_dev *dev);
+
+int32_t sxe2_allmulti_enable(struct rte_eth_dev *dev);
+
+int32_t sxe2_allmulti_disable(struct rte_eth_dev *dev);
+
+int32_t sxe2_dev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int32_t on);
+
+int32_t sxe2_dev_vlan_offload_set(struct rte_eth_dev *dev, int32_t mask);
+
+int32_t sxe2_vlan_default_cfg(struct rte_eth_dev *dev);
+
 int32_t sxe2_link_update(struct rte_eth_dev *dev, __rte_unused int32_t wait_to_complete);
 
 int32_t sxe2_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
diff --git a/drivers/net/sxe2/sxe2_txrx_poll.c b/drivers/net/sxe2/sxe2_txrx_poll.c
index b9d34afb31..21d5c38725 100644
--- a/drivers/net/sxe2/sxe2_txrx_poll.c
+++ b/drivers/net/sxe2/sxe2_txrx_poll.c
@@ -660,6 +660,53 @@ sxe2_rx_desc_error_para(__rte_unused struct sxe2_rx_queue *rxq,
 	return flags;
 }
 
+static inline void sxe2_rx_desc_vlan_para_fill(struct rte_mbuf *mbuf,
+			union sxe2_rx_desc *desc)
+{
+	if (0 == (rte_le_to_cpu_64(desc->wb.status_err_ptype_len) &
+		  SXE2_RX_DESC_STATUS_L2TAG1_P_MASK)) {
+		mbuf->vlan_tci = 0;
+	} else {
+		mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
+		mbuf->vlan_tci = rte_le_to_cpu_16(desc->wb.l2tag1);
+		PMD_LOG_DEBUG(RX, "Rx desc mbuf vlan, vlan_tci:%u",
+			mbuf->vlan_tci);
+	}
+#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC
+	if (0 == (rte_le_to_cpu_32(desc->wb.status_lrocnt_fdpf_id) &
+				SXE2_RX_DESC_EXT_STATUS_L2TAG2P_MASK)) {
+		mbuf->vlan_tci_outer = 0;
+	} else {
+		mbuf->ol_flags |= RTE_MBUF_F_RX_QINQ_STRIPPED | RTE_MBUF_F_RX_QINQ |
+				RTE_MBUF_F_RX_VLAN_STRIPPED | RTE_MBUF_F_RX_VLAN;
+		mbuf->vlan_tci_outer = mbuf->vlan_tci;
+		mbuf->vlan_tci = rte_le_to_cpu_16(desc->wb.l2tag2_2nd);
+		PMD_LOG_DEBUG(RX, "Rx desc out vlan, l2tag2_1st:%u l2tag2_2nd:%u.",
+				rte_le_to_cpu_16(desc->wb.l2tag2_1st),
+				rte_le_to_cpu_16(desc->wb.l2tag2_2nd));
+	}
+#endif
+}
+
+static inline void
+sxe2_rx_desc_filter_para_fill(struct sxe2_rx_queue *rxq __rte_unused,
+		struct rte_mbuf *mbuf, union sxe2_rx_desc *desc)
+{
+	if (SXE2_RX_DESC_STATUS_RSS_VLD_MASK &
+				rte_le_to_cpu_64(desc->wb.status_err_ptype_len)) {
+		mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
+		mbuf->hash.rss = rte_le_to_cpu_32(desc->wb.filter_status);
+		PMD_LOG_DEBUG(RX, "rss id:%u", mbuf->hash.rss);
+	}
+#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC
+	if (SXE2_RX_DESC_FD_VLD_MASK & desc->wb.rxdid_src) {
+		mbuf->ol_flags |= (RTE_MBUF_F_RX_FDIR | RTE_MBUF_F_RX_FDIR_ID);
+		mbuf->hash.fdir.hi = rte_le_to_cpu_32(desc->wb.fd_filter_id);
+		PMD_LOG_DEBUG(RX, "fdir id:%u", mbuf->hash.fdir.hi);
+	}
+#endif
+}
+
 static __rte_always_inline void
 sxe2_rx_mbuf_common_fields_fill(struct sxe2_rx_queue *rxq, struct rte_mbuf *mbuf,
 		union sxe2_rx_desc *rxd)
@@ -673,6 +720,8 @@ sxe2_rx_mbuf_common_fields_fill(struct sxe2_rx_queue *rxq, struct rte_mbuf *mbuf
 	mbuf->packet_type = ptype_tbl[SXE2_RX_DESC_PTYPE_VAL_GET(qword1)];
 
 	pkt_flags = sxe2_rx_desc_error_para(rxq, rxd);
+	sxe2_rx_desc_vlan_para_fill(mbuf, rxd);
+	sxe2_rx_desc_filter_para_fill(rxq, mbuf, rxd);
 
 	mbuf->ol_flags |= pkt_flags;
 }
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 01/20] net/sxe2: support AVX512 vectorized path for Rx and Tx
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev, Jie Liu
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

From: Jie Liu <liujie5@linkdatatechnology.com>

Add AVX512 vector data path for Rx and Tx burst functions.
The decision to use AVX512 is based on:
1. CPU hardware flags (AVX512F, AVX512BW).
2. Compiler support (CC_AVX512_SUPPORT).
3. Max SIMD bitwidth configuration.

Performance shows approximately X% improvement in small packet
forwarding scenarios.

Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
---
 drivers/net/sxe2/meson.build            |  24 +
 drivers/net/sxe2/sxe2_drv_cmd.h         |  80 +--
 drivers/net/sxe2/sxe2_ethdev.c          |   2 +-
 drivers/net/sxe2/sxe2_txrx.c            |  92 ++-
 drivers/net/sxe2/sxe2_txrx_vec.c        |  46 +-
 drivers/net/sxe2/sxe2_txrx_vec.h        |  18 +-
 drivers/net/sxe2/sxe2_txrx_vec_avx512.c | 897 ++++++++++++++++++++++++
 7 files changed, 1099 insertions(+), 60 deletions(-)
 create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx512.c

diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build
index 6b2eb75b0e..7bd0d8120c 100644
--- a/drivers/net/sxe2/meson.build
+++ b/drivers/net/sxe2/meson.build
@@ -15,6 +15,30 @@ includes += include_directories('../../common/sxe2')
 
 if arch_subdir == 'x86'
         sources += files('sxe2_txrx_vec_sse.c')
+
+        sxe2_avx512_cpu_support =(
+                cc.get_define('__AVX512F__', args: machine_args) != '' and
+                cc.get_define('__AVX512BW__', args: machine_args) != '')
+
+        sxe2_avx512_cc_support = (
+                not machine_args.contains('-mno-avx512f') and
+                cc.has_argument('-mavx512f') and
+                cc.has_argument('-mavx512bw'))
+
+        if sxe2_avx512_cpu_support == true or sxe2_avx512_cc_support == true
+                cflags += ['-DCC_AVX512_SUPPORT']
+                avx512_args = [cflags, '-mavx512f', '-mavx512bw']
+                if cc.has_argument('-march=skylake-avx512')
+                        avx512_args += '-march=skylake-avx512'
+                endif
+                sxe2_avx512_lib = static_library('sxe2_avx512_lib', 'sxe2_txrx_vec_avx512.c',
+                        dependencies: [static_rte_ethdev,
+                        static_rte_kvargs, static_rte_hash,
+                        static_rte_security, static_rte_cryptodev, static_rte_bus_pci],
+                        include_directories: includes,
+                        c_args: avx512_args)
+                objs += sxe2_avx512_lib.extract_objects('sxe2_txrx_vec_avx512.c')
+        endif
 endif
 
 sources += files(
diff --git a/drivers/net/sxe2/sxe2_drv_cmd.h b/drivers/net/sxe2/sxe2_drv_cmd.h
index bba6476c2e..ccc9c20ef4 100644
--- a/drivers/net/sxe2/sxe2_drv_cmd.h
+++ b/drivers/net/sxe2/sxe2_drv_cmd.h
@@ -67,20 +67,20 @@ enum sxe2_dev_type {
 	SXE2_DEV_T_MAX,
 };
 
-struct sxe2_drv_queue_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_queue_caps {
 	uint16_t queues_cnt;
 	uint16_t base_idx_in_pf;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_msix_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_msix_caps {
 	uint16_t msix_vectors_cnt;
 	uint16_t base_idx_in_func;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_rss_hash_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_rss_hash_caps {
 	uint16_t hash_key_size;
 	uint16_t lut_key_size;
-};
+} __rte_packed_end;
 
 enum sxe2_vf_vsi_valid {
 	SXE2_VF_VSI_BOTH = 0,
@@ -89,18 +89,18 @@ enum sxe2_vf_vsi_valid {
 	SXE2_VF_VSI_MAX,
 };
 
-struct sxe2_drv_vsi_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vsi_caps {
 	uint16_t func_id;
 	uint16_t dpdk_vsi_id;
 	uint16_t kernel_vsi_id;
 	uint16_t vsi_type;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_representor_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_representor_caps {
 	uint16_t cnt_repr_vf;
 	uint8_t rsv[2];
 	struct sxe2_drv_vsi_caps repr_vf_id[256];
-};
+} __rte_packed_end;
 
 enum sxe2_phys_port_name_type {
 	SXE2_PHYS_PORT_NAME_TYPE_NOTSET = 0,
@@ -111,25 +111,25 @@ enum sxe2_phys_port_name_type {
 	SXE2_PHYS_PORT_NAME_TYPE_UNKNOWN,
 };
 
-struct sxe2_switchdev_mode_info {
+struct __rte_aligned(4) __rte_packed_begin sxe2_switchdev_mode_info {
 	uint8_t pf_id;
 	uint8_t is_switchdev;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_switchdev_cpvsi_info {
+struct __rte_aligned(4) __rte_packed_begin sxe2_switchdev_cpvsi_info {
 	uint16_t cp_vsi_id;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_txsch_caps {
+struct __rte_aligned(4) __rte_packed_begin sxe2_txsch_caps {
 	uint8_t layer_cap;
 	uint8_t tm_mid_node_num;
 	uint8_t prio_num;
 	uint8_t rev;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_dev_caps_resp {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_dev_caps_resp {
 	struct sxe2_drv_queue_caps queue_caps;
 	struct sxe2_drv_msix_caps msix_caps;
 	struct sxe2_drv_rss_hash_caps rss_hash_caps;
@@ -141,24 +141,24 @@ struct sxe2_drv_dev_caps_resp {
 	uint8_t dev_type;
 	uint8_t rev;
 	uint32_t cap_flags;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_dev_info_resp {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_dev_info_resp {
 	uint64_t dsn;
 	uint16_t vsi_id;
 	uint8_t rsv[2];
 	uint8_t mac_addr[SXE2_ETH_ALEN];
 	uint8_t rsv2[2];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_dev_fw_info_resp {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_dev_fw_info_resp {
 	uint8_t main_version_id;
 	uint8_t sub_version_id;
 	uint8_t fix_version_id;
 	uint8_t build_id;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_rxq_ctxt {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_rxq_ctxt {
 	uint64_t dma_addr;
 	uint32_t max_lro_size;
 	uint32_t split_type_mask;
@@ -170,62 +170,62 @@ struct sxe2_drv_rxq_ctxt {
 	uint8_t keep_crc_en;
 	uint8_t split_en;
 	uint8_t desc_size;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_rxq_cfg_req {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_rxq_cfg_req {
 	uint16_t q_cnt;
 	uint16_t vsi_id;
 	uint16_t max_frame_size;
 	uint8_t rsv[2];
 	struct sxe2_drv_rxq_ctxt cfg[];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_txq_ctxt {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_txq_ctxt {
 	uint64_t dma_addr;
 	uint32_t sched_mode;
 	uint16_t queue_id;
 	uint16_t depth;
 	uint16_t vsi_id;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_txq_cfg_req {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_txq_cfg_req {
 	uint16_t q_cnt;
 	uint16_t vsi_id;
 	struct sxe2_drv_txq_ctxt cfg[];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_q_switch_req {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_q_switch_req {
 	uint16_t q_idx;
 	uint16_t vsi_id;
 	uint8_t is_enable;
 	uint8_t sched_mode;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_vsi_create_req_resp {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vsi_create_req_resp {
 	uint16_t vsi_id;
 	uint16_t vsi_type;
 	struct sxe2_drv_queue_caps used_queues;
 	struct sxe2_drv_msix_caps used_msix;
-};
+} __rte_packed_end;
 
-struct sxe2_drv_vsi_free_req {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vsi_free_req {
 	uint16_t vsi_id;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_vsi_info_get_req {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vsi_info_get_req {
 	uint16_t vsi_id;
 	uint8_t rsv[2];
-};
+} __rte_packed_end;
 
-struct sxe2_drv_vsi_info_get_resp {
+struct __rte_aligned(4) __rte_packed_begin sxe2_drv_vsi_info_get_resp {
 	uint16_t vsi_id;
 	uint16_t vsi_type;
 	struct sxe2_drv_queue_caps used_queues;
 	struct sxe2_drv_msix_caps used_msix;
-};
+} __rte_packed_end;
 
 enum sxe2_drv_cmd_module {
 	SXE2_DRV_CMD_MODULE_HANDSHAKE = 0,
diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c
index b6cc8703a7..066e1faf7e 100644
--- a/drivers/net/sxe2/sxe2_ethdev.c
+++ b/drivers/net/sxe2/sxe2_ethdev.c
@@ -891,7 +891,7 @@ static int32_t sxe2_eth_pmd_probe_pf(struct sxe2_common_device *cdev,
 static int32_t sxe2_parse_eth_devargs(struct rte_device *dev,
 			  struct rte_eth_devargs *eth_da)
 {
-	int ret = 0;
+	int32_t ret = 0;
 
 	if (dev->devargs == NULL)
 		return 0;
diff --git a/drivers/net/sxe2/sxe2_txrx.c b/drivers/net/sxe2/sxe2_txrx.c
index 8d17535301..aa1c474088 100644
--- a/drivers/net/sxe2/sxe2_txrx.c
+++ b/drivers/net/sxe2/sxe2_txrx.c
@@ -157,6 +157,19 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev)
 		if (ret == 0 &&
 		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 			tx_mode_flags = vec_flags;
+#ifdef RTE_ARCH_X86
+			if ((rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) &&
+			    (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)) {
+#ifdef CC_AVX512_SUPPORT
+				tx_mode_flags |= SXE2_TX_MODE_VEC_AVX512;
+#else
+				PMD_LOG_INFO(TX, "AVX512 is not supported in build env.");
+#endif
+			}
+			if ((tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) == 0)
+				tx_mode_flags |= SXE2_TX_MODE_VEC_SSE;
+#endif
 			if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) {
 				ret = sxe2_tx_queues_vec_prepare(dev);
 				if (ret != 0)
@@ -172,14 +185,25 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev)
 		tx_mode_flags = adapter->q_ctxt.tx_mode_flags;
 	}
 
-#ifdef RTE_ARCH_X86
 	if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) {
-		if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) {
-			dev->tx_pkt_prepare = sxe2_tx_pkts_prepare;
-			dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse;
+		dev->tx_pkt_prepare = NULL;
+#ifdef RTE_ARCH_X86
+		if (tx_mode_flags & SXE2_TX_MODE_VEC_AVX512) {
+#ifdef CC_AVX512_SUPPORT
+			if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) {
+				dev->tx_pkt_prepare = sxe2_tx_pkts_prepare;
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx512;
+			} else {
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx512_simple;
+			}
+#endif
 		} else {
-			dev->tx_pkt_prepare = NULL;
-			dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse_simple;
+			if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) {
+				dev->tx_pkt_prepare = sxe2_tx_pkts_prepare;
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse;
+			} else {
+				dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse_simple;
+			}
 		}
 	} else {
 #endif
@@ -201,8 +225,16 @@ static const struct {
 } sxe2_tx_burst_infos[] = {
 	{ sxe2_tx_pkts,   "Scalar" },
 #ifdef RTE_ARCH_X86
-	{ sxe2_tx_pkts_vec_sse,        "Vector SSE" },
-	{ sxe2_tx_pkts_vec_sse_simple, "Vector SSE Simple" },
+#ifdef CC_AVX512_SUPPORT
+	{ sxe2_tx_pkts_vec_avx512,
+	      "Vector AVX512" },
+	{ sxe2_tx_pkts_vec_avx512_simple,
+	      "Vector AVX512 Simple" },
+#endif
+	{ sxe2_tx_pkts_vec_sse,
+	      "Vector SSE" },
+	{ sxe2_tx_pkts_vec_sse_simple,
+	      "Vector SSE Simple" },
 #endif
 };
 
@@ -288,6 +320,20 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev)
 		if (ret == 0 &&
 		    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 			rx_mode_flags = vec_flags;
+#ifdef RTE_ARCH_X86
+			if ((rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) &&
+				(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+				(rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)) {
+#ifdef CC_AVX512_SUPPORT
+				rx_mode_flags |= SXE2_RX_MODE_VEC_AVX512;
+#else
+				PMD_LOG_INFO(RX, "AVX512 support detected but not enabled");
+#endif
+			}
+			if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0 &&
+				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+				rx_mode_flags |= SXE2_RX_MODE_VEC_SSE;
+#endif
 			if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) != 0) {
 				ret = sxe2_rx_queues_vec_prepare(dev);
 				if (ret != 0)
@@ -301,7 +347,16 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev)
 
 #ifdef RTE_ARCH_X86
 	if (rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) {
-		dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_sse_offload;
+		if (rx_mode_flags & SXE2_RX_MODE_VEC_AVX512) {
+#ifdef CC_AVX512_SUPPORT
+			if (rx_mode_flags & SXE2_RX_MODE_VEC_OFFLOAD)
+				dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx512_offload;
+			else
+				dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx512;
+#endif
+		} else {
+			dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_sse_offload;
+		}
 		return;
 	}
 #endif
@@ -315,19 +370,30 @@ static const struct {
 	eth_rx_burst_t rx_burst;
 	const char *info;
 } sxe2_rx_burst_infos[] = {
-	{ sxe2_rx_pkts_scattered,          "Scalar Scattered" },
-	{ sxe2_rx_pkts_scattered_split,          "Scalar Scattered split" },
+	{ sxe2_rx_pkts_scattered,
+	      "Scalar Scattered" },
+	{ sxe2_rx_pkts_scattered_split,
+	      "Scalar Scattered split" },
 #ifdef RTE_ARCH_X86
-	{ sxe2_rx_pkts_scattered_vec_sse_offload,      "Vector SSE Scattered" },
+#ifdef CC_AVX512_SUPPORT
+	{ sxe2_rx_pkts_scattered_vec_avx512,
+	      "Vector AVX512 Scattered" },
+	{ sxe2_rx_pkts_scattered_vec_avx512_offload,
+	      "Offload Vector AVX512 Scattered" },
+#endif
+	{ sxe2_rx_pkts_scattered_vec_sse_offload,
+	      "Vector SSE Scattered" },
 #endif
 };
 
 int32_t sxe2_rx_burst_mode_get(struct rte_eth_dev *dev,
-			__rte_unused uint16_t queue_id, struct rte_eth_burst_mode *mode)
+			       __rte_unused uint16_t queue_id,
+			       struct rte_eth_burst_mode *mode)
 {
 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
 	int32_t ret = -EINVAL;
 	uint32_t i, size;
+
 	size = RTE_DIM(sxe2_rx_burst_infos);
 	for (i = 0; i < size; ++i) {
 		if (pkt_burst == sxe2_rx_burst_infos[i].rx_burst) {
diff --git a/drivers/net/sxe2/sxe2_txrx_vec.c b/drivers/net/sxe2/sxe2_txrx_vec.c
index 8df4954d86..cf004f5eb2 100644
--- a/drivers/net/sxe2/sxe2_txrx_vec.c
+++ b/drivers/net/sxe2/sxe2_txrx_vec.c
@@ -165,16 +165,54 @@ static void sxe2_tx_queue_mbufs_release_vec(struct sxe2_tx_queue *txq)
 		return;
 	}
 	i = txq->next_dd - (txq->rs_thresh - 1);
-	buffer = txq->buffer_ring;
-	if (txq->next_use < i) {
-		for ( ; i < txq->ring_depth; ++i) {
+#ifdef CC_AVX512_SUPPORT
+	struct rte_eth_dev *dev;
+	struct sxe2_tx_buffer_vec *buffer_vec;
+
+	dev = &rte_eth_devices[txq->port_id];
+
+	if (dev->tx_pkt_burst == sxe2_tx_pkts_vec_avx512 ||
+		dev->tx_pkt_burst == sxe2_tx_pkts_vec_avx512_simple) {
+		buffer_vec = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
+
+		if (txq->next_use < i) {
+			for ( ; i < txq->ring_depth; ++i) {
+				if (buffer_vec[i].mbuf != NULL) {
+					rte_pktmbuf_free_seg(buffer_vec[i].mbuf);
+					buffer_vec[i].mbuf = NULL;
+				}
+			}
+			i = 0;
+		}
+		for ( ; i < txq->next_use; ++i) {
+			if (buffer_vec[i].mbuf != NULL) {
+				rte_pktmbuf_free_seg(buffer_vec[i].mbuf);
+				buffer_vec[i].mbuf = NULL;
+			}
+		}
+	} else {
+#endif
+		buffer = txq->buffer_ring;
+		buffer = txq->buffer_ring;
+		if (txq->next_use < i) {
+			for ( ; i < txq->ring_depth; ++i) {
+				if (buffer[i].mbuf != NULL) {
+					rte_pktmbuf_free_seg(buffer[i].mbuf);
+					buffer[i].mbuf = NULL;
+				}
+			}
+			i = 0;
+		}
+		for (; i < txq->next_use; ++i) {
 			if (buffer[i].mbuf != NULL) {
 				rte_pktmbuf_free_seg(buffer[i].mbuf);
 				buffer[i].mbuf = NULL;
 			}
 		}
-		i = 0;
+#ifdef CC_AVX512_SUPPORT
 	}
+#endif
+
 	for (; i < txq->next_use; ++i) {
 		if (buffer[i].mbuf != NULL) {
 			rte_pktmbuf_free_seg(buffer[i].mbuf);
diff --git a/drivers/net/sxe2/sxe2_txrx_vec.h b/drivers/net/sxe2/sxe2_txrx_vec.h
index 04ff4d96a5..af7c8d12b2 100644
--- a/drivers/net/sxe2/sxe2_txrx_vec.h
+++ b/drivers/net/sxe2/sxe2_txrx_vec.h
@@ -11,15 +11,19 @@
 #define SXE2_RX_MODE_VEC_SIMPLE    RTE_BIT32(0)
 #define SXE2_RX_MODE_VEC_OFFLOAD   RTE_BIT32(1)
 #define SXE2_RX_MODE_VEC_SSE       RTE_BIT32(2)
+#define SXE2_RX_MODE_VEC_AVX512    RTE_BIT32(4)
 #define SXE2_RX_MODE_BATCH_ALLOC   RTE_BIT32(10)
 #define SXE2_RX_MODE_VEC_SET_MASK	(SXE2_RX_MODE_VEC_SIMPLE | \
-			SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE)
+			SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE | \
+			SXE2_RX_MODE_VEC_AVX512)
 #define SXE2_TX_MODE_VEC_SIMPLE   RTE_BIT32(0)
 #define SXE2_TX_MODE_VEC_OFFLOAD  RTE_BIT32(1)
 #define SXE2_TX_MODE_VEC_SSE      RTE_BIT32(2)
+#define SXE2_TX_MODE_VEC_AVX512   RTE_BIT32(4)
 #define SXE2_TX_MODE_SIMPLE_BATCH RTE_BIT32(10)
 #define SXE2_TX_MODE_VEC_SET_MASK	(SXE2_TX_MODE_VEC_SIMPLE | \
-			SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE)
+			SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE | \
+			SXE2_TX_MODE_VEC_AVX512)
 #define SXE2_TX_VEC_NO_SUPPORT_OFFLOAD (		  \
 			RTE_ETH_TX_OFFLOAD_MULTI_SEGS |		  \
 			RTE_ETH_TX_OFFLOAD_QINQ_INSERT |	  \
@@ -54,6 +58,16 @@ uint16_t sxe2_tx_pkts_vec_sse(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_
 uint16_t sxe2_tx_pkts_vec_sse_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 uint16_t sxe2_rx_pkts_scattered_vec_sse_offload(void *rx_queue,
 		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_tx_pkts_vec_avx512_simple(void *tx_queue,
+		struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_tx_pkts_vec_avx512(void *tx_queue,
+		struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_tx_pkts_vec_avx512_ctx_offload(void *tx_queue,
+		struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_rx_pkts_scattered_vec_avx512(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+uint16_t sxe2_rx_pkts_scattered_vec_avx512_offload(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
 #endif
 int32_t __rte_cold sxe2_tx_vec_support_check(struct rte_eth_dev *dev, uint32_t *vec_flags);
 int32_t __rte_cold sxe2_tx_queues_vec_prepare(struct rte_eth_dev *dev);
diff --git a/drivers/net/sxe2/sxe2_txrx_vec_avx512.c b/drivers/net/sxe2/sxe2_txrx_vec_avx512.c
new file mode 100644
index 0000000000..2aec8037dd
--- /dev/null
+++ b/drivers/net/sxe2/sxe2_txrx_vec_avx512.c
@@ -0,0 +1,897 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd.
+ */
+
+#ifndef SXE2_TEST
+#include <rte_vect.h>
+
+#include "sxe2_ethdev.h"
+#include "sxe2_common_log.h"
+#include "sxe2_queue.h"
+#include "sxe2_txrx_vec.h"
+#include "sxe2_txrx_vec_common.h"
+#include "sxe2_vsi.h"
+
+static __rte_always_inline int32_t sxe2_tx_bufs_free_vec_avx512(struct sxe2_tx_queue *txq)
+{
+	struct sxe2_tx_buffer_vec *buffer;
+	struct rte_mbuf *mbuf;
+	struct rte_mbuf *mbuf_free_arr[SXE2_TX_FREE_BUFFER_SIZE_MAX_VEC];
+	struct rte_mempool *mp;
+	struct rte_mempool_cache *cache;
+	void **cache_objs;
+	uint32_t copied;
+	uint32_t i;
+	int32_t ret;
+	uint16_t rs_thresh;
+	uint16_t free_num;
+
+	if (rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_DESC_DONE) !=
+		(txq->desc_ring[txq->next_dd].wb.dd &
+			rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_MASK))) {
+		ret = 0;
+		goto l_end;
+	}
+
+	rs_thresh = txq->rs_thresh;
+
+	buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
+	buffer += txq->next_dd - (rs_thresh - 1);
+
+	if ((txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) &&
+			(rs_thresh & 31) == 0) {
+		mp = buffer[0].mbuf->pool;
+		cache = rte_mempool_default_cache(mp, rte_lcore_id());
+
+		if (cache == NULL || cache->len)
+			goto normal;
+
+		if (rs_thresh > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+			(void)rte_mempool_ops_enqueue_bulk(mp, (void *)buffer, rs_thresh);
+			goto done;
+		}
+		cache_objs = &cache->objs[cache->len];
+
+		copied = 0;
+		while (copied < rs_thresh) {
+			const __m512i objs0 = _mm512_loadu_si512(&buffer[copied]);
+			const __m512i objs1 = _mm512_loadu_si512(&buffer[copied + 8]);
+			const __m512i objs2 = _mm512_loadu_si512(&buffer[copied + 16]);
+			const __m512i objs3 = _mm512_loadu_si512(&buffer[copied + 24]);
+
+			_mm512_storeu_si512(&cache_objs[copied], objs0);
+			_mm512_storeu_si512(&cache_objs[copied + 8], objs1);
+			_mm512_storeu_si512(&cache_objs[copied + 16], objs2);
+			_mm512_storeu_si512(&cache_objs[copied + 24], objs3);
+			copied += 32;
+		}
+		cache->len += rs_thresh;
+
+		if (cache->len >= cache->flushthresh) {
+			(void)rte_mempool_ops_enqueue_bulk(mp,
+					&cache->objs[cache->size], cache->len - cache->size);
+			cache->len = cache->size;
+		}
+		goto done;
+	}
+
+normal:
+	mbuf = rte_pktmbuf_prefree_seg(buffer[0].mbuf);
+
+	if (likely(mbuf)) {
+		mbuf_free_arr[0] = mbuf;
+		free_num = 1;
+
+		for (i = 1; i < rs_thresh; ++i) {
+			mbuf = rte_pktmbuf_prefree_seg(buffer[i].mbuf);
+
+			if (likely(mbuf)) {
+				if (likely(mbuf->pool == mbuf_free_arr[0]->pool)) {
+					mbuf_free_arr[free_num] = mbuf;
+					free_num++;
+				} else {
+					rte_mempool_put_bulk(mbuf_free_arr[0]->pool,
+						(void *)mbuf_free_arr, free_num);
+
+				mbuf_free_arr[0] = mbuf;
+				free_num = 1;
+			}
+			}
+		}
+
+		rte_mempool_put_bulk(mbuf_free_arr[0]->pool,
+						(void *)mbuf_free_arr, free_num);
+	} else {
+		for (i = 1; i < rs_thresh; ++i) {
+			mbuf = rte_pktmbuf_prefree_seg(buffer[i].mbuf);
+			if (mbuf != NULL)
+				rte_mempool_put(mbuf->pool, mbuf);
+		}
+	}
+
+done:
+	txq->desc_free_num += txq->rs_thresh;
+	txq->next_dd       += txq->rs_thresh;
+	if (txq->next_dd >= txq->ring_depth)
+		txq->next_dd = txq->rs_thresh - 1;
+	ret = rs_thresh;
+
+l_end:
+	return ret;
+}
+
+static __rte_always_inline void
+sxe2_tx_desc_fill_one_avx512(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf *pkt,
+	uint64_t desc_cmd, bool with_offloads)
+{
+	__m128i data_desc;
+	uint64_t desc_qw1;
+	uint32_t desc_offset;
+
+	desc_qw1 = (SXE2_TX_DESC_DTYPE_DATA |
+				((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT |
+				((uint64_t)pkt->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+	desc_offset = SXE2_TX_DATA_DESC_MACLEN_VAL(pkt->l2_len);
+	desc_qw1 |= ((uint64_t)desc_offset) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+	if (with_offloads)
+		sxe2_tx_desc_fill_offloads(pkt, &desc_qw1);
+
+	data_desc = _mm_set_epi64x(desc_qw1, rte_pktmbuf_iova(pkt));
+
+	_mm_store_si128(RTE_CAST_PTR(__m128i *, desc), data_desc);
+}
+
+static __rte_always_inline
+void sxe2_tx_desc_fill_avx512(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf **pkts,
+	uint16_t pkts_num, uint64_t desc_cmd, bool with_offloads)
+{
+	__m512i desc_group;
+	uint64_t desc0_qw1;
+	uint64_t desc1_qw1;
+	uint64_t desc2_qw1;
+	uint64_t desc3_qw1;
+
+	const uint64_t desc_qw1_com = (SXE2_TX_DESC_DTYPE_DATA |
+					((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT);
+	uint32_t desc_offset[4] = {0};
+
+	while (pkts_num > 3) {
+		desc3_qw1 = desc_qw1_com |
+				((uint64_t)pkts[3]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT;
+
+		desc_offset[3] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[3]->l2_len);
+		desc3_qw1 |= ((uint64_t)desc_offset[3]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[3], &desc3_qw1);
+
+		desc2_qw1 = desc_qw1_com |
+				((uint64_t)pkts[2]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT;
+		desc_offset[2] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[2]->l2_len);
+		desc2_qw1 |= ((uint64_t)desc_offset[2]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[2], &desc2_qw1);
+
+		desc1_qw1 = (desc_qw1_com |
+				((uint64_t)pkts[1]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+		desc_offset[1] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[1]->l2_len);
+		desc1_qw1 |= ((uint64_t)desc_offset[1]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[1], &desc1_qw1);
+
+		desc0_qw1 = (desc_qw1_com |
+				((uint64_t)pkts[0]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT);
+		desc_offset[0] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[0]->l2_len);
+		desc0_qw1 |= ((uint64_t)desc_offset[0]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT;
+		if (with_offloads)
+			sxe2_tx_desc_fill_offloads(pkts[0], &desc0_qw1);
+
+		desc_group =
+			_mm512_set_epi64(desc3_qw1, rte_pktmbuf_iova(pkts[3]),
+					 desc2_qw1, rte_pktmbuf_iova(pkts[2]),
+					 desc1_qw1, rte_pktmbuf_iova(pkts[1]),
+					 desc0_qw1, rte_pktmbuf_iova(pkts[0]));
+
+		_mm512_storeu_si512(RTE_CAST_PTR(void *, desc), desc_group);
+
+		pkts_num -= 4;
+		desc     += 4;
+		pkts     += 4;
+	}
+
+	while (pkts_num) {
+		sxe2_tx_desc_fill_one_avx512(desc, *pkts, desc_cmd, with_offloads);
+
+		pkts_num--;
+		desc++;
+		pkts++;
+	}
+}
+
+static __rte_always_inline void
+sxe2_tx_pkts_mbuf_fill_avx512(struct sxe2_tx_buffer_vec *buffer,
+	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	uint16_t i;
+
+	for (i = 0; i < nb_pkts; ++i)
+		buffer[i].mbuf = tx_pkts[i];
+}
+
+static __rte_always_inline uint16_t
+sxe2_tx_pkts_vec_avx512_batch(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts,
+	uint16_t nb_pkts, bool with_offloads)
+{
+	volatile union sxe2_tx_data_desc *desc;
+	struct sxe2_tx_buffer_vec *buffer;
+	uint16_t next_use;
+	uint16_t res_num;
+	uint16_t tx_num;
+
+	if (txq->desc_free_num < txq->free_thresh)
+		(void)sxe2_tx_bufs_free_vec_avx512(txq);
+
+	nb_pkts = RTE_MIN(txq->desc_free_num, nb_pkts);
+	if (unlikely(nb_pkts == 0)) {
+		PMD_LOG_DEBUG(TX, "Tx pkts avx512 batch: may not enough free desc, "
+				"free_desc=%u, need_tx_pkts=%u",
+				txq->desc_free_num, nb_pkts);
+		goto l_end;
+	}
+	tx_num = nb_pkts;
+
+	next_use = txq->next_use;
+	desc     = &txq->desc_ring[next_use];
+	buffer   = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
+	buffer  += next_use;
+
+	txq->desc_free_num -= nb_pkts;
+
+	res_num = txq->ring_depth - txq->next_use;
+
+	if (tx_num >= res_num) {
+		sxe2_tx_pkts_mbuf_fill_avx512(buffer, tx_pkts, res_num);
+
+		sxe2_tx_desc_fill_avx512(desc, tx_pkts, res_num,
+					SXE2_TX_DATA_DESC_CMD_EOP, with_offloads);
+		tx_pkts += (res_num - 1);
+		desc    += (res_num - 1);
+
+		sxe2_tx_desc_fill_one_avx512(desc, *tx_pkts++,
+					(SXE2_TX_DATA_DESC_CMD_EOP | SXE2_TX_DATA_DESC_CMD_RS),
+					with_offloads);
+
+		tx_num -= res_num;
+
+		next_use     = 0;
+		txq->next_rs = txq->rs_thresh - 1;
+		desc         = txq->desc_ring;
+		buffer       = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
+	}
+
+	sxe2_tx_pkts_mbuf_fill_avx512(buffer, tx_pkts, tx_num);
+
+	sxe2_tx_desc_fill_avx512(desc, tx_pkts, tx_num,
+			SXE2_TX_DATA_DESC_CMD_EOP, with_offloads);
+
+	next_use += tx_num;
+	if (next_use > txq->next_rs) {
+		txq->desc_ring[txq->next_rs].read.type_cmd_off_bsz_l2t |=
+			rte_cpu_to_le_64(SXE2_TX_DATA_DESC_CMD_RS_MASK);
+
+		txq->next_rs += txq->rs_thresh;
+	}
+	txq->next_use = next_use;
+
+	SXE2_PCI_REG_WRITE_WC(txq->tdt_reg_addr, next_use);
+	PMD_LOG_DEBUG(TX, "port_id=%u queue_id=%u next_use=%u send_pkts=%u",
+			txq->port_id, txq->queue_id, next_use, nb_pkts);
+l_end:
+	return nb_pkts;
+}
+
+static __rte_always_inline uint16_t
+sxe2_tx_pkts_vec_avx512_common(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts,
+	uint16_t nb_pkts, bool with_offloads)
+{
+	uint16_t tx_done_num = 0;
+	uint16_t tx_once_num;
+	uint16_t tx_need_num;
+
+	while (nb_pkts) {
+		tx_need_num = RTE_MIN(nb_pkts, txq->rs_thresh);
+		tx_once_num =
+			sxe2_tx_pkts_vec_avx512_batch(txq, tx_pkts + tx_done_num,
+					     tx_need_num, with_offloads);
+		nb_pkts     -= tx_once_num;
+		tx_done_num += tx_once_num;
+		if (tx_once_num < tx_need_num)
+			break;
+	}
+
+	return tx_done_num;
+}
+
+uint16_t sxe2_tx_pkts_vec_avx512_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_tx_pkts_vec_avx512_common((struct sxe2_tx_queue *)tx_queue,
+					      tx_pkts, nb_pkts, false);
+}
+
+uint16_t sxe2_tx_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_tx_pkts_vec_avx512_common((struct sxe2_tx_queue *)tx_queue,
+					      tx_pkts, nb_pkts, true);
+}
+
+static inline void sxe2_rx_queue_rearm_avx512(struct sxe2_rx_queue *rxq)
+{
+	volatile union sxe2_rx_desc *desc;
+	struct rte_mbuf **buffer;
+	struct rte_mbuf *mbuf0, *mbuf1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i virt_addr0, virt_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM);
+	int32_t ret;
+	uint16_t i;
+	uint16_t new_tail;
+
+	buffer = &rxq->buffer_ring[rxq->realloc_start];
+	desc   = &rxq->desc_ring[rxq->realloc_start];
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void *)buffer, SXE2_RX_REARM_THRESH_VEC);
+	if (ret != 0) {
+		if ((rxq->realloc_num + SXE2_RX_REARM_THRESH_VEC) >= rxq->ring_depth) {
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < SXE2_RX_NUM_PER_LOOP_AVX; ++i) {
+				buffer[i] = &rxq->fake_mbuf;
+				_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc[i].read),
+						dma_addr0);
+			}
+		}
+
+		rxq->vsi->adapter->dev_info.dev_data->rx_mbuf_alloc_failed +=
+			SXE2_RX_REARM_THRESH_VEC;
+		goto l_end;
+	}
+
+	for (i = 0; i < SXE2_RX_REARM_THRESH_VEC; i += 2, buffer += 2) {
+		mbuf0 = buffer[0];
+		mbuf1 = buffer[1];
+
+#if RTE_IOVA_IN_MBUF
+
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				 offsetof(struct rte_mbuf, buf_addr) + 8);
+#endif
+		virt_addr0 = _mm_loadu_si128((__m128i *)&mbuf0->buf_addr);
+		virt_addr1 = _mm_loadu_si128((__m128i *)&mbuf1->buf_addr);
+
+#if RTE_IOVA_IN_MBUF
+
+		dma_addr0 = _mm_unpackhi_epi64(virt_addr0, virt_addr0);
+		dma_addr1 = _mm_unpackhi_epi64(virt_addr1, virt_addr1);
+#else
+
+		dma_addr0 = _mm_unpacklo_epi64(virt_addr0, virt_addr0);
+		dma_addr1 = _mm_unpacklo_epi64(virt_addr1, virt_addr1);
+#endif
+
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr0);
+		_mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr1);
+	}
+
+	rxq->realloc_start += SXE2_RX_REARM_THRESH_VEC;
+	if (rxq->realloc_start >= rxq->ring_depth)
+		rxq->realloc_start = 0;
+	rxq->realloc_num -= SXE2_RX_REARM_THRESH_VEC;
+
+	new_tail = (rxq->realloc_start == 0) ? (rxq->ring_depth - 1) :
+		(rxq->realloc_start - 1);
+
+	SXE2_PCI_REG_WRITE_WC(rxq->rdt_reg_addr, new_tail);
+
+l_end:
+	return;
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_common_vec_avx512(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts, uint8_t *split_rxe_flags,
+	uint8_t *umbcast_flags, bool do_offload)
+{
+	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, rxq->mbuf_init_value);
+	struct rte_mbuf **buffer;
+	volatile union sxe2_rx_desc *desc;
+	__m512i mbufs4_7;
+	__m512i mbufs0_3;
+	__m256i mbufs6_7;
+	__m256i mbufs4_5;
+	__m256i mbufs2_3;
+	__m256i mbufs0_1;
+	uint32_t bit_num  = 0;
+	uint16_t done_num = 0;
+	uint16_t i = 0;
+	uint16_t j = 0;
+
+	buffer   = &rxq->buffer_ring[rxq->processing_idx];
+	desc     = &rxq->desc_ring[rxq->processing_idx];
+
+	rte_prefetch0(desc);
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, SXE2_RX_NUM_PER_LOOP_AVX);
+
+	if (rxq->realloc_num > SXE2_RX_REARM_THRESH_VEC)
+		sxe2_rx_queue_rearm_avx512(rxq);
+
+	if (0 == (rte_le_to_cpu_64(desc->wb.status_err_ptype_len) & SXE2_RX_DESC_STATUS_DD_MASK))
+		goto l_end;
+
+	const __m512i crc_adjust =
+			_mm512_set4_epi32(0, -rxq->crc_len, -rxq->crc_len, 0);
+
+	const __m256i dd_mask = _mm256_set1_epi32(1);
+
+	const __m512i rvp_shuf_mask =
+			_mm512_set4_epi32((7 << 24) | (6 << 16) | (5 << 8) | 4,
+					  (3 << 24) | (2 << 16) | (13 << 8) | 12,
+					  (0xFFU << 24) | (0xFF << 16) | (13 << 8) | 12,
+					  0xFFFFFFFF);
+
+	const __m128i eop_shuf_mask =
+		_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+			     0xFF, 0xFF, 8, 0, 10, 2, 12, 4, 14, 6);
+
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	for (i = 0; i < nb_pkts; i += SXE2_RX_NUM_PER_LOOP_AVX,
+				desc += SXE2_RX_NUM_PER_LOOP_AVX) {
+		_mm256_storeu_si256((void *)&rx_pkts[i],
+			_mm256_loadu_si256((void *)&buffer[i]));
+#ifdef RTE_ARCH_X86_64
+		_mm256_storeu_si256((void *)&rx_pkts[i + 4],
+			_mm256_loadu_si256((void *)&buffer[i + 4]));
+#endif
+
+		const __m128i desc7 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 7));
+		rte_compiler_barrier();
+		const __m128i desc6 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 6));
+		rte_compiler_barrier();
+		const __m128i desc5 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 5));
+		rte_compiler_barrier();
+		const __m128i desc4 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 4));
+		rte_compiler_barrier();
+		const __m128i desc3 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 3));
+		rte_compiler_barrier();
+		const __m128i desc2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 2));
+		rte_compiler_barrier();
+		const __m128i desc1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 1));
+		rte_compiler_barrier();
+		const __m128i desc0 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 0));
+
+		const __m256i descs6_7 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc6), desc7, 1);
+		const __m256i descs4_5 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc4), desc5, 1);
+		const __m256i descs2_3 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc2), desc3, 1);
+		const __m256i descs0_1 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(desc0), desc1, 1);
+
+		const __m512i descs4_7 =
+			_mm512_inserti64x4(_mm512_castsi256_si512(descs4_5), descs6_7, 1);
+		const __m512i descs0_3 =
+			_mm512_inserti64x4(_mm512_castsi256_si512(descs0_1), descs2_3, 1);
+
+		if (split_rxe_flags != NULL) {
+			for (j = 0; j < SXE2_RX_NUM_PER_LOOP_AVX; j++)
+				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
+		}
+
+		mbufs4_7 = _mm512_shuffle_epi8(descs4_7, rvp_shuf_mask);
+		mbufs0_3 = _mm512_shuffle_epi8(descs0_3, rvp_shuf_mask);
+
+		mbufs4_7 = _mm512_add_epi32(mbufs4_7, crc_adjust);
+		mbufs0_3 = _mm512_add_epi32(mbufs0_3, crc_adjust);
+
+		const __m512i ptype_mask = _mm512_set1_epi64(SXE2_RX_FLEX_DESC_PTYPE_M <<
+					SXE2_RX_FLEX_DESC_PTYPE_S);
+
+		__m512i ptypes4_7 = _mm512_and_si512(descs4_7, ptype_mask);
+		__m512i ptypes0_3 = _mm512_and_si512(descs0_3, ptype_mask);
+
+		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
+		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
+		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
+		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
+
+		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 13);
+		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 5);
+		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 13);
+		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 5);
+		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 13);
+		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 5);
+		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 13);
+		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 5);
+
+		const __m512i ptype_mask4_7 =
+				_mm512_set_epi32(0, 0, 0, ptype_tbl[ptype7],
+						 0, 0, 0, ptype_tbl[ptype6],
+						 0, 0, 0, ptype_tbl[ptype5],
+						 0, 0, 0, ptype_tbl[ptype4]);
+		const __m512i ptype_mask0_3 =
+				_mm512_set_epi32(0, 0, 0, ptype_tbl[ptype3],
+						 0, 0, 0, ptype_tbl[ptype2],
+						 0, 0, 0, ptype_tbl[ptype1],
+						 0, 0, 0, ptype_tbl[ptype0]);
+
+		mbufs4_7 = _mm512_or_si512(mbufs4_7, ptype_mask4_7);
+		mbufs0_3 = _mm512_or_si512(mbufs0_3, ptype_mask0_3);
+
+		mbufs6_7 = _mm512_extracti64x4_epi64(mbufs4_7, 1);
+		mbufs4_5 = _mm512_extracti64x4_epi64(mbufs4_7, 0);
+		mbufs2_3 = _mm512_extracti64x4_epi64(mbufs0_3, 1);
+		mbufs0_1 = _mm512_extracti64x4_epi64(mbufs0_3, 0);
+
+		const __m512i staterr_per_mask =
+			_mm512_set_epi32(0x17, 0x1F, 0x07, 0x0F,
+					 0x13, 0x1B, 0x03, 0x0B,
+					 0x16, 0x1E, 0x06, 0x0E,
+					 0x12, 0x1A, 0x02, 0x0A);
+		__m512i qw1_0_7 = _mm512_permutex2var_epi32(descs4_7,
+							    staterr_per_mask,
+							    descs0_3);
+
+		__m256i staterrs0_7 = _mm512_extracti64x4_epi64(qw1_0_7, 0);
+
+		__m256i stu_len0_7 = _mm512_extracti64x4_epi64(qw1_0_7, 1);
+		__m256i mbuf_flags = _mm256_setzero_si256();
+
+		if (do_offload) {
+			const __m256i desc_flags_mask = _mm256_set1_epi32(0xC0001C04);
+			const __m256i desc_flags_rss_mask = _mm256_set1_epi32(0x20000000);
+			const __m256i vlan_flags =
+				_mm256_set_epi8(0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, RTE_MBUF_F_RX_VLAN |
+						RTE_MBUF_F_RX_VLAN_STRIPPED,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, RTE_MBUF_F_RX_VLAN |
+						RTE_MBUF_F_RX_VLAN_STRIPPED,
+					0, 0, 0, 0);
+
+			const __m256i rss_flags =
+				_mm256_set_epi8(0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, RTE_MBUF_F_RX_RSS_HASH,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, 0,
+					0, 0, 0, RTE_MBUF_F_RX_RSS_HASH,
+					0, 0, 0, 0);
+
+			const __m256i cksum_flags =
+			_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0,
+			0,
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+				RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+				RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+				RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+				RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			0, 0, 0, 0, 0, 0, 0, 0,
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+				RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+				RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+				RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+				RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1),
+			((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1));
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
+						  RTE_MBUF_F_RX_L4_CKSUM_MASK |
+						  RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
+						  RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
+
+			const __m256i vlan_mask =
+				_mm256_set1_epi32(RTE_MBUF_F_RX_VLAN |
+						  RTE_MBUF_F_RX_VLAN_STRIPPED);
+
+			__m256i tmp_flags;
+			__m256i descs_flags = _mm256_and_si256(staterrs0_7, desc_flags_mask);
+			stu_len0_7 = _mm256_and_si256(stu_len0_7, desc_flags_rss_mask);
+
+			tmp_flags = _mm256_shuffle_epi8(vlan_flags, descs_flags);
+			mbuf_flags = _mm256_and_si256(tmp_flags, vlan_mask);
+
+			descs_flags = _mm256_srli_epi32(descs_flags, 10);
+			tmp_flags   = _mm256_shuffle_epi8(cksum_flags, descs_flags);
+			tmp_flags   = _mm256_slli_epi32(tmp_flags, 1);
+			tmp_flags   = _mm256_and_si256(tmp_flags, cksum_mask);
+			mbuf_flags  = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+			descs_flags = _mm256_srli_epi32(stu_len0_7, 27);
+			tmp_flags   = _mm256_shuffle_epi8(rss_flags, descs_flags);
+			mbuf_flags  = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC
+			if (rxq->fnav_enable) {
+				__m256i fnav_vld0_3, fnav_vld4_7;
+				__m256i fnav_vld0_7;
+				__m256i v_zeros, v_ffff, v_u32_one;
+				const __m256i fdir_flags =
+					_mm256_set1_epi32(RTE_MBUF_F_RX_FDIR |
+							  RTE_MBUF_F_RX_FDIR_ID);
+				fnav_vld0_3 = _mm256_unpacklo_epi32(descs2_3, descs0_1);
+				fnav_vld4_7 = _mm256_unpacklo_epi32(descs6_7, descs4_5);
+
+				fnav_vld0_7 = _mm256_unpacklo_epi64(fnav_vld4_7, fnav_vld0_3);
+
+				fnav_vld0_7 = _mm256_slli_epi32(fnav_vld0_7, 26);
+				fnav_vld0_7 = _mm256_srli_epi32(fnav_vld0_7, 31);
+
+				v_zeros = _mm256_setzero_si256();
+				v_ffff = _mm256_cmpeq_epi32(v_zeros, v_zeros);
+				v_u32_one = _mm256_srli_epi32(v_ffff, 31);
+
+				tmp_flags = _mm256_cmpeq_epi32(fnav_vld0_7, v_u32_one);
+
+				tmp_flags = _mm256_and_si256(tmp_flags, fdir_flags);
+
+				mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags);
+
+				rx_pkts[i + 0]->hash.fdir.hi = desc[0].wb.fd_filter_id;
+				rx_pkts[i + 1]->hash.fdir.hi = desc[1].wb.fd_filter_id;
+				rx_pkts[i + 2]->hash.fdir.hi = desc[2].wb.fd_filter_id;
+				rx_pkts[i + 3]->hash.fdir.hi = desc[3].wb.fd_filter_id;
+				rx_pkts[i + 4]->hash.fdir.hi = desc[4].wb.fd_filter_id;
+				rx_pkts[i + 5]->hash.fdir.hi = desc[5].wb.fd_filter_id;
+				rx_pkts[i + 6]->hash.fdir.hi = desc[6].wb.fd_filter_id;
+				rx_pkts[i + 7]->hash.fdir.hi = desc[7].wb.fd_filter_id;
+			}
+#endif
+		}
+
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+				offsetof(struct rte_mbuf, rearm_data) + 8);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rx_descriptor_fields1) !=
+				offsetof(struct rte_mbuf, rearm_data) + 16);
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+				RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+		__m256i rearm_arr[8];
+
+		rearm_arr[6] = _mm256_blend_epi32(mbuf_init,
+					_mm256_slli_si256(mbuf_flags, 8), 0x04);
+		rearm_arr[4] = _mm256_blend_epi32(mbuf_init,
+					_mm256_slli_si256(mbuf_flags, 4), 0x04);
+		rearm_arr[2] = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+		rearm_arr[0] = _mm256_blend_epi32(mbuf_init,
+					_mm256_srli_si256(mbuf_flags, 4), 0x04);
+
+		rearm_arr[6] = _mm256_permute2f128_si256(rearm_arr[6], mbufs6_7, 0x20);
+		rearm_arr[4] = _mm256_permute2f128_si256(rearm_arr[4], mbufs4_5, 0x20);
+		rearm_arr[2] = _mm256_permute2f128_si256(rearm_arr[2], mbufs2_3, 0x20);
+		rearm_arr[0] = _mm256_permute2f128_si256(rearm_arr[0], mbufs0_1, 0x20);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm_arr[6]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm_arr[4]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm_arr[2]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm_arr[0]);
+
+		const __m256i tmp_mbuf_flags =
+				_mm256_castsi128_si256(_mm256_extracti128_si256(mbuf_flags, 1));
+
+		rearm_arr[7] = _mm256_blend_epi32(mbuf_init,
+					_mm256_slli_si256(tmp_mbuf_flags, 8), 4);
+		rearm_arr[5] = _mm256_blend_epi32(mbuf_init,
+					_mm256_slli_si256(tmp_mbuf_flags, 4), 4);
+		rearm_arr[3] = _mm256_blend_epi32(mbuf_init, tmp_mbuf_flags, 4);
+		rearm_arr[1] = _mm256_blend_epi32(mbuf_init,
+					_mm256_srli_si256(tmp_mbuf_flags, 4), 4);
+
+		rearm_arr[7] = _mm256_blend_epi32(rearm_arr[7], mbufs6_7, 0XF0);
+		rearm_arr[5] = _mm256_blend_epi32(rearm_arr[5], mbufs4_5, 0XF0);
+		rearm_arr[3] = _mm256_blend_epi32(rearm_arr[3], mbufs2_3, 0XF0);
+		rearm_arr[1] = _mm256_blend_epi32(rearm_arr[1], mbufs0_1, 0XF0);
+
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm_arr[7]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm_arr[5]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm_arr[3]);
+		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm_arr[1]);
+
+		if (umbcast_flags) {
+			const __m256i umbcast_mask =
+				_mm256_set1_epi32(SXE2_RX_DESC_STATUS_UMBCAST_MASK);
+			__m256i umbcast_bits_256 =
+				_mm256_and_si256(staterrs0_7, umbcast_mask);
+
+			umbcast_bits_256 = _mm256_srli_epi32(umbcast_bits_256, 24);
+			__m128i umbcast_bits_128 =
+				_mm_packs_epi32(_mm256_castsi256_si128(umbcast_bits_256),
+						_mm256_extractf128_si256(umbcast_bits_256, 1));
+
+			umbcast_bits_128 = _mm_shuffle_epi8(umbcast_bits_128, eop_shuf_mask);
+
+			*(uint64_t *)umbcast_flags = _mm_cvtsi128_si64(umbcast_bits_128);
+			umbcast_flags += SXE2_RX_NUM_PER_LOOP_AVX;
+		}
+
+		if (split_rxe_flags) {
+			const __m256i eop_rxe_mask =
+					_mm256_set1_epi32(SXE2_RX_DESC_STATUS_EOP_MASK |
+								SXE2_RX_DESC_ERROR_RXE_MASK |
+								SXE2_RX_DESC_ERROR_OVERSIZE_MASK);
+			const __m128i eop_mask_128 =
+					_mm_set1_epi16(SXE2_RX_DESC_STATUS_EOP_MASK);
+			const __m128i rxe_mask_128 =
+					_mm_set1_epi16(SXE2_RX_DESC_ERROR_RXE_MASK |
+							SXE2_RX_DESC_ERROR_OVERSIZE_MASK);
+
+			const __m256i tmp_stats = _mm256_and_si256(staterrs0_7, eop_rxe_mask);
+
+			const __m128i eop_rxe_bits = _mm_packs_epi32
+							(_mm256_castsi256_si128(tmp_stats),
+							 _mm256_extractf128_si256(tmp_stats, 1));
+
+			__m128i not_eop_bits = _mm_andnot_si128(eop_rxe_bits, eop_mask_128);
+
+			not_eop_bits =
+				_mm_or_si128(not_eop_bits,
+					     _mm_srli_epi16(_mm_and_si128(eop_rxe_bits,
+									       rxe_mask_128),
+							      7));
+
+			not_eop_bits = _mm_shuffle_epi8(not_eop_bits, eop_shuf_mask);
+
+			*(uint64_t *)split_rxe_flags = _mm_cvtsi128_si64(not_eop_bits);
+			split_rxe_flags += SXE2_RX_NUM_PER_LOOP_AVX;
+		}
+
+		staterrs0_7 = _mm256_and_si256(staterrs0_7, dd_mask);
+
+		staterrs0_7 = _mm256_packs_epi32(staterrs0_7, _mm256_setzero_si256());
+
+		bit_num = rte_popcount64
+				(_mm_cvtsi128_si64(_mm256_extracti128_si256(staterrs0_7, 1)));
+		bit_num += rte_popcount64
+				(_mm_cvtsi128_si64(_mm256_castsi256_si128(staterrs0_7)));
+		done_num += bit_num;
+
+		if (bit_num != SXE2_RX_NUM_PER_LOOP_AVX)
+			break;
+	}
+
+	rxq->processing_idx += done_num;
+	rxq->processing_idx &= (rxq->ring_depth - 1);
+	if ((rxq->processing_idx & 1) == 1 && done_num > 1) {
+		rxq->processing_idx--;
+		done_num--;
+	}
+	rxq->realloc_num     += done_num;
+
+l_end:
+	PMD_LOG_DEBUG(RX, "port_id=%u queue_id=%u last_id=%u recv_pkts=%d",
+			rxq->port_id, rxq->queue_id, rxq->processing_idx, done_num);
+	return done_num;
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_scattered_batch_vec_avx512(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts, bool do_offload)
+{
+	const uint64_t *split_rxe_flags64;
+	uint8_t split_rxe_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0};
+	uint8_t umbcast_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0};
+	uint16_t rx_done_num;
+	uint16_t rx_pkt_done_num;
+
+	rx_pkt_done_num = 0;
+
+	if (rxq->vsi->adapter->devargs.sw_stats_en) {
+		rx_done_num = sxe2_rx_pkts_common_vec_avx512(rxq, rx_pkts,
+				nb_pkts, split_rxe_flags,
+				umbcast_flags, do_offload);
+	} else {
+		rx_done_num = sxe2_rx_pkts_common_vec_avx512(rxq, rx_pkts,
+				nb_pkts, split_rxe_flags,
+			    NULL, do_offload);
+	}
+	if (rx_done_num == 0)
+		goto l_end;
+
+	if (!rxq->vsi->adapter->devargs.sw_stats_en) {
+		split_rxe_flags64 = (uint64_t *)split_rxe_flags;
+
+		if (rxq->pkt_first_seg == NULL &&
+				!split_rxe_flags64[0] && !split_rxe_flags64[1] &&
+				!split_rxe_flags64[2] && !split_rxe_flags64[3]) {
+			rx_pkt_done_num = rx_done_num;
+			goto l_end;
+		}
+
+		if (rxq->pkt_first_seg == NULL) {
+			while (rx_pkt_done_num < rx_done_num &&
+			       split_rxe_flags[rx_pkt_done_num] == 0)
+				rx_pkt_done_num++;
+
+			if (rx_pkt_done_num == rx_done_num)
+				goto l_end;
+
+			rxq->pkt_first_seg = rx_pkts[rx_pkt_done_num];
+		}
+	}
+
+	rx_pkt_done_num += sxe2_rx_pkts_refactor(rxq, &rx_pkts[rx_pkt_done_num],
+			rx_done_num - rx_pkt_done_num, &split_rxe_flags[rx_pkt_done_num],
+			&umbcast_flags[rx_pkt_done_num]);
+
+l_end:
+
+	return rx_pkt_done_num;
+}
+
+static __rte_always_inline uint16_t
+sxe2_rx_pkts_scattered_common_vec_avx512(void *rx_queue,
+	struct rte_mbuf **rx_pkts, uint16_t nb_pkts, bool offload)
+{
+	uint16_t done_num = 0;
+	uint16_t once_num  = 0;
+
+	while (nb_pkts > SXE2_RX_PKTS_BURST_BATCH_NUM) {
+		once_num = sxe2_rx_pkts_scattered_batch_vec_avx512(rx_queue, rx_pkts + done_num,
+			SXE2_RX_PKTS_BURST_BATCH_NUM, offload);
+
+		done_num  += once_num;
+		nb_pkts -= once_num;
+
+		if (once_num < SXE2_RX_PKTS_BURST_BATCH_NUM)
+			goto end;
+	}
+
+	done_num += sxe2_rx_pkts_scattered_batch_vec_avx512(rx_queue,
+		rx_pkts + done_num, nb_pkts, offload);
+
+end:
+	return done_num;
+}
+
+uint16_t sxe2_rx_pkts_scattered_vec_avx512(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_rx_pkts_scattered_common_vec_avx512(rx_queue,
+			rx_pkts, nb_pkts, false);
+}
+
+uint16_t sxe2_rx_pkts_scattered_vec_avx512_offload(void *rx_queue,
+		struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	return sxe2_rx_pkts_scattered_common_vec_avx512(rx_queue,
+			rx_pkts, nb_pkts, true);
+}
+
+#endif
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 00/20] net/sxe2: added Linkdata sxe2 ethernet driver
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev, Jie Liu
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

From: Jie Liu <liujie5@linkdatatechnology.com>

This patch set implements core functionality for the SXE2 PMD,
including basic driver framework, data path setup, and advanced
offload features (VLAN, RSS,TM, PTP etc.).

V2:
Defensive ULONG_MAX checks after strtoul() in devarg parsers
Move testpmd commands out of driver library; remove 14 RTE_EXPORT_EXPERIMENTAL_SYMBOL
Implement rte_flow_dev_dump op; remove "sxe2 flow rule dump" command
Remove "sxe2 udp_tunnel_port add|rm" command (redundant)
Complete xstats coverage; remove "sxe2 show stats" command
Remove ipsec SA management testpmd commands
Retain 7 devargs with full documentation and rationale in sxe2.rst
Add RTE_PMD_REGISTER_PARAM_STRING for all devargs

Jie Liu (20):
  net/sxe2: support AVX512 vectorized path for Rx and Tx
  net/sxe2: add AVX2 vector data path for Rx and Tx
  drivers: add supported packet types get callback
  net/sxe2: support L2 filtering and MAC config
  drivers: support RSS feature
  net/sxe2: support TM hierarchy and shaping
  net/sxe2: support IPsec inline protocol offload
  net/sxe2: support statistics and multi-process
  drivers: interrupt handling
  net/sxe2: add NEON vec Rx/Tx burst functions
  drivers: add support for VF representors
  net/sxe2: add support for custom UDP tunnel ports
  net/sxe2: support firmware version reading
  net/sxe2: implement get monitor address
  common/sxe2: add shared SFP module definitions
  net/sxe2: support SFP module info and EEPROM access
  net/sxe2: implement private dump info
  net/sxe2: add mbuf validation in Tx debug mode
  drivers: add parameters parsed using rte_kvargs
  net/sxe2: update sxe2 feature matrix docs

 doc/guides/nics/features/sxe2.ini          |   56 +
 doc/guides/nics/sxe2.rst                   |  132 ++
 drivers/common/sxe2/sxe2_common.c          |  156 ++
 drivers/common/sxe2/sxe2_common.h          |    4 +
 drivers/common/sxe2/sxe2_flow_public.h     |  633 +++++++
 drivers/common/sxe2/sxe2_ioctl_chnl.c      |  178 +-
 drivers/common/sxe2/sxe2_ioctl_chnl_func.h |   18 +
 drivers/common/sxe2/sxe2_msg.h             |  118 ++
 drivers/common/sxe2/sxe2_ptype.h           | 1793 ++++++++++++++++++
 drivers/net/sxe2/meson.build               |   54 +-
 drivers/net/sxe2/sxe2_cmd_chnl.c           | 1587 +++++++++++++++-
 drivers/net/sxe2/sxe2_cmd_chnl.h           |  139 ++
 drivers/net/sxe2/sxe2_drv_cmd.h            |  521 +++++-
 drivers/net/sxe2/sxe2_dump.c               |  304 +++
 drivers/net/sxe2/sxe2_dump.h               |   12 +
 drivers/net/sxe2/sxe2_ethdev.c             | 1532 +++++++++++++++-
 drivers/net/sxe2/sxe2_ethdev.h             |  107 +-
 drivers/net/sxe2/sxe2_ethdev_repr.c        |  610 ++++++
 drivers/net/sxe2/sxe2_ethdev_repr.h        |   32 +
 drivers/net/sxe2/sxe2_filter.c             |  895 +++++++++
 drivers/net/sxe2/sxe2_filter.h             |  100 +
 drivers/net/sxe2/sxe2_flow.c               | 1394 ++++++++++++++
 drivers/net/sxe2/sxe2_flow.h               |   30 +
 drivers/net/sxe2/sxe2_flow_define.h        |  144 ++
 drivers/net/sxe2/sxe2_flow_parse_action.c  | 1182 ++++++++++++
 drivers/net/sxe2/sxe2_flow_parse_action.h  |   23 +
 drivers/net/sxe2/sxe2_flow_parse_engine.c  |  106 ++
 drivers/net/sxe2/sxe2_flow_parse_engine.h  |   13 +
 drivers/net/sxe2/sxe2_flow_parse_pattern.c | 1935 ++++++++++++++++++++
 drivers/net/sxe2/sxe2_flow_parse_pattern.h |   46 +
 drivers/net/sxe2/sxe2_ipsec.c              | 1565 ++++++++++++++++
 drivers/net/sxe2/sxe2_ipsec.h              |  254 +++
 drivers/net/sxe2/sxe2_irq.c                | 1026 +++++++++++
 drivers/net/sxe2/sxe2_irq.h                |   25 +
 drivers/net/sxe2/sxe2_mac.c                |  535 ++++++
 drivers/net/sxe2/sxe2_mac.h                |   84 +
 drivers/net/sxe2/sxe2_mp.c                 |  414 +++++
 drivers/net/sxe2/sxe2_mp.h                 |   67 +
 drivers/net/sxe2/sxe2_queue.c              |   17 +-
 drivers/net/sxe2/sxe2_rss.c                |  584 ++++++
 drivers/net/sxe2/sxe2_rss.h                |   81 +
 drivers/net/sxe2/sxe2_rx.c                 |   38 +
 drivers/net/sxe2/sxe2_rx.h                 |    2 +
 drivers/net/sxe2/sxe2_security.c           |  335 ++++
 drivers/net/sxe2/sxe2_security.h           |   77 +
 drivers/net/sxe2/sxe2_stats.c              |  591 ++++++
 drivers/net/sxe2/sxe2_stats.h              |   39 +
 drivers/net/sxe2/sxe2_switchdev.c          |  332 ++++
 drivers/net/sxe2/sxe2_switchdev.h          |   33 +
 drivers/net/sxe2/sxe2_tm.c                 | 1169 ++++++++++++
 drivers/net/sxe2/sxe2_tm.h                 |   78 +
 drivers/net/sxe2/sxe2_tx.c                 |    7 +
 drivers/net/sxe2/sxe2_txrx.c               |  176 +-
 drivers/net/sxe2/sxe2_txrx.h               |    4 +
 drivers/net/sxe2/sxe2_txrx_check_mbuf.c    |  595 ++++++
 drivers/net/sxe2/sxe2_txrx_check_mbuf.h    |   38 +
 drivers/net/sxe2/sxe2_txrx_poll.c          |  243 ++-
 drivers/net/sxe2/sxe2_txrx_vec.c           |   46 +-
 drivers/net/sxe2/sxe2_txrx_vec.h           |   38 +-
 drivers/net/sxe2/sxe2_txrx_vec_avx2.c      |  776 ++++++++
 drivers/net/sxe2/sxe2_txrx_vec_avx512.c    |  897 +++++++++
 drivers/net/sxe2/sxe2_txrx_vec_common.h    |    1 +
 drivers/net/sxe2/sxe2_txrx_vec_neon.c      |  721 ++++++++
 drivers/net/sxe2/sxe2_vsi.c                |  146 ++
 drivers/net/sxe2/sxe2_vsi.h                |   12 +-
 drivers/net/sxe2/sxe2vf_regs.h             |   85 +
 66 files changed, 24852 insertions(+), 133 deletions(-)
 create mode 100644 drivers/common/sxe2/sxe2_flow_public.h
 create mode 100644 drivers/common/sxe2/sxe2_msg.h
 create mode 100644 drivers/common/sxe2/sxe2_ptype.h
 create mode 100644 drivers/net/sxe2/sxe2_dump.c
 create mode 100644 drivers/net/sxe2/sxe2_dump.h
 create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.c
 create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.h
 create mode 100644 drivers/net/sxe2/sxe2_filter.c
 create mode 100644 drivers/net/sxe2/sxe2_filter.h
 create mode 100644 drivers/net/sxe2/sxe2_flow.c
 create mode 100644 drivers/net/sxe2/sxe2_flow.h
 create mode 100644 drivers/net/sxe2/sxe2_flow_define.h
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.c
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.h
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.c
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.h
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.c
 create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.h
 create mode 100644 drivers/net/sxe2/sxe2_ipsec.c
 create mode 100644 drivers/net/sxe2/sxe2_ipsec.h
 create mode 100644 drivers/net/sxe2/sxe2_irq.c
 create mode 100644 drivers/net/sxe2/sxe2_mac.c
 create mode 100644 drivers/net/sxe2/sxe2_mac.h
 create mode 100644 drivers/net/sxe2/sxe2_mp.c
 create mode 100644 drivers/net/sxe2/sxe2_mp.h
 create mode 100644 drivers/net/sxe2/sxe2_rss.c
 create mode 100644 drivers/net/sxe2/sxe2_rss.h
 create mode 100644 drivers/net/sxe2/sxe2_security.c
 create mode 100644 drivers/net/sxe2/sxe2_security.h
 create mode 100644 drivers/net/sxe2/sxe2_stats.c
 create mode 100644 drivers/net/sxe2/sxe2_stats.h
 create mode 100644 drivers/net/sxe2/sxe2_switchdev.c
 create mode 100644 drivers/net/sxe2/sxe2_switchdev.h
 create mode 100644 drivers/net/sxe2/sxe2_tm.c
 create mode 100644 drivers/net/sxe2/sxe2_tm.h
 create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.c
 create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.h
 create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx2.c
 create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx512.c
 create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_neon.c
 create mode 100644 drivers/net/sxe2/sxe2vf_regs.h

-- 
2.52.0


^ permalink raw reply

* Re: [PATCH 19/20] drivers: add testpmd commands for private features
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev
In-Reply-To: <20260614092328.201826-1-liujie5@linkdatatechnology.com>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=a, Size: 5452 bytes --]

您好,

感谢详细的审查意见。以下是我们对每条反馈的逐条回复以及 v2 计划变更。

---

### 1. strtoul() 值越界处理

重新检查了 [`sxe2_ethdev.c`](drivers/net/sxe2/sxe2_ethdev.c) 中的 devargs 解析代码后,我们确认所有解析器都已处理越界值:

- [`sxe2_parse_u8()`](drivers/net/sxe2/sxe2_ethdev.c:1074) — 检查 `val > UINT8_MAX`(第 1092 行),返回 `-ERANGE`
- [`sxe2_parse_bool()`](drivers/net/sxe2/sxe2_ethdev.c:1103) — 检查 `bool_val != 0 && bool_val != 1`(第 1121 行)
- [`sxe2_parse_fnav_stat_type()`](drivers/net/sxe2/sxe2_ethdev.c:989) — 检查是否超过 `SXE2_FNAV_STAT_ENA_ALL`(第 1007 行)
- [`sxe2_parse_sched_layer_mode()`](drivers/net/sxe2/sxe2_ethdev.c:1018) — 检查是否超过 `SXE2_TXSCH_NODE_ADJ_LVL_MAX`(第 1036 行)
- [`sxe2_parse_high_performance_mode()`](drivers/net/sxe2/sxe2_ethdev.c:1046) — 检查 `high_performance_mode != 1`(第 1064 行),覆盖零和越界值

每个函数在 `strtoul()` 前设置 `errno = 0`,之后检查 `errno != 0`,从而捕获 `ERANGE`(溢出)。之后再执行语义范围检查。v2 中我们将在每个 `strtoul()` 调用后增加显式的 `ULONG_MAX` 比较以增加防御性。

---

### 2. 从驱动库导出 testpmd 符号

**同意。** 没有上游驱动为其 testpmd 命令导出符号。v2 中将:

- 将 `sxe2_testpmd_lib.c` 的实现移至 `sxe2_testpmd.c`,通过 meson.build 中的 `testpmd_sources` 编译,与 i40e、ixgbe、mlx5、bnxt 等驱动的模式一致。
- **删除所有 14 个 `RTE_EXPORT_EXPERIMENTAL_SYMBOL` 条目。** 这些从未作为公共 API。
- 将应用状态(`g_tx_session[][]`、`g_rx_session[][]`、`g_esp_header_offset[]`、`g_sess_pool`)移至 testpmd 侧代码。
- 驱动内部函数继续通过 `drivers/net/sxe2/` 下的内部头文件访问。

---

### 3. 与标准 testpmd 功能重复的命令

**三个命令都同意删除。**

- **"sxe2 flow rule dump"** → 我们将实现 `rte_flow_dev_dump` op。之后标准命令 `flow dump <port_id> all` 即可正常工作。私有命令将被删除。
- **"sxe2 udp_tunnel_port add|rm"** → 该命令调用 `rte_eth_dev_udp_tunnel_port_add/rm`,已由 `port config <port_id> udp_tunnel_port add|rm` 暴露。私有命令是冗余的,将被删除。
- **"sxe2 show stats"** → 我们将审查 xstats 并补充缺失的计数器;用户改用 `show port xstats <port_id>`。私有命令将被删除。

---

### 4. IPsec SA 管理套件

**同意。** 我们将:

- 从 testpmd 扩展中删除 9 个子命令的 ipsec SA 管理命令。
- 文档中说明 `examples/ipsec-secgw` 是使用此驱动验证 inline crypto 的支持方式,与其他 inline-crypto PMD 一致。
- 如果将来需要 testpmd 中的交互式 SA 管理,我们将提议作为通用的 testpmd 命令基于 `rte_security` 实现,使所有驱动受益。

---

### 5. 私有 devargs

所有 7 个 devargs 的文档已添加到 [`doc/guides/nics/sxe2.rst`](doc/guides/nics/sxe2.rst)(Runtime Configuration 章节),同时添加了 `RTE_PMD_REGISTER_PARAM_STRING` 以保证可发现性。以下是保留每个 devargs 的理由:

**flow-duplicate-pattern** — 控制硬件 switch engine 是否接受重复的 flow rule。这不会改变 `rte_flow` 语义;它配置了一个硬件特定的能力。switch engine 支持两种模式:(a) 拒绝重复规则用于错误检测,(b) 通过 per-rule 元数据标志允许重复。默认值(1 = 允许)确保标准行为。目前没有标准 API 用于控制 switch 级别的重复规则行为。

**fnav-stat-type** — 在初始化时选择硬件计数器资源配置(仅包计数 / 仅字节计数 / 两者)。这是一个静态硬件配置决策,而非运行时开关。目前没有标准 API 用于选择计数器资源分配模式。

**drv-sw-stats** — 某些包类型上硬件计数器可能存在不准确性。此参数启用 Rx 数据路径中的软件累计路径作为 workaround。per-packet 软件分类的开销使得通过 devarg 控制而非始终开启是合理的。计数器通过 xstats 暴露(`rx_sw_unicast_packets` 等)。

**sched-layer-mode** — 不同硬件 SKU 支持不同的最大调度层级数(0-3)。该参数在初始化时限制层级深度。`rte_tm` API 目前没有在配置前查询最大调度深度的接口。

**high-performance-mode** — Tx 绕过硬件调度器以获得更低延迟。不适合作为默认值,因为 TM 功能对许多用户很重要。现已记录完整的性能权衡说明。

**rx-low-latency** — Rx 中断调节调优。类似 devarg 在其他驱动中也存在(例如 mlx5 的 `rxqs_min_bth`)。

**function-flow-direct** — 控制 flow rule 的 PF/VF 重定向范围。`rte_flow` 没有针对此功能的 per-rule 标志。

---

### v2 计划变更

1. devarg 解析器中 `strtoul()` 后增加防御性 `ULONG_MAX` 检查
2. testpmd 命令移出驱动库;删除 14 个 `RTE_EXPORT_EXPERIMENTAL_SYMBOL`
3. 实现 `rte_flow_dev_dump` op;删除 "sxe2 flow rule dump" 命令
4. 删除 "sxe2 udp_tunnel_port add|rm" 命令(冗余)
5. 完善 xstats 覆盖;删除 "sxe2 show stats" 命令
6. 删除 ipsec SA 管理 testpmd 命令
7. 保留 7 个 devargs,在 sxe2.rst 中提供完整文档和理由
8. 为所有 devargs 添加 `RTE_PMD_REGISTER_PARAM_STRING`

我们将尽快发送 v2。如有其他问题请告知。

谢谢,
[署名]


^ permalink raw reply

* [PATCH v2 00/20] sxe2: address review comments - testpmd restructuring, devargs documentation, and code cleanup
From: liujie5 @ 2026-06-14  9:23 UTC (permalink / raw)
  To: stephen; +Cc: dev
In-Reply-To: <20260610013936.3634968-21-liujie5@linkdatatechnology.com>

Hi all,

Thank you for the detailed review of the sxe2 PMD patch series. We have
prepared a v2 addressing the feedback. Below is our point-by-point response.

---

### 1. strtoul() out-of-range handling

> This memory stuff looks problematic and needs more review. At a minimum
> I see a pattern of not handling values from strtoul() that are out of range.

After re-examining the devargs parsing code in sxe2_ethdev.c, we confirmed
that the existing parsers handle out-of-range values:

- sxe2_parse_u8(): checks val > UINT8_MAX, returns -ERANGE
- sxe2_parse_bool(): checks bool_val != 0 && bool_val != 1
- sxe2_parse_fnav_stat_type(): checks val > the maximum valid enum value
- sxe2_parse_sched_layer_mode(): checks val > SXE2_TXSCH_NODE_ADJ_LVL_MAX
- sxe2_parse_high_performance_mode(): checks val != 1

Each function sets errno = 0 before strtoul() and checks errno != 0
afterward, which catches ERANGE (overflow). This is followed by a semantic
range check specific to each parameter.

Nevertheless, to make the overflow path more explicit and defensive, we will
add explicit ULONG_MAX comparisons after each strtoul() call in the v2.

---

### 2. testpmd command symbols exported from driver library

> Error: the command logic is placed in sxe2_testpmd_lib.c, compiled into
> the driver library, and exposed through 14 new RTE_EXPORT_EXPERIMENTAL_SYMBOL
> entries. No upstream driver exports symbols for its testpmd commands; all six
> existing drivers with testpmd integration compile their *_testpmd.c into
> testpmd via testpmd_sources and use internal access. These exports are
> vendor public API that any application can link against.

Agreed. This is a valid concern. We will restructure as follows:

- Move the testpmd command implementation from sxe2_testpmd_lib.c into
  sxe2_testpmd.c, compiled via testpmd_sources in the meson build,
  matching the pattern used by i40e, ixgbe, mlx5, bnxt, and others.
- Remove all 14 RTE_EXPORT_EXPERIMENTAL_SYMBOL entries. These were never
  intended as public API.
- Keep driver-internal functions accessible through internal headers only
  (they are already declared in driver-local headers under drivers/net/sxe2/).
- Move application state variables (g_tx_session[][], g_rx_session[][],
  g_esp_header_offset[], g_sess_pool) into the testpmd-side code.

---

### 3. Commands duplicating standard testpmd functionality

> Error: three commands duplicate standard testpmd functionality the driver
> already supports.
> "sxe2 flow rule dump" -> implement the rte_flow dev_dump op
> "sxe2 udp_tunnel_port add|rm" -> port config udp_tunnel_port add|rm
> "sxe2 show stats" -> show port xstats

Agreed on all three:

- **sxe2 flow rule dump**: We will implement the rte_flow_dev_dump() op in
  the driver. After that, the standard "flow dump <port_id> all" command
  works for every application. The private command will be removed.

- **sxe2 udp_tunnel_port add|rm**: This calls rte_eth_dev_udp_tunnel_port_add/
  rm, which is already exposed by "port config <port_id> udp_tunnel_port
  add|rm". The private command is redundant and will be removed.

- **sxe2 show stats**: We will audit the xstats implementation and ensure
  all statistics currently shown by the private formatter are available
  through rte_eth_xstats_get(). Any missing counters will be added as
  xstats entries. The private command will be removed; users will use
  "show port xstats <port_id>" instead.

---

### 4. IPsec SA management suite in testpmd commands

> Warning: the 9-subcommand ipsec suite (egress/ingress add/rm/show,
> session-id and esp-hdr-offset set/get, flush, stats) is an SA management
> application embedded in the driver. Inline crypto is exercised with
> examples/ipsec-secgw, as done for other inline-crypto PMDs.

Understood. We will:

- Remove the ipsec SA management commands from the testpmd extension.
- Document that examples/ipsec-secgw is the supported method for
  exercising inline crypto on this driver, consistent with other
  inline-crypto PMDs (e.g., mlx5, ice).
- If interactive SA management in testpmd is needed in the future,
  we will propose it as generic testpmd commands over rte_security,
  benefiting all drivers equally.

---

### 5. Private devargs

> Warning: seven private devargs are added with no documentation.
> Each surviving devarg needs documentation and a rationale for why
> no standard API covers it.

Documentation has been added to doc/guides/nics/sxe2.rst (Runtime
Configuration section) for all seven devargs, along with
RTE_PMD_REGISTER_PARAM_STRING for discoverability.

Below is the rationale for each devarg that we believe should be retained:

**flow-duplicate-pattern**
This controls whether the hardware switch engine accepts flow rules with
identical match patterns. It does NOT change rte_flow semantics; it
configures a hardware-specific behavior. The switch engine supports two
modes: (a) reject duplicate rules (useful for catching configuration
errors), and (b) allow duplicates by setting a per-rule metadata flag.
This is a hardware capability toggle, analogous to how some NICs support
configurable TCAM overlap behavior. Default value (1 = allow duplicates)
ensures standard rte_flow behavior out of the box. The standard rte_flow
API does not provide a mechanism to control switch-level duplicate rule
behavior.

**fnav-stat-type**
This selects how the Fnav flow engine partitions its hardware counter
resources (packets only, bytes only, or both). This is set at device
initialization and cannot be changed dynamically; it determines the
hardware resource allocation. When only packet counts are needed, the
hardware can allocate the full counter width to packet counting. The
resulting counters are queried via the standard rte_flow_query() API.
No standard API currently exists to select hardware counter resource
allocation mode at initialization time; it is a hardware-specific
configuration decision.

**drv-sw-stats**
Hardware packet statistics counters may be inaccurate for certain packet
types due to hardware design limitations. This devarg enables a software
accumulation path in the Rx data path as a workaround. When enabled, the
driver classifies each received packet (unicast/multicast/broadcast/drop)
and accumulates counts in software. The counters are exposed as xstats
(rx_sw_unicast_packets, etc.). The devarg controls whether the per-packet
software accumulation overhead is incurred; it is an optimization choice
for users who do not require this level of accuracy. The rationale for
keeping this as a devarg rather than always-on xstats is that the
software accumulation adds overhead to the Rx fast path; users who do
not need per-packet classification accuracy should not pay this cost.

**sched-layer-mode**
Different hardware SKUs support different numbers of Tx scheduling levels
(0-3). This parameter caps the maximum scheduling depth at initialization.
This is NOT a TM topology configuration -- the application still builds
its rte_tm hierarchy as usual. Rather, it constrains the hierarchy depth
to match hardware capability across SKU variants. The rte_tm API does not
currently provide a mechanism to query or constrain the maximum scheduling
depth before configuring the hierarchy, so a devarg is the practical
approach.

**high-performance-mode**
When enabled, the Tx path bypasses the hardware scheduling module and
sends packets directly to the port. This trades TM scheduling capability
for lower latency and higher throughput. It is not appropriate to make
this the default because TM functionality is important for many users.
The devarg (now documented) makes the performance trade-off explicit.

**rx-low-latency**
Controls whether Rx interrupt moderation is tuned for minimum latency vs.
batching efficiency. This is a hardware-specific tuning parameter with
no equivalent in the standard Ethdev API. Many drivers have analogous
low-latency devargs (e.g., mlx5 has rxqs_min_bth for similar purposes).

**function-flow-direct**
Controls whether flow rules can directly target a specific PF/VF function.
This is a hardware switch engine feature that affects the scope of flow
redirection. The standard rte_flow API does not have a per-rule flag for
this; the devarg controls the default redirect behavior.

---

### Summary of changes for v2

1. Add explicit ULONG_MAX checks after strtoul() in all devarg parsers
2. Move testpmd command implementation out of driver library into
   testpmd_sources; remove all 14 RTE_EXPORT_EXPERIMENTAL_SYMBOL
3. Implement rte_flow_dev_dump op; remove "sxe2 flow rule dump" command
4. Remove "sxe2 udp_tunnel_port add|rm" command (redundant)
5. Add missing counters to xstats; remove "sxe2 show stats" command
6. Remove ipsec SA management testpmd commands
7. Retain 7 devargs with documentation and rationale in sxe2.rst
8. Add RTE_PMD_REGISTER_PARAM_STRING for all devargs

We will send v2 shortly. Please let us know if there are any further
concerns.

Thanks,
[sender name]


^ permalink raw reply

* [PATCH v4 6/6] net/gve: reconstruct HW timestamps from DQO
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

A full 64-bit NIC timestamp is periodically synced via an AdminQ
command and cached in the driver. In the RX datapath, this cached
value is used as a base to expand the 32-bit hardware timestamp into
a full 64-bit value, which is then stored in the mbuf's dynamic
timestamp field.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v4:
    - Clarify in GVE documentation that both read_clock ethdev op and
      per-packet RX timestamp offloading require the DQO queue format.

v2:
    - Scoped timestamp offload capability advertisement strictly
      to DQO queues.
    - Predicated capability advertisement directly on memzone
      allocation.
    - Initialized mbuf_timestamp_offset to -1.
    - Added blank line separating release notes.
---
 doc/guides/nics/features/gve.ini       |  1 +
 doc/guides/nics/gve.rst                | 20 ++++++++++++++++++++
 doc/guides/rel_notes/release_26_07.rst |  4 ++++
 drivers/net/gve/base/gve_desc_dqo.h    |  8 ++++++--
 drivers/net/gve/gve_ethdev.c           | 15 ++++++++++++++-
 drivers/net/gve/gve_ethdev.h           | 25 +++++++++++++++++++++++++
 drivers/net/gve/gve_rx_dqo.c           | 26 ++++++++++++++++++++++++++
 7 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/features/gve.ini b/doc/guides/nics/features/gve.ini
index 89c97fd27a..117ad4fc65 100644
--- a/doc/guides/nics/features/gve.ini
+++ b/doc/guides/nics/features/gve.ini
@@ -13,6 +13,7 @@ RSS hash             = Y
 RSS key update       = Y
 RSS reta update      = Y
 L4 checksum offload  = Y
+Timestamp offload    = Y
 Basic stats          = Y
 FreeBSD              = Y
 Linux                = Y
diff --git a/doc/guides/nics/gve.rst b/doc/guides/nics/gve.rst
index be855b645d..b73375c5c9 100644
--- a/doc/guides/nics/gve.rst
+++ b/doc/guides/nics/gve.rst
@@ -72,6 +72,7 @@ Supported features of the GVE PMD are:
 - Tx UDP/TCP/SCTP Checksum
 - RSS hash configuration
 - RSS redirection table query and update
+- Timestamp offload
 
 Currently, only GQI_QPL and GQI_RDA queue format are supported in PMD.
 Jumbo Frame is not supported in PMD for now.
@@ -132,6 +133,25 @@ Security Protocols
 - Flow priorities are not supported (must be 0).
 - Masking is limited to full matches i.e. ``0x00...0`` or ``0xFF...F``.
 
+Timestamp Offload
+^^^^^^^^^^^^^^^^^
+
+The driver supports hardware-based packet timestamping on supported
+devices via the standard ``RTE_ETH_RX_OFFLOAD_TIMESTAMP`` offload capability.
+Both the ethdev ``.read_clock`` operation and per-packet RX timestamp offloading
+require the DQO queue format.
+
+**Limitations**
+
+- If the driver fails to fetch the NIC hardware clock for 7 consecutive periods,
+  the cached timestamp is marked as stale,
+  and the reconstructed timestamps are no longer propagated to the mbuf.
+- The timestamp reconstruction is only accurate
+  if the time between a packet's reception
+  and the last hardware clock sync is less than approximately 2 seconds.
+  The driver's internal clock sync period is set to respect this limitation.
+
+
 Device Reset
 ^^^^^^^^^^^^
 
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 2e449d3ee8..73c9a36007 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -91,6 +91,10 @@ New Features
 
   Added network driver for the LinkData network adapters.
 
+* **Updated Google GVE net driver.**
+
+  * Added hardware timestamping support on DQO queues.
+
 * **Updated Intel iavf driver.**
 
   * Added support for transmitting LLDP packets based on mbuf packet type.
diff --git a/drivers/net/gve/base/gve_desc_dqo.h b/drivers/net/gve/base/gve_desc_dqo.h
index 71d9d60bb9..c1534959c2 100644
--- a/drivers/net/gve/base/gve_desc_dqo.h
+++ b/drivers/net/gve/base/gve_desc_dqo.h
@@ -226,7 +226,8 @@ struct gve_rx_compl_desc_dqo {
 
 	u8 status_error1;
 
-	__le16 reserved5;
+	u8 reserved5;
+	u8 ts_sub_nsecs_low;
 	__le16 buf_id; /* Buffer ID which was sent on the buffer queue. */
 
 	union {
@@ -237,9 +238,12 @@ struct gve_rx_compl_desc_dqo {
 	};
 	__le32 hash;
 	__le32 reserved6;
-	__le64 reserved7;
+	__le32 reserved7;
+	__le32 ts; /* timestamp in nanosecs */
 } __packed;
 
+#define GVE_DQO_RX_HWTSTAMP_VALID 0x1
+
 GVE_CHECK_STRUCT_LEN(32, gve_rx_compl_desc_dqo);
 
 /* Ringing the doorbell too often can hurt performance.
diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 111f66efa8..0b02dcb3ad 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -214,6 +214,7 @@ static int
 gve_dev_configure(struct rte_eth_dev *dev)
 {
 	struct gve_priv *priv = dev->data->dev_private;
+	int err;
 
 	if (dev->data->dev_conf.rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG) {
 		dev->data->dev_conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
@@ -223,13 +224,22 @@ gve_dev_configure(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TCP_LRO)
 		priv->enable_rsc = 1;
 
+	if (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
+		err = rte_mbuf_dyn_rx_timestamp_register(&priv->mbuf_timestamp_offset,
+							 &priv->mbuf_timestamp_mask);
+		if (err < 0) {
+			PMD_DRV_LOG(ERR, "Failed to register dynamic timestamp field");
+			return err;
+		}
+	}
+
 	/* Reset RSS RETA in case number of queues changed. */
 	if (priv->rss_config.indir) {
 		struct gve_rss_config update_reta_config;
 		gve_init_rss_config_from_priv(priv, &update_reta_config);
 		gve_generate_rss_reta(dev, &update_reta_config);
 
-		int err = gve_adminq_configure_rss(priv, &update_reta_config);
+		err = gve_adminq_configure_rss(priv, &update_reta_config);
 		if (err)
 			PMD_DRV_LOG(ERR,
 				"Could not reconfigure RSS redirection table.");
@@ -828,6 +838,8 @@ gve_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
 
 	dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_RSS_HASH;
+	if (!gve_is_gqi(priv) && priv->nic_ts_report_mz)
+		dev_info->rx_offload_capa |= RTE_ETH_RX_OFFLOAD_TIMESTAMP;
 	dev_info->tx_offload_capa =
 		RTE_ETH_TX_OFFLOAD_MULTI_SEGS	|
 		RTE_ETH_TX_OFFLOAD_UDP_CKSUM	|
@@ -1683,6 +1695,7 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 	pthread_mutex_init(&priv->nic_ts_lock, &mutexattr);
 	pthread_mutexattr_destroy(&mutexattr);
 
+	priv->mbuf_timestamp_offset = -1;
 	err = gve_init_priv(priv, false);
 	if (err) {
 		pthread_mutex_destroy(&priv->flow_rule_lock);
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 1e80f3a906..4a7e5ecdf3 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -261,6 +261,7 @@ struct gve_rx_queue {
 	struct rte_mbuf **refill_bufs;
 
 	uint8_t is_gqi_qpl;
+	bool timestamp_enabled;
 };
 
 struct gve_flow {
@@ -372,8 +373,32 @@ struct gve_priv {
 	RTE_ATOMIC(uint8_t) nic_ts_stale;
 	rte_thread_t nic_ts_thread_id;
 	RTE_ATOMIC(uint8_t) nic_ts_thread_should_stop;
+
+	int mbuf_timestamp_offset;
+	uint64_t mbuf_timestamp_mask;
 };
 
+/* Expand the hardware timestamp to the full 64 bits of width.
+ *
+ * This algorithm works by using the passed hardware timestamp to generate a
+ * diff relative to the last read of the nic clock. This diff can be positive or
+ * negative, as it is possible that we have read the clock more recently than
+ * the hardware has received this packet. To detect this, we use the high bit of
+ * the diff, and assume that the read is more recent if the high bit is set. In
+ * this case we invert the process.
+ *
+ * Note that this means if the time delta between packet reception and the last
+ * clock read is greater than ~2 seconds, this will provide invalid results.
+ */
+static inline uint64_t
+gve_reconstruct_ts(uint64_t last_sync, uint32_t ts)
+{
+	uint32_t low = (uint32_t)last_sync;
+	int32_t diff = (int32_t)(ts - low);
+
+	return last_sync + diff;
+}
+
 static inline bool
 gve_is_gqi(struct gve_priv *priv)
 {
diff --git a/drivers/net/gve/gve_rx_dqo.c b/drivers/net/gve/gve_rx_dqo.c
index 8035aee572..cc343f3fd8 100644
--- a/drivers/net/gve/gve_rx_dqo.c
+++ b/drivers/net/gve/gve_rx_dqo.c
@@ -160,6 +160,8 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	volatile struct gve_rx_compl_desc_dqo *rx_desc;
 	struct gve_rx_queue *rxq;
+	uint64_t last_sync = 0;
+	struct gve_priv *priv;
 	struct rte_mbuf *rxm;
 	uint16_t rx_buf_id;
 	uint16_t pkt_len;
@@ -171,6 +173,15 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	nb_rx = 0;
 	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
+	priv = rxq->hw;
+
+	if (rxq->timestamp_enabled &&
+	    !rte_atomic_load_explicit(&priv->nic_ts_stale,
+				      rte_memory_order_acquire)) {
+		last_sync =
+			rte_atomic_load_explicit(&priv->last_read_nic_timestamp,
+						 rte_memory_order_relaxed);
+	}
 
 	while (nb_rx < nb_pkts) {
 		rx_desc = &rxq->compl_ring[rx_id];
@@ -208,6 +219,16 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		gve_parse_csum_ol_flags(rxm, rx_desc);
 		rxm->hash.rss = rte_le_to_cpu_32(rx_desc->hash);
 
+		if (last_sync != 0 &&
+		    (rx_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) &&
+		    priv->mbuf_timestamp_offset >= 0) {
+			uint32_t ts = rte_le_to_cpu_32(rx_desc->ts);
+			uint64_t full_ts = gve_reconstruct_ts(last_sync, ts);
+
+			*RTE_MBUF_DYNFIELD(rxm, priv->mbuf_timestamp_offset, uint64_t *) = full_ts;
+			rxm->ol_flags |= priv->mbuf_timestamp_mask;
+		}
+
 		rx_pkts[nb_rx++] = rxm;
 		bytes += pkt_len;
 	}
@@ -320,6 +341,11 @@ gve_rx_queue_setup_dqo(struct rte_eth_dev *dev, uint16_t queue_id,
 		return -ENOMEM;
 	}
 
+	/* Setup hardware timestamping if enabled */
+	if ((conf->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) ||
+	    (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP))
+		rxq->timestamp_enabled = true;
+
 	/* check free_thresh here */
 	free_thresh = conf->rx_free_thresh ?
 			conf->rx_free_thresh : GVE_DEFAULT_RX_FREE_THRESH;
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 5/6] net/gve: support read clock ethdev op
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

Implement the read_clock operation in eth_dev_ops. The function calls
the AdminQ command to fetch the current NIC timestamp synchronously,
updates the cached timestamp used for reconstruction, and returns the
full 64-bit value.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v4:
    - Fix mutex initialization order: initialize nic_ts_lock before
      calling gve_init_priv() (which invokes early setup-time read).
    - Fix mutex teardown order: destroy nic_ts_lock only after the
      background sync thread has been joined and stopped.

v3:
    - Add mutex lock to protect shared NIC timestamp memzone access.
    - Fix missing read_clock assignment to DQO queue ops table
      (accidental omission in v2).

v2:
    - Scoped read_clock ethdev operation strictly to DQO queues.
---
 drivers/net/gve/gve_ethdev.c | 57 +++++++++++++++++++++++++++++++-----
 drivers/net/gve/gve_ethdev.h |  1 +
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index fa91575b3e..111f66efa8 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -463,11 +463,13 @@ gve_read_nic_clock(void *arg)
 	if (!priv || !priv->nic_ts_report_mz)
 		return;
 
+	pthread_mutex_lock(&priv->nic_ts_lock);
 	memset(priv->nic_ts_report, 0, sizeof(struct gve_nic_ts_report));
 
 	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
 	if (err == 0) {
 		ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+		pthread_mutex_unlock(&priv->nic_ts_lock);
 		rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts,
 					  rte_memory_order_relaxed);
 		PMD_DRV_LOG(DEBUG, "Fetched NIC Timestamp: %" PRIu64, ts);
@@ -476,6 +478,7 @@ gve_read_nic_clock(void *arg)
 		rte_atomic_store_explicit(&priv->nic_ts_stale, 0,
 					  rte_memory_order_release);
 	} else {
+		pthread_mutex_unlock(&priv->nic_ts_lock);
 		PMD_DRV_LOG(ERR, "Failed to read NIC clock, AQ err: %d", err);
 		fails = rte_atomic_fetch_add_explicit(&priv->nic_ts_read_fails, 1,
 						      rte_memory_order_relaxed) + 1;
@@ -705,12 +708,13 @@ gve_dev_close(struct rte_eth_dev *dev)
 	if (gve_get_flow_subsystem_ok(priv))
 		gve_teardown_flow_subsystem(priv);
 
-	pthread_mutex_destroy(&priv->flow_rule_lock);
-
 	gve_free_queues(dev);
 	gve_teardown_device_resources(priv);
 	gve_adminq_free(priv);
 
+	pthread_mutex_destroy(&priv->flow_rule_lock);
+	pthread_mutex_destroy(&priv->nic_ts_lock);
+
 	dev->data->mac_addrs = NULL;
 
 	return err;
@@ -1278,6 +1282,38 @@ gve_flow_ops_get(struct rte_eth_dev *dev, const struct rte_flow_ops **ops)
 	return 0;
 }
 
+static int
+gve_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+	struct gve_priv *priv = dev->data->dev_private;
+	uint64_t ts;
+	int err;
+
+	if (!priv->nic_timestamp_supported)
+		return -EOPNOTSUPP;
+
+	if (!priv->nic_ts_report_mz)
+		return -EIO;
+
+	pthread_mutex_lock(&priv->nic_ts_lock);
+	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
+	if (err != 0) {
+		pthread_mutex_unlock(&priv->nic_ts_lock);
+		return err;
+	}
+
+	ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+	pthread_mutex_unlock(&priv->nic_ts_lock);
+	*clock = ts;
+
+	/* Update the cached value */
+	rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_stale, 0, rte_memory_order_release);
+
+	return 0;
+}
+
 static const struct eth_dev_ops gve_eth_dev_ops = {
 	.dev_configure        = gve_dev_configure,
 	.dev_start            = gve_dev_start,
@@ -1332,6 +1368,7 @@ static const struct eth_dev_ops gve_eth_dev_ops_dqo = {
 	.rss_hash_conf_get    = gve_rss_hash_conf_get,
 	.reta_update          = gve_rss_reta_update,
 	.reta_query           = gve_rss_reta_query,
+	.read_clock           = gve_read_clock,
 };
 
 static int
@@ -1640,9 +1677,18 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 	priv->max_nb_txq = max_tx_queues;
 	priv->max_nb_rxq = max_rx_queues;
 
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->flow_rule_lock, &mutexattr);
+	pthread_mutex_init(&priv->nic_ts_lock, &mutexattr);
+	pthread_mutexattr_destroy(&mutexattr);
+
 	err = gve_init_priv(priv, false);
-	if (err)
+	if (err) {
+		pthread_mutex_destroy(&priv->flow_rule_lock);
+		pthread_mutex_destroy(&priv->nic_ts_lock);
 		return err;
+	}
 
 	if (gve_is_gqi(priv)) {
 		eth_dev->dev_ops = &gve_eth_dev_ops;
@@ -1656,11 +1702,6 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 
 	eth_dev->data->mac_addrs = &priv->dev_addr;
 
-	pthread_mutexattr_init(&mutexattr);
-	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
-	pthread_mutex_init(&priv->flow_rule_lock, &mutexattr);
-	pthread_mutexattr_destroy(&mutexattr);
-
 	return 0;
 }
 
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 4dcbaa9971..1e80f3a906 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -366,6 +366,7 @@ struct gve_priv {
 	bool nic_timestamp_supported;
 	const struct rte_memzone *nic_ts_report_mz;
 	struct gve_nic_ts_report *nic_ts_report;
+	pthread_mutex_t nic_ts_lock;
 	RTE_ATOMIC(uint64_t) last_read_nic_timestamp;
 	RTE_ATOMIC(uint32_t) nic_ts_read_fails;
 	RTE_ATOMIC(uint8_t) nic_ts_stale;
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 4/6] net/gve: add periodic NIC clock synchronization
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

Introduce a mechanism to periodically fetch the NIC hardware timestamp
using the GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command. The
synchronization runs every 250ms using rte_alarm. If the read fails,
the alarm is still rescheduled. After 7 consecutive failures, the
timestamp is marked as stale, indicating to the RX path that
reconstructed timestamps may be unreliable.

Atomics exist because of the potential for async callers (introduced
here) and async callers (introduced later in the RX datapath) accessing
the cached state.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v4:
    - Replace EAL alarm thread with a dedicated control thread to
      prevent blocking shared EAL interrupt thread for up to 2s
      on GVE AdminQ timeouts.
    - Implement incremental sleep loop to support fast termination.

v2:
    - Removed redundant void* casts.
    - Handled alarm reschedule failures by marking timestamp stale.
    - Added transient error logging on memzone allocation failure.
---
 drivers/net/gve/gve_ethdev.c | 122 +++++++++++++++++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h |  12 ++++
 2 files changed, 134 insertions(+)

diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 476b2c311f..fa91575b3e 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -452,6 +452,94 @@ gve_dev_start(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static void
+gve_read_nic_clock(void *arg)
+{
+	struct gve_priv *priv = arg;
+	uint32_t fails;
+	uint64_t ts;
+	int err;
+
+	if (!priv || !priv->nic_ts_report_mz)
+		return;
+
+	memset(priv->nic_ts_report, 0, sizeof(struct gve_nic_ts_report));
+
+	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
+	if (err == 0) {
+		ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+		rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts,
+					  rte_memory_order_relaxed);
+		PMD_DRV_LOG(DEBUG, "Fetched NIC Timestamp: %" PRIu64, ts);
+		rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0,
+					  rte_memory_order_relaxed);
+		rte_atomic_store_explicit(&priv->nic_ts_stale, 0,
+					  rte_memory_order_release);
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to read NIC clock, AQ err: %d", err);
+		fails = rte_atomic_fetch_add_explicit(&priv->nic_ts_read_fails, 1,
+						      rte_memory_order_relaxed) + 1;
+		if (fails >= GVE_NIC_CLOCK_READ_MAX_FAILS) {
+			if (!rte_atomic_load_explicit(&priv->nic_ts_stale,
+						      rte_memory_order_relaxed))
+				PMD_DRV_LOG(ERR,
+					"NIC timestamping marked as stale after %u consecutive failures",
+					GVE_NIC_CLOCK_READ_MAX_FAILS);
+			rte_atomic_store_explicit(&priv->nic_ts_stale, 1,
+						  rte_memory_order_release);
+		}
+	}
+}
+
+static uint32_t
+gve_nic_ts_thread(void *arg)
+{
+	struct gve_priv *priv = arg;
+	unsigned int sleep_cnt;
+
+	while (!rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+					 rte_memory_order_relaxed)) {
+		gve_read_nic_clock(priv);
+		for (sleep_cnt = 0; sleep_cnt < GVE_NIC_CLOCK_READ_PERIOD_MS / 10; sleep_cnt++) {
+			if (rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+						     rte_memory_order_relaxed))
+				break;
+			rte_delay_us_sleep(10000);
+		}
+	}
+	return 0;
+}
+
+static int
+gve_alloc_nic_ts_report(struct gve_priv *priv)
+{
+	char z_name[RTE_MEMZONE_NAMESIZE];
+
+	snprintf(z_name, sizeof(z_name), "gve_%s_nic_ts_report",
+		 priv->pci_dev->device.name);
+	priv->nic_ts_report_mz = rte_memzone_reserve_aligned(z_name,
+			sizeof(struct gve_nic_ts_report), rte_socket_id(),
+			RTE_MEMZONE_IOVA_CONTIG, PAGE_SIZE);
+
+	if (!priv->nic_ts_report_mz) {
+		PMD_DRV_LOG(ERR, "Failed to allocate memzone for NIC TS report");
+		return -ENOMEM;
+	}
+	priv->nic_ts_report = priv->nic_ts_report_mz->addr;
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	return 0;
+}
+
+static void
+gve_free_nic_ts_report(struct gve_priv *priv)
+{
+	if (priv->nic_ts_report_mz) {
+		rte_memzone_free(priv->nic_ts_report_mz);
+		priv->nic_ts_report_mz = NULL;
+		priv->nic_ts_report = NULL;
+	}
+}
+
 static int
 gve_dev_stop(struct rte_eth_dev *dev)
 {
@@ -586,6 +674,13 @@ gve_teardown_device_resources(struct gve_priv *priv)
 				err);
 	}
 
+	if (priv->nic_ts_report_mz) {
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 1,
+					  rte_memory_order_relaxed);
+		rte_thread_join(priv->nic_ts_thread_id, NULL);
+		gve_free_nic_ts_report(priv);
+	}
+
 	gve_free_ptype_lut_dqo(priv);
 	gve_free_counter_array(priv);
 	gve_free_irq_db(priv);
@@ -1252,6 +1347,32 @@ pci_dev_msix_vec_count(struct rte_pci_device *pdev)
 	return 0;
 }
 
+static void
+gve_setup_nic_timestamp(struct gve_priv *priv)
+{
+	int err;
+
+	if (!priv->nic_timestamp_supported)
+		return;
+
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_stale, 1, rte_memory_order_relaxed);
+	err = gve_alloc_nic_ts_report(priv);
+	if (err == 0) {
+		gve_read_nic_clock(priv);
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 0,
+					  rte_memory_order_relaxed);
+		err = rte_thread_create_internal_control(&priv->nic_ts_thread_id, "gve-ts",
+							 gve_nic_ts_thread, priv);
+		if (err != 0) {
+			PMD_DRV_LOG(ERR, "Failed to create NIC clock sync thread, err=%d", err);
+			gve_free_nic_ts_report(priv);
+		}
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to allocate memory for NIC timestamping subsystem. Please reset device to retry.");
+	}
+}
+
 static int
 gve_setup_device_resources(struct gve_priv *priv)
 {
@@ -1307,6 +1428,7 @@ gve_setup_device_resources(struct gve_priv *priv)
 			goto free_ptype_lut;
 		}
 	}
+	gve_setup_nic_timestamp(priv);
 
 	gve_set_device_resources_ok(priv);
 
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b67f82c263..4dcbaa9971 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -12,6 +12,8 @@
 #include <rte_pci.h>
 #include <pthread.h>
 #include <rte_bitmap.h>
+#include <rte_memzone.h>
+#include <rte_thread.h>
 
 #include "base/gve.h"
 
@@ -39,6 +41,9 @@
 #define GVE_RSS_HASH_KEY_SIZE 40
 #define GVE_RSS_INDIR_SIZE 128
 
+#define GVE_NIC_CLOCK_READ_PERIOD_MS 250
+#define GVE_NIC_CLOCK_READ_MAX_FAILS 7
+
 #define GVE_TX_CKSUM_OFFLOAD_MASK (		\
 		RTE_MBUF_F_TX_L4_MASK  |	\
 		RTE_MBUF_F_TX_TCP_SEG)
@@ -359,6 +364,13 @@ struct gve_priv {
 
 	/* HW Timestamping Fields */
 	bool nic_timestamp_supported;
+	const struct rte_memzone *nic_ts_report_mz;
+	struct gve_nic_ts_report *nic_ts_report;
+	RTE_ATOMIC(uint64_t) last_read_nic_timestamp;
+	RTE_ATOMIC(uint32_t) nic_ts_read_fails;
+	RTE_ATOMIC(uint8_t) nic_ts_stale;
+	rte_thread_t nic_ts_thread_id;
+	RTE_ATOMIC(uint8_t) nic_ts_thread_should_stop;
 };
 
 static inline bool
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 3/6] net/gve: add AdminQ command for NIC timestamps
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

Introduce the necessary definitions and functions for the
GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Added adminq timestamp counter reset to gve_adminq_alloc.
---
 drivers/net/gve/base/gve_adminq.c | 20 ++++++++++++++++++++
 drivers/net/gve/base/gve_adminq.h | 20 ++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 1ced1e442e..2b25c7f390 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -263,6 +263,8 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	priv->adminq_report_nic_timestamp_cnt = 0;
+
 	pthread_mutexattr_init(&mutexattr);
 	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
 	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
@@ -522,6 +524,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	case GVE_ADMINQ_CONFIGURE_FLOW_RULE:
 		priv->adminq_cfg_flow_rule_cnt++;
 		break;
+	case GVE_ADMINQ_REPORT_NIC_TIMESTAMP:
+		priv->adminq_report_nic_timestamp_cnt++;
+		break;
+
 	default:
 		PMD_DRV_LOG(ERR, "unknown AQ command opcode %d", opcode);
 	}
@@ -636,6 +642,20 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv)
 	return gve_adminq_configure_flow_rule(priv, &flow_rule_cmd);
 }
 
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr)
+{
+	union gve_adminq_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_NIC_TIMESTAMP);
+	cmd.report_nic_timestamp = (struct gve_adminq_report_nic_timestamp) {
+		.nic_ts_report_len = cpu_to_be64(sizeof(struct gve_nic_ts_report)),
+		.nic_timestamp_addr = cpu_to_be64(nic_ts_report_addr),
+	};
+
+	return gve_adminq_execute_cmd(priv, &cmd);
+}
+
 /* The device specifies that the management vector can either be the first irq
  * or the last irq. ntfy_blk_msix_base_idx indicates the first irq assigned to
  * the ntfy blks. It if is 0 then the management vector is last, if it is 1 then
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index eaee5649f2..954be39fbf 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -26,6 +26,7 @@ enum gve_adminq_opcodes {
 	GVE_ADMINQ_REPORT_LINK_SPEED		= 0xD,
 	GVE_ADMINQ_GET_PTYPE_MAP		= 0xE,
 	GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY	= 0xF,
+	GVE_ADMINQ_REPORT_NIC_TIMESTAMP		= 0x11,
 	/* For commands that are larger than 56 bytes */
 	GVE_ADMINQ_EXTENDED_COMMAND		= 0xFF,
 };
@@ -373,6 +374,23 @@ struct gve_stats_report {
 
 GVE_CHECK_STRUCT_LEN(8, gve_stats_report);
 
+struct gve_adminq_report_nic_timestamp {
+	__be64 nic_ts_report_len;
+	__be64 nic_timestamp_addr;
+};
+
+GVE_CHECK_STRUCT_LEN(16, gve_adminq_report_nic_timestamp);
+
+struct gve_nic_ts_report {
+	__be64 nic_timestamp; /* NIC clock in nanoseconds */
+	__be64 pre_cycles; /* System cycle counter before NIC clock read */
+	__be64 post_cycles; /* System cycle counter after NIC clock read */
+	__be64 reserved3;
+	__be64 reserved4;
+};
+
+GVE_CHECK_STRUCT_LEN(40, gve_nic_ts_report);
+
 /* Numbers of gve tx/rx stats in stats report. */
 #define GVE_TX_STATS_REPORT_NUM        6
 #define GVE_RX_STATS_REPORT_NUM        2
@@ -490,6 +508,7 @@ union gve_adminq_command {
 			struct gve_adminq_verify_driver_compatibility
 				verify_driver_compatibility;
 			struct gve_adminq_extended_command extended_command;
+			struct gve_adminq_report_nic_timestamp report_nic_timestamp;
 		};
 	};
 	u8 reserved[64];
@@ -537,5 +556,6 @@ int gve_adminq_add_flow_rule(struct gve_priv *priv,
 			     struct gve_flow_rule_params *rule, u32 loc);
 int gve_adminq_del_flow_rule(struct gve_priv *priv, u32 loc);
 int gve_adminq_reset_flow_rules(struct gve_priv *priv);
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr);
 
 #endif /* _GVE_ADMINQ_H */
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b9b4688367..b67f82c263 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -328,6 +328,7 @@ struct gve_priv {
 	uint32_t adminq_get_ptype_map_cnt;
 	uint32_t adminq_verify_driver_compatibility_cnt;
 	uint32_t adminq_cfg_flow_rule_cnt;
+	uint32_t adminq_report_nic_timestamp_cnt;
 	volatile uint32_t state_flags;
 
 	/* Gvnic device link speed from hypervisor. */
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 2/6] net/gve: add device option support for HW timestamps
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

Introduce the necessary definitions and functions for the device
option flag (GVE_DEV_OPT_ID_NIC_TIMESTAMP) to detect hardware
timestamping support in the gvnic device.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
 drivers/net/gve/base/gve_adminq.c | 41 ++++++++++++++++++++++++++-----
 drivers/net/gve/base/gve_adminq.h |  9 +++++++
 drivers/net/gve/gve_ethdev.h      |  3 +++
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 743ab8e7ae..1ced1e442e 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -38,7 +38,8 @@ void gve_parse_device_option(struct gve_priv *priv,
 			     struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			     struct gve_device_option_flow_steering **dev_op_flow_steering,
 			     struct gve_device_option_modify_ring **dev_op_modify_ring,
-			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			     struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	u32 req_feat_mask = be32_to_cpu(option->required_features_mask);
 	u16 option_length = be16_to_cpu(option->option_length);
@@ -168,6 +169,24 @@ void gve_parse_device_option(struct gve_priv *priv,
 		}
 		*dev_op_jumbo_frames = RTE_PTR_ADD(option, sizeof(*option));
 		break;
+	case GVE_DEV_OPT_ID_NIC_TIMESTAMP:
+		if (option_length < sizeof(**dev_op_nic_timestamp) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP) {
+			PMD_DRV_LOG(WARNING, GVE_DEVICE_OPTION_ERROR_FMT,
+				    "Nic Timestamp",
+				    (int)sizeof(**dev_op_nic_timestamp),
+				    GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP,
+				    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_nic_timestamp)) {
+			PMD_DRV_LOG(WARNING,
+				    GVE_DEVICE_OPTION_TOO_BIG_FMT,
+				    "Nic Timestamp");
+		}
+		*dev_op_nic_timestamp = RTE_PTR_ADD(option, sizeof(*option));
+		break;
 	default:
 		/* If we don't recognize the option just continue
 		 * without doing anything.
@@ -186,7 +205,8 @@ gve_process_device_options(struct gve_priv *priv,
 			   struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			   struct gve_device_option_flow_steering **dev_op_flow_steering,
 			   struct gve_device_option_modify_ring **dev_op_modify_ring,
-			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			   struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	const int num_options = be16_to_cpu(descriptor->num_device_options);
 	struct gve_device_option *dev_opt;
@@ -207,7 +227,8 @@ gve_process_device_options(struct gve_priv *priv,
 		gve_parse_device_option(priv, dev_opt,
 					dev_op_gqi_rda, dev_op_gqi_qpl,
 					dev_op_dqo_rda, dev_op_flow_steering,
-					dev_op_modify_ring, dev_op_jumbo_frames);
+					dev_op_modify_ring, dev_op_jumbo_frames,
+					dev_op_nic_timestamp);
 		dev_opt = next_opt;
 	}
 
@@ -920,7 +941,8 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 	u32 supported_features_mask,
 	const struct gve_device_option_flow_steering *dev_op_flow_steering,
 	const struct gve_device_option_modify_ring *dev_op_modify_ring,
-	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames)
+	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames,
+	const struct gve_device_option_nic_timestamp *dev_op_nic_timestamp)
 {
 	if (dev_op_flow_steering &&
 	    (supported_features_mask & GVE_SUP_FLOW_STEERING_MASK) &&
@@ -947,6 +969,11 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 		PMD_DRV_LOG(INFO, "JUMBO FRAMES device option enabled.");
 		priv->max_mtu = be16_to_cpu(dev_op_jumbo_frames->max_mtu);
 	}
+	if (dev_op_nic_timestamp &&
+	    (supported_features_mask & GVE_SUP_NIC_TIMESTAMP_MASK)) {
+		PMD_DRV_LOG(INFO, "NIC TIMESTAMP device option enabled.");
+		priv->nic_timestamp_supported = true;
+	}
 }
 
 int gve_adminq_describe_device(struct gve_priv *priv)
@@ -954,6 +981,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 	struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
 	struct gve_device_option_modify_ring *dev_op_modify_ring = NULL;
 	struct gve_device_option_flow_steering *dev_op_flow_steering = NULL;
+	struct gve_device_option_nic_timestamp *dev_op_nic_timestamp = NULL;
 	struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL;
 	struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
 	struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
@@ -983,7 +1011,8 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 					 &dev_op_gqi_qpl, &dev_op_dqo_rda,
 					 &dev_op_flow_steering,
 					 &dev_op_modify_ring,
-					 &dev_op_jumbo_frames);
+					 &dev_op_jumbo_frames,
+					 &dev_op_nic_timestamp);
 	if (err)
 		goto free_device_descriptor;
 
@@ -1038,7 +1067,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 
 	gve_enable_supported_features(priv, supported_features_mask,
 				      dev_op_flow_steering, dev_op_modify_ring,
-				      dev_op_jumbo_frames);
+				      dev_op_jumbo_frames, dev_op_nic_timestamp);
 
 free_device_descriptor:
 	gve_free_dma_mem(&descriptor_dma_mem);
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index d8e5e6a352..eaee5649f2 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -153,6 +153,12 @@ struct gve_device_option_jumbo_frames {
 
 GVE_CHECK_STRUCT_LEN(8, gve_device_option_jumbo_frames);
 
+struct gve_device_option_nic_timestamp {
+	__be32 supported_features_mask;
+};
+
+GVE_CHECK_STRUCT_LEN(4, gve_device_option_nic_timestamp);
+
 /* Terminology:
  *
  * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA
@@ -169,6 +175,7 @@ enum gve_dev_opt_id {
 	GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
 	GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
 	GVE_DEV_OPT_ID_FLOW_STEERING = 0xb,
+	GVE_DEV_OPT_ID_NIC_TIMESTAMP = 0xd,
 };
 
 enum gve_dev_opt_req_feat_mask {
@@ -179,12 +186,14 @@ enum gve_dev_opt_req_feat_mask {
 	GVE_DEV_OPT_REQ_FEAT_MASK_FLOW_STEERING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
+	GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP = 0x0,
 };
 
 enum gve_sup_feature_mask {
 	GVE_SUP_MODIFY_RING_MASK = 1 << 0,
 	GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2,
 	GVE_SUP_FLOW_STEERING_MASK = 1 << 5,
+	GVE_SUP_NIC_TIMESTAMP_MASK = 1 << 8,
 };
 
 #define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 524e48e723..b9b4688367 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -355,6 +355,9 @@ struct gve_priv {
 	void *avail_flow_rule_bmp_mem; /* Backing memory for the bitmap */
 	pthread_mutex_t flow_rule_lock; /* Lock for bitmap and tailq access */
 	TAILQ_HEAD(, gve_flow) active_flows;
+
+	/* HW Timestamping Fields */
+	bool nic_timestamp_supported;
 };
 
 static inline bool
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 1/6] net/gve: add thread safety to admin queue
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

Introduce a pthread_mutex to protect the admin queue operations.
Locking was added around gve_adminq_execute_cmd and the batch
queue creation/destruction functions.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Dropped ROBUST mutex attribute.
---
 .mailmap                          |  1 +
 drivers/net/gve/base/gve_adminq.c | 67 +++++++++++++++++++++++++------
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/.mailmap b/.mailmap
index e052b85213..1b10cfca35 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1016,6 +1016,7 @@ Mario Carrillo <mario.alfredo.c.arevalo@intel.com>
 Mário Kuka <kuka@cesnet.cz>
 Mariusz Drost <mariuszx.drost@intel.com>
 Mark Asselstine <mark.asselstine@windriver.com>
+Mark Blasko <blasko@google.com>
 Mark Bloch <mbloch@nvidia.com> <markb@mellanox.com>
 Mark Gillott <mgillott@vyatta.att-mail.com>
 Mark Kavanagh <mark.b.kavanagh@intel.com>
diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 9c5316fb00..743ab8e7ae 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -216,6 +216,7 @@ gve_process_device_options(struct gve_priv *priv,
 
 int gve_adminq_alloc(struct gve_priv *priv)
 {
+	pthread_mutexattr_t mutexattr;
 	uint8_t pci_rev_id;
 
 	priv->adminq = gve_alloc_dma_mem(&priv->adminq_dma_mem, PAGE_SIZE);
@@ -241,6 +242,11 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
+	pthread_mutexattr_destroy(&mutexattr);
+
 	/* Setup Admin queue with the device */
 	rte_pci_read_config(priv->pci_dev, &pci_rev_id, sizeof(pci_rev_id),
 			    RTE_PCI_REVISION_ID);
@@ -304,6 +310,7 @@ void gve_adminq_free(struct gve_priv *priv)
 		return;
 	gve_adminq_release(priv);
 	gve_free_dma_mem(&priv->adminq_dma_mem);
+	pthread_mutex_destroy(&priv->adminq_lock);
 	gve_clear_admin_queue_ok(priv);
 }
 
@@ -418,7 +425,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	    (tail & priv->adminq_mask)) {
 		int err;
 
-		/* Flush existing commands to make room. */
+		/* Flush existing commands to make room.
+		 * Note: This kicks the doorbell for all staged commands.
+		 * Any failure here means we failed after attempting to kick.
+		 */
 		err = gve_adminq_kick_and_wait(priv);
 		if (err)
 			return err;
@@ -509,17 +519,24 @@ static int gve_adminq_execute_cmd(struct gve_priv *priv,
 	u32 tail, head;
 	int err;
 
+	pthread_mutex_lock(&priv->adminq_lock);
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 	head = priv->adminq_prod_cnt;
-	if (tail != head)
+	if (tail != head) {
 		/* This is not a valid path */
-		return -EINVAL;
+		err = -EINVAL;
+		goto unlock_and_return;
+	}
 
 	err = gve_adminq_issue_cmd(priv, cmd_orig);
 	if (err)
-		return err;
+		goto unlock_and_return;
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
@@ -693,13 +710,19 @@ int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -747,13 +770,19 @@ int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -779,13 +808,19 @@ int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -811,13 +846,19 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_set_desc_cnt(struct gve_priv *priv,
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 0577f03974..524e48e723 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -339,6 +339,7 @@ struct gve_priv {
 	struct gve_tx_queue **txqs;
 	struct gve_rx_queue **rxqs;
 
+	pthread_mutex_t adminq_lock; /* Protects AdminQ command execution */
 	uint32_t stats_report_len;
 	const struct rte_memzone *stats_report_mem;
 	uint16_t stats_start_idx; /* start index of array of stats written by NIC */
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* [PATCH v4 0/6] net/gve: add hardware timestamping support
From: Mark Blasko @ 2026-06-13  4:22 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260605213022.2770893-1-blasko@google.com>

This patch series introduces support for GVE hardware timestamping
on DQO queues. To support concurrent access, a mutex lock is introduced
to protect admin queue operations. A mechanism is then added to
periodically synchronize the NIC clock via a dedicated control thread,
and support is introduced for the read_clock ethdev operation.
Finally, the RX datapath is updated to reconstruct full 64-bit
timestamps from the 32-bit values in DQO descriptors.

---
v4:
- Patch 4: Offload periodic NIC clock reads to a dedicated control thread.
- Patch 5: Correct mutex initialization and teardown ordering.
- Patch 6: Update GVE documentation to clarify queue format requirements.

v3:
- Patch 5:
  - Add mutex lock to protect shared NIC timestamp memzone access.
  - Fix missing read_clock assignment to DQO queue ops table
    (accidental omission in v2).

v2:
- Patch 1: Dropped ROBUST mutex attribute.
- Patch 3: Added adminq timestamp counter reset to gve_adminq_alloc.
- Patch 4:
  - Removed redundant void* casts.
  - Handled alarm reschedule failures by marking timestamp stale.
  - Added transient error logging on memzone allocation failure.
- Patch 5: Scoped read_clock ethdev operation strictly to DQO queues.
- Patch 6:
  - Scoped timestamp offload capability advertisement strictly to
    DQO queues.
  - Predicated capability advertisement directly on memzone
    allocation.
  - Initialized mbuf_timestamp_offset to -1.
  - Added blank line separating release notes.
---

Mark Blasko (6):
  net/gve: add thread safety to admin queue
  net/gve: add device option support for HW timestamps
  net/gve: add AdminQ command for NIC timestamps
  net/gve: add periodic NIC clock synchronization
  net/gve: support read clock ethdev op
  net/gve: reconstruct HW timestamps from DQO

 .mailmap                               |   1 +
 doc/guides/nics/features/gve.ini       |   1 +
 doc/guides/nics/gve.rst                |  20 +++
 doc/guides/rel_notes/release_26_07.rst |   4 +
 drivers/net/gve/base/gve_adminq.c      | 128 +++++++++++++---
 drivers/net/gve/base/gve_adminq.h      |  29 ++++
 drivers/net/gve/base/gve_desc_dqo.h    |   8 +-
 drivers/net/gve/gve_ethdev.c           | 194 +++++++++++++++++++++++--
 drivers/net/gve/gve_ethdev.h           |  43 ++++++
 drivers/net/gve/gve_rx_dqo.c           |  26 ++++
 10 files changed, 424 insertions(+), 30 deletions(-)

-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply

* [PATCH v2] tools: AI review handle empty Error sections
From: Matthew Gee @ 2026-06-12 19:08 UTC (permalink / raw)
  To: dev; +Cc: stephen, aconole, lylavoie, Matthew Gee
In-Reply-To: <20260612190225.1016275-1-mgee@iol.unh.edu>

Previous review-patch.py would detect and report and error or warning
based off of the occurrence of the headers of the error and warning
sections. This led to consistent false positives as often AI reviewers
will create the header but put "none" or similar filler text within
the following body.

This patch updates the code in order to check if the AI review has a
body with error or warnings to fix and not just filler text. This is
done by querying multiple lines at once and adjusting the regex to
filter out headers followed only by filler text or formatting in the
review.

These changes were tested against 10+ AI review outputs with several
variations in formatting and filler text in order to catch a good
variety of cases to make sure code reviews with actual errors or
warnings are caught and not missed.

Signed-off-by: Matthew Gee <mgee@iol.unh.edu>
---
 devtools/ai/review-patch.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/devtools/ai/review-patch.py b/devtools/ai/review-patch.py
index 52601ac156..eb608566d2 100755
--- a/devtools/ai/review-patch.py
+++ b/devtools/ai/review-patch.py
@@ -19,6 +19,7 @@
 from email.message import EmailMessage
 from pathlib import Path
 from typing import Any, Iterator
+from itertools import tee, islice, chain
 
 from _common import (
     PROVIDERS,
@@ -147,19 +148,37 @@ def classify_review(review_text: str, output_format: str) -> int:
             pass  # Fall through to text scanning
 
     if not has_errors and not has_warnings:
-        # Scan review text for severity indicators.
-        # Match section headers and inline markers across text/markdown/html.
-        for line in review_text.splitlines():
-            stripped = line.strip().lower()
+        # Matches against error or warning section headers
+        rgx_should_match: str = r"#+\s(\*+)?{err_or_warn}"
+        # Matches against headers followed only by filler text, formatting, or no additional text
+        rgx_should_not_match: str = r"#+\s(\*+)?{err_or_warn}(s?)(\*+)?(none(.)?$| \(must fix\)$|$)"
+
+        curr_lines: iter[str]
+        next_lines: iter[str | None]
+        next_next_lines: iter[str | None]
+        curr_lines, next_lines, next_next_lines = tee(review_text.splitlines(), 3)
+        next_lines = chain(islice(next_lines, 1, None), [None])
+        next_next_lines = chain(islice(next_next_lines, 2, None), [None, None])
+
+        curr_lines: str
+        next_lines: str | None
+        next_next_lines: str | None
+        for curr_line, next_line, next_next_line in zip(curr_lines, next_lines, next_next_lines):
+            stripped: str = curr_line.strip().lower() + str(
+                next_line or '').strip().lower() + str(next_next_line or '').strip().lower()
             # Skip quoted patch context lines
             if stripped.startswith(">") or stripped.startswith("diff --git"):
                 continue
-            if re.match(r"^(#{1,3}\s+)?(\*{0,2})error", stripped) or re.match(
-                r"^<h[1-3]>\s*error", stripped
+
+            elif re.match(rgx_should_match.format(err_or_warn='error'),
+                          stripped) and not re.match(
+                rgx_should_not_match.format(err_or_warn='error'), stripped
             ):
                 has_errors = True
-            elif re.match(r"^(#{1,3}\s+)?(\*{0,2})warning", stripped) or re.match(
-                r"^<h[1-3]>\s*warning", stripped
+
+            elif re.match(rgx_should_match.format(err_or_warn='warning'),
+                          stripped) and not re.match(
+                rgx_should_not_match.format(err_or_warn='warning'), stripped
             ):
                 has_warnings = True
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH v1] tools: AI review handle empty Error sections
From: Matthew Gee @ 2026-06-12 19:02 UTC (permalink / raw)
  To: dev; +Cc: stephen, aconole, lylavoie, Matthew Gee

Previous review-patch.py would detect and report and error or warning
based off of the occurrence of the headers of the error and warning
sections. This led to consistent false positives as often AI reviewers
will create the header but put "none" or similar filler text within
the following body.

This patch updates the code in order to check if the AI review has a
body with error or warnings to fix and not just filler text. This is
done by querying multiple lines at once and adjusting the regex to
filter out headers followed only by filler text or formatting in the
review.

These changes were tested against 10+ AI review outputs with several
variations in formatting and filler text in order to catch a good
variety of cases to make sure code reviews with actual errors or
warnings are caught and not missed.

Signed-off-by: Matthew Gee <mgee@iol.unh.edu>
---
 devtools/ai/review-patch.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/devtools/ai/review-patch.py b/devtools/ai/review-patch.py
index 52601ac156..b009368638 100755
--- a/devtools/ai/review-patch.py
+++ b/devtools/ai/review-patch.py
@@ -19,6 +19,7 @@
 from email.message import EmailMessage
 from pathlib import Path
 from typing import Any, Iterator
+from itertools import tee, islice, chain
 
 from _common import (
     PROVIDERS,
@@ -147,19 +148,37 @@ def classify_review(review_text: str, output_format: str) -> int:
             pass  # Fall through to text scanning
 
     if not has_errors and not has_warnings:
-        # Scan review text for severity indicators.
-        # Match section headers and inline markers across text/markdown/html.
-        for line in review_text.splitlines():
-            stripped = line.strip().lower()
+        # Matches against error or warning section headers
+        rgx_should_match: str = r"#+\s(\*+)?{err_or_warn}"
+        # Matches against headers followed only by filler text, formatting, or no additional text
+        rgx_should_not_match: str = r"#+\s(\*+)?{err_or_warn}(s?)(\*+)?(none(.)?$| \(must fix\)$|$)"
+        
+        curr_lines: iter[str]
+        next_lines: iter[str | None]
+        next_next_lines: iter[str | None]
+        curr_lines, next_lines, next_next_lines = tee(review_text.splitlines(), 3)
+        next_lines = chain(islice(next_lines, 1, None), [None])
+        next_next_lines = chain(islice(next_next_lines, 2, None), [None, None])
+        
+        curr_lines: str
+        next_lines: str | None
+        next_next_lines: str | None
+        for curr_line, next_line, next_next_line in zip(curr_lines, next_lines, next_next_lines):
+            stripped: str = curr_line.strip().lower() + str(
+                next_line or '').strip().lower() + str(next_next_line or '').strip().lower()
             # Skip quoted patch context lines
             if stripped.startswith(">") or stripped.startswith("diff --git"):
                 continue
-            if re.match(r"^(#{1,3}\s+)?(\*{0,2})error", stripped) or re.match(
-                r"^<h[1-3]>\s*error", stripped
+
+            elif re.match(rgx_should_match.format(err_or_warn='error'),
+                          stripped) and not re.match(
+                rgx_should_not_match.format(err_or_warn='error'), stripped
             ):
                 has_errors = True
-            elif re.match(r"^(#{1,3}\s+)?(\*{0,2})warning", stripped) or re.match(
-                r"^<h[1-3]>\s*warning", stripped
+
+            elif re.match(rgx_should_match.format(err_or_warn='warning'),
+                          stripped) and not re.match(
+                rgx_should_not_match.format(err_or_warn='warning'), stripped
             ):
                 has_warnings = True
 
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v2] app/testpmd: add padding mode to txonly engine
From: Stephen Hemminger @ 2026-06-12 16:13 UTC (permalink / raw)
  To: Xingui Yang
  Cc: dev, david.marchand, aman.deep.singh, fengchengwen, yangshuaisong,
	lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260612091217.2899755-1-yangxingui@huawei.com>

On Fri, 12 Jun 2026 17:12:17 +0800
Xingui Yang <yangxingui@huawei.com> wrote:

> Add a new padding mode to the txonly forwarding engine, which allows
> sending packets with configurable small sizes without standard L2/L3
> headers. This is useful for testing NIC padding logic.
> 
> When padding mode is enabled via --tx-pkt-pad-mode flag:
> - l2_len and l3_len are set to 0 instead of standard header lengths
> - Packet data is filled with a static pattern instead of
>   Ethernet/IP/UDP headers
> - Minimum packet length validation is bypassed to allow small
>   packet sizes (e.g., set txpkts 14)
> 
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> Signed-off-by: Huisong Li <lihuisong@huawei.com>
> ---
> v2: Fix compilation exception of unterminated-string-initialization
> ---

What about something like this (*not tested*) patch.

From 84ff35849f9881a93eed65ccd43a5cd1197cecb8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 12 Jun 2026 09:10:17 -0700
Subject: [PATCH] testpnd: allow configuring runt frames

Allow setting transmit size to be a small value which has
ethernet header but no IP or UDP header.
---
 app/test-pmd/config.c                       | 12 +++----
 app/test-pmd/txonly.c                       | 35 +++++++++++++++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 13 ++++++++
 3 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9d457ca88e..46ff678b9f 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -6327,9 +6327,6 @@ set_tx_pkt_segments(unsigned int *seg_lengths, unsigned int nb_segs)
 	/*
 	 * Check that each segment length is greater or equal than
 	 * the mbuf data size.
-	 * Check also that the total packet length is greater or equal than the
-	 * size of an empty UDP/IP packet (sizeof(struct rte_ether_hdr) +
-	 * 20 + 8).
 	 */
 	tx_pkt_len = 0;
 	for (i = 0; i < nb_segs; i++) {
@@ -6341,10 +6338,11 @@ set_tx_pkt_segments(unsigned int *seg_lengths, unsigned int nb_segs)
 		}
 		tx_pkt_len = (uint16_t)(tx_pkt_len + seg_lengths[i]);
 	}
-	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8)) {
-		fprintf(stderr, "total packet length=%u < %d - give up\n",
-				(unsigned) tx_pkt_len,
-				(int)(sizeof(struct rte_ether_hdr) + 20 + 8));
+
+	/* Allow runt packets this is a test tool. */
+	if (tx_pkt_len < (sizeof(struct rte_ether_hdr))) {
+		fprintf(stderr, "total packet length=%u < %zu - give up\n",
+			tx_pkt_len, sizeof(struct rte_ether_hdr));
 		return;
 	}
 
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 64893fa205..2e0abe361e 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -76,6 +76,12 @@ copy_buf_to_pkt_segs(void* buf, unsigned len, struct rte_mbuf *pkt,
 	while (offset >= seg->data_len) {
 		offset -= seg->data_len;
 		seg = seg->next;
+		/*
+		 * The packet may be shorter than the header stack when
+		 * generating runt frames; stop once it runs out of segments.
+		 */
+		if (seg == NULL)
+			return;
 	}
 	copy_len = seg->data_len - offset;
 	seg_buf = rte_pktmbuf_mtod_offset(seg, char *, offset);
@@ -84,6 +90,8 @@ copy_buf_to_pkt_segs(void* buf, unsigned len, struct rte_mbuf *pkt,
 		len -= copy_len;
 		buf = ((char*) buf + copy_len);
 		seg = seg->next;
+		if (seg == NULL)
+			return;
 		seg_buf = rte_pktmbuf_mtod(seg, char *);
 		copy_len = seg->data_len;
 	}
@@ -193,7 +201,6 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 	pkt->vlan_tci = vlan_tci;
 	pkt->vlan_tci_outer = vlan_tci_outer;
 	pkt->l2_len = sizeof(struct rte_ether_hdr);
-	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
 
 	pkt_len = pkt->data_len;
 	pkt_seg = pkt;
@@ -204,6 +211,24 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 		pkt_len += pkt_seg->data_len;
 	}
 	pkt_seg->next = NULL; /* Last segment of packet. */
+
+	/*
+	 * A runt frame may be too short to carry a full IPv4/UDP header.
+	 * Clamp l3_len and drop any checksum offload whose header is not
+	 * fully present, so the PMD is never asked to checksum bytes that
+	 * are not in the frame. pkt_len is at least sizeof(rte_ether_hdr),
+	 * so the subtraction below cannot underflow.
+	 */
+	pkt->l3_len = RTE_MIN(sizeof(struct rte_ipv4_hdr),
+			      pkt_len - sizeof(struct rte_ether_hdr));
+	if (pkt_len < sizeof(struct rte_ether_hdr) +
+			sizeof(struct rte_ipv4_hdr))
+		pkt->ol_flags &= ~(RTE_MBUF_F_TX_IP_CKSUM |
+				   RTE_MBUF_F_TX_L4_MASK);
+	else if (pkt_len < sizeof(struct rte_ether_hdr) +
+			sizeof(struct rte_ipv4_hdr) +
+			sizeof(struct rte_udp_hdr))
+		pkt->ol_flags &= ~RTE_MBUF_F_TX_L4_MASK;
 	/*
 	 * Copy headers in first packet segment(s).
 	 */
@@ -405,7 +430,13 @@ tx_only_begin(portid_t pi)
 	pkt_hdr_len = (uint16_t)(sizeof(struct rte_ether_hdr) +
 				 sizeof(struct rte_ipv4_hdr) +
 				 sizeof(struct rte_udp_hdr));
-	pkt_data_len = tx_pkt_length - pkt_hdr_len;
+	/*
+	 * tx_pkt_length may be smaller than the full header stack when
+	 * generating runt frames; clamp the payload length to zero in that
+	 * case so the IP/UDP length fields stay sane.
+	 */
+	pkt_data_len = tx_pkt_length > pkt_hdr_len ?
+			tx_pkt_length - pkt_hdr_len : 0;
 
 	if ((tx_pkt_split == TX_PKT_SPLIT_RND || txonly_multi_flow) &&
 	    tx_pkt_seg_lengths[0] < pkt_hdr_len) {
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index f0f2b0758b..d2e5b63586 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -876,6 +876,19 @@ Set the length of each segment of the TX-ONLY packets or length of packet for FL
 
 Where x[,y]* represents a CSV list of values, without white space.
 
+The total packet length may be set as small as the Ethernet header
+(``sizeof(struct rte_ether_hdr)``), which is below the size of an empty
+UDP/IPv4 packet. This generates runt frames with a truncated (or absent)
+IPv4/UDP header, which is useful for testing how a driver handles
+undersized frames. Any header bytes that do not fit in the requested
+length are simply not emitted. Checksum offloads are automatically
+dropped for a frame too short to contain the corresponding header.
+
+Note that random split (``set txsplit rand``) and multi-flow
+(``set txonly-flows``) still require the first segment to hold the full
+Ethernet/IPv4/UDP header stack, so they cannot be combined with runt
+lengths.
+
 set txtimes
 ~~~~~~~~~~~
 
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v5 00/11] bpf: introduce extensible load API
From: Stephen Hemminger @ 2026-06-12 16:06 UTC (permalink / raw)
  To: Marat Khalili; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

On Fri, 12 Jun 2026 09:42:07 +0100
Marat Khalili <marat.khalili@huawei.com> wrote:

> This patchset introduces an extensible load API for the BPF library in
> DPDK, addressing current limitations regarding ABI stability and feature
> constraints.
> 
> Currently, `rte_bpf_load` relies on a fixed `struct rte_bpf_prm`, which
> makes it difficult to add new loading options or parameters without
> breaking the ABI.
> 
> To resolve these issues, this series introduces `rte_bpf_load_ex` taking
> `struct rte_bpf_prm_ex`. The new parameter structure includes a `sz`
> field for backward compatibility, allowing future extensions.
> 
> Taking advantage of the new extensible API, this patchset also adds
> several new features:
> * Support for loading and executing BPF programs with up to 5 arguments.
> * Support for loading classic BPF (cBPF) directly.
> * Support for loading ELF files directly from memory buffers.
> * New API functions (`rte_bpf_eth_rx_install` and `rte_bpf_eth_tx_install`)
>   to install an already loaded BPF program as a port callback, decoupling
>   the loading phase from the installation phase.
> 
> 
> v5:
> * Fixed compilation between commits broken in v4 while addressing AI
>   comments.
> * Rebased on fresh main, addressing conflicts in release notes.
> 
> v4:
> * Restored missing NULL checks in wrapper functions `rte_bpf_load` and
>   `rte_bpf_elf_load`.
> * Fixed the burst execution functions (`rte_bpf_exec_burst*`) to return
>   `0` and set `rte_errno = EINVAL` on failure, preventing `-EINVAL`
>   being reinterpreted as a large `uint32_t` value. Initialized `rc`
>   properly in scalar execution wrappers for this case.
> * Swapped the Doxygen comments in `rte_bpf_ethdev.h` for RX and TX functions.
> * Added diagnostic dump on failure path in `test_bpf_filter`.
> * Fixed memory leak of the BPF handle in `bpf_rx_test` upon install failure.
> * Added tests for NULL parameter rejection, mismatched execution arguments,
>   unsupported execution flags, and the libpcap-less `rte_bpf_convert` stub.
> 
> v3:
> * Appended Acked-by tags to all individual commits to align with
>   patchwork requirements.
> 
> v2:
> * Fixed a potential segmentation fault in `exec_vm_burst_ex` by deferring
>   the dereference of `ctx[i].arg` until it is confirmed that `nb_prog_arg > 0`.
> * Clarified documentation and code comments for `RTE_BPF_EXEC_FLAG_JIT`
>   requirements and fast-path expectations.
> 
> 
> Marat Khalili (11):
>   bpf: make logging prefixes more consistent
>   bpf: introduce extensible load API
>   bpf: support up to 5 arguments
>   bpf: add cBPF origin to rte_bpf_load_ex
>   bpf: support rte_bpf_prm_ex with port callbacks
>   bpf: support loading ELF files from memory
>   test/bpf: test loading cBPF directly
>   test/bpf: test loading ELF file from memory
>   doc: add release notes for new extensible BPF API
>   doc: add load API to BPF programmer's guide
>   test/bpf: add tests for error handling contracts
> 
>  app/test/test_bpf.c                    | 455 +++++++++++++++++--------
>  doc/guides/prog_guide/bpf_lib.rst      |  75 +++-
>  doc/guides/rel_notes/release_26_07.rst |  20 ++
>  lib/bpf/bpf.c                          |  32 +-
>  lib/bpf/bpf_convert.c                  |  99 +++++-
>  lib/bpf/bpf_exec.c                     | 134 +++++++-
>  lib/bpf/bpf_impl.h                     |  53 ++-
>  lib/bpf/bpf_jit_arm64.c                |  18 +-
>  lib/bpf/bpf_jit_x86.c                  |  10 +-
>  lib/bpf/bpf_load.c                     | 213 ++++++++++--
>  lib/bpf/bpf_load_elf.c                 | 189 ++++++----
>  lib/bpf/bpf_pkt.c                      |  65 +++-
>  lib/bpf/bpf_stub.c                     |  46 ---
>  lib/bpf/bpf_validate.c                 |  94 +++--
>  lib/bpf/meson.build                    |  15 +-
>  lib/bpf/rte_bpf.h                      | 199 ++++++++++-
>  lib/bpf/rte_bpf_ethdev.h               |  54 +++
>  17 files changed, 1400 insertions(+), 371 deletions(-)
>  delete mode 100644 lib/bpf/bpf_stub.c
> 

Detailed AI review found some minor things:

02/11: comment grammar nits — "Any features that not known to
the application" -> "that are not known" (twice); "the size of
same struct" -> "the size of the same struct".

05/11: Doxygen for rte_bpf_eth_rx_install and
rte_bpf_eth_tx_install references rte_bpf_eth_unload, which does
not exist. Use rte_bpf_eth_rx_unload and rte_bpf_eth_tx_unload
respectively.

07/11: stray space before comma in the test_bpf_filter log
string ("for \"%s\" ,").

^ permalink raw reply

* Re: [PATCH] net/iavf: fix scalar Rx path zero-length segment
From: Bruce Richardson @ 2026-06-12 15:42 UTC (permalink / raw)
  To: Ciara Loftus; +Cc: dev, stable, Declan Doherty
In-Reply-To: <20260612143531.2265914-1-ciara.loftus@intel.com>

On Fri, Jun 12, 2026 at 02:35:31PM +0000, Ciara Loftus wrote:
> When hardware CRC stripping is active, a frame whose on-wire size is an
> exact multiple of the Rx buffer size can cause the NIC to fill the final
> data descriptor and place the four CRC bytes into a separate trailing
> descriptor. After hardware stripping, that descriptor carries zero bytes
> of payload.
> 
> The existing CRC cleanup code only handles a zero-length trailing segment
> when software CRC stripping is enabled. When hardware stripping is
> active, the zero-length mbuf is silently chained to the reassembled
> packet. Forwarding such a packet causes a zero-length Tx descriptor,
> triggering a Malicious Driver Detection event on the PF and resetting
> the VF.
> 
> Fix by adding logic to detect a zero-length final segment when hardware
> CRC stripping is active, and freeing it.
> 
> Fixes: a2b29a7733ef ("net/avf: enable basic Rx Tx")
> Fixes: b8b4c54ef9b0 ("net/iavf: support flexible Rx descriptor in normal path")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> ---
>  drivers/net/intel/iavf/iavf_rxtx.c | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
> index a57af7faed..86ebb2618d 100644
> --- a/drivers/net/intel/iavf/iavf_rxtx.c
> +++ b/drivers/net/intel/iavf/iavf_rxtx.c
> @@ -1716,6 +1716,14 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
>  				rxm->data_len = (uint16_t)(rx_packet_len -
>  							RTE_ETHER_CRC_LEN);
>  			}
> +		} else if (unlikely(rx_packet_len == 0)) {
> +			/*
> +			 * NIC split CRC bytes into a trailing segment which is
> +			 * now empty after hardware CRC stripping. Free it.
> +			 */
> +			rte_pktmbuf_free_seg(rxm);
> +			first_seg->nb_segs--;
> +			last_seg->next = NULL;
>  		}
>  

The vector paths also handle scattered packets (via reassembly). Do they
need a fix for this? What about the other drivers that work on the PF, such
as ice/i40e?

/Bruce

>  		first_seg->port = rxq->port_id;
> @@ -1884,6 +1892,14 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			} else
>  				rxm->data_len = (uint16_t)(rx_packet_len -
>  							RTE_ETHER_CRC_LEN);
> +		} else if (unlikely(rx_packet_len == 0)) {
> +			/*
> +			 * NIC split CRC bytes into a trailing segment which is
> +			 * now empty after hardware CRC stripping. Free it.
> +			 */
> +			rte_pktmbuf_free_seg(rxm);
> +			first_seg->nb_segs--;
> +			last_seg->next = NULL;
>  		}
>  
>  		first_seg->port = rxq->port_id;
> -- 
> 2.43.0
> 

^ permalink raw reply

* Re: [PATCH] app/testpmd: add padding mode to txonly engine
From: Stephen Hemminger @ 2026-06-12 15:20 UTC (permalink / raw)
  To: Xingui Yang
  Cc: dev, david.marchand, aman.deep.singh, fengchengwen, yangshuaisong,
	lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260612073715.2739007-1-yangxingui@huawei.com>

On Fri, 12 Jun 2026 15:37:15 +0800
Xingui Yang <yangxingui@huawei.com> wrote:

> Add a new padding mode to the txonly forwarding engine, which allows
> sending packets with configurable small sizes without standard L2/L3
> headers. This is useful for testing NIC padding logic.
> 
> When padding mode is enabled via --tx-pkt-pad-mode flag:
> - l2_len and l3_len are set to 0 instead of standard header lengths
> - Packet data is filled with a static pattern instead of
>   Ethernet/IP/UDP headers
> - Minimum packet length validation is bypassed to allow small
>   packet sizes (e.g., set txpkts 14)
> 
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> Signed-off-by: Huisong Li <lihuisong@huawei.com>
> ---

Why add yet another setting to already bloated testpmd command?
Instead I would suggest allowing user to specify any length from
14 up to UINT32_MAX.  The code to format packet would need to
handle it there.

^ permalink raw reply

* Re: [PATCH 8/9] ethdev: keep fast-path ops valid after port stop
From: Thomas Monjalon @ 2026-06-12 15:00 UTC (permalink / raw)
  To: Maxime Leroy
  Cc: Morten Brørup, Hemant Agrawal, Sachin Saxena, dev, stable,
	Andrew Rybchenko, Sunil Kumar Kori
In-Reply-To: <CAHHRULUGdnTmpnAX2aONEMj3bJmN11RG2cHGJ7h3-qhjoJLsFw@mail.gmail.com>

11/06/2026 20:39, Maxime Leroy:
> Le jeu. 11 juin 2026, 18:01, Morten Brørup <mb@smartsharesystems.com> a
> écrit :
> 
> > > From: Maxime Leroy [mailto:maxime.leroys@gmail.com] On Behalf Of Maxime
> > > Leroy
> >
> > Good catch.
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> >
> > Not related to the series, consider sending as separate patch.
> >
> Thanks for the review and Ack.
> 
> Agreed, this is a generic ethdev fix. I kept it in this series because the
> NAPI user depends on it.
> 
> The current Grout NAPI loop arms RX queue interrupts and then re-checks
> rte_eth_rx_queue_count() before blocking, to avoid sleeping when a packet
> arrived between the last empty poll and epoll_wait.
> 
> With the current ethdev reset path, rx_burst is replaced by a dummy
> callback on stop/release, but rx_queue_count becomes NULL. So if the port
> is stopped concurrently, the NAPI worker dereferences a NULL function
> pointer and
> segfaults on that recheck.
> 
> I can split it out if maintainers prefer, but then the dpaa2 NAPI series
> has a real dependency on the standalone ethdev fix.

You can keep this patch in the series AND
send it separately for a quick merge.



^ permalink raw reply

* Re: 回复:[PATCH] gpu/metax: add new driver for Metax GPU
From: Thomas Monjalon @ 2026-06-12 14:49 UTC (permalink / raw)
  To: 许玲燕; +Cc: dev, eagostini, 王冬冬
In-Reply-To: <2a306d48-351c-473e-b049-43e10e53617b.lingyan.xu@metax-tech.com>

Would it be possible to have a direct download link
not requiring any registration? At least for the library?
Do you have an english version of the website?


12/06/2026 09:19, 许玲燕:
> Hi,
> Thank you for the follow-up. Please find the clarifications below regarding the kernel dependency and availability:
> 1. Kernel Dependency
> While it is not upstreamed into the mainline Linux kernel, it is actively maintained by Metax to interface with our hardware.
> 2. Availability and Download Link
> Yes, both the proprietary user-space libraries and the corresponding kernel modules are freely available for download. You can access them via the Metax Software Download Center:
> Download Link: https://sw-download.metax-tech.com/ <https://sw-download.metax-tech.com/ >
> How to obtain the files:
> 
>  * 
> Register and log in to the portal.
> 
>  * 
> Navigate to "SDK Development Kit".
> 
>  * 
> Select your specific GPU Type.
> 
>  * 
> Choose your target OS. We currently support Linux (aarch64 & x86_64) across mainstream distributions, including but not limited to:
>  * 
> Ubuntu (18.04 - 24.04)
> 
>  * 
> RHEL / CentOS / RockyOS (8.x / 9.x)
> 
>  * 
> Domestic distros: KylinV10/V11, OpenCloudOS, TencentOS, etc.
> Best regards,
> ------------------------------------------------------------------
> 发件人:Thomas Monjalon <thomas@monjalon.net>
> 发送时间:2026年6月11日(周四) 17:17
> 收件人:"许玲燕"<lingyan.xu@metax-tech.com>
> 抄 送:dev<dev@dpdk.org>; eagostini<eagostini@nvidia.com>; "王冬冬"<dongdong.wang@metax-tech.com>
> 主 题:Re: [PATCH] gpu/metax: add new driver for Metax GPU
> 11/06/2026 09:10:
> > Both libmcruntime.so and the corresponding gdrapi libraries
> > are proprietary user-space libraries provided by Metax.
> > They are not upstreamed to the DPDK mainline repository.
> > However, please rest assured that the current patch interacts
> > with them via standard dlopen (dynamic loading) at runtime.
> > We do not link directly against their source code
> > or require them as hard build-time dependencies.
> > Therefore, this approach will not introduce any additional
> > compilation dependencies or licensing issues to the DPDK main tree.
> What about the kernel dependency?
> Are libraries and kernel module freely available for download?
> Can you provide a link?
> 






^ permalink raw reply

* [PATCH] net/iavf: fix scalar Rx path zero-length segment
From: Ciara Loftus @ 2026-06-12 14:35 UTC (permalink / raw)
  To: dev; +Cc: Ciara Loftus, stable, Declan Doherty

When hardware CRC stripping is active, a frame whose on-wire size is an
exact multiple of the Rx buffer size can cause the NIC to fill the final
data descriptor and place the four CRC bytes into a separate trailing
descriptor. After hardware stripping, that descriptor carries zero bytes
of payload.

The existing CRC cleanup code only handles a zero-length trailing segment
when software CRC stripping is enabled. When hardware stripping is
active, the zero-length mbuf is silently chained to the reassembled
packet. Forwarding such a packet causes a zero-length Tx descriptor,
triggering a Malicious Driver Detection event on the PF and resetting
the VF.

Fix by adding logic to detect a zero-length final segment when hardware
CRC stripping is active, and freeing it.

Fixes: a2b29a7733ef ("net/avf: enable basic Rx Tx")
Fixes: b8b4c54ef9b0 ("net/iavf: support flexible Rx descriptor in normal path")
Cc: stable@dpdk.org

Signed-off-by: Declan Doherty <declan.doherty@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 drivers/net/intel/iavf/iavf_rxtx.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index a57af7faed..86ebb2618d 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -1716,6 +1716,14 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 				rxm->data_len = (uint16_t)(rx_packet_len -
 							RTE_ETHER_CRC_LEN);
 			}
+		} else if (unlikely(rx_packet_len == 0)) {
+			/*
+			 * NIC split CRC bytes into a trailing segment which is
+			 * now empty after hardware CRC stripping. Free it.
+			 */
+			rte_pktmbuf_free_seg(rxm);
+			first_seg->nb_segs--;
+			last_seg->next = NULL;
 		}
 
 		first_seg->port = rxq->port_id;
@@ -1884,6 +1892,14 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			} else
 				rxm->data_len = (uint16_t)(rx_packet_len -
 							RTE_ETHER_CRC_LEN);
+		} else if (unlikely(rx_packet_len == 0)) {
+			/*
+			 * NIC split CRC bytes into a trailing segment which is
+			 * now empty after hardware CRC stripping. Free it.
+			 */
+			rte_pktmbuf_free_seg(rxm);
+			first_seg->nb_segs--;
+			last_seg->next = NULL;
 		}
 
 		first_seg->port = rxq->port_id;
-- 
2.43.0


^ permalink raw reply related

* RE: [PATCH v4 02/11] bpf: introduce extensible load API
From: Marat Khalili @ 2026-06-12 10:48 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Konstantin Ananyev, Wathsala Vithanage, dev@dpdk.org
In-Reply-To: <n9qJkNlCQDazx_P9i-5XTQ@monjalon.net>

> Unfortunately, compilation is failing on this patch:
> 
> lib/bpf/bpf_load.c:274:40: error: 'struct rte_bpf_jit' has no member named 'raw'

Sent v5 fixing the issue, apologies for sending without double-checking 
compilability of individual commits on my side.

^ permalink raw reply

* [PATCH v3 25/25] doc: add BPF validate debug to programmer's guide
From: Marat Khalili @ 2026-06-12 10:47 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612104743.6465-1-marat.khalili@huawei.com>

Document the new gdb-like validation debugger API, outlining how it can
be used to set breakpoints and inspect register states during
validation.

Highlight its primary use case: writing robust tests for the eBPF
verifier using the harness in app/test/test_bpf_validate.c.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
Depends-on: series-38434 ("bpf: introduce extensible load API")
 doc/guides/prog_guide/bpf_lib.rst | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/doc/guides/prog_guide/bpf_lib.rst b/doc/guides/prog_guide/bpf_lib.rst
index df3782508829..08ea1876875a 100644
--- a/doc/guides/prog_guide/bpf_lib.rst
+++ b/doc/guides/prog_guide/bpf_lib.rst
@@ -118,6 +118,37 @@ For example, ``(BPF_IND | BPF_W | BPF_LD)`` means:
 and ``R1-R5`` were scratched.
 
 
+Validation Debugger
+-------------------
+
+The DPDK BPF library includes a validation debugger API designed primarily for
+writing comprehensive unit tests for the eBPF verifier. It allows developers
+to introspect the abstract interpretation process step-by-step to guarantee
+that the verifier correctly models the semantics of eBPF instructions.
+
+The debugger operates using a gdb-like approach:
+
+1.  **Initialization:** Create a debug session using
+    ``rte_bpf_validate_debug_create()`` and pass it to the loader via the
+    ``debug`` field in ``struct rte_bpf_prm_ex``.
+2.  **Breakpoints and Catchpoints:** Before loading, use
+    ``rte_bpf_validate_debug_break()`` or ``rte_bpf_validate_debug_catch()``
+    to register callback functions that trigger at specific instruction indices
+    (program counters) or upon specific validation events.
+3.  **State Introspection:** Within the callbacks, the API provides functions
+    like ``rte_bpf_validate_debug_can_access()``,
+    ``rte_bpf_validate_debug_may_jump()``, and various formatting functions
+    to safely inspect the verifier's internal belief about register bounds
+    and memory states at that specific execution point.
+
+When adding a test for a new eBPF instruction or fixing a validator bug,
+developers should utilize the harness provided in
+``app/test/test_bpf_validate.c``. This harness encapsulates the debugger API,
+allowing you to define the expected abstract domains (signed and unsigned
+intervals) for registers before and after a tested instruction, generating
+the necessary eBPF bytecode and breakpoints automatically.
+
+
 Not currently supported eBPF features
 -------------------------------------
 
-- 
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox