DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] crypto/cnxk: fix out of place AES GCM
From: Daphne Priscilla @ 2026-06-12  6:20 UTC (permalink / raw)
  To: dev; +Cc: stable, gakhil, ktejasree, anoobj, Daphne Priscilla

For AES-GCM out of place, when AAD is present in inbuf before the data,
it is treated as passthrough data. This results in AAD being
present in outbuf header, but test expects outbuf header to remain
zero. Passthrough data is now diverted to metabuf so outbuf
header remains zero.

Fixes: 7c19abdd0cf1 ("common/cnxk: support 103XX CPT")
Cc: stable@dpdk.org

Signed-off-by: Daphne Priscilla <df@marvell.com>
---
 .mailmap                                 |  1 +
 drivers/common/cnxk/roc_se.h             |  2 +-
 drivers/crypto/cnxk/cnxk_cryptodev_ops.c |  3 +
 drivers/crypto/cnxk/cnxk_se.h            | 96 ++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/.mailmap b/.mailmap
index 118dfa0ff9..1191afbf0b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -334,6 +334,7 @@ Danny Patel <dannyp@marvell.com>
 Danny Zhou <danny.zhou@intel.com>
 Danylo Vodopianov <dvo-plv@napatech.com>
 Dapeng Yu <dapengx.yu@intel.com>
+Daphne Priscilla <df@marvell.com>
 Darek Stojaczyk <dariusz.stojaczyk@intel.com>
 Daria Kolistratova <daria.kolistratova@intel.com>
 Dariusz Chaberski <dariuszx.chaberski@intel.com>
diff --git a/drivers/common/cnxk/roc_se.h b/drivers/common/cnxk/roc_se.h
index 499e71ce85..d3ad61ca04 100644
--- a/drivers/common/cnxk/roc_se.h
+++ b/drivers/common/cnxk/roc_se.h
@@ -26,7 +26,7 @@
 #define ROC_SE_MISC_MINOR_OP_DUMMY	 0x04ULL
 #define ROC_SE_MISC_MINOR_OP_HW_SUPPORT	 0x08ULL
 
-#define ROC_SE_MAX_AAD_SIZE 64
+#define ROC_SE_MAX_AAD_SIZE 1024
 #define ROC_SE_MAX_MAC_LEN  64
 
 #define ROC_SE_OFF_CTRL_LEN 8
diff --git a/drivers/crypto/cnxk/cnxk_cryptodev_ops.c b/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
index 2f9eb322dc..5e59f1d7bd 100644
--- a/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
+++ b/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
@@ -82,6 +82,9 @@ cnxk_cpt_get_mlen(void)
 			       (RTE_ALIGN_CEIL(ROC_MAX_SG_IN_OUT_CNT, 4) >> 2) * ROC_SG_ENTRY_SIZE),
 			      8);
 
+	/* Space for discarding AAD bytes from output stream in GCM OOP */
+	len += ROC_SE_MAX_AAD_SIZE;
+
 	return len;
 }
 
diff --git a/drivers/crypto/cnxk/cnxk_se.h b/drivers/crypto/cnxk/cnxk_se.h
index 8dbf3e73c7..09d9d1e0e3 100644
--- a/drivers/crypto/cnxk/cnxk_se.h
+++ b/drivers/crypto/cnxk/cnxk_se.h
@@ -407,8 +407,28 @@ sg_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 			if (unlikely(req_flags & ROC_SE_SINGLE_BUF_INPLACE)) {
 				i = fill_sg_comp_from_buf_min(scatter_comp, i, params->bufs, &size);
 			} else {
-				i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							  &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg_comp(scatter_comp, i, meta_passthrough,
+							 passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov,
+								  dst_offset, &size, aad_buf,
+								  aad_offset);
 			}
 			if (unlikely(size)) {
 				plt_dp_err("Insufficient buffer space,"
@@ -430,8 +450,28 @@ sg_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 			if (unlikely(req_flags & ROC_SE_SINGLE_BUF_INPLACE)) {
 				i = fill_sg_comp_from_buf_min(scatter_comp, i, params->bufs, &size);
 			} else {
-				i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							  &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg_comp(scatter_comp, i, meta_passthrough,
+							 passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov,
+								  dst_offset, &size, aad_buf,
+								  aad_offset);
 			}
 
 			if (unlikely(size)) {
@@ -606,8 +646,28 @@ sg2_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 				i = fill_sg2_comp_from_buf_min(scatter_comp, i, params->bufs,
 							       &size);
 			} else {
-				i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							   &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg2_comp(scatter_comp, i, meta_passthrough,
+							  passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov,
+								   dst_offset, &size, aad_buf,
+								   aad_offset);
 			}
 			if (unlikely(size)) {
 				plt_dp_err("Insufficient buffer space,"
@@ -632,8 +692,28 @@ sg2_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 				i = fill_sg2_comp_from_buf_min(scatter_comp, i, params->bufs,
 							       &size);
 			} else {
-				i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							   &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg2_comp(scatter_comp, i, meta_passthrough,
+							  passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov,
+								   dst_offset, &size, aad_buf,
+								   aad_offset);
 			}
 
 			if (unlikely(size)) {
-- 
2.43.0


^ permalink raw reply related

* [PATCH] common/mlx5: fix high SMMU TLB miss with mempool alignment
From: Xingui Yang @ 2026-06-12  7:14 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, thomas, dsosnowski, viacheslavo, bingz,
	orika, suanmingm, matan, dmitry.kozliuk, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

From: Shuaisong Yang <yangshuaisong@h-partners.com>

On Kunpeng SoC with mlx CX7, dpdk-l3fwd with intra-NUMA core pinning
under SMMU nonstrict/strict mode shows about 30% performance degradation
compared to cross-NUMA pinning. With SMMU disabled or passthrough mode,
intra-NUMA performs as expected (slightly better than cross-NUMA).

CX7 in NUMA1
NUMA node0 CPU(s):    0-39
NUMA node1 CPU(s):    40-79

intra-NUMA:
dpdk-l3fwd -l 40-55 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
  --config='(0,0,40),(0,1,41),(0,2,42),(0,3,43),(0,4,44),\
            (0,5,45),(0,6,46),(0,7,47),(0,8,48),(0,9,49),\
            (0,10,50),(0,11,51),(0,12,52),(0,13,53),\
            (0,14,54),(0,15,55)' \
  --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64

cross-NUMA:
dpdk-l3fwd -l 11-26 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
  --config='(0,0,11),(0,1,12),(0,2,13),(0,3,14),(0,4,15),\
            (0,5,16),(0,6,17),(0,7,18),(0,8,19),(0,9,20),\
            (0,10,21),(0,11,22),(0,12,23),(0,13,24),\
            (0,14,25),(0,15,26)' \
  --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64

The root cause is that under SMMU enabled mode, the mempool allocated
for intra-NUMA pinning is aligned to system page size instead of
hugepage size, while cross-NUMA pinning correctly uses hugepage size
alignment. This causes high TLB miss rates under SMMU.

Align all memory ranges to hugepage boundaries during mempool
registration to ensure hugepage_sz alignment, thereby reducing TLB
misses and fixing the intra-NUMA performance degradation.

Fixes: 690b2a88c2f7 ("common/mlx5: add mempool registration facilities")
Cc: stable@dpdk.org

Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
 .mailmap                             |  1 +
 drivers/common/mlx5/mlx5_common_mr.c | 53 +++++++++++++++++++---------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/.mailmap b/.mailmap
index 4001e5fb0e..e13e88db1b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1979,3 +1979,4 @@ Zongyu Wu <wuzongyu1@huawei.com>
 Zorik Machulsky <zorik@amazon.com>
 Zyta Szpak <zyta@marvell.com> <zr@semihalf.com>
 Zyta Szpak <zyta@marvell.com> <zyta.szpak@semihalf.com>
+Shuaisong Yang <yangshuaisong@h-partners.com>
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index aa2d5e88a4..aee037abb4 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -1524,7 +1524,9 @@ mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem,
  * @param[in] is_extmem
  *   Whether the pool is contains only external pinned buffers.
  * @param[out] out
- *   Receives memory ranges to register, aligned to the system page size.
+ *   Receives memory ranges to register. Aligned to the hugepage size
+ *   if all ranges reside on hugepages of the same size,
+ *   otherwise aligned to the system page size.
  *   The caller must release them with free().
  * @param[out] out_n
  *   Receives the number of @p out items.
@@ -1541,7 +1543,9 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
 {
 	struct mlx5_range *ranges = NULL;
 	unsigned int i, ranges_n = 0;
+	bool same_hugepage_sz = true;
 	struct rte_memseg_list *msl;
+	uint64_t hugepage_sz = 0;
 
 	if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) {
 		DRV_LOG(ERR, "Cannot get address ranges for mempool %s",
@@ -1552,28 +1556,43 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
 	*share_hugepage = false;
 	msl = rte_mem_virt2memseg_list((void *)ranges[0].start);
 	if (msl != NULL) {
-		uint64_t hugepage_sz = 0;
+		hugepage_sz = msl->page_sz;
 
 		/* Check that all ranges are on pages of the same size. */
 		for (i = 0; i < ranges_n; i++) {
-			if (hugepage_sz != 0 && hugepage_sz != msl->page_sz)
+			struct rte_memseg_list *range_msl;
+			range_msl = rte_mem_virt2memseg_list(
+					(void *)ranges[i].start);
+			if (range_msl == NULL ||
+			    range_msl->page_sz != hugepage_sz) {
+				same_hugepage_sz = false;
 				break;
-			hugepage_sz = msl->page_sz;
+			}
 		}
-		if (i == ranges_n) {
-			/*
-			 * If the entire pool is within one hugepage,
-			 * combine all ranges into one of the hugepage size.
-			 */
-			uintptr_t reg_start = ranges[0].start;
-			uintptr_t reg_end = ranges[ranges_n - 1].end;
-			uintptr_t hugepage_start =
-				RTE_ALIGN_FLOOR(reg_start, hugepage_sz);
-			uintptr_t hugepage_end = hugepage_start + hugepage_sz;
-			if (reg_end < hugepage_end) {
-				ranges[0].start = hugepage_start;
+	}
+	if (same_hugepage_sz && hugepage_sz > 0) {
+		unsigned int orig_ranges_n = ranges_n;
+
+		for (i = 0; i < ranges_n; i++) {
+			ranges[i].start = RTE_ALIGN_FLOOR(ranges[i].start,
+							  hugepage_sz);
+			ranges[i].end = RTE_ALIGN_CEIL(ranges[i].end,
+							hugepage_sz);
+		}
+		ranges_n = 1;
+		for (i = 1; i < orig_ranges_n; i++) {
+			if (ranges[ranges_n - 1].end >= ranges[i].start)
+				ranges[ranges_n - 1].end =
+					RTE_MAX(ranges[ranges_n - 1].end,
+						ranges[i].end);
+			else
+				ranges[ranges_n++] = ranges[i];
+		}
+		if (ranges_n == 1) {
+			uintptr_t hugepage_end = ranges[0].start + hugepage_sz;
+
+			if (ranges[0].end <= hugepage_end) {
 				ranges[0].end = hugepage_end;
-				ranges_n = 1;
 				*share_hugepage = true;
 			}
 		}
-- 
2.43.0


^ permalink raw reply related

* 回复：[PATCH] gpu/metax: add new driver for Metax GPU
From: 许玲燕 @ 2026-06-12  7:19 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, eagostini, 王冬冬
In-Reply-To: <CNjXKdTGQx6A6BPVH44M4w@monjalon.net>

[-- Attachment #1: Type: text/plain, Size: 2058 bytes --]

Hi,
Thank you for the follow-up. Please find the clarifications below regarding the kernel dependency and availability:
1. Kernel Dependency
While it is not upstreamed into the mainline Linux kernel, it is actively maintained by Metax to interface with our hardware.
2. Availability and Download Link
Yes, both the proprietary user-space libraries and the corresponding kernel modules are freely available for download. You can access them via the Metax Software Download Center:
Download Link: https://sw-download.metax-tech.com/ <https://sw-download.metax-tech.com/ >
How to obtain the files:

 * 
Register and log in to the portal.

 * 
Navigate to "SDK Development Kit".

 * 
Select your specific GPU Type.

 * 
Choose your target OS. We currently support Linux (aarch64 & x86_64) across mainstream distributions, including but not limited to:
 * 
Ubuntu (18.04 - 24.04)

 * 
RHEL / CentOS / RockyOS (8.x / 9.x)

 * 
Domestic distros: KylinV10/V11, OpenCloudOS, TencentOS, etc.
Best regards,
------------------------------------------------------------------
发件人：Thomas Monjalon <thomas@monjalon.net>
发送时间：2026年6月11日(周四) 17:17
收件人："许玲燕"<lingyan.xu@metax-tech.com>
抄　送：dev<dev@dpdk.org>; eagostini<eagostini@nvidia.com>; "王冬冬"<dongdong.wang@metax-tech.com>
主　题：Re: [PATCH] gpu/metax: add new driver for Metax GPU
11/06/2026 09:10:
> Both libmcruntime.so and the corresponding gdrapi libraries
> are proprietary user-space libraries provided by Metax.
> They are not upstreamed to the DPDK mainline repository.
> However, please rest assured that the current patch interacts
> with them via standard dlopen (dynamic loading) at runtime.
> We do not link directly against their source code
> or require them as hard build-time dependencies.
> Therefore, this approach will not introduce any additional
> compilation dependencies or licensing issues to the DPDK main tree.
What about the kernel dependency?
Are libraries and kernel module freely available for download?
Can you provide a link?

[-- Attachment #2: Type: text/html, Size: 71291 bytes --]

^ permalink raw reply

* [PATCH] app/testpmd: add padding mode to txonly engine
From: Xingui Yang @ 2026-06-12  7:37 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

Add a new padding mode to the txonly forwarding engine, which allows
sending packets with configurable small sizes without standard L2/L3
headers. This is useful for testing NIC padding logic.

When padding mode is enabled via --tx-pkt-pad-mode flag:
- l2_len and l3_len are set to 0 instead of standard header lengths
- Packet data is filled with a static pattern instead of
  Ethernet/IP/UDP headers
- Minimum packet length validation is bypassed to allow small
  packet sizes (e.g., set txpkts 14)

Signed-off-by: Xingui Yang <yangxingui@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 app/test-pmd/config.c     |  2 +-
 app/test-pmd/parameters.c |  7 +++++++
 app/test-pmd/testpmd.c    |  3 +++
 app/test-pmd/testpmd.h    |  1 +
 app/test-pmd/txonly.c     | 18 ++++++++++++++++--
 5 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9d457ca88e..36b9b023e2 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -6341,7 +6341,7 @@ set_tx_pkt_segments(unsigned int *seg_lengths, unsigned int nb_segs)
 		}
 		tx_pkt_len = (uint16_t)(tx_pkt_len + seg_lengths[i]);
 	}
-	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8)) {
+	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8) && !tx_pkt_pad_mode) {
 		fprintf(stderr, "total packet length=%u < %d - give up\n",
 				(unsigned) tx_pkt_len,
 				(int)(sizeof(struct rte_ether_hdr) + 20 + 8));
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 337d8fc8ac..8c3b1244e7 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -195,6 +195,8 @@ enum {
 	TESTPMD_OPT_TXONLY_MULTI_FLOW_NUM,
 #define TESTPMD_OPT_TXONLY_FLOWS "txonly-flows"
 	TESTPMD_OPT_TXONLY_FLOWS_NUM,
+#define TESTPMD_OPT_TX_PKT_PAD_MODE "tx-pkt-pad-mode"
+	TESTPMD_OPT_TX_PKT_PAD_MODE_NUM,
 #define TESTPMD_OPT_RXQ_SHARE "rxq-share"
 	TESTPMD_OPT_RXQ_SHARE_NUM,
 #define TESTPMD_OPT_ETH_LINK_SPEED "eth-link-speed"
@@ -351,6 +353,7 @@ static const struct option long_options[] = {
 	NO_ARG(TESTPMD_OPT_MULTI_RX_MEMPOOL),
 	NO_ARG(TESTPMD_OPT_TXONLY_MULTI_FLOW),
 	REQUIRED_ARG(TESTPMD_OPT_TXONLY_FLOWS),
+	NO_ARG(TESTPMD_OPT_TX_PKT_PAD_MODE),
 	NO_ARG(TESTPMD_OPT_RXQ_SHARE),
 	REQUIRED_ARG(TESTPMD_OPT_ETH_LINK_SPEED),
 	NO_ARG(TESTPMD_OPT_DISABLE_LINK_CHECK),
@@ -504,6 +507,7 @@ usage(char* progname)
 	printf("  --txonly-multi-flow: generate multiple flows in txonly mode\n");
 	printf("  --txonly-nb-flows=N: number of flows per lcore in txonly"
 	       " multi-flow mode (1-64, default 64)\n");
+	printf("  --tx-pkt-pad-mode: enable padding mode in txonly mode\n");
 	printf("  --tx-ip=src,dst: IP addresses in Tx-only mode\n");
 	printf("  --tx-udp=src[,dst]: UDP ports in Tx-only mode\n");
 	printf("  --eth-link-speed: force link speed.\n");
@@ -1577,6 +1581,9 @@ launch_args_parse(int argc, char** argv)
 			else
 				rte_exit(EXIT_FAILURE, "txonly-flows must be >= 1 and <= 64\n");
 			break;
+		case TESTPMD_OPT_TX_PKT_PAD_MODE_NUM:
+			tx_pkt_pad_mode = 1;
+			break;
 		case TESTPMD_OPT_RXQ_SHARE_NUM:
 			rxq_share = 1;
 			break;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index fcd8a90967..457bb6d3fe 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -296,6 +296,9 @@ uint32_t tx_pkt_times_inter;
 uint32_t tx_pkt_times_intra;
 /**< Timings for send scheduling in TXONLY mode, time between packets. */
 
+uint8_t tx_pkt_pad_mode;
+/**< Whether packet padding mode is enabled. */
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t nb_pkt_flowgen_clones; /**< Number of Tx packet clones to send in flowgen mode. */
 int nb_flows_flowgen = 1024; /**< Number of flows in flowgen mode. */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 3d4b36d668..04fdc2db42 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -663,6 +663,7 @@ extern uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT]; /**< Seg. lengths */
 extern uint8_t  tx_pkt_nb_segs; /**< Number of segments in TX packets */
 extern uint32_t tx_pkt_times_intra;
 extern uint32_t tx_pkt_times_inter;
+extern uint8_t  tx_pkt_pad_mode;
 
 enum tx_pkt_split {
 	TX_PKT_SPLIT_OFF,
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 64893fa205..d8dbc2d46e 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -192,8 +192,8 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 	pkt->ol_flags |= ol_flags;
 	pkt->vlan_tci = vlan_tci;
 	pkt->vlan_tci_outer = vlan_tci_outer;
-	pkt->l2_len = sizeof(struct rte_ether_hdr);
-	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+	pkt->l2_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ether_hdr);
+	pkt->l3_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ipv4_hdr);
 
 	pkt_len = pkt->data_len;
 	pkt_seg = pkt;
@@ -204,6 +204,19 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 		pkt_len += pkt_seg->data_len;
 	}
 	pkt_seg->next = NULL; /* Last segment of packet. */
+
+	if (tx_pkt_pad_mode) {
+		static const char pad_pattern[16] = "0123456789abcdef";
+		uint32_t j;
+		char *pad;
+
+		pad = rte_pktmbuf_mtod(pkt, char *);
+		for (j = 0; j < pkt->data_len; j++)
+			pad[j] = pad_pattern[j % 16];
+
+		goto out;
+	}
+
 	/*
 	 * Copy headers in first packet segment(s).
 	 */
@@ -295,6 +308,7 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 			sizeof(struct rte_ipv4_hdr) +
 			sizeof(pkt_udp_hdr));
 	}
+out:
 	/*
 	 * Complete first mbuf of packet and append it to the
 	 * burst of packets to be transmitted.
-- 
2.43.0


^ permalink raw reply related

* [PATCH] app/testpmd: add VLAN priority insert support
From: Xingui Yang @ 2026-06-12  8:14 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

The tx_vlan set command currently only accepts a VLAN ID in range
[0, 4095].  This patch adds support for an extended format that includes
802.1p priority and CFI bits, allowing users to set the VLAN priority
tag when inserting VLAN headers in TX packets.

The extended format is:
  bit 0-11:  VLAN ID (0-4095)
  bit 12:    CFI (Canonical Format Indicator)
  bit 13-15: Priority (0-7, 802.1p CoS)

This is consistent with the VLAN tag structure used by
rte_eth_dev_set_vlan_pvid() where the PVID field encodes VLAN ID, CFI
and priority in the same format.

A new command line option --enable-vlan-priority is added to enable this
feature. By default, the feature is disabled to maintain backward
compatibility with existing users. When enabled, the
vlan_id_is_invalid() function allows any 16-bit value to pass, while the
full 16-bit value (including CFI and priority bits) is passed to the
driver for hardware VLAN insertion.

Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
 app/test-pmd/config.c     | 24 +++++++++++++++---------
 app/test-pmd/parameters.c |  6 ++++++
 app/test-pmd/testpmd.c    |  5 +++++
 app/test-pmd/testpmd.h    |  2 ++
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 36b9b023e2..80cde109e6 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1241,12 +1241,18 @@ void print_valid_ports(void)
 }
 
 static int
-vlan_id_is_invalid(uint16_t vlan_id)
+vlan_id_is_invalid(uint16_t vlan_id, int vlan_priority_ena)
 {
-	if (vlan_id < 4096)
-		return 0;
-	fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
-	return 1;
+	if (!vlan_priority_ena && vlan_id >= 4096) {
+		fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
+		return 1;
+	}
+
+	/*
+	 * When vlan_priority_ena is enabled, allow any 16-bit value
+	 * to pass priority and CFI bits to the driver.
+	 */
+	return 0;
 }
 
 static uint32_t
@@ -6876,7 +6882,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
 
 	if (port_id_is_invalid(port_id, ENABLED_WARN))
 		return 1;
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return 1;
 	diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
 	if (diag == 0)
@@ -6923,7 +6929,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
 	struct rte_eth_dev_info dev_info;
 	int ret;
 
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return;
 
 	if (ports[port_id].dev_conf.txmode.offloads &
@@ -6954,9 +6960,9 @@ tx_qinq_set(portid_t port_id, uint16_t vlan_id, uint16_t vlan_id_outer)
 	struct rte_eth_dev_info dev_info;
 	int ret;
 
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return;
-	if (vlan_id_is_invalid(vlan_id_outer))
+	if (vlan_id_is_invalid(vlan_id_outer, vlan_priority_insert_ena))
 		return;
 
 	ret = eth_dev_info_get_print_err(port_id, &dev_info);
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 8c3b1244e7..3f37498d3b 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -117,6 +117,8 @@ enum {
 	TESTPMD_OPT_ENABLE_HW_VLAN_EXTEND_NUM,
 #define TESTPMD_OPT_ENABLE_HW_QINQ_STRIP "enable-hw-qinq-strip"
 	TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM,
+#define TESTPMD_OPT_ENABLE_VLAN_PRIORITY "enable-vlan-priority"
+	TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM,
 #define TESTPMD_OPT_ENABLE_DROP_EN "enable-drop-en"
 	TESTPMD_OPT_ENABLE_DROP_EN_NUM,
 #define TESTPMD_OPT_DISABLE_RSS "disable-rss"
@@ -461,6 +463,7 @@ usage(char* progname)
 	printf("  --enable-hw-vlan-strip: enable hardware vlan strip.\n");
 	printf("  --enable-hw-vlan-extend: enable hardware vlan extend.\n");
 	printf("  --enable-hw-qinq-strip: enable hardware qinq strip.\n");
+	printf("  --enable-vlan-priority: enable vlan priority insert.\n");
 	printf("  --enable-drop-en: enable per queue packet drop.\n");
 	printf("  --disable-rss: disable rss.\n");
 	printf("  --enable-rss: Force rss even for single-queue operation.\n");
@@ -1259,6 +1262,9 @@ launch_args_parse(int argc, char** argv)
 		case TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM:
 			rx_offloads |= RTE_ETH_RX_OFFLOAD_QINQ_STRIP;
 			break;
+		case TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM:
+			vlan_priority_insert_ena = 1;
+			break;
 		case TESTPMD_OPT_ENABLE_DROP_EN_NUM:
 			rx_drop_en = 1;
 			break;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 457bb6d3fe..0239ec59de 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -307,6 +307,11 @@ uint16_t mb_mempool_cache = DEF_MBUF_CACHE; /**< Size of mbuf mempool cache. */
 /* current configuration is in DCB or not,0 means it is not in DCB mode */
 uint8_t dcb_config = 0;
 
+/*
+ * Configurable value of vlan priority insert enable.
+ */
+uint8_t vlan_priority_insert_ena;
+
 /*
  * Configurable number of RX/TX queues.
  */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 04fdc2db42..104a6e73be 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -618,6 +618,8 @@ extern uint64_t noisy_lkup_num_reads_writes;
 
 extern uint8_t dcb_config;
 
+extern uint8_t vlan_priority_insert_ena;
+
 extern uint32_t mbuf_data_size_n;
 extern uint16_t mbuf_data_size[MAX_SEGS_BUFFER_SPLIT];
 /**< Mbuf data space size. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 0/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-12  8:17 UTC (permalink / raw)
  To: dev, stephen; +Cc: longli, weh

From: Wei Hu <weh@microsoft.com>

Add support for handling hardware service reset events in the
MANA driver. When the MANA kernel driver receives a hardware
service event, it initiates a device reset and notifies userspace
via IBV_EVENT_DEVICE_FATAL. The MANA PMD handles this by
performing an automatic teardown and recovery sequence.

The driver uses ethdev recovery events (ERR_RECOVERING,
RECOVERY_SUCCESS, RECOVERY_FAILED) to notify upper layers of
the reset lifecycle, and a PCI device removal event callback
to distinguish hot-remove from service reset.

Changes since v8:
- Fixed reset thread resource leak: previously reset_thread_active
  was cleared before emitting recovery callbacks, so no join site
  would reap the thread. Now the flag stays true throughout the
  thread lifetime. mana_join_reset_thread detects the self-join
  case (callback calling dev_stop/dev_close from the reset thread)
  using rte_thread_equal and calls rte_thread_detach instead of
  join, so thread resources are freed on exit. External callers
  continue to join normally.
- Fixed lost condvar signal: added a predicate loop around
  pthread_cond_timedwait that checks dev_state under
  reset_cond_mutex. If mana_pci_remove_event_cb signals before
  the reset thread enters the wait, the wakeup is no longer lost.
  The PCI remove callback sets dev_state to RESET_FAILED under
  the same mutex before signaling.
- Added a lock/unlock barrier on reset_ops_lock in
  mana_pci_remove_event_cb to ensure teardown has completed
  before emitting the INTR_RMV event.
- Fixed mana_reset_exit_delay return type from uint32_t to int
  to match the negative error codes it stores.
- Removed unnecessary else-after-goto in mana_probe_port.

Changes since v7:
- Moved heavy teardown (dev_stop, IPC to secondaries, dev_close,
  MR btree free) from mana_reset_enter (EAL interrupt thread)
  to mana_reset_thread (control thread). The interrupt handler
  now only sets state, drains in-flight bursts, and spawns the
  thread. Teardown runs immediately in the control thread before
  the recovery timer wait, avoiding blocking the interrupt thread
  on multi-second IPC timeouts and ibverbs calls. Each function
  now owns its own lock scope with no lock hand-off between
  threads.
- Simplified burst_state from encoding device state in bits 1+
  to a single blocked flag (bit 1). Only one value was ever
  stored, so the multi-state encoding was misleading. Added
  MANA_BURST_BLOCKED constant.
- Updated mana.rst to reflect that teardown runs on the control
  thread, not the interrupt handler.

Changes since v6:
- Rebased onto latest upstream for-main
- Replaced removed RTE_ETH_DEV_TO_PCI macro with
  RTE_CLASS_TO_BUS_DEVICE (upstream commit 4757b8df04
  removed the old bus-specific ethdev convenience macros)

Changes since v5:
- Replaced RCU QSBR with per-queue atomic burst_state using a
  single-variable CAS design: bit 0 is the in-burst flag, bit 1
  is the blocked flag. The data path uses CAS(0→1) to enter
  burst and fetch_and(~1) to exit. The reset path uses fetch_or
  to set the blocked bit and polls bit 0 to drain in-flight
  bursts. This eliminates the two-variable Dekker pattern and the
  need for sequential consistency (seq_cst) ordering.
- Removed librte_rcu dependency
- Removed __rte_no_thread_safety_analysis annotations (no longer
  needed after mutex conversion)
- Moved ERR_RECOVERING event emission before acquiring
  reset_ops_lock and before mana_reset_enter, so upper layers
  (e.g. netvsc) can switch data path before mana stops queues.
  Emitting outside the lock avoids deadlock if the callback
  calls dev_stop or dev_close.
- Replaced MANA_OPS_*_LOCK macros with mana_reset_trylock()
  helper function and explicit per-operation wrappers
- Removed unused rte_alarm.h and rte_lock_annotations.h includes
- Added RECOVERY_FAILED event when mana_reset_enter fails
  internally, so the application always receives a terminal event
- Added mana_clear_burst_state() helper to clear per-queue
  burst_state on failure paths (reset_failed, dev_stop_lock,
  dev_close_lock) preventing permanent silent packet drop after
  a failed reset

Changes since v4:
- Fixed stale rte_spinlock_unlock call in mana_intr_handler that
  was missed during the spinlock-to-mutex conversion, causing a
  -Wincompatible-pointer-types warning

Changes since v3:
- Converted reset_ops_lock from rte_spinlock_t to pthread_mutex_t
  with PTHREAD_PROCESS_SHARED, since the lock is held across
  blocking IB verbs calls and IPC with 5s timeout
- Removed rte_dev_event_callback_unregister retry loop to avoid
  deadlock when interrupt thread and reset thread contend

Changes since v2:
- Added per-queue burst_state atomic variable with Dekker-like
  synchronization to block data path during reset without RCU
- Replaced rte_alarm with condvar + control thread for reset exit
- Made reset_thread_active atomic with CAS — flag is set by
  creator and only cleared by the joiner, not the thread itself
- Fixed second reset crash: removed reset thread join logic from
  mana_dev_close (inner function) to avoid corrupting dev_state
  when called from mana_reset_enter
- Made reset_thread_active RTE_ATOMIC(bool) with explicit ordering
- Added retry loop for rte_dev_event_callback_unregister on -EAGAIN
- Initialized condvar/mutex with PTHREAD_PROCESS_SHARED since priv
  is in hugepage shared memory
- Added re-check of dev_state after lock acquisition in
  mana_intr_handler to prevent racing with pci_remove_event_cb
- Replaced (void *)0 with NULL in mp.c
- Added lock ownership comment block at mana_reset_enter
- Documented rte_dev_event_monitor_start() requirement
- Added mana.rst documentation and release note

Changes since v1:
- Removed net/netvsc patch from this series
- Simplified reset exit: mana_reset_exit calls
  mana_reset_exit_delay directly instead of spawning a thread
- Added __rte_no_thread_safety_analysis annotations for clang
- Switched to rte_thread_create_internal_control
- Fixed declaration-after-statement style issues
- Removed unnecessary blank lines and stale comments

Wei Hu (1):
  net/mana: add device reset support

 doc/guides/nics/mana.rst               |   40 +
 doc/guides/rel_notes/release_26_07.rst |    8 +
 drivers/net/mana/mana.c                | 1088 ++++++++++++++++++++++--
 drivers/net/mana/mana.h                |   52 +-
 drivers/net/mana/mp.c                  |   89 +-
 drivers/net/mana/mr.c                  |    6 +-
 drivers/net/mana/rx.c                  |   23 +-
 drivers/net/mana/tx.c                  |   44 +-
 8 files changed, 1242 insertions(+), 108 deletions(-)

-- 
2.34.1


^ permalink raw reply

* [PATCH v9 1/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-12  8:17 UTC (permalink / raw)
  To: dev, stephen; +Cc: longli, weh
In-Reply-To: <20260612081723.27699-1-weh@linux.microsoft.com>

From: Wei Hu <weh@microsoft.com>

Add support for handling hardware reset events in the MANA driver.
When the MANA kernel driver receives a hardware service event, it
initiates a device reset and notifies userspace via
IBV_EVENT_DEVICE_FATAL. The DPDK driver handles this by performing
an automatic teardown and recovery sequence.

The interrupt handler sets the device state, blocks new data path
bursts, waits for in-flight bursts to drain using per-queue atomic
flags, and spawns a control thread. The control thread performs
teardown immediately (dev_stop, secondary IPC, dev_close, MR cache
free) before waiting for the hardware recovery timer to fire. This
avoids blocking the EAL interrupt thread on multi-second IPC
timeouts and ibverbs calls. After the recovery delay, the thread
unregisters the interrupt handler, re-probes the PCI device,
reinitializes MR caches, and restarts queues. Each function owns
its own lock scope with no lock hand-off between threads.

Each queue has an atomic burst_state variable where bit 0 is the
in-burst flag and bit 1 is a blocked flag. The data path uses a
single compare-and-swap (0 to 1) to enter a burst, which fails
immediately if the blocked bit is set. The reset path sets the
blocked bit via atomic fetch-or and polls bit 0 to wait for
in-flight bursts to drain. This single-variable design avoids the
need for sequential consistency ordering.

A per-device mutex serializes the reset path with ethdev
operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
support and is held across blocking IB verbs calls. A trylock
helper encapsulates the lock acquisition and device state check
for all ethdev operation wrappers. Operations that cannot wait
(configure, queue setup) return -EBUSY during reset, while
dev_stop and dev_close join the reset thread before acquiring
the lock to ensure proper sequencing.

The reset thread keeps reset_thread_active true throughout its
lifetime. mana_join_reset_thread uses rte_thread_equal to detect
the self-join case (when a recovery callback calls dev_stop or
dev_close from the reset thread itself) and calls
rte_thread_detach instead of join, so thread resources are freed
on exit. External callers join normally.

The condvar wait in the reset thread uses a predicate loop that
checks dev_state under reset_cond_mutex, so a PCI remove signal
that arrives before the thread enters the wait is not lost. The
PCI remove callback sets dev_state to RESET_FAILED under the
same mutex before signaling. A lock/unlock barrier on
reset_ops_lock in the PCI remove path ensures teardown has
completed before emitting the INTR_RMV event.

Multi-process support is included: secondary processes unmap and
remap doorbell pages via IPC during the reset enter and exit
phases. Data path functions in both primary and secondary
processes check the device state atomically and return early when
the device is not active.

The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
reset path so that upper layers (e.g. netvsc) can switch their
data path before queues are stopped. The event is emitted outside
the reset lock to avoid deadlock if the callback calls dev_stop or
dev_close. On completion, the driver emits RECOVERY_SUCCESS or
RECOVERY_FAILED after releasing the lock. If a recovery callback
triggers dev_stop or dev_close, the self-join detection in
mana_join_reset_thread detaches the thread to avoid deadlock. If
the enter phase fails internally, RECOVERY_FAILED is sent
immediately so the application receives a terminal event. A PCI
device removal event callback distinguishes hot-remove from
service reset.

Documentation for the device reset feature is added in the MANA
NIC guide and the 26.07 release notes.

Signed-off-by: Wei Hu <weh@microsoft.com>
---
 doc/guides/nics/mana.rst               |   40 +
 doc/guides/rel_notes/release_26_07.rst |    8 +
 drivers/net/mana/mana.c                | 1088 ++++++++++++++++++++++--
 drivers/net/mana/mana.h                |   52 +-
 drivers/net/mana/mp.c                  |   89 +-
 drivers/net/mana/mr.c                  |    6 +-
 drivers/net/mana/rx.c                  |   23 +-
 drivers/net/mana/tx.c                  |   44 +-
 8 files changed, 1242 insertions(+), 108 deletions(-)

diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
index 0fcab6e2f6..08e345ea61 100644
--- a/doc/guides/nics/mana.rst
+++ b/doc/guides/nics/mana.rst
@@ -71,3 +71,43 @@ The user can specify below argument in devargs.
     The default value is not set,
     meaning all the NICs will be probed and loaded.
     User can specify multiple mac=xx:xx:xx:xx:xx:xx arguments for up to 8 NICs.
+
+Device Reset Support
+--------------------
+
+The MANA PMD supports automatic recovery from hardware service reset events.
+When the MANA kernel driver receives a hardware service event,
+it initiates a device reset and notifies userspace
+via ``IBV_EVENT_DEVICE_FATAL``.
+
+The driver handles this transparently through a two-phase reset flow:
+
+* **Enter phase**: The interrupt handler blocks new data path bursts
+  and waits for all in-flight burst calls to drain
+  using per-queue atomic flags,
+  then spawns a control thread for the remaining work.
+
+* **Teardown and exit phase**: The control thread tears down
+  IB resources and queues, unmaps secondary process doorbell pages,
+  and closes the device. After a delay for hardware recovery,
+  it re-probes the PCI device,
+  reinstalls the interrupt handler,
+  reinitializes resources, and restarts queues.
+
+The driver emits the following ethdev recovery events
+to notify upper layers (e.g. netvsc) of the reset lifecycle:
+
+``RTE_ETH_EVENT_ERR_RECOVERING``
+   Reset has started.
+
+``RTE_ETH_EVENT_RECOVERY_SUCCESS``
+   Device has recovered successfully.
+
+``RTE_ETH_EVENT_RECOVERY_FAILED``
+   Recovery failed.
+
+To distinguish a PCI hot-remove from a service reset,
+the driver registers for PCI device removal events.
+This requires the application to call ``rte_dev_event_monitor_start()``
+for removal events to be delivered
+(e.g. testpmd ``--hot-plug-handling`` option).
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index bd0cec2709..58e8c2422e 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -122,6 +122,14 @@ New Features
   Added AGENTS.md file for AI review
   and supporting scripts to review patches and documentation.
 
+* **Added device reset support to the MANA PMD.**
+
+  Added automatic recovery from hardware service reset events
+  in the MANA poll mode driver. The driver uses ethdev recovery events
+  (``RTE_ETH_EVENT_ERR_RECOVERING``, ``RTE_ETH_EVENT_RECOVERY_SUCCESS``,
+  ``RTE_ETH_EVENT_RECOVERY_FAILED``) to notify upper layers of the
+  reset lifecycle.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 67396cda1f..0b72f711a1 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_configure(struct rte_eth_dev *dev)
 			      RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
 
 	priv->num_queues = dev->data->nb_rx_queues;
+	DRV_LOG(DEBUG, "priv %p, port %u, dev port %u, num_queues: %u",
+		priv, priv->port_id, priv->dev_port, priv->num_queues);
 
 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -214,8 +216,8 @@ mana_dev_start(struct rte_eth_dev *dev)
 
 	DRV_LOG(INFO, "TX/RX queues have started");
 
-	/* Enable datapath for secondary processes */
-	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+	/* Intentionally ignore errors — secondary may not be running */
+	(void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
 
 	ret = rxq_intr_enable(priv);
 	if (ret) {
@@ -242,26 +244,33 @@ mana_dev_stop(struct rte_eth_dev *dev)
 {
 	int ret;
 	struct mana_priv *priv = dev->data->dev_private;
-
-	rxq_intr_disable(priv);
+	enum mana_device_state state;
+
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		rxq_intr_disable(priv);
+		DRV_LOG(DEBUG, "rxq_intr_disable called");
+	}
 
 	dev->tx_pkt_burst = mana_tx_burst_removed;
 	dev->rx_pkt_burst = mana_rx_burst_removed;
 
-	/* Stop datapath on secondary processes */
-	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+	/* Intentionally ignore errors — secondary may not be running */
+	(void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
 
 	rte_wmb();
 
 	ret = mana_stop_tx_queues(dev);
 	if (ret) {
-		DRV_LOG(ERR, "failed to stop tx queues");
+		DRV_LOG(ERR, "failed to stop tx queues, ret %d", ret);
 		return ret;
 	}
 
 	ret = mana_stop_rx_queues(dev);
 	if (ret) {
-		DRV_LOG(ERR, "failed to stop tx queues");
+		DRV_LOG(ERR, "failed to stop rx queues, ret %d", ret);
 		return ret;
 	}
 
@@ -275,36 +284,66 @@ mana_dev_close(struct rte_eth_dev *dev)
 {
 	struct mana_priv *priv = dev->data->dev_private;
 	int ret;
+	enum mana_device_state state;
 
+	DRV_LOG(DEBUG, "Free MR for priv %p", priv);
 	mana_remove_all_mr(priv);
 
-	ret = mana_intr_uninstall(priv);
-	if (ret)
-		return ret;
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		ret = mana_intr_uninstall(priv);
+		if (ret)
+			return ret;
+	}
 
 	if (priv->ib_parent_pd) {
-		int err = ibv_dealloc_pd(priv->ib_parent_pd);
-		if (err)
-			DRV_LOG(ERR, "Failed to deallocate parent PD: %d", err);
+		ret = ibv_dealloc_pd(priv->ib_parent_pd);
+		if (ret)
+			DRV_LOG(ERR,
+				"Failed to deallocate parent PD: %d", ret);
 		priv->ib_parent_pd = NULL;
 	}
 
 	if (priv->ib_pd) {
-		int err = ibv_dealloc_pd(priv->ib_pd);
-		if (err)
-			DRV_LOG(ERR, "Failed to deallocate PD: %d", err);
+		ret = ibv_dealloc_pd(priv->ib_pd);
+		if (ret)
+			DRV_LOG(ERR, "Failed to deallocate PD: %d", ret);
 		priv->ib_pd = NULL;
 	}
 
-	ret = ibv_close_device(priv->ib_ctx);
-	if (ret) {
-		ret = errno;
-		return ret;
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		if (priv->ib_ctx) {
+			ret = ibv_close_device(priv->ib_ctx);
+			if (ret) {
+				ret = errno;
+				return ret;
+			}
+			priv->ib_ctx = NULL;
+		}
 	}
 
 	return 0;
 }
 
+/*
+ * Called from mana_pci_remove to free resources allocated
+ * during probe that are not freed by dev_close.
+ */
+static void
+mana_dev_free_resources(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	pthread_mutex_destroy(&priv->reset_ops_lock);
+	pthread_mutex_destroy(&priv->reset_cond_mutex);
+	pthread_cond_destroy(&priv->reset_cond);
+}
+
 static int
 mana_dev_info_get(struct rte_eth_dev *dev,
 		  struct rte_eth_dev_info *dev_info)
@@ -391,6 +430,39 @@ mana_dev_info_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+/*
+ * Try to acquire the reset lock and verify the device is active.
+ * Returns 0 with lock held on success, or -EBUSY if the lock
+ * could not be acquired or the device is not in ACTIVE state.
+ */
+static int
+mana_reset_trylock(struct mana_priv *priv)
+{
+	if (pthread_mutex_trylock(&priv->reset_ops_lock))
+		return -EBUSY;
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static int
+mana_dev_info_get_lock(struct rte_eth_dev *dev,
+		       struct rte_eth_dev_info *dev_info)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_info_get(dev, dev_info);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_txq_info *qinfo)
@@ -552,6 +624,22 @@ mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 	return ret;
 }
 
+static int
+mana_dev_tx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+			     uint16_t nb_desc, unsigned int socket_id,
+			     const struct rte_eth_txconf *tx_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_tx_queue_setup(dev, queue_idx,
+				      nb_desc, socket_id, tx_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
@@ -629,6 +717,23 @@ mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 	return ret;
 }
 
+static int
+mana_dev_rx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+			     uint16_t nb_desc, unsigned int socket_id,
+			     const struct rte_eth_rxconf *rx_conf __rte_unused,
+			     struct rte_mempool *mp)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_rx_queue_setup(dev, queue_idx, nb_desc,
+				      socket_id, rx_conf, mp);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
@@ -820,33 +925,267 @@ mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
 	return mana_ifreq(priv, SIOCSIFMTU, &request);
 }
 
+static int
+mana_dev_configure_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_configure(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_dev_start_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_start(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+/*
+ * Join the reset thread if it is active. Uses CAS on
+ * reset_thread_active to ensure only one caller joins.
+ * If called from the reset thread itself (e.g. via a recovery
+ * event callback that calls dev_stop/dev_close), detach instead
+ * of joining to avoid deadlock and let the thread self-free.
+ */
+static void
+mana_join_reset_thread(struct mana_priv *priv)
+{
+	bool expected = true;
+
+	if (rte_atomic_compare_exchange_strong_explicit(
+			&priv->reset_thread_active, &expected, false,
+			rte_memory_order_acq_rel,
+			rte_memory_order_acquire)) {
+		if (rte_thread_equal(rte_thread_self(),
+				     priv->reset_thread)) {
+			/* Self case: detach so resources are freed on
+			 * thread exit. Don't modify dev_state — the
+			 * caller (dev_stop_lock/dev_close_lock) handles
+			 * state transitions.
+			 */
+			rte_thread_detach(priv->reset_thread);
+			return;
+		}
+
+		pthread_mutex_lock(&priv->reset_cond_mutex);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+		pthread_cond_signal(&priv->reset_cond);
+		pthread_mutex_unlock(&priv->reset_cond_mutex);
+		rte_thread_join(priv->reset_thread, NULL);
+	}
+}
+
+/*
+ * Clear per-queue burst_state so the data path CAS can succeed again.
+ * Must be called under reset_ops_lock when transitioning back to ACTIVE
+ * after a failed or aborted reset.
+ */
+static void
+mana_clear_burst_state(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int i;
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_store_explicit(&rxq->burst_state, 0,
+						  rte_memory_order_release);
+		if (txq)
+			rte_atomic_store_explicit(&txq->burst_state, 0,
+						  rte_memory_order_release);
+	}
+}
+
+/*
+ * Custom lock wrappers for dev_stop and dev_close.
+ * These join any active reset thread and use a blocking lock (not
+ * trylock) so they wait for any in-progress reset processing to
+ * finish, rather than returning -EBUSY. When the device is not in
+ * MANA_DEV_ACTIVE state, they transition state to MANA_DEV_ACTIVE.
+ */
+static int
+mana_dev_stop_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	mana_join_reset_thread(priv);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		mana_clear_burst_state(dev);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return 0;
+	}
+
+	ret = mana_dev_stop(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_dev_close_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	mana_join_reset_thread(priv);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		mana_clear_burst_state(dev);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+	}
+
+	ret = mana_dev_close(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rss_hash_update_lock(struct rte_eth_dev *dev,
+			  struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rss_hash_update(dev, rss_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rss_hash_conf_get_lock(struct rte_eth_dev *dev,
+			    struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rss_hash_conf_get(dev, rss_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static void
+mana_dev_tx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	if (mana_reset_trylock(priv)) {
+		DRV_LOG(ERR, "Device reset in progress, "
+			"mana_dev_tx_queue_release not called");
+		return;
+	}
+	mana_dev_tx_queue_release(dev, qid);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static void
+mana_dev_rx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	if (mana_reset_trylock(priv)) {
+		DRV_LOG(ERR, "Device reset in progress, "
+			"mana_dev_rx_queue_release not called");
+		return;
+	}
+	mana_dev_rx_queue_release(dev, qid);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_rx_intr_enable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rx_intr_enable(dev, rx_queue_id);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rx_intr_disable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rx_intr_disable(dev, rx_queue_id);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_mtu_set_lock(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_mtu_set(dev, mtu);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static const struct eth_dev_ops mana_dev_ops = {
-	.dev_configure		= mana_dev_configure,
-	.dev_start		= mana_dev_start,
-	.dev_stop		= mana_dev_stop,
-	.dev_close		= mana_dev_close,
-	.dev_infos_get		= mana_dev_info_get,
+	.dev_configure		= mana_dev_configure_lock,
+	.dev_start		= mana_dev_start_lock,
+	.dev_stop		= mana_dev_stop_lock,
+	.dev_close		= mana_dev_close_lock,
+	.dev_infos_get		= mana_dev_info_get_lock,
 	.txq_info_get		= mana_dev_tx_queue_info,
 	.rxq_info_get		= mana_dev_rx_queue_info,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
-	.rss_hash_update	= mana_rss_hash_update,
-	.rss_hash_conf_get	= mana_rss_hash_conf_get,
-	.tx_queue_setup		= mana_dev_tx_queue_setup,
-	.tx_queue_release	= mana_dev_tx_queue_release,
-	.rx_queue_setup		= mana_dev_rx_queue_setup,
-	.rx_queue_release	= mana_dev_rx_queue_release,
-	.rx_queue_intr_enable	= mana_rx_intr_enable,
-	.rx_queue_intr_disable	= mana_rx_intr_disable,
+	.rss_hash_update	= mana_rss_hash_update_lock,
+	.rss_hash_conf_get	= mana_rss_hash_conf_get_lock,
+	.tx_queue_setup		= mana_dev_tx_queue_setup_lock,
+	.tx_queue_release	= mana_dev_tx_queue_release_lock,
+	.rx_queue_setup		= mana_dev_rx_queue_setup_lock,
+	.rx_queue_release	= mana_dev_rx_queue_release_lock,
+	.rx_queue_intr_enable	= mana_rx_intr_enable_lock,
+	.rx_queue_intr_disable	= mana_rx_intr_disable_lock,
 	.link_update		= mana_dev_link_update,
 	.stats_get		= mana_dev_stats_get,
 	.stats_reset		= mana_dev_stats_reset,
-	.mtu_set		= mana_mtu_set,
+	.mtu_set		= mana_mtu_set_lock,
 };
 
 static const struct eth_dev_ops mana_dev_secondary_ops = {
 	.stats_get = mana_dev_stats_get,
 	.stats_reset = mana_dev_stats_reset,
-	.dev_infos_get = mana_dev_info_get,
+	.dev_infos_get = mana_dev_info_get_lock,
 };
 
 uint16_t
@@ -1031,28 +1370,516 @@ mana_ibv_device_to_pci_addr(const struct ibv_device *device,
 	return 0;
 }
 
+static int mana_pci_probe(struct rte_pci_driver *pci_drv,
+			  struct rte_pci_device *pci_dev);
+static void mana_intr_handler(void *arg);
+static void mana_reset_exit(struct mana_priv *priv);
+
+/* Delay before initiating reset exit after reset enter completes */
+#define MANA_RESET_TIMER_US (15 * 1000000ULL) /* 15 seconds */
+
 /*
- * Interrupt handler from IB layer to notify this device is being removed.
+ * Callback for PCI device removal events from EAL.
+ * If the device is in reset (RESET_EXIT state), this means the PCI
+ * device was hot-removed rather than a service reset. Wake the reset
+ * thread via condvar and notify netvsc via RTE_ETH_EVENT_INTR_RMV.
+ */
+static void
+mana_pci_remove_event_cb(const char *device_name,
+			 enum rte_dev_event_type event, void *cb_arg)
+{
+	struct mana_priv *priv = cb_arg;
+	struct rte_eth_dev *dev;
+
+	if (event != RTE_DEV_EVENT_REMOVE)
+		return;
+
+	DRV_LOG(INFO, "PCI device %s removed", device_name);
+
+	/* Wake the reset thread immediately */
+	pthread_mutex_lock(&priv->reset_cond_mutex);
+	rte_atomic_store_explicit(&priv->dev_state,
+		MANA_DEV_RESET_FAILED, rte_memory_order_release);
+	pthread_cond_signal(&priv->reset_cond);
+	pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+	/* Wait for the reset thread to finish teardown and release
+	 * reset_ops_lock before emitting INTR_RMV to the application.
+	 */
+	pthread_mutex_lock(&priv->reset_ops_lock);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	dev = &rte_eth_devices[priv->port_id];
+	DRV_LOG(INFO, "Sending RTE_ETH_EVENT_INTR_RMV for port %u",
+		priv->port_id);
+	rte_eth_dev_callback_process(dev,
+		RTE_ETH_EVENT_INTR_RMV, NULL);
+}
+
+/*
+ * Reset thread: performs teardown immediately, waits for the
+ * recovery timer, then re-probes and restarts the device.
+ * Runs on a control thread so it can call blocking IPC, ibv
+ * teardown, and rte_intr_callback_unregister (which all must
+ * not run on the EAL interrupt thread).
+ */
+static uint32_t
+mana_reset_thread(void *arg)
+{
+	struct mana_priv *priv = (struct mana_priv *)arg;
+	struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+	struct timespec ts;
+	int ret;
+	int i;
+
+	DRV_LOG(INFO, "Reset thread started");
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	/* Teardown: stop data path, unmap secondary doorbells, close device,
+	 * free MR caches. Must happen immediately — hardware may be gone.
+	 */
+	ret = mana_dev_stop(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to stop mana dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_ENTER);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to reset secondary processes ret = %d",
+			ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	ret = mana_dev_close(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to close mana dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		DRV_LOG(DEBUG, "Free MR for priv = %p, rxq %u, txq %u",
+			priv, rxq->rxq_idx, txq->txq_idx);
+		mana_mr_btree_free(&rxq->mr_btree);
+		mana_mr_btree_free(&txq->mr_btree);
+	}
+
+	DRV_LOG(DEBUG, "Teardown complete");
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_EXIT,
+				     rte_memory_order_release);
+
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	/* Wait for the recovery timer before re-probing.
+	 * Check dev_state under reset_cond_mutex before waiting:
+	 * if mana_pci_remove_event_cb already set RESET_FAILED
+	 * (under the same mutex), we skip the wait entirely.
+	 * This avoids losing a condvar signal that arrived before
+	 * we entered the wait.
+	 */
+	DRV_LOG(INFO, "Waiting %us for hardware recovery",
+		(unsigned int)(MANA_RESET_TIMER_US / 1000000));
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	ts.tv_sec += MANA_RESET_TIMER_US / 1000000;
+
+	pthread_mutex_lock(&priv->reset_cond_mutex);
+	while (rte_atomic_load_explicit(&priv->dev_state,
+	       rte_memory_order_acquire) == MANA_DEV_RESET_EXIT) {
+		if (pthread_cond_timedwait(&priv->reset_cond,
+		    &priv->reset_cond_mutex, &ts))
+			break; /* timeout */
+	}
+	pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+		DRV_LOG(INFO, "Reset thread: dev_state=%d, skipping exit",
+			(int)rte_atomic_load_explicit(&priv->dev_state,
+			rte_memory_order_acquire));
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return 0;
+	}
+
+	DRV_LOG(INFO, "Reset thread: initiating reset exit");
+	mana_reset_exit(priv);
+	/* Lock is released by mana_reset_exit_delay.
+	 * reset_thread_active remains true — the joiner
+	 * (mana_join_reset_thread) will either join or detach
+	 * (if called from this thread's own callback).
+	 */
+	return 0;
+
+reset_failed:
+	mana_clear_burst_state(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+		priv->port_id);
+	rte_eth_dev_callback_process(dev,
+		RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+	return 0;
+}
+
+static void
+mana_reset_enter(struct mana_priv *priv)
+{
+	int ret;
+	int i;
+	struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+
+	/*
+	 * Lock ownership: mana_intr_handler acquires reset_ops_lock,
+	 * mana_reset_enter sets state/drains/spawns thread and releases it.
+	 * The reset thread independently acquires/releases the lock for
+	 * teardown and for the exit (re-probe) phase.
+	 */
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_ENTER,
+				     rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Entering into device reset state");
+	DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+	/* Set the blocked bit on each queue's burst_state so new bursts
+	 * are rejected, then wait for any in-flight burst (bit 0) to finish.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_fetch_or_explicit(&rxq->burst_state,
+				MANA_BURST_BLOCKED,
+				rte_memory_order_release);
+		if (txq)
+			rte_atomic_fetch_or_explicit(&txq->burst_state,
+				MANA_BURST_BLOCKED,
+				rte_memory_order_release);
+	}
+
+	/* Wait for all in-flight burst calls to finish (bit 0 to clear) */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			while (rte_atomic_load_explicit(&rxq->burst_state,
+				    rte_memory_order_acquire) & 1)
+				rte_pause();
+		if (txq)
+			while (rte_atomic_load_explicit(&txq->burst_state,
+				    rte_memory_order_acquire) & 1)
+				rte_pause();
+	}
+
+	DRV_LOG(DEBUG, "All data path threads drained");
+
+	/* Join previous reset thread if it completed but was not joined.
+	 * Use CAS to avoid double-join if another path joined first.
+	 * Don't use mana_join_reset_thread() here — we are already in
+	 * RESET_ENTER state and must not change dev_state to ACTIVE.
+	 */
+	{
+		bool expected = true;
+
+		if (rte_atomic_compare_exchange_strong_explicit(
+				&priv->reset_thread_active, &expected, false,
+				rte_memory_order_acq_rel,
+				rte_memory_order_acquire))
+			rte_thread_join(priv->reset_thread, NULL);
+	}
+
+	ret = rte_thread_create_internal_control(&priv->reset_thread,
+						 "mana-reset",
+						 mana_reset_thread, priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to create reset thread ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+					  MANA_DEV_RESET_FAILED,
+					  rte_memory_order_release);
+		goto reset_failed;
+	}
+	rte_atomic_store_explicit(&priv->reset_thread_active,
+		true, rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Reset thread started");
+
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return;
+
+reset_failed:
+	mana_clear_burst_state(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_reset_exit_delay(void *arg)
+{
+	struct mana_priv *priv = (struct mana_priv *)arg;
+	int ret = 0;
+	int i;
+	struct rte_eth_dev *dev;
+	struct rte_pci_device *pci_dev;
+
+	DRV_LOG(DEBUG, "Delayed mana device reset complete processing");
+
+	/* If the app called dev_stop/dev_close during the timer window,
+	 * state is no longer RESET_EXIT. Nothing to do.
+	 */
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+		DRV_LOG(DEBUG, "State is not RESET_EXIT, skipping");
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return ret;
+	}
+
+	dev = &rte_eth_devices[priv->port_id];
+	pci_dev = RTE_CLASS_TO_BUS_DEVICE(dev, *pci_dev);
+
+	DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+	ret = ibv_close_device(priv->ib_ctx);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to close ibv device %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+		goto out;
+	}
+	priv->ib_ctx = NULL;
+
+	ret = mana_pci_probe(NULL, pci_dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to probe mana pci dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+		goto out;
+	}
+
+	/*
+	 * Init the local MR caches.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		ret = mana_mr_btree_init(&rxq->mr_btree,
+					 MANA_MR_BTREE_PER_QUEUE_N,
+					 rxq->socket);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to init RXQ %d MR btree "
+				"on socket %u, ret %d", i, rxq->socket, ret);
+			goto mr_init_failed_rxq;
+		}
+
+		ret = mana_mr_btree_init(&txq->mr_btree,
+					 MANA_MR_BTREE_PER_QUEUE_N,
+					 txq->socket);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to init TXQ %d MR btree "
+				"on socket %u, ret %d", i, txq->socket, ret);
+			goto mr_init_failed_txq;
+		}
+	}
+	DRV_LOG(DEBUG, "priv %p, num_queues %u", priv, priv->num_queues);
+
+	/* Start secondaries */
+	ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_EXIT);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to start secondary processes ret = %d",
+			ret);
+		goto mr_init_failed_all;
+	}
+
+	ret = mana_dev_start(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to start mana dev ret %d", ret);
+		goto mr_init_failed_all;
+	}
+
+	/* Clear per-queue burst_state before marking device active so
+	 * data path CAS can succeed again.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_store_explicit(&rxq->burst_state, 0,
+						  rte_memory_order_release);
+		if (txq)
+			rte_atomic_store_explicit(&txq->burst_state, 0,
+						  rte_memory_order_release);
+	}
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+				     rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Exiting the reset complete processing");
+	goto out;
+
+mr_init_failed_all:
+	i = priv->num_queues;
+	goto mr_init_failed_rxq;
+
+mr_init_failed_txq:
+	/* RXQ btree at index i was initialized, free it */
+	mana_mr_btree_free(&((struct mana_rxq *)
+			     dev->data->rx_queues[i])->mr_btree);
+
+mr_init_failed_rxq:
+	/* Free all fully initialized btrees for indices < i */
+	for (int j = 0; j < i; j++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[j];
+		struct mana_txq *txq = dev->data->tx_queues[j];
+
+		mana_mr_btree_free(&rxq->mr_btree);
+		mana_mr_btree_free(&txq->mr_btree);
+	}
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+
+out:
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	if (!ret) {
+		DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_SUCCESS for port %u",
+			priv->port_id);
+		rte_eth_dev_callback_process(dev,
+			RTE_ETH_EVENT_RECOVERY_SUCCESS, NULL);
+	} else {
+		DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+			priv->port_id);
+		rte_eth_dev_callback_process(dev,
+			RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+	}
+	return ret;
+}
+
+static void
+mana_reset_exit(struct mana_priv *priv)
+{
+	int ret;
+
+	if (!priv) {
+		DRV_LOG(ERR, "Private structure invalid");
+		return;
+	}
+	DRV_LOG(DEBUG, "Entering into device reset complete processing");
+
+	rxq_intr_disable(priv);
+
+	/* Unregister the interrupt handler. Since mana_reset_exit is always
+	 * called from mana_reset_thread (a non-interrupt thread), the
+	 * interrupt source is inactive and rte_intr_callback_unregister
+	 * succeeds directly.
+	 */
+	if (priv->intr_handle) {
+		ret = rte_intr_callback_unregister(priv->intr_handle,
+						   mana_intr_handler, priv);
+		if (ret < 0)
+			DRV_LOG(ERR, "Failed to unregister intr callback ret %d",
+				ret);
+		else
+			DRV_LOG(DEBUG, "%d intr callback(s) removed", ret);
+
+		rte_intr_instance_free(priv->intr_handle);
+		priv->intr_handle = NULL;
+	}
+
+	/* Proceed directly to reset exit delay (re-probe and restart).
+	 * No need for a separate thread - we are already on
+	 * mana_reset_thread which is a non-interrupt control thread.
+	 */
+	mana_reset_exit_delay(priv);
+}
+
+/*
+ * Interrupt handler from IB layer to notify this device is
+ * being removed or reset.
  */
 static void
 mana_intr_handler(void *arg)
 {
 	struct mana_priv *priv = arg;
 	struct ibv_context *ctx = priv->ib_ctx;
-	struct ibv_async_event event;
+	struct ibv_async_event event = { 0 };
+	struct rte_eth_dev *dev;
 
 	/* Read and ack all messages from IB device */
 	while (true) {
 		if (ibv_get_async_event(ctx, &event))
 			break;
 
-		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
-			struct rte_eth_dev *dev;
-
-			dev = &rte_eth_devices[priv->port_id];
-			if (dev->data->dev_conf.intr_conf.rmv)
+		switch (event.event_type) {
+		case IBV_EVENT_DEVICE_FATAL:
+			DRV_LOG(INFO, "IBV_EVENT_DEVICE_FATAL received, dev_state=%d",
+				(int)rte_atomic_load_explicit(&priv->dev_state,
+				rte_memory_order_acquire));
+			if (rte_atomic_load_explicit(&priv->dev_state,
+			    rte_memory_order_acquire) == MANA_DEV_ACTIVE) {
+				/* Notify upper layers (e.g. netvsc) before
+				 * acquiring the lock so they can switch data
+				 * path before mana stops queues. Emitting
+				 * outside the lock avoids deadlock if the
+				 * callback calls dev_stop/dev_close.
+				 */
+				dev = &rte_eth_devices[priv->port_id];
+				DRV_LOG(INFO,
+					"Sending RTE_ETH_EVENT_ERR_RECOVERING for port %u",
+					priv->port_id);
 				rte_eth_dev_callback_process(dev,
-					RTE_ETH_EVENT_INTR_RMV, NULL);
+					RTE_ETH_EVENT_ERR_RECOVERING,
+					NULL);
+
+				pthread_mutex_lock(&priv->reset_ops_lock);
+
+				/* Re-check after lock to avoid racing with
+				 * mana_pci_remove_event_cb which may have
+				 * set RESET_FAILED while we waited.
+				 */
+				if (rte_atomic_load_explicit(&priv->dev_state,
+				    rte_memory_order_acquire) !=
+				    MANA_DEV_ACTIVE) {
+					pthread_mutex_unlock(
+						&priv->reset_ops_lock);
+					break;
+				}
+
+				mana_reset_enter(priv);
+
+				if (rte_atomic_load_explicit(&priv->dev_state,
+				    rte_memory_order_acquire) ==
+				    MANA_DEV_RESET_FAILED) {
+					DRV_LOG(INFO,
+						"Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+						priv->port_id);
+					rte_eth_dev_callback_process(dev,
+						RTE_ETH_EVENT_RECOVERY_FAILED,
+						NULL);
+				}
+			} else {
+				DRV_LOG(ERR, "Already in reset handling, dev_state=%d",
+					(int)rte_atomic_load_explicit(&priv->dev_state,
+					rte_memory_order_acquire));
+			}
+			break;
+
+		default:
+			break;
 		}
 
 		ibv_ack_async_event(&event);
@@ -1063,6 +1890,23 @@ static int
 mana_intr_uninstall(struct mana_priv *priv)
 {
 	int ret;
+	struct rte_eth_dev *dev;
+
+	if (!priv->intr_handle)
+		return 0;
+
+	/* Unregister PCI device removal event callback.
+	 * Do not retry on -EAGAIN to avoid deadlock: the callback
+	 * may be blocked waiting for reset_ops_lock which we hold.
+	 */
+	dev = &rte_eth_devices[priv->port_id];
+	if (dev->device) {
+		ret = rte_dev_event_callback_unregister(dev->device->name,
+			mana_pci_remove_event_cb, priv);
+		if (ret < 0 && ret != -ENOENT)
+			DRV_LOG(WARNING, "Failed to unregister PCI remove cb ret %d",
+				ret);
+	}
 
 	ret = rte_intr_callback_unregister(priv->intr_handle,
 					   mana_intr_handler, priv);
@@ -1072,6 +1916,7 @@ mana_intr_uninstall(struct mana_priv *priv)
 	}
 
 	rte_intr_instance_free(priv->intr_handle);
+	priv->intr_handle = NULL;
 
 	return 0;
 }
@@ -1127,6 +1972,16 @@ mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
 		goto free_intr;
 	}
 
+	/* Register for PCI device removal events to distinguish
+	 * PCI hot-remove from service reset. This requires the
+	 * application to call rte_dev_event_monitor_start() for
+	 * events to be delivered (e.g. testpmd --hot-plug-handling).
+	 */
+	ret = rte_dev_event_callback_register(eth_dev->device->name,
+					      mana_pci_remove_event_cb, priv);
+	if (ret)
+		DRV_LOG(WARNING, "Failed to register PCI remove event callback");
+
 	eth_dev->intr_handle = priv->intr_handle;
 	return 0;
 
@@ -1156,7 +2011,7 @@ mana_proc_priv_init(struct rte_eth_dev *dev)
 /*
  * Map the doorbell page for the secondary process through IB device handle.
  */
-static int
+int
 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
 {
 	struct mana_process_priv *priv = eth_dev->process_private;
@@ -1294,17 +2149,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 	char name[RTE_ETH_NAME_MAX_LEN];
 	int ret;
 	struct ibv_context *ctx = NULL;
+	bool is_reset = false;
+	pthread_mutexattr_t mattr;
+	pthread_condattr_t cattr;
 
 	rte_ether_format_addr(address, sizeof(address), addr);
-	DRV_LOG(INFO, "device located port %u address %s", port, address);
 
-	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
-				  SOCKET_ID_ANY);
-	if (!priv)
-		return -ENOMEM;
+	DRV_LOG(DEBUG, "device located port %u address %s", port, address);
 
 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
 
+	eth_dev = rte_eth_dev_allocated(name);
+	if (eth_dev) {
+		is_reset = true;
+		priv = eth_dev->data->dev_private;
+		DRV_LOG(DEBUG, "Device reset for eth_dev %p priv %p",
+			eth_dev, priv);
+	} else {
+		priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
+					  SOCKET_ID_ANY);
+		if (!priv)
+			return -ENOMEM;
+	}
+
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		int fd;
 
@@ -1317,6 +2184,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 
 		eth_dev->device = &pci_dev->device;
 		eth_dev->dev_ops = &mana_dev_secondary_ops;
+
 		ret = mana_proc_priv_init(eth_dev);
 		if (ret)
 			goto failed;
@@ -1336,7 +2204,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 			goto failed;
 		}
 
-		/* fd is no not used after mapping doorbell */
+		/* fd is not used after mapping doorbell */
 		close(fd);
 
 		eth_dev->tx_pkt_burst = mana_tx_burst;
@@ -1355,22 +2223,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 		goto failed;
 	}
 
-	eth_dev = rte_eth_dev_allocate(name);
-	if (!eth_dev) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	eth_dev->data->mac_addrs =
-		rte_calloc("mana_mac", 1,
-			   sizeof(struct rte_ether_addr), 0);
-	if (!eth_dev->data->mac_addrs) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
-
 	priv->ib_pd = ibv_alloc_pd(ctx);
 	if (!priv->ib_pd) {
 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
@@ -1390,10 +2242,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 	}
 
 	priv->ib_ctx = ctx;
-	priv->port_id = eth_dev->data->port_id;
-	priv->dev_port = port;
-	eth_dev->data->dev_private = priv;
-	priv->dev_data = eth_dev->data;
 
 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
@@ -1415,23 +2263,72 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 		name, priv->max_rx_queues, priv->max_rx_desc,
 		priv->max_send_sge, priv->max_mr_size);
 
-	rte_eth_copy_pci_info(eth_dev, pci_dev);
+	if (!is_reset) {
+		eth_dev = rte_eth_dev_allocate(name);
+		if (!eth_dev) {
+			ret = -ENOMEM;
+			goto failed;
+		}
 
-	/* Create async interrupt handler */
-	ret = mana_intr_install(eth_dev, priv);
-	if (ret) {
-		DRV_LOG(ERR, "Failed to install intr handler");
-		goto failed;
+		eth_dev->data->mac_addrs =
+			rte_calloc("mana_mac", 1,
+				   sizeof(struct rte_ether_addr), 0);
+		if (!eth_dev->data->mac_addrs) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+
+		rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
+	} else {
+		/*
+		 * Reset path.
+		 */
+		rte_ether_format_addr(address, RTE_ETHER_ADDR_FMT_SIZE,
+				      eth_dev->data->mac_addrs);
+		DRV_LOG(DEBUG, "Found existing eth_dev %p with mac addr %s",
+			eth_dev, address);
+		DRV_LOG(DEBUG, "ib_ctx = %p", priv->ib_ctx);
+		goto out;
 	}
 
-	eth_dev->device = &pci_dev->device;
+	priv->port_id = eth_dev->data->port_id;
+	priv->dev_port = port;
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+				     rte_memory_order_release);
+
+	rte_eth_copy_pci_info(eth_dev, pci_dev);
+
+	pthread_mutexattr_init(&mattr);
+	pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->reset_ops_lock, &mattr);
+	pthread_mutex_init(&priv->reset_cond_mutex, &mattr);
+	pthread_mutexattr_destroy(&mattr);
+
+	pthread_condattr_init(&cattr);
+	pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&priv->reset_cond, &cattr);
+	pthread_condattr_destroy(&cattr);
 
-	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
+	eth_dev->device = &pci_dev->device;
 
 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
 	eth_dev->dev_ops = &mana_dev_ops;
 
+out:
+	/* Create async interrupt handler */
+	ret = mana_intr_install(eth_dev, priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to install intr handler, ret %d", ret);
+		goto failed;
+	}
+	DRV_LOG(INFO, "mana_intr_install succeeded");
+
+	DRV_LOG(INFO, "device %s priv %p dev port %d at port %u",
+		name, priv, priv->dev_port, eth_dev->data->port_id);
+
 	rte_eth_dev_probing_finish(eth_dev);
 
 	return 0;
@@ -1439,20 +2336,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 failed:
 	/* Free the resource for the port failed */
 	if (priv) {
-		if (priv->ib_parent_pd)
+		if (priv->ib_parent_pd) {
 			ibv_dealloc_pd(priv->ib_parent_pd);
+			priv->ib_parent_pd = NULL;
+		}
 
-		if (priv->ib_pd)
+		if (priv->ib_pd) {
 			ibv_dealloc_pd(priv->ib_pd);
+			priv->ib_pd = NULL;
+		}
 	}
 
-	if (eth_dev)
-		rte_eth_dev_release_port(eth_dev);
+	if (!is_reset) {
+		if (eth_dev)
+			rte_eth_dev_release_port(eth_dev);
 
-	rte_free(priv);
+		rte_free(priv);
+	}
 
-	if (ctx)
+	if (ctx) {
 		ibv_close_device(ctx);
+		if (is_reset && priv)
+			priv->ib_ctx = NULL;
+	}
 
 	return ret;
 }
@@ -1617,7 +2523,17 @@ mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 static int
 mana_dev_uninit(struct rte_eth_dev *dev)
 {
-	return mana_dev_close(dev);
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	/* Join reset thread before teardown to ensure it has exited
+	 * before we destroy the condvar/mutex in free_resources.
+	 */
+	mana_join_reset_thread(priv);
+
+	ret = mana_dev_close(dev);
+	mana_dev_free_resources(dev);
+	return ret;
 }
 
 /*
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 79cc47b6ab..a7b301484a 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -5,6 +5,8 @@
 #ifndef __MANA_H__
 #define __MANA_H__
 
+#include <pthread.h>
+
 #define	PCI_VENDOR_ID_MICROSOFT		0x1414
 #define PCI_DEVICE_ID_MICROSOFT_MANA_PF	0x00b9
 #define PCI_DEVICE_ID_MICROSOFT_MANA	0x00ba
@@ -337,6 +339,26 @@ struct mana_process_priv {
 	void *db_page;
 };
 
+enum mana_device_state {
+	/* Normal running */
+	MANA_DEV_ACTIVE		= 0,
+	/* In reset enter processing */
+	MANA_DEV_RESET_ENTER	= 1,
+	/*
+	 * Reset enter processing completed.
+	 * Waiting for reset exit or in reset exit processing.
+	 */
+	MANA_DEV_RESET_EXIT	= 2,
+	/* Reset failed */
+	MANA_DEV_RESET_FAILED	= 3,
+};
+
+/* burst_state bit layout:
+ *   Bit 0: in-burst (set by data path CAS 0→1, cleared on exit).
+ *   Bit 1: blocked  (set by reset path to reject new bursts).
+ */
+#define MANA_BURST_BLOCKED	2
+
 struct mana_priv {
 	struct rte_eth_dev_data *dev_data;
 	struct mana_process_priv *process_priv;
@@ -368,6 +390,15 @@ struct mana_priv {
 	uint64_t max_mr_size;
 	struct mana_mr_btree mr_btree;
 	rte_spinlock_t	mr_btree_lock;
+	RTE_ATOMIC(enum mana_device_state) dev_state;
+	/* mutex for synchronizing mana reset and some mana_dev_ops callbacks */
+	pthread_mutex_t reset_ops_lock;
+	/* Reset thread ID, valid when reset_thread_active is true */
+	rte_thread_t reset_thread;
+	RTE_ATOMIC(bool) reset_thread_active;
+	/* Condvar to wake reset thread early on PCI remove */
+	pthread_mutex_t reset_cond_mutex;
+	pthread_cond_t reset_cond;
 };
 
 struct mana_txq_desc {
@@ -427,6 +458,14 @@ struct mana_txq {
 	struct mana_mr_btree mr_btree;
 	struct mana_stats stats;
 	unsigned int socket;
+	unsigned int txq_idx;
+
+	/*
+	 * Bit 0: in-burst flag (set by data path, cleared on exit).
+	 * Bit 1: blocked flag (set by reset path via fetch_or).
+	 * Data path CAS 0→1 to enter; fails if blocked bit is set.
+	 */
+	RTE_ATOMIC(uint32_t) burst_state;
 };
 
 struct mana_rxq {
@@ -462,6 +501,14 @@ struct mana_rxq {
 	struct mana_mr_btree mr_btree;
 
 	unsigned int socket;
+	unsigned int rxq_idx;
+
+	/*
+	 * Bit 0: in-burst flag (set by data path, cleared on exit).
+	 * Bit 1: blocked flag (set by reset path via fetch_or).
+	 * Data path CAS 0→1 to enter; fails if blocked bit is set.
+	 */
+	RTE_ATOMIC(uint32_t) burst_state;
 };
 
 extern int mana_logtype_driver;
@@ -543,6 +590,8 @@ enum mana_mp_req_type {
 	MANA_MP_REQ_CREATE_MR,
 	MANA_MP_REQ_START_RXTX,
 	MANA_MP_REQ_STOP_RXTX,
+	MANA_MP_REQ_RESET_ENTER,
+	MANA_MP_REQ_RESET_EXIT,
 };
 
 /* Pameters for IPC. */
@@ -563,8 +612,9 @@ void mana_mp_uninit_primary(void);
 void mana_mp_uninit_secondary(void);
 int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
 int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
+int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd);
 
-void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+int mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
 void *mana_alloc_verbs_buf(size_t size, void *data);
 void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 72417fc0c7..1161ebd71c 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -2,10 +2,13 @@
  * Copyright 2022 Microsoft Corporation
  */
 
+#include <sys/mman.h>
 #include <rte_malloc.h>
 #include <ethdev_driver.h>
 #include <rte_log.h>
+#include <rte_eal_paging.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include <infiniband/verbs.h>
 
@@ -119,6 +122,23 @@ mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 	return ret;
 }
 
+static int
+mana_mp_reset_enter(struct rte_eth_dev *dev)
+{
+	struct mana_process_priv *proc_priv = dev->process_private;
+
+	void *addr = proc_priv->db_page;
+
+	/* Reset the db_page to NULL */
+	proc_priv->db_page = NULL;
+
+	if (addr)
+		(void)munmap(addr, rte_mem_page_size());
+
+	DRV_LOG(DEBUG, "Secondary doorbell pages unmapped");
+	return 0;
+}
+
 static int
 mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 {
@@ -171,6 +191,49 @@ mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 		ret = rte_mp_reply(&mp_res, peer);
 		break;
 
+	case MANA_MP_REQ_RESET_ENTER:
+		DRV_LOG(INFO, "Port %u reset enter", dev->data->port_id);
+		res->result = mana_mp_reset_enter(dev);
+
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	case MANA_MP_REQ_RESET_EXIT:
+		DRV_LOG(INFO, "Port %u reset exit", dev->data->port_id);
+		{
+			struct mana_process_priv *proc_priv =
+				dev->process_private;
+
+			if (proc_priv->db_page != NULL) {
+				DRV_LOG(DEBUG,
+					"Secondary doorbell already "
+					"mapped to %p",
+					proc_priv->db_page);
+				res->result = 0;
+			} else if (mp_msg->num_fds < 1) {
+				DRV_LOG(ERR,
+					"No FD in RESET_EXIT message");
+				res->result = -EINVAL;
+			} else {
+				int fd = mp_msg->fds[0];
+
+				ret = mana_map_doorbell_secondary(dev,
+								  fd);
+				if (ret) {
+					DRV_LOG(ERR,
+						"Failed secondary "
+						"doorbell map %d",
+						fd);
+					res->result = -ENODEV;
+				} else {
+					res->result = 0;
+				}
+				close(fd);
+			}
+		}
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
 	default:
 		DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
 			param->port_id, param->type);
@@ -254,7 +317,7 @@ mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
 	}
 
 	ret = mp_res->fds[0];
-	DRV_LOG(ERR, "port %u command FD from primary is %d",
+	DRV_LOG(DEBUG, "port %u command FD from primary is %d",
 		dev->data->port_id, ret);
 exit:
 	free(mp_rep.msgs);
@@ -298,27 +361,36 @@ mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
 	return ret;
 }
 
-void
+int
 mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 {
 	struct rte_mp_msg mp_req = { 0 };
 	struct rte_mp_msg *mp_res;
-	struct rte_mp_reply mp_rep;
+	struct rte_mp_reply mp_rep = { 0 };
 	struct mana_mp_param *res;
 	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
-	int i, ret;
+	int i, ret = 0;
 
-	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX &&
+	    type != MANA_MP_REQ_RESET_ENTER && type != MANA_MP_REQ_RESET_EXIT) {
 		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
 			dev->data->port_id, type);
-		return;
+		return -EINVAL;
 	}
 
 	if (rte_atomic_load_explicit(&mana_shared_data->secondary_cnt, rte_memory_order_relaxed) == 0)
-		return;
+		return 0;
 
 	mp_init_msg(&mp_req, type, dev->data->port_id);
 
+	/* Include IB cmd FD for secondary doorbell remap */
+	if (type == MANA_MP_REQ_RESET_EXIT) {
+		struct mana_priv *priv = dev->data->dev_private;
+
+		mp_req.num_fds = 1;
+		mp_req.fds[0] = priv->ib_ctx->cmd_fd;
+	}
+
 	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
 	if (ret) {
 		if (rte_errno != ENOTSUP)
@@ -329,6 +401,7 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 	if (mp_rep.nb_sent != mp_rep.nb_received) {
 		DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
 			dev->data->port_id, type);
+		ret = -ETIMEDOUT;
 		goto exit;
 	}
 	for (i = 0; i < mp_rep.nb_received; i++) {
@@ -337,9 +410,11 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 		if (res->result) {
 			DRV_LOG(ERR, "port %u request failed on secondary %d",
 				dev->data->port_id, i);
+			ret = res->result;
 			goto exit;
 		}
 	}
 exit:
 	free(mp_rep.msgs);
+	return ret;
 }
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
index c4045141bc..8914f4cf04 100644
--- a/drivers/net/mana/mr.c
+++ b/drivers/net/mana/mr.c
@@ -314,8 +314,10 @@ mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
 void
 mana_mr_btree_free(struct mana_mr_btree *bt)
 {
-	rte_free(bt->table);
-	memset(bt, 0, sizeof(*bt));
+	if (bt && bt->table) {
+		rte_free(bt->table);
+		memset(bt, 0, sizeof(*bt));
+	}
 }
 
 int
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index 1b8ba1f3a9..aedb05d46f 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -36,6 +36,11 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 		db_page = process_priv->db_page;
 	}
 
+	if (!db_page) {
+		DP_LOG(ERR, "db_page is NULL, cannot ring RX doorbell");
+		return -EINVAL;
+	}
+
 	/* Hardware Spec specifies that software client should set 0 for
 	 * wqe_cnt for Receive Queues.
 	 */
@@ -172,7 +177,7 @@ mana_stop_rx_queues(struct rte_eth_dev *dev)
 
 	for (i = 0; i < priv->num_queues; i++)
 		if (dev->data->rx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
-			return -EINVAL;
+			return 0;
 
 	if (priv->rwq_qp) {
 		ret = ibv_destroy_qp(priv->rwq_qp);
@@ -256,6 +261,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 		struct mana_rxq *rxq = dev->data->rx_queues[i];
 		struct ibv_wq_init_attr wq_attr = {};
 
+		rxq->rxq_idx = i;
+		DRV_LOG(DEBUG, "assigning rxq_idx to %d", i);
+
 		manadv_set_context_attr(priv->ib_ctx,
 			MANADV_CTX_ATTR_BUF_ALLOCATORS,
 			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -451,6 +459,16 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	uint32_t pkt_len;
 	uint32_t i;
 	int polled = 0;
+	uint32_t expected = 0;
+
+	/* Single atomic CAS: enter burst only if device is active (0→1).
+	 * Fails immediately if reset path has set the blocked bit.
+	 */
+	if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+			&rxq->burst_state, &expected, 1,
+			rte_memory_order_acquire,
+			rte_memory_order_relaxed)))
+		return 0;
 
 repoll:
 	/* Polling on new completions if we have no backlog */
@@ -592,6 +610,9 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				wqe_consumed, ret);
 	}
 
+	rte_atomic_fetch_and_explicit(&rxq->burst_state, ~(uint32_t)1,
+				     rte_memory_order_release);
+
 	return pkt_received;
 }
 
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
index 57dbbc3651..10f2212b5d 100644
--- a/drivers/net/mana/tx.c
+++ b/drivers/net/mana/tx.c
@@ -17,7 +17,7 @@ mana_stop_tx_queues(struct rte_eth_dev *dev)
 
 	for (i = 0; i < priv->num_queues; i++)
 		if (dev->data->tx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
-			return -EINVAL;
+			return 0;
 
 	for (i = 0; i < priv->num_queues; i++) {
 		struct mana_txq *txq = dev->data->tx_queues[i];
@@ -83,6 +83,9 @@ mana_start_tx_queues(struct rte_eth_dev *dev)
 
 		txq = dev->data->tx_queues[i];
 
+		txq->txq_idx = i;
+		DRV_LOG(DEBUG, "assigning txq_idx to %d", txq->txq_idx);
+
 		manadv_set_context_attr(priv->ib_ctx,
 			MANADV_CTX_ATTR_BUF_ALLOCATORS,
 			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -190,10 +193,34 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	void *db_page;
 	uint16_t pkt_sent = 0;
 	uint32_t num_comp, i;
+	uint32_t expected = 0;
 #ifdef RTE_ARCH_32
 	uint32_t wqe_count = 0;
 #endif
 
+	db_page = priv->db_page;
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct rte_eth_dev *dev =
+			&rte_eth_devices[priv->dev_data->port_id];
+		struct mana_process_priv *process_priv = dev->process_private;
+
+		db_page = process_priv->db_page;
+	}
+
+	/* Single atomic CAS: enter burst only if device is active (0→1).
+	 * Fails immediately if reset path has set the blocked bit.
+	 */
+	if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+			&txq->burst_state, &expected, 1,
+			rte_memory_order_acquire,
+			rte_memory_order_relaxed) || !db_page)) {
+		if (!expected) /* CAS succeeded but db_page NULL — undo */
+			rte_atomic_fetch_and_explicit(&txq->burst_state,
+						      ~(uint32_t)1,
+						      rte_memory_order_release);
+		return 0;
+	}
+
 	/* Process send completions from GDMA */
 	num_comp = gdma_poll_completion_queue(&txq->gdma_cq,
 			txq->gdma_comp_buf, txq->num_desc);
@@ -216,7 +243,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		}
 
 		if (!desc->pkt) {
-			DP_LOG(ERR, "mana_txq_desc has a NULL pkt");
+			DP_LOG(ERR, "mana_txq_desc has a NULL pkt, priv %p, "
+			       "txq = %d", priv, txq->txq_idx);
 		} else {
 			txq->stats.bytes += desc->pkt->pkt_len;
 			rte_pktmbuf_free(desc->pkt);
@@ -474,15 +502,6 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	}
 
 	/* Ring hardware door bell */
-	db_page = priv->db_page;
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		struct rte_eth_dev *dev =
-			&rte_eth_devices[priv->dev_data->port_id];
-		struct mana_process_priv *process_priv = dev->process_private;
-
-		db_page = process_priv->db_page;
-	}
-
 	if (pkt_sent) {
 #ifdef RTE_ARCH_32
 		ret = mana_ring_short_doorbell(db_page, GDMA_QUEUE_SEND,
@@ -501,5 +520,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 			DP_LOG(ERR, "mana_ring_doorbell failed ret %d", ret);
 	}
 
+	rte_atomic_fetch_and_explicit(&txq->burst_state, ~(uint32_t)1,
+				     rte_memory_order_release);
+
 	return pkt_sent;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v5 00/11] bpf: introduce extensible load API
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  Cc: dev
In-Reply-To: <20260520124922.42445-1-marat.khalili@huawei.com>

This patchset introduces an extensible load API for the BPF library in
DPDK, addressing current limitations regarding ABI stability and feature
constraints.

Currently, `rte_bpf_load` relies on a fixed `struct rte_bpf_prm`, which
makes it difficult to add new loading options or parameters without
breaking the ABI.

To resolve these issues, this series introduces `rte_bpf_load_ex` taking
`struct rte_bpf_prm_ex`. The new parameter structure includes a `sz`
field for backward compatibility, allowing future extensions.

Taking advantage of the new extensible API, this patchset also adds
several new features:
* Support for loading and executing BPF programs with up to 5 arguments.
* Support for loading classic BPF (cBPF) directly.
* Support for loading ELF files directly from memory buffers.
* New API functions (`rte_bpf_eth_rx_install` and `rte_bpf_eth_tx_install`)
  to install an already loaded BPF program as a port callback, decoupling
  the loading phase from the installation phase.

v5:
* Fixed compilation between commits broken in v4 while addressing AI
  comments.
* Rebased on fresh main, addressing conflicts in release notes.

v4:
* Restored missing NULL checks in wrapper functions `rte_bpf_load` and
  `rte_bpf_elf_load`.
* Fixed the burst execution functions (`rte_bpf_exec_burst*`) to return
  `0` and set `rte_errno = EINVAL` on failure, preventing `-EINVAL`
  being reinterpreted as a large `uint32_t` value. Initialized `rc`
  properly in scalar execution wrappers for this case.
* Swapped the Doxygen comments in `rte_bpf_ethdev.h` for RX and TX functions.
* Added diagnostic dump on failure path in `test_bpf_filter`.
* Fixed memory leak of the BPF handle in `bpf_rx_test` upon install failure.
* Added tests for NULL parameter rejection, mismatched execution arguments,
  unsupported execution flags, and the libpcap-less `rte_bpf_convert` stub.

v3:
* Appended Acked-by tags to all individual commits to align with
  patchwork requirements.

v2:
* Fixed a potential segmentation fault in `exec_vm_burst_ex` by deferring
  the dereference of `ctx[i].arg` until it is confirmed that `nb_prog_arg > 0`.
* Clarified documentation and code comments for `RTE_BPF_EXEC_FLAG_JIT`
  requirements and fast-path expectations.

Marat Khalili (11):
  bpf: make logging prefixes more consistent
  bpf: introduce extensible load API
  bpf: support up to 5 arguments
  bpf: add cBPF origin to rte_bpf_load_ex
  bpf: support rte_bpf_prm_ex with port callbacks
  bpf: support loading ELF files from memory
  test/bpf: test loading cBPF directly
  test/bpf: test loading ELF file from memory
  doc: add release notes for new extensible BPF API
  doc: add load API to BPF programmer's guide
  test/bpf: add tests for error handling contracts

 app/test/test_bpf.c                    | 455 +++++++++++++++++--------
 doc/guides/prog_guide/bpf_lib.rst      |  75 +++-
 doc/guides/rel_notes/release_26_07.rst |  20 ++
 lib/bpf/bpf.c                          |  32 +-
 lib/bpf/bpf_convert.c                  |  99 +++++-
 lib/bpf/bpf_exec.c                     | 134 +++++++-
 lib/bpf/bpf_impl.h                     |  53 ++-
 lib/bpf/bpf_jit_arm64.c                |  18 +-
 lib/bpf/bpf_jit_x86.c                  |  10 +-
 lib/bpf/bpf_load.c                     | 213 ++++++++++--
 lib/bpf/bpf_load_elf.c                 | 189 ++++++----
 lib/bpf/bpf_pkt.c                      |  65 +++-
 lib/bpf/bpf_stub.c                     |  46 ---
 lib/bpf/bpf_validate.c                 |  94 +++--
 lib/bpf/meson.build                    |  15 +-
 lib/bpf/rte_bpf.h                      | 199 ++++++++++-
 lib/bpf/rte_bpf_ethdev.h               |  54 +++
 17 files changed, 1400 insertions(+), 371 deletions(-)
 delete mode 100644 lib/bpf/bpf_stub.c

-- 
2.43.0

^ permalink raw reply

* [PATCH v5 01/11] bpf: make logging prefixes more consistent
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev, Wathsala Vithanage; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Logging in lib/bpf is inconsistent: some places use `%s()`, other just
`%s` for `__func__`.

Introduce new macro for logging prefixed with function name and use it
everywhere function name without arguments is prefixed to the log line.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_convert.c   | 18 +++++++++---------
 lib/bpf/bpf_impl.h      |  3 +++
 lib/bpf/bpf_jit_arm64.c |  4 ++--
 lib/bpf/bpf_load.c      |  2 +-
 lib/bpf/bpf_load_elf.c  |  2 +-
 lib/bpf/bpf_stub.c      |  6 ++----
 lib/bpf/bpf_validate.c  | 25 ++++++++++++-------------
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/lib/bpf/bpf_convert.c b/lib/bpf/bpf_convert.c
index 86e703299d..953ca80670 100644
--- a/lib/bpf/bpf_convert.c
+++ b/lib/bpf/bpf_convert.c
@@ -247,8 +247,8 @@ static int bpf_convert_filter(const struct bpf_insn *prog, size_t len,
 	uint8_t bpf_src;
 
 	if (len > BPF_MAXINSNS) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cBPF program too long (%zu insns)",
-			    __func__, len);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cBPF program too long (%zu insns)",
+			    len);
 		return -EINVAL;
 	}
 
@@ -483,8 +483,8 @@ static int bpf_convert_filter(const struct bpf_insn *prog, size_t len,
 
 			/* Unknown instruction. */
 		default:
-			RTE_BPF_LOG_LINE(ERR, "%s: Unknown instruction!: %#x",
-				    __func__, fp->code);
+			RTE_BPF_LOG_FUNC_LINE(ERR, "Unknown instruction!: %#x",
+				    fp->code);
 			goto err;
 		}
 
@@ -528,7 +528,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 	int ret;
 
 	if (prog == NULL) {
-		RTE_BPF_LOG_LINE(ERR, "%s: NULL program", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "NULL program");
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -536,13 +536,13 @@ rte_bpf_convert(const struct bpf_program *prog)
 	/* 1st pass: calculate the eBPF program length */
 	ret = bpf_convert_filter(prog->bf_insns, prog->bf_len, NULL, &ebpf_len);
 	if (ret < 0) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cannot get eBPF length", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot get eBPF length");
 		rte_errno = -ret;
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(DEBUG, "%s: prog len cBPF=%u -> eBPF=%u",
-		    __func__, prog->bf_len, ebpf_len);
+	RTE_BPF_LOG_FUNC_LINE(DEBUG, "prog len cBPF=%u -> eBPF=%u",
+		    prog->bf_len, ebpf_len);
 
 	prm = rte_zmalloc("bpf_filter",
 			  sizeof(*prm) + ebpf_len * sizeof(*ebpf), 0);
@@ -557,7 +557,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 	/* 2nd pass: remap cBPF to eBPF instructions  */
 	ret = bpf_convert_filter(prog->bf_insns, prog->bf_len, ebpf, &ebpf_len);
 	if (ret < 0) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cannot convert cBPF to eBPF", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot convert cBPF to eBPF");
 		rte_free(prm);
 		rte_errno = -ret;
 		return NULL;
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index f5fa220984..fb5ec3c4d6 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -32,6 +32,9 @@ extern int rte_bpf_logtype;
 #define RTE_BPF_LOG_LINE(lvl, ...) \
 	RTE_LOG_LINE(lvl, BPF, __VA_ARGS__)
 
+#define RTE_BPF_LOG_FUNC_LINE(lvl, fmt, ...) \
+	RTE_LOG_LINE(lvl, BPF, "%s(): " fmt, __func__, ##__VA_ARGS__)
+
 static inline size_t
 bpf_size(uint32_t bpf_op_sz)
 {
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index a04ef33a9c..4bbb97da1b 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -98,8 +98,8 @@ check_invalid_args(struct a64_jit_ctx *ctx, uint32_t limit)
 
 	for (idx = 0; idx < limit; idx++) {
 		if (rte_le_to_cpu_32(ctx->ins[idx]) == A64_INVALID_OP_CODE) {
-			RTE_BPF_LOG_LINE(ERR,
-				"%s: invalid opcode at %u;", __func__, idx);
+			RTE_BPF_LOG_FUNC_LINE(ERR,
+				"invalid opcode at %u;", idx);
 			return -EINVAL;
 		}
 	}
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index 6983c026af..b8a0426fe2 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -100,7 +100,7 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
 
 	if (rc != 0) {
 		rte_errno = -rc;
-		RTE_BPF_LOG_LINE(ERR, "%s: %d-th xsym is invalid", __func__, i);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "%d-th xsym is invalid", i);
 		return NULL;
 	}
 
diff --git a/lib/bpf/bpf_load_elf.c b/lib/bpf/bpf_load_elf.c
index 1d30ba17e2..2390823cbf 100644
--- a/lib/bpf/bpf_load_elf.c
+++ b/lib/bpf/bpf_load_elf.c
@@ -122,7 +122,7 @@ check_elf_header(const Elf64_Ehdr *eh)
 		err = "unexpected machine type";
 
 	if (err != NULL) {
-		RTE_BPF_LOG_LINE(ERR, "%s(): %s", __func__, err);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "%s", err);
 		return -EINVAL;
 	}
 
diff --git a/lib/bpf/bpf_stub.c b/lib/bpf/bpf_stub.c
index dea0d703ca..e06e820d83 100644
--- a/lib/bpf/bpf_stub.c
+++ b/lib/bpf/bpf_stub.c
@@ -21,8 +21,7 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(ERR, "%s() is not supported, rebuild with libelf installed",
-		__func__);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
 	rte_errno = ENOTSUP;
 	return NULL;
 }
@@ -38,8 +37,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(ERR, "%s() is not supported, rebuild with libpcap installed",
-		__func__);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
 	rte_errno = ENOTSUP;
 	return NULL;
 }
diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index e8dbec2827..a7f4f576c9 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -1838,16 +1838,16 @@ add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx)
 	uint32_t ne;
 
 	if (nidx >= bvf->prm->nb_ins) {
-		RTE_BPF_LOG_LINE(ERR,
-			"%s: program boundary violation at pc: %u, next pc: %u",
-			__func__, get_node_idx(bvf, node), nidx);
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"program boundary violation at pc: %u, next pc: %u",
+			get_node_idx(bvf, node), nidx);
 		return -EINVAL;
 	}
 
 	ne = node->nb_edge;
 	if (ne >= RTE_DIM(node->edge_dest)) {
-		RTE_BPF_LOG_LINE(ERR, "%s: internal error at pc: %u",
-			__func__, get_node_idx(bvf, node));
+		RTE_BPF_LOG_FUNC_LINE(ERR, "internal error at pc: %u",
+			get_node_idx(bvf, node));
 		return -EINVAL;
 	}
 
@@ -2005,8 +2005,7 @@ validate(struct bpf_verifier *bvf)
 
 		err = check_syntax(ins);
 		if (err != 0) {
-			RTE_BPF_LOG_LINE(ERR, "%s: %s at pc: %u",
-				__func__, err, i);
+			RTE_BPF_LOG_FUNC_LINE(ERR, "%s at pc: %u", err, i);
 			rc |= -EINVAL;
 		}
 
@@ -2230,9 +2229,9 @@ save_cur_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
 	/* get new eval_state for this node */
 	st = pull_eval_state(&bvf->evst_sr_pool);
 	if (st == NULL) {
-		RTE_BPF_LOG_LINE(ERR,
-			"%s: internal error (out of space) at pc: %u",
-			__func__, get_node_idx(bvf, node));
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"internal error (out of space) at pc: %u",
+			get_node_idx(bvf, node));
 		return -ENOMEM;
 	}
 
@@ -2462,8 +2461,8 @@ evaluate(struct bpf_verifier *bvf)
 				err = ins_chk[op].eval(bvf, ins + idx);
 				stats.nb_eval++;
 				if (err != NULL) {
-					RTE_BPF_LOG_LINE(ERR, "%s: %s at pc: %u",
-						__func__, err, idx);
+					RTE_BPF_LOG_FUNC_LINE(ERR,
+						"%s at pc: %u", err, idx);
 					rc = -EINVAL;
 				}
 			}
@@ -2533,7 +2532,7 @@ __rte_bpf_validate(struct rte_bpf *bpf)
 			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR &&
 			(sizeof(uint64_t) != sizeof(uintptr_t) ||
 			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
-		RTE_BPF_LOG_LINE(ERR, "%s: unsupported argument type", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
 		return -ENOTSUP;
 	}
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 03/11] bpf: support up to 5 arguments
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev, Wathsala Vithanage; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

When using rte_bpf_load_ex allow up to 5 arguments for a BPF program.
Particularly useful for call-backs and other internal functions.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf.c           |  32 ++++++++++-
 lib/bpf/bpf_exec.c      | 124 +++++++++++++++++++++++++++++++++++++++-
 lib/bpf/bpf_impl.h      |   2 +-
 lib/bpf/bpf_jit_arm64.c |   2 +-
 lib/bpf/bpf_jit_x86.c   |   2 +-
 lib/bpf/bpf_load.c      |   8 ++-
 lib/bpf/bpf_validate.c  |  45 +++++++++++----
 lib/bpf/rte_bpf.h       | 121 +++++++++++++++++++++++++++++++++++++--
 8 files changed, 311 insertions(+), 25 deletions(-)

diff --git a/lib/bpf/bpf.c b/lib/bpf/bpf.c
index 5239b3e11e..67dededd9a 100644
--- a/lib/bpf/bpf.c
+++ b/lib/bpf/bpf.c
@@ -16,8 +16,8 @@ void
 rte_bpf_destroy(struct rte_bpf *bpf)
 {
 	if (bpf != NULL) {
-		if (bpf->jit.func != NULL)
-			munmap(bpf->jit.func, bpf->jit.sz);
+		if (bpf->jit.raw != NULL)
+			munmap(bpf->jit.raw, bpf->jit.sz);
 		munmap(bpf, bpf->sz);
 	}
 }
@@ -29,7 +29,33 @@ rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
 	if (bpf == NULL || jit == NULL)
 		return -EINVAL;
 
-	jit[0] = bpf->jit;
+	if (bpf->prm.nb_prog_arg != 1) {
+		RTE_BPF_LOG_LINE(ERR,
+			"this program takes %d arguments, use rte_bpf_get_jit_ex",
+			bpf->prm.nb_prog_arg);
+		return -EINVAL;
+	}
+
+	*jit = (struct rte_bpf_jit) {
+		.func = bpf->jit.raw,
+		.sz = bpf->jit.sz,
+	};
+	return 0;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_get_jit_ex, 26.11)
+int
+rte_bpf_get_jit_ex(const struct rte_bpf *bpf, struct rte_bpf_jit_ex *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	if (bpf->jit.raw == NULL) {
+		RTE_BPF_LOG_LINE(ERR, "no JIT-compiled version");
+		return -ENOENT;
+	}
+
+	*jit = bpf->jit;
 	return 0;
 }
 
diff --git a/lib/bpf/bpf_exec.c b/lib/bpf/bpf_exec.c
index e4668ba10b..20f8d0fa29 100644
--- a/lib/bpf/bpf_exec.c
+++ b/lib/bpf/bpf_exec.c
@@ -10,6 +10,7 @@
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_byteorder.h>
+#include <rte_errno.h>
 
 #include "bpf_impl.h"
 
@@ -502,6 +503,12 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 	uint64_t reg[EBPF_REG_NUM];
 	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
 
+	if (bpf->prm.nb_prog_arg != 1) {
+		/* Use rte_bpf_exec_burst_ex with this program. */
+		rte_errno = EINVAL;
+		return 0;
+	}
+
 	for (i = 0; i != num; i++) {
 
 		reg[EBPF_REG_1] = (uintptr_t)ctx[i];
@@ -513,12 +520,127 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 	return i;
 }
 
+static uint32_t
+exec_vm_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[EBPF_REG_NUM];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		switch (bpf->prm.nb_prog_arg) {
+		case 5:
+			reg[EBPF_REG_5] = ctx[i].arg[4].u64;
+			/* FALLTHROUGH */
+		case 4:
+			reg[EBPF_REG_4] = ctx[i].arg[3].u64;
+			/* FALLTHROUGH */
+		case 3:
+			reg[EBPF_REG_3] = ctx[i].arg[2].u64;
+			/* FALLTHROUGH */
+		case 2:
+			reg[EBPF_REG_2] = ctx[i].arg[1].u64;
+			/* FALLTHROUGH */
+		case 1:
+			reg[EBPF_REG_1] = ctx[i].arg[0].u64;
+			/* FALLTHROUGH */
+		case 0:
+			break;
+		}
+
+		reg[EBPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+static uint32_t
+exec_jit_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num)
+{
+	uint32_t i = 0;
+	const struct rte_bpf_jit_ex jit = bpf->jit;
+
+	/*
+	 * Fast path: assumes application pre-validated RTE_BPF_EXEC_FLAG_JIT
+	 * and successful JIT generation. No explicit NULL checks here.
+	 */
+	switch (bpf->prm.nb_prog_arg) {
+	case 0:
+		for (i = 0; i != num; i++)
+			rc[i] = jit.func0();
+		break;
+	case 1:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func1(arg[0]);
+		}
+		break;
+	case 2:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func2(arg[0], arg[1]);
+		}
+		break;
+	case 3:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func3(arg[0], arg[1], arg[2]);
+		}
+		break;
+	case 4:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func4(arg[0], arg[1], arg[2], arg[3]);
+		}
+		break;
+	case 5:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func5(arg[0], arg[1], arg[2], arg[3], arg[4]);
+		}
+		break;
+	}
+
+	return i;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_exec_burst_ex, 26.11)
+uint32_t
+rte_bpf_exec_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num, uint64_t flags)
+{
+	if ((flags & ~RTE_BPF_EXEC_FLAG_MASK) != 0) {
+		rte_errno = EINVAL;
+		return 0;
+	}
+
+	return (flags & RTE_BPF_EXEC_FLAG_JIT) != 0 ?
+		exec_jit_burst_ex(bpf, ctx, rc, num) :
+		exec_vm_burst_ex(bpf, ctx, rc, num);
+}
+
 RTE_EXPORT_SYMBOL(rte_bpf_exec)
 uint64_t
 rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
 {
-	uint64_t rc;
+	uint64_t rc = UINT64_MAX;
 
 	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
 	return rc;
 }
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_exec_ex, 26.11)
+uint64_t
+rte_bpf_exec_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t flags)
+{
+	uint64_t rc = UINT64_MAX;
+
+	rte_bpf_exec_burst_ex(bpf, ctx, &rc, 1, flags);
+	return rc;
+}
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index 1cee109bc9..4a98b33730 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -12,7 +12,7 @@
 
 struct rte_bpf {
 	struct rte_bpf_prm_ex prm;
-	struct rte_bpf_jit jit;
+	struct rte_bpf_jit_ex jit;
 	size_t sz;
 	uint32_t stack_sz;
 };
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 9e5e142c13..ba7ae4d680 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -1471,7 +1471,7 @@ __rte_bpf_jit_arm64(struct rte_bpf *bpf)
 	/* Flush the icache */
 	__builtin___clear_cache((char *)ctx.ins, (char *)(ctx.ins + ctx.idx));
 
-	bpf->jit.func = (void *)ctx.ins;
+	bpf->jit.raw = ctx.ins;
 	bpf->jit.sz = size;
 
 	goto finish;
diff --git a/lib/bpf/bpf_jit_x86.c b/lib/bpf/bpf_jit_x86.c
index 6f4235d434..54eb279643 100644
--- a/lib/bpf/bpf_jit_x86.c
+++ b/lib/bpf/bpf_jit_x86.c
@@ -1568,7 +1568,7 @@ __rte_bpf_jit_x86(struct rte_bpf *bpf)
 	if (rc != 0)
 		munmap(st.ins, st.sz);
 	else {
-		bpf->jit.func = (void *)st.ins;
+		bpf->jit.raw = st.ins;
 		bpf->jit.sz = st.sz;
 	}
 
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index a6793b2c94..f63093b9bc 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -149,7 +149,8 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
 			.raw.nb_ins = prm->nb_ins,
 			.xsym = prm->xsym,
 			.nb_xsym = prm->nb_xsym,
-			.prog_arg = prm->prog_arg,
+			.prog_arg[0] = prm->prog_arg,
+			.nb_prog_arg = 1,
 		});
 }
 
@@ -170,7 +171,8 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 			.elf_file.section = sname,
 			.xsym = prm->xsym,
 			.nb_xsym = prm->nb_xsym,
-			.prog_arg = prm->prog_arg,
+			.prog_arg[0] = prm->prog_arg,
+			.nb_prog_arg = 1,
 		});
 }
 
@@ -271,6 +273,6 @@ rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
 	}
 
 	RTE_BPF_LOG_FUNC_LINE(INFO, "successfully creates %p(jit={.func=%p,.sz=%zu});",
-		load.bpf, load.bpf->jit.func, load.bpf->jit.sz);
+		load.bpf, load.bpf->jit.raw, load.bpf->jit.sz);
 	return load.bpf;
 }
diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index 5bfc59296d..bf8a4abb5a 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -2425,10 +2425,14 @@ evaluate(struct bpf_verifier *bvf)
 		.s = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE},
 	};
 
-	bvf->evst->rv[EBPF_REG_1].v = bvf->prm->prog_arg;
-	bvf->evst->rv[EBPF_REG_1].mask = UINT64_MAX;
-	if (bvf->prm->prog_arg.type == RTE_BPF_ARG_RAW)
-		eval_max_bound(bvf->evst->rv + EBPF_REG_1, UINT64_MAX);
+	for (uint32_t pai = 0; pai != bvf->prm->nb_prog_arg; ++pai) {
+		struct bpf_reg_val *reg = &bvf->evst->rv[EBPF_REG_1 + pai];
+
+		reg->v = bvf->prm->prog_arg[pai];
+		reg->mask = UINT64_MAX;
+		if (reg->v.type == RTE_BPF_ARG_RAW)
+			eval_max_bound(reg, UINT64_MAX);
+	}
 
 	bvf->evst->rv[EBPF_REG_10] = rvfp;
 
@@ -2521,21 +2525,42 @@ evaluate(struct bpf_verifier *bvf)
 	return rc;
 }
 
+static bool
+prog_arg_is_valid(const struct rte_bpf_arg *prog_arg)
+{
+	/* check input argument type, don't allow mbuf ptr on 32-bit */
+	if (prog_arg->type != RTE_BPF_ARG_RAW &&
+			prog_arg->type != RTE_BPF_ARG_PTR &&
+			(sizeof(uint64_t) != sizeof(uintptr_t) ||
+			prog_arg->type != RTE_BPF_ARG_PTR_MBUF)) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
+		return false;
+	}
+
+	return true;
+}
+
 int
 __rte_bpf_validate(const struct rte_bpf_prm_ex *prm, uint32_t *stack_sz)
 {
 	int32_t rc;
 	struct bpf_verifier bvf;
 
-	/* check input argument type, don't allow mbuf ptr on 32-bit */
-	if (prm->prog_arg.type != RTE_BPF_ARG_RAW &&
-			prm->prog_arg.type != RTE_BPF_ARG_PTR &&
-			(sizeof(uint64_t) != sizeof(uintptr_t) ||
-			prm->prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
-		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
+	if (prm->nb_prog_arg > EBPF_FUNC_MAX_ARGS) {
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"support up to %u arguments, found %u",
+			EBPF_FUNC_MAX_ARGS, prm->nb_prog_arg);
 		return -ENOTSUP;
 	}
 
+	for (uint32_t pai = 0; pai != prm->nb_prog_arg; ++pai)
+		if (!prog_arg_is_valid(&prm->prog_arg[pai])) {
+			RTE_BPF_LOG_FUNC_LINE(ERR,
+				"unsupported argument %d (r%d) type",
+				pai, EBPF_REG_1 + pai);
+			return -ENOTSUP;
+		}
+
 	memset(&bvf, 0, sizeof(bvf));
 	bvf.prm = prm;
 	bvf.in = calloc(prm->raw.nb_ins, sizeof(bvf.in[0]));
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index bf58a41819..0e7eaa3c18 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -25,6 +25,11 @@
 extern "C" {
 #endif
 
+#define RTE_BPF_EXEC_FLAG_JIT	RTE_BIT64(0)	/**< use JIT-compiled version */
+
+/** Mask with all supported `RTE_BPF_EXEC_FLAG_*` flags set. */
+#define RTE_BPF_EXEC_FLAG_MASK  RTE_BPF_EXEC_FLAG_JIT
+
 /**
  * Possible types for function/BPF program arguments.
  */
@@ -122,7 +127,8 @@ struct rte_bpf_prm_ex {
 	/**< array of external symbols that eBPF code is allowed to reference */
 	uint32_t nb_xsym;  /**< number of elements in xsym */
 
-	struct rte_bpf_arg prog_arg;  /**< input arg description */
+	struct rte_bpf_arg prog_arg[EBPF_FUNC_MAX_ARGS];  /**< program arguments */
+	uint32_t nb_prog_arg;  /**< program argument count */
 };
 
 /**
@@ -138,13 +144,49 @@ struct rte_bpf_prm {
 };
 
 /**
- * Information about compiled into native ISA eBPF code.
+ * Information about compiled into native ISA eBPF code accepting 1 argument.
  */
 struct rte_bpf_jit {
 	uint64_t (*func)(void *); /**< JIT-ed native code */
 	size_t sz;                /**< size of JIT-ed code */
 };
 
+union rte_bpf_func_arg {
+	uint64_t u64;
+	void *ptr;
+};
+
+typedef uint64_t (*rte_bpf_jit_func0_t)(void);
+typedef uint64_t (*rte_bpf_jit_func1_t)(union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func2_t)(union rte_bpf_func_arg, union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func3_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func4_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg, union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func5_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg, union rte_bpf_func_arg, union rte_bpf_func_arg);
+
+/**
+ * JIT-ed native code, member depends on number of program arguments.
+ */
+struct rte_bpf_jit_ex {
+	union {
+		void *raw;
+		rte_bpf_jit_func0_t func0;  /* nullary function */
+		rte_bpf_jit_func1_t func1;  /* unary function */
+		rte_bpf_jit_func2_t func2;  /* binary function */
+		rte_bpf_jit_func3_t func3;  /* ternary function */
+		rte_bpf_jit_func4_t func4;  /* quaternary function */
+		rte_bpf_jit_func5_t func5;  /* quinary function */
+	};
+	size_t sz;
+};
+
+/* Tuple of eBPF program arguments. */
+struct rte_bpf_prog_ctx {
+	union rte_bpf_func_arg arg[EBPF_FUNC_MAX_ARGS];
+};
+
 struct rte_bpf;
 
 /**
@@ -224,7 +266,7 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 	__rte_malloc __rte_dealloc(rte_bpf_destroy, 1);
 
 /**
- * Execute given BPF bytecode.
+ * Execute given BPF bytecode accepting 1 argument.
  *
  * @param bpf
  *   handle for the BPF code to execute.
@@ -237,7 +279,29 @@ uint64_t
 rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
 
 /**
- * Execute given BPF bytecode over a set of input contexts.
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Execute given BPF bytecode accepting any number of arguments.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   program arguments tuple.
+ * @param flags
+ *   bitwise OR of `RTE_BPF_EXEC_FLAG_*` values controlling execution.
+ *   Flag RTE_BPF_EXEC_FLAG_JIT requires presence of JIT version (can be checked
+ *   with rte_bpf_get_jit_ex).
+ * @return
+ *   BPF execution return value.
+ */
+__rte_experimental
+uint64_t
+rte_bpf_exec_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t flags);
+
+/**
+ * Execute given BPF bytecode accepting 1 argument over a set of input contexts.
  *
  * @param bpf
  *   handle for the BPF code to execute.
@@ -255,7 +319,35 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 		uint32_t num);
 
 /**
- * Provide information about natively compiled code for given BPF handle.
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Execute given BPF program accepting any number of arguments over a set of
+ * input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to array of program argument tuples, can be NULL for nullary programs.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number executions, number of elements in arrays ctx and rc[].
+ * @param flags
+ *   bitwise OR of `RTE_BPF_EXEC_FLAG_*` values controlling execution.
+ *   Flag RTE_BPF_EXEC_FLAG_JIT requires presence of JIT version (can be checked
+ *   with rte_bpf_get_jit_ex).
+ * @return
+ *   number of successfully processed inputs.
+ */
+__rte_experimental
+uint32_t
+rte_bpf_exec_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t rc[], uint32_t num, uint64_t flags);
+
+/**
+ * Provide information about natively compiled code for given BPF program
+ * accepting 1 argument.
  *
  * @param bpf
  *   handle for the BPF code.
@@ -268,6 +360,25 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 int
 rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Get function JIT-compiled from the BPF program.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the struct rte_bpf_jit_ex.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - -ENOENT if there is no JIT-compiled version.
+ *   - Zero if operation completed successfully.
+ */
+__rte_experimental
+int
+rte_bpf_get_jit_ex(const struct rte_bpf *bpf, struct rte_bpf_jit_ex *jit);
+
 /**
  * Dump epf instructions to a file.
  *
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 04/11] bpf: add cBPF origin to rte_bpf_load_ex
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Add cBPF origin to rte_bpf_load_ex to allow loading PCAP filters and
other cBPF code through the unified interface.

Note that for the no-libpcap stub of rte_bpf_convert, the behavior when
called with a NULL program has changed from setting rte_errno to EINVAL
to setting it to ENOTSUP. Since both cases return NULL, callers relying
on pointer checking are unaffected.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_convert.c | 81 +++++++++++++++++++++++++++++++++++++++++--
 lib/bpf/bpf_impl.h    | 11 ++++++
 lib/bpf/bpf_load.c    | 12 ++++++-
 lib/bpf/bpf_stub.c    | 27 ---------------
 lib/bpf/meson.build   | 11 +++---
 lib/bpf/rte_bpf.h     |  8 ++++-
 6 files changed, 113 insertions(+), 37 deletions(-)
 delete mode 100644 lib/bpf/bpf_stub.c

diff --git a/lib/bpf/bpf_convert.c b/lib/bpf/bpf_convert.c
index 953ca80670..e8074b13d0 100644
--- a/lib/bpf/bpf_convert.c
+++ b/lib/bpf/bpf_convert.c
@@ -9,6 +9,12 @@
  * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
  */
 
+#include "bpf_impl.h"
+#include <eal_export.h>
+#include <rte_errno.h>
+
+#ifdef RTE_HAS_LIBPCAP
+
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
@@ -17,17 +23,14 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <eal_export.h>
 #include <rte_common.h>
 #include <rte_bpf.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
-#include <rte_errno.h>
 
 #include <pcap/pcap.h>
 #include <pcap/bpf.h>
 
-#include "bpf_impl.h"
 #include "bpf_def.h"
 
 #ifndef BPF_MAXINSNS
@@ -572,3 +575,75 @@ rte_bpf_convert(const struct bpf_program *prog)
 
 	return prm;
 }
+
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load)
+{
+	free(load->ins);
+}
+
+int
+__rte_bpf_convert(struct __rte_bpf_load *load)
+{
+	struct rte_bpf_prm_ex *const prm = &load->prm;
+	uint32_t nb_ins = 0;
+	int ret;
+
+	RTE_ASSERT(prm->origin == RTE_BPF_ORIGIN_CBPF);
+
+	if (prm->cbpf.ins == NULL || prm->cbpf.nb_ins == 0)
+		return -EINVAL;
+
+	/* 1st pass: calculate the eBPF program length */
+	ret = bpf_convert_filter(prm->cbpf.ins, prm->cbpf.nb_ins, NULL, &nb_ins);
+	if (ret < 0) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot get eBPF length");
+		return ret;
+	}
+
+	RTE_ASSERT(load->ins == NULL);
+	load->ins = malloc(nb_ins * sizeof(load->ins[0]));
+	if (load->ins == NULL)
+		return -ENOMEM;
+
+	/* 2nd pass: remap cBPF to eBPF instructions  */
+	ret = bpf_convert_filter(prm->cbpf.ins, prm->cbpf.nb_ins, load->ins, &nb_ins);
+	if (ret < 0) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot convert cBPF to eBPF");
+		return ret;
+	}
+
+	prm->origin = RTE_BPF_ORIGIN_RAW;
+	prm->raw.ins = load->ins;
+	prm->raw.nb_ins = nb_ins;
+
+	return 0;
+}
+
+#else /* RTE_HAS_LIBPCAP */
+
+RTE_EXPORT_SYMBOL(rte_bpf_convert)
+struct rte_bpf_prm *
+rte_bpf_convert(const struct bpf_program *prog)
+{
+	RTE_SET_USED(prog);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
+	rte_errno = ENOTSUP;
+	return NULL;
+}
+
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load)
+{
+	RTE_ASSERT(load->ins == NULL);
+}
+
+int
+__rte_bpf_convert(struct __rte_bpf_load *load)
+{
+	RTE_SET_USED(load);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
+	return -ENOTSUP;
+}
+
+#endif /* RTE_HAS_LIBPCAP */
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index 4a98b33730..92d03583d9 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -21,6 +21,9 @@ struct rte_bpf {
 struct __rte_bpf_load {
 	struct rte_bpf_prm_ex prm;
 
+	/* Conversion from cBPF. */
+	struct ebpf_insn *ins;
+
 	/* Loading ELF and applying relocations. */
 	int elf_fd;  /* ELF fd, must be negative (not zero) by default. */
 	void *elf;  /* Using void to avoid dependency on libelf. */
@@ -34,6 +37,14 @@ struct __rte_bpf_load {
  * to avoid potential name conflict with other libraries.
  */
 
+/* Free temporary resources created by converting from cBPF to eBPF. */
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load);
+
+/* Convert program from cBPF to eBPF. */
+int
+__rte_bpf_convert(struct __rte_bpf_load *load);
+
 /* Free temporary resources created by opening ELF. */
 void
 __rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load);
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index f63093b9bc..e3265e97ff 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -240,6 +240,9 @@ load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
 	switch (load->prm.origin) {
 	case RTE_BPF_ORIGIN_RAW:
 		break;
+	case RTE_BPF_ORIGIN_CBPF:
+		rc = rc < 0 ? rc : __rte_bpf_convert(load);
+		break;
 	case RTE_BPF_ORIGIN_ELF_FILE:
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_file(load);
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_code(load);
@@ -254,6 +257,13 @@ load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
 	return rc;
 }
 
+static void
+load_cleanup(struct __rte_bpf_load *load)
+{
+	__rte_bpf_convert_cleanup(load);
+	__rte_bpf_load_elf_cleanup(load);
+}
+
 RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_load_ex, 26.11)
 struct rte_bpf *
 rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
@@ -262,7 +272,7 @@ rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
 
 	const int rc = load_try(&load, prm);
 
-	__rte_bpf_load_elf_cleanup(&load);
+	load_cleanup(&load);
 
 	RTE_ASSERT((rc < 0) == (load.bpf == NULL));
 
diff --git a/lib/bpf/bpf_stub.c b/lib/bpf/bpf_stub.c
deleted file mode 100644
index 4c329832c2..0000000000
--- a/lib/bpf/bpf_stub.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018-2021 Intel Corporation
- */
-
-#include "bpf_impl.h"
-#include <eal_export.h>
-#include <rte_errno.h>
-
-/**
- * Contains stubs for unimplemented public API functions
- */
-
-#ifndef RTE_HAS_LIBPCAP
-RTE_EXPORT_SYMBOL(rte_bpf_convert)
-struct rte_bpf_prm *
-rte_bpf_convert(const struct bpf_program *prog)
-{
-	if (prog == NULL) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
-
-	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
-	rte_errno = ENOTSUP;
-	return NULL;
-}
-#endif
diff --git a/lib/bpf/meson.build b/lib/bpf/meson.build
index 4901b6ee14..7e8a300e3f 100644
--- a/lib/bpf/meson.build
+++ b/lib/bpf/meson.build
@@ -15,14 +15,16 @@ if arch_subdir == 'x86' and dpdk_conf.get('RTE_ARCH_32')
     subdir_done()
 endif
 
-sources = files('bpf.c',
+sources = files(
+        'bpf.c',
+        'bpf_convert.c',
         'bpf_dump.c',
         'bpf_exec.c',
         'bpf_load.c',
         'bpf_load_elf.c',
         'bpf_pkt.c',
-        'bpf_stub.c',
-        'bpf_validate.c')
+        'bpf_validate.c',
+)
 
 if arch_subdir == 'x86' and dpdk_conf.get('RTE_ARCH_64')
     sources += files('bpf_jit_x86.c')
@@ -45,8 +47,7 @@ else
 endif
 
 if dpdk_conf.has('RTE_HAS_LIBPCAP')
-    sources += files('bpf_convert.c')
     ext_deps += pcap_dep
 else
-    warning('libpcap is missing, rte_bpf_convert API will be disabled')
+    warning('libpcap is missing, cBPF API will be disabled')
 endif
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index 0e7eaa3c18..da2bdea7e0 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -95,10 +95,12 @@ struct rte_bpf_xsym {
  */
 enum rte_bpf_origin {
 	RTE_BPF_ORIGIN_RAW,		/**< code loaded from raw array */
-	RTE_BPF_ORIGIN_RESERVED,	/**< reserved for cBPF */
+	RTE_BPF_ORIGIN_CBPF,		/**< code converted from cbpf */
 	RTE_BPF_ORIGIN_ELF_FILE,	/**< code loaded from elf_file */
 };
 
+struct bpf_insn;
+
 /**
  * Input parameters for loading eBPF code, extensible version.
  *
@@ -117,6 +119,10 @@ struct rte_bpf_prm_ex {
 			const struct ebpf_insn *ins;  /**< eBPF instructions */
 			uint32_t nb_ins;  /**< number of instructions in ins */
 		} raw;
+		struct {
+			const struct bpf_insn *ins;  /**< cBPF instructions */
+			uint32_t nb_ins;  /**< number of instructions in ins */
+		} cbpf;
 		struct {
 			const char *path;  /**< path to the ELF file */
 			const char *section;  /**< ELF section with the code */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 06/11] bpf: support loading ELF files from memory
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Introduce new ELF origin RTE_BPF_ORIGIN_ELF_MEMORY allowing one to
specify data area containing ELF image.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_impl.h     |  5 +++++
 lib/bpf/bpf_load.c     |  4 ++++
 lib/bpf/bpf_load_elf.c | 40 +++++++++++++++++++++++++++++++++++++++-
 lib/bpf/rte_bpf.h      |  6 ++++++
 4 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index 92d03583d9..14ad772d4b 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -27,6 +27,7 @@ struct __rte_bpf_load {
 	/* Loading ELF and applying relocations. */
 	int elf_fd;  /* ELF fd, must be negative (not zero) by default. */
 	void *elf;  /* Using void to avoid dependency on libelf. */
+	const char *elf_section;
 
 	/* Value we are going to return, if any. */
 	struct rte_bpf *bpf;
@@ -53,6 +54,10 @@ __rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load);
 int
 __rte_bpf_load_elf_file(struct __rte_bpf_load *load);
 
+/* Open the ELF memory image. */
+int
+__rte_bpf_load_elf_memory(struct __rte_bpf_load *load);
+
 /* Get code from ELF and apply relocations to it. */
 int
 __rte_bpf_load_elf_code(struct __rte_bpf_load *load);
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index e3265e97ff..e406211e0e 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -247,6 +247,10 @@ load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_file(load);
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_code(load);
 		break;
+	case RTE_BPF_ORIGIN_ELF_MEMORY:
+		rc = rc < 0 ? rc : __rte_bpf_load_elf_memory(load);
+		rc = rc < 0 ? rc : __rte_bpf_load_elf_code(load);
+		break;
 	default:
 		rc = rc < 0 ? rc : -EINVAL;
 	}
diff --git a/lib/bpf/bpf_load_elf.c b/lib/bpf/bpf_load_elf.c
index 4ae7492351..80443cb63a 100644
--- a/lib/bpf/bpf_load_elf.c
+++ b/lib/bpf/bpf_load_elf.c
@@ -310,6 +310,36 @@ __rte_bpf_load_elf_file(struct __rte_bpf_load *load)
 		return -EINVAL;
 	}
 
+	load->elf_section = prm->elf_file.section;
+
+	return 0;
+}
+
+int
+__rte_bpf_load_elf_memory(struct __rte_bpf_load *load)
+{
+	const struct rte_bpf_prm_ex *const prm = &load->prm;
+
+	RTE_ASSERT(prm->origin == RTE_BPF_ORIGIN_ELF_MEMORY);
+
+	if (prm->elf_memory.data == NULL || prm->elf_memory.section == NULL)
+		return -EINVAL;
+
+	if (elf_version(EV_CURRENT) == EV_NONE)
+		return -ENOTSUP;
+
+	load->elf = elf_memory(
+		/* Cast away const, we are not going to modify the ELF image. */
+		(char *)(uintptr_t)prm->elf_memory.data, prm->elf_memory.size);
+	if (load->elf == NULL) {
+		const int rc = elf_errno();
+		RTE_BPF_LOG_FUNC_LINE(ERR, "error %d opening ELF image: %s",
+			rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	load->elf_section = prm->elf_memory.section;
+
 	return 0;
 }
 
@@ -321,7 +351,7 @@ __rte_bpf_load_elf_code(struct __rte_bpf_load *load)
 	size_t sidx;
 	int rc;
 
-	rc = find_elf_code(load->elf, prm->elf_file.section, &sd, &sidx);
+	rc = find_elf_code(load->elf, load->elf_section, &sd, &sidx);
 	if (rc < 0)
 		return rc;
 
@@ -353,6 +383,14 @@ __rte_bpf_load_elf_file(struct __rte_bpf_load *load)
 	return -ENOTSUP;
 }
 
+int
+__rte_bpf_load_elf_memory(struct __rte_bpf_load *load)
+{
+	RTE_SET_USED(load);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
+	return -ENOTSUP;
+}
+
 int
 __rte_bpf_load_elf_code(struct __rte_bpf_load *load)
 {
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index da2bdea7e0..413ccf0497 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -97,6 +97,7 @@ enum rte_bpf_origin {
 	RTE_BPF_ORIGIN_RAW,		/**< code loaded from raw array */
 	RTE_BPF_ORIGIN_CBPF,		/**< code converted from cbpf */
 	RTE_BPF_ORIGIN_ELF_FILE,	/**< code loaded from elf_file */
+	RTE_BPF_ORIGIN_ELF_MEMORY,	/**< code loaded from elf_memory */
 };
 
 struct bpf_insn;
@@ -127,6 +128,11 @@ struct rte_bpf_prm_ex {
 			const char *path;  /**< path to the ELF file */
 			const char *section;  /**< ELF section with the code */
 		} elf_file;
+		struct {
+			const void *data;  /**< pointer to the ELF image */
+			size_t size;  /**< size of the ELF image */
+			const char *section;  /**< ELF section with the code */
+		} elf_memory;
 	};
 
 	const struct rte_bpf_xsym *xsym;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 08/11] test/bpf: test loading ELF file from memory
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Run each subtest in test_bpf_elf twice: the old way loading ELF images
via temporary file, and using the new rte_bpf_load_ex API to load them
directly from memory.

In tests loading port/queue filters use new rte_bpf_eth_(rx|tx)_install
API to install an already loaded (via one of the ways) BPF program.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 app/test/test_bpf.c | 194 ++++++++++++++++++++++++++------------------
 1 file changed, 114 insertions(+), 80 deletions(-)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index c43b872a4c..026ba18b75 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -3977,12 +3977,61 @@ create_temp_bpf_file(const uint8_t *data, size_t size, const char *name)
 
 #include "test_bpf_load.h"
 
+/* Function loading BPF program from ELF image in memory. */
+typedef struct rte_bpf *
+(*load_elf_image_t)(const void *data, size_t size, const char *section,
+	const struct rte_bpf_xsym *xsym, uint32_t nb_xsym, const struct rte_bpf_arg *prog_arg);
+
+/* Load BPF program by writing ELF image to temporary file and opening this file. */
+static struct rte_bpf *
+load_elf_image_temp_file(const void *data, size_t size, const char *section,
+	const struct rte_bpf_xsym *xsym, uint32_t nb_xsym, const struct rte_bpf_arg *prog_arg)
+{
+	/* Create temp file from embedded BPF object */
+	char *tmpfile = create_temp_bpf_file(data, size, "test");
+	if (tmpfile == NULL) {
+		rte_errno = EIO;
+		return NULL;
+	}
+
+	/* Try to load BPF program from temp file */
+	const struct rte_bpf_prm prm = {
+		.xsym = xsym,
+		.nb_xsym = nb_xsym,
+		.prog_arg = *prog_arg,
+	};
+
+	struct rte_bpf *bpf = rte_bpf_elf_load(&prm, tmpfile, section);
+	unlink(tmpfile);
+	free(tmpfile);
+
+	return bpf;
+}
+
+/* Load BPF program by calling rte_bpf_load_ex and specifying image as the origin. */
+static struct rte_bpf *
+load_elf_image_direct(const void *data, size_t size, const char *section,
+	const struct rte_bpf_xsym *xsym, uint32_t nb_xsym, const struct rte_bpf_arg *prog_arg)
+{
+	return rte_bpf_load_ex(&(struct rte_bpf_prm_ex){
+		.sz = sizeof(struct rte_bpf_prm_ex),
+		.origin = RTE_BPF_ORIGIN_ELF_MEMORY,
+		.elf_memory.data = data,
+		.elf_memory.size = size,
+		.elf_memory.section = section,
+		.xsym = xsym,
+		.nb_xsym = nb_xsym,
+		.prog_arg[0] = *prog_arg,
+		.nb_prog_arg = 1,
+	});
+}
+
 /*
  * Test loading BPF program from an object file.
  * This test uses same arguments as previous test_call1 example.
  */
 static int
-test_bpf_elf_load(void)
+test_bpf_elf_load(load_elf_image_t load_elf_image)
 {
 	static const char test_section[] = "call1";
 	uint8_t tbuf[sizeof(struct dummy_vect8)];
@@ -4010,28 +4059,15 @@ test_bpf_elf_load(void)
 			},
 		},
 	};
-	int ret;
-
-	/* Create temp file from embedded BPF object */
-	char *tmpfile = create_temp_bpf_file(app_test_bpf_load_o,
-					     app_test_bpf_load_o_len,
-					     "load");
-	if (tmpfile == NULL)
-		return -1;
-
-	/* Try to load BPF program from temp file */
-	const struct rte_bpf_prm prm = {
-		.xsym = xsym,
-		.nb_xsym = RTE_DIM(xsym),
-		.prog_arg = {
-			.type = RTE_BPF_ARG_PTR,
-			.size = sizeof(tbuf),
-		},
+	static const struct rte_bpf_arg prog_arg = {
+		.type = RTE_BPF_ARG_PTR,
+		.size = sizeof(tbuf),
 	};
+	struct rte_bpf *bpf;
+	int ret;
 
-	struct rte_bpf *bpf = rte_bpf_elf_load(&prm, tmpfile, test_section);
-	unlink(tmpfile);
-	free(tmpfile);
+	bpf = load_elf_image(app_test_bpf_load_o, app_test_bpf_load_o_len, test_section,
+		xsym, RTE_DIM(xsym), &prog_arg);
 
 	/* If libelf support is not available */
 	if (bpf == NULL && rte_errno == ENOTSUP)
@@ -4174,22 +4210,28 @@ setup_mbufs(struct rte_mbuf *burst[], unsigned int n)
 	return tcp_count;
 }
 
-static int bpf_tx_test(uint16_t port, const char *tmpfile, struct rte_mempool *pool,
-		       const char *section, uint32_t flags)
+static int bpf_tx_test(uint16_t port, struct rte_mempool *pool, load_elf_image_t load_elf_image,
+	const char *section, uint32_t flags)
 {
-	const struct rte_bpf_prm prm = {
-		.prog_arg = {
-			.type = RTE_BPF_ARG_PTR,
-			.size = sizeof(struct dummy_net),
-		},
+	static const struct rte_bpf_arg prog_arg = {
+		.type = RTE_BPF_ARG_PTR,
+		.size = sizeof(struct dummy_net),
 	};
+	struct rte_bpf *bpf;
 	int ret;
 
-	/* Try to load BPF TX program from temp file */
-	ret = rte_bpf_eth_tx_elf_load(port, 0, &prm, tmpfile, section, flags);
+	/* Try to load BPF program from image */
+	bpf = load_elf_image(app_test_bpf_filter_o, app_test_bpf_filter_o_len, section,
+		NULL, 0, &prog_arg);
+	TEST_ASSERT_NOT_NULL(bpf, "failed to load BPF filter from image, error=%d:(%s)\n",
+		       rte_errno, rte_strerror(rte_errno));
+
+	/* Try to install loaded BPF program */
+	ret = rte_bpf_eth_tx_install(port, 0, bpf, flags);
 	if (ret != 0) {
-		printf("%s@%d: failed to load BPF filter from file=%s error=%d:(%s)\n",
-		       __func__, __LINE__, tmpfile, rte_errno, rte_strerror(rte_errno));
+		printf("%s@%d: failed to install BPF filter, error=%d:(%s)\n",
+		       __func__, __LINE__, rte_errno, rte_strerror(rte_errno));
+		rte_bpf_destroy(bpf);
 		return ret;
 	}
 
@@ -4217,10 +4259,9 @@ static int bpf_tx_test(uint16_t port, const char *tmpfile, struct rte_mempool *p
 
 /* Test loading a transmit filter which only allows IPv4 packets */
 static int
-test_bpf_elf_tx_load(void)
+test_bpf_elf_tx_load(load_elf_image_t load_elf_image)
 {
 	static const char null_dev[] = "net_null_bpf0";
-	char *tmpfile = NULL;
 	struct rte_mempool *mb_pool = NULL;
 	uint16_t port = UINT16_MAX;
 	int ret;
@@ -4237,27 +4278,17 @@ test_bpf_elf_tx_load(void)
 	if (ret != 0)
 		goto fail;
 
-	/* Create temp file from embedded BPF object */
-	tmpfile = create_temp_bpf_file(app_test_bpf_filter_o, app_test_bpf_filter_o_len, "tx");
-	if (tmpfile == NULL)
-		goto fail;
-
 	/* Do test with VM */
-	ret = bpf_tx_test(port, tmpfile, mb_pool, "filter", 0);
+	ret = bpf_tx_test(port, mb_pool, load_elf_image, "filter", 0);
 	if (ret != 0)
 		goto fail;
 
 	/* Repeat with JIT */
-	ret = bpf_tx_test(port, tmpfile, mb_pool, "filter", RTE_BPF_ETH_F_JIT);
+	ret = bpf_tx_test(port, mb_pool, load_elf_image, "filter", RTE_BPF_ETH_F_JIT);
 	if (ret == 0)
 		printf("%s: TX ELF load test passed\n", __func__);
 
 fail:
-	if (tmpfile) {
-		unlink(tmpfile);
-		free(tmpfile);
-	}
-
 	if (port != UINT16_MAX)
 		rte_vdev_uninit(null_dev);
 
@@ -4272,23 +4303,29 @@ test_bpf_elf_tx_load(void)
 }
 
 /* Test loading a receive filter */
-static int bpf_rx_test(uint16_t port, const char *tmpfile, struct rte_mempool *pool,
-		       const char *section, uint32_t flags, uint16_t expected)
+static int bpf_rx_test(uint16_t port, struct rte_mempool *pool, load_elf_image_t load_elf_image,
+	const char *section, uint32_t flags, uint16_t expected)
 {
-	struct rte_mbuf *pkts[BPF_TEST_BURST];
-	const struct rte_bpf_prm prm = {
-		.prog_arg = {
-			.type = RTE_BPF_ARG_PTR,
-			.size = sizeof(struct dummy_net),
-		},
+	static const struct rte_bpf_arg prog_arg = {
+		.type = RTE_BPF_ARG_PTR,
+		.size = sizeof(struct dummy_net),
 	};
+	struct rte_mbuf *pkts[BPF_TEST_BURST];
+	struct rte_bpf *bpf;
 	int ret;
 
-	/* Load BPF program to drop all packets */
-	ret = rte_bpf_eth_rx_elf_load(port, 0, &prm, tmpfile, section, flags);
+	/* Try to load BPF program from image */
+	bpf = load_elf_image(app_test_bpf_filter_o, app_test_bpf_filter_o_len, section,
+		NULL, 0, &prog_arg);
+	TEST_ASSERT_NOT_NULL(bpf, "failed to load BPF filter from image, error=%d:(%s)\n",
+		       rte_errno, rte_strerror(rte_errno));
+
+	/* Try to install loaded BPF program */
+	ret = rte_bpf_eth_rx_install(port, 0, bpf, flags);
 	if (ret != 0) {
-		printf("%s@%d: failed to load BPF filter from file=%s error=%d:(%s)\n",
-		       __func__, __LINE__, tmpfile, rte_errno, rte_strerror(rte_errno));
+		printf("%s@%d: failed to install BPF filter, error=%d:(%s)\n",
+		       __func__, __LINE__, rte_errno, rte_strerror(rte_errno));
+		rte_bpf_destroy(bpf);
 		return ret;
 	}
 
@@ -4311,11 +4348,10 @@ static int bpf_rx_test(uint16_t port, const char *tmpfile, struct rte_mempool *p
 
 /* Test loading a receive filters, first with drop all and then with allow all packets */
 static int
-test_bpf_elf_rx_load(void)
+test_bpf_elf_rx_load(load_elf_image_t load_elf_image)
 {
 	static const char null_dev[] = "net_null_bpf0";
 	struct rte_mempool *pool = NULL;
-	char *tmpfile = NULL;
 	uint16_t port = UINT16_MAX;
 	int ret;
 
@@ -4331,28 +4367,23 @@ test_bpf_elf_rx_load(void)
 	if (ret != 0)
 		goto fail;
 
-	/* Create temp file from embedded BPF object */
-	tmpfile = create_temp_bpf_file(app_test_bpf_filter_o, app_test_bpf_filter_o_len, "rx");
-	if (tmpfile == NULL)
-		goto fail;
-
 	/* Do test with VM */
-	ret = bpf_rx_test(port, tmpfile, pool, "drop", 0, 0);
+	ret = bpf_rx_test(port, pool, load_elf_image, "drop", 0, 0);
 	if (ret != 0)
 		goto fail;
 
 	/* Repeat with JIT */
-	ret = bpf_rx_test(port, tmpfile, pool, "drop", RTE_BPF_ETH_F_JIT, 0);
+	ret = bpf_rx_test(port, pool, load_elf_image, "drop", RTE_BPF_ETH_F_JIT, 0);
 	if (ret != 0)
 		goto fail;
 
 	/* Repeat with allow all */
-	ret = bpf_rx_test(port, tmpfile, pool, "allow", 0, BPF_TEST_BURST);
+	ret = bpf_rx_test(port, pool, load_elf_image, "allow", 0, BPF_TEST_BURST);
 	if (ret != 0)
 		goto fail;
 
 	/* Repeat with JIT */
-	ret = bpf_rx_test(port, tmpfile, pool, "allow", RTE_BPF_ETH_F_JIT, BPF_TEST_BURST);
+	ret = bpf_rx_test(port, pool, load_elf_image, "allow", RTE_BPF_ETH_F_JIT, BPF_TEST_BURST);
 	if (ret != 0)
 		goto fail;
 
@@ -4364,11 +4395,6 @@ test_bpf_elf_rx_load(void)
 			  "Mempool available %u != %u leaks?", avail, BPF_TEST_POOLSIZE);
 
 fail:
-	if (tmpfile) {
-		unlink(tmpfile);
-		free(tmpfile);
-	}
-
 	if (port != UINT16_MAX)
 		rte_vdev_uninit(null_dev);
 
@@ -4381,13 +4407,21 @@ test_bpf_elf_rx_load(void)
 static int
 test_bpf_elf(void)
 {
-	int ret;
+	static const load_elf_image_t elf_image_loaders[] = {
+		load_elf_image_temp_file,
+		load_elf_image_direct,
+	};
 
-	ret = test_bpf_elf_load();
-	if (ret == TEST_SUCCESS)
-		ret = test_bpf_elf_tx_load();
-	if (ret == TEST_SUCCESS)
-		ret = test_bpf_elf_rx_load();
+	int ret = TEST_SUCCESS;
+
+	for (int li = 0; li != RTE_DIM(elf_image_loaders); ++li) {
+		if (ret == TEST_SUCCESS)
+			ret = test_bpf_elf_load(elf_image_loaders[li]);
+		if (ret == TEST_SUCCESS)
+			ret = test_bpf_elf_tx_load(elf_image_loaders[li]);
+		if (ret == TEST_SUCCESS)
+			ret = test_bpf_elf_rx_load(elf_image_loaders[li]);
+	}
 
 	return ret;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 09/11] doc: add release notes for new extensible BPF API
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  Cc: dev, Konstantin Ananyev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Document the following new eBPF features introduced in this release:
* Extensible BPF loading API (rte_bpf_load_ex, rte_bpf_prm_ex).
* Loading and executing eBPF programs with up to 5 arguments.
* Installing already loaded eBPF programs as port callbacks.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 doc/guides/rel_notes/release_26_07.rst | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 5d7aa8d1bf..3a151c8e83 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -155,6 +155,26 @@ New Features
   Added AGENTS.md file for AI review
   and supporting scripts to review patches and documentation.
 
+* **Added extensible BPF loading API.**
+
+  Added an extensible BPF loading API comprising the function
+  ``rte_bpf_load_ex`` and struct ``rte_bpf_prm_ex``. This enables new features
+  such as loading classic BPF (cBPF), loading ELF images directly from memory
+  buffers, and executing multi-argument programs, while avoiding future ABI
+  breakages.
+
+* **Added support for executing BPF programs with multiple arguments.**
+
+  Added support for loading and executing BPF programs with up to 5 arguments.
+  This introduces new API functions ``rte_bpf_exec_ex``,
+  ``rte_bpf_exec_burst_ex``, and ``rte_bpf_get_jit_ex``.
+
+* **Added BPF port callback installation API.**
+
+  Added new API functions ``rte_bpf_eth_rx_install`` and
+  ``rte_bpf_eth_tx_install`` for installing already loaded BPF programs as
+  port callbacks (as opposed to loading them directly from ELF files).
+
 
 Removed Items
 -------------
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 10/11] doc: add load API to BPF programmer's guide
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Rewrite the basic operations list to focus on a typical use. Provide an
end-to-end example demonstrating loading from an ELF file, executing via
JIT or the interpreter, and properly handling multiple custom arguments
using rte_bpf_prog_ctx.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 doc/guides/prog_guide/bpf_lib.rst | 75 ++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/doc/guides/prog_guide/bpf_lib.rst b/doc/guides/prog_guide/bpf_lib.rst
index 8c820328b9..df37825088 100644
--- a/doc/guides/prog_guide/bpf_lib.rst
+++ b/doc/guides/prog_guide/bpf_lib.rst
@@ -15,17 +15,79 @@ for more information.
 Also it introduces basic framework to load/unload BPF-based filters
 on eth devices (right now only via SW RX/TX callbacks).
 
-The library API provides the following basic operations:
+The library API provides the following basic operations for working with BPF
+programs:
 
-*  Create a new BPF execution context and load user provided eBPF code into it.
+*   **Loading:** The extensible API (``rte_bpf_load_ex``) is the recommended
+    way to load a BPF program. By utilizing ``struct rte_bpf_prm_ex``, you can
+    load an eBPF program from an ELF file on disk, or load eBPF/cBPF bytecode
+    directly from memory buffers.
 
-*   Destroy an BPF execution context and its runtime structures and free the associated memory.
+*   **Execution via Callbacks:** Once loaded, a BPF program can be attached to
+    a specific ethernet device port and queue to automatically process incoming
+    or outgoing packets using ``rte_bpf_eth_rx_install`` or
+    ``rte_bpf_eth_tx_install``.
 
-*   Execute eBPF bytecode associated with provided input parameter.
+*   **Direct Execution:** You can execute a BPF program directly from your
+    application code using ``rte_bpf_exec_ex`` (or the burst variant
+    ``rte_bpf_exec_burst_ex``). This API allows passing an execution context
+    (``struct rte_bpf_prog_ctx``) containing up to 5 custom arguments.
 
-*   Provide information about natively compiled code for given BPF context.
+*   **JIT Execution:** For maximum performance, you can retrieve the natively
+    compiled (JIT) function pointer for a loaded program using
+    ``rte_bpf_get_jit_ex`` and call it directly from your code with the same
+    arguments.
 
-*   Load BPF program from the ELF file and install callback to execute it on given ethdev port/queue.
+*   **Cleanup:** Destroy a BPF execution context and free the associated memory
+    using ``rte_bpf_destroy``.
+
+The following is a concise example of loading an eBPF program from an ELF file,
+and executing it directly, utilizing the JIT-compiled version if available:
+
+.. code-block:: c
+
+    struct rte_bpf_prm_ex prm = {
+        .sz = sizeof(struct rte_bpf_prm_ex),
+        .origin = RTE_BPF_ORIGIN_ELF_FILE,
+        .elf_file = {
+            .path = "ptype.o",
+            .section = ".text",
+        },
+        .nb_prog_arg = 2,
+        .prog_arg = {
+            [0] = {
+                .type = RTE_BPF_ARG_PTR_MBUF,
+                .size = sizeof(struct rte_mbuf),
+                .buf_size = RTE_MBUF_DEFAULT_BUF_SIZE,
+            },
+            [1] = {
+                .type = RTE_BPF_ARG_RAW,
+                .size = sizeof(uint64_t),
+            },
+        },
+    };
+    struct rte_bpf *bpf = rte_bpf_load_ex(&prm);
+    if (bpf == NULL) {
+        /* Handle load failure */
+    }
+
+    struct rte_bpf_prog_ctx ctx = {
+        .arg[0] = { .ptr = mbuf },
+        .arg[1] = { .u64 = RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK },
+    };
+
+    struct rte_bpf_jit_ex jit;
+    uint64_t ret;
+    if (rte_bpf_get_jit_ex(bpf, &jit) == 0 && jit.func2 != NULL) {
+        /* Call the JIT-compiled function directly for best performance */
+        ret = jit.func2(ctx.arg[0], ctx.arg[1]);
+    } else {
+        /* Fallback to interpreter */
+        uint64_t flags = 0;
+        ret = rte_bpf_exec_ex(bpf, &ctx, flags);
+    }
+
+    rte_bpf_destroy(bpf);
 
 Packet data load instructions
 -----------------------------
@@ -60,7 +122,6 @@ Not currently supported eBPF features
 -------------------------------------
 
  - JIT support only available for X86_64 and arm64 platforms
- - cBPF
  - tail-pointer call
  - eBPF MAP
  - external function calls for 32-bit platforms
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 07/11] test/bpf: test loading cBPF directly
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Run cBPF tests twice: via rte_bpf_convert, and using
RTE_BPF_FLAG_ORIGIN_CBPF origin of new rte_bpf_load_ex API.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 app/test/test_bpf.c | 133 +++++++++++++++++++++++++++-----------------
 1 file changed, 81 insertions(+), 52 deletions(-)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index dd24722450..c43b872a4c 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -4429,13 +4429,59 @@ test_bpf_dump(struct bpf_program *cbf, const struct rte_bpf_prm *prm)
 	}
 }
 
+/* Function loading BPF program from cBPF instructions array. */
+typedef struct rte_bpf *
+(*load_cbpf_program_t)(struct bpf_program *cbpf_program, const char *str);
+
+/* Load BPF program by converting cBPF array to rte_bpf_prm and then opening it. */
+static struct rte_bpf *
+load_cbpf_program_convert(struct bpf_program *cbpf_program, const char *str)
+{
+	struct rte_bpf_prm *prm = NULL;
+	struct rte_bpf *bpf;
+
+	prm = rte_bpf_convert(cbpf_program);
+	if (prm == NULL) {
+		printf("%s@%d: bpf_convert(\"%s\") failed\n",
+			__func__, __LINE__, str);
+		return NULL;
+	}
+
+	printf("bpf convert(\"%s\") produced:\n", str);
+	rte_bpf_dump(stdout, prm->ins, prm->nb_ins);
+
+	printf("%s \"%s\"\n", __func__, str);
+	test_bpf_dump(cbpf_program, prm);
+
+	bpf = rte_bpf_load(prm);
+	rte_free(prm);
+
+	return bpf;
+}
+
+/* Load BPF program by calling rte_bpf_load_ex and specifying cBPF array as the origin. */
+static struct rte_bpf *
+load_cbpf_program_direct(struct bpf_program *cbpf_program, const char *str __rte_unused)
+{
+	return rte_bpf_load_ex(&(struct rte_bpf_prm_ex){
+		.sz = sizeof(struct rte_bpf_prm_ex),
+		.origin = RTE_BPF_ORIGIN_CBPF,
+		.cbpf.ins = cbpf_program->bf_insns,
+		.cbpf.nb_ins = cbpf_program->bf_len,
+		.prog_arg[0] = {
+			.type = RTE_BPF_ARG_PTR_MBUF,
+			.size = sizeof(struct rte_mbuf),
+		},
+		.nb_prog_arg = 1,
+	});
+}
+
 static int
-test_bpf_match(pcap_t *pcap, const char *str,
-	       struct rte_mbuf *mb)
+test_bpf_match(pcap_t *pcap, const char *str, struct rte_mbuf *mb,
+	load_cbpf_program_t load_cbpf_program)
 {
 	struct bpf_program fcode;
-	struct rte_bpf_prm *prm = NULL;
-	struct rte_bpf *bpf = NULL;
+	struct rte_bpf *bpf;
 	int ret = -1;
 	uint64_t rc;
 
@@ -4445,17 +4491,10 @@ test_bpf_match(pcap_t *pcap, const char *str,
 		return -1;
 	}
 
-	prm = rte_bpf_convert(&fcode);
-	if (prm == NULL) {
-		printf("%s@%d: bpf_convert('%s') failed,, error=%d(%s);\n",
-		       __func__, __LINE__, str, rte_errno, strerror(rte_errno));
-		goto error;
-	}
-
-	bpf = rte_bpf_load(prm);
+	bpf = load_cbpf_program(&fcode, str);
 	if (bpf == NULL) {
-		printf("%s@%d: failed to load bpf code, error=%d(%s);\n",
-			__func__, __LINE__, rte_errno, strerror(rte_errno));
+		printf("%s@%d: failed to load cbpf program for \"%s\", error=%d(%s);\n",
+			__func__, __LINE__, str, rte_errno, strerror(rte_errno));
 		goto error;
 	}
 
@@ -4465,7 +4504,6 @@ test_bpf_match(pcap_t *pcap, const char *str,
 error:
 	if (bpf)
 		rte_bpf_destroy(bpf);
-	rte_free(prm);
 	pcap_freecode(&fcode);
 	return ret;
 }
@@ -4474,6 +4512,11 @@ test_bpf_match(pcap_t *pcap, const char *str,
 static int
 test_bpf_filter_sanity(pcap_t *pcap)
 {
+	static const load_cbpf_program_t cbpf_program_loaders[] = {
+		load_cbpf_program_convert,
+		load_cbpf_program_direct,
+	};
+
 	const uint32_t plen = 100;
 	struct rte_mbuf mb, *m;
 	uint8_t tbuf[RTE_MBUF_DEFAULT_BUF_SIZE];
@@ -4500,15 +4543,17 @@ test_bpf_filter_sanity(pcap_t *pcap)
 		.dst_addr = rte_cpu_to_be_32(RTE_IPV4_BROADCAST),
 	};
 
-	if (test_bpf_match(pcap, "ip", m) != 0) {
-		printf("%s@%d: filter \"ip\" doesn't match test data\n",
-		       __func__, __LINE__);
-		return -1;
-	}
-	if (test_bpf_match(pcap, "not ip", m) == 0) {
-		printf("%s@%d: filter \"not ip\" does match test data\n",
-		       __func__, __LINE__);
-		return -1;
+	for (int li = 0; li != RTE_DIM(cbpf_program_loaders); ++li) {
+		if (test_bpf_match(pcap, "ip", m, cbpf_program_loaders[li]) != 0) {
+			printf("%s@%d: filter \"ip\" doesn't match test data\n",
+			       __func__, __LINE__);
+			return -1;
+		}
+		if (test_bpf_match(pcap, "not ip", m, cbpf_program_loaders[li]) == 0) {
+			printf("%s@%d: filter \"not ip\" does match test data\n",
+			       __func__, __LINE__);
+			return -1;
+		}
 	}
 
 	return 0;
@@ -4556,44 +4601,26 @@ static const char * const sample_filters[] = {
 };
 
 static int
-test_bpf_filter(pcap_t *pcap, const char *s)
+test_bpf_filter(pcap_t *pcap, const char *s, load_cbpf_program_t load_cbpf_program)
 {
 	struct bpf_program fcode;
-	struct rte_bpf_prm *prm = NULL;
-	struct rte_bpf *bpf = NULL;
+	struct rte_bpf *bpf;
 
 	if (pcap_compile(pcap, &fcode, s, 1, PCAP_NETMASK_UNKNOWN)) {
-		printf("%s@%d: pcap_compile('%s') failed: %s;\n",
+		printf("%s@%d: pcap_compile(\"%s\") failed: %s;\n",
 		       __func__, __LINE__, s, pcap_geterr(pcap));
 		return -1;
 	}
 
-	prm = rte_bpf_convert(&fcode);
-	if (prm == NULL) {
-		printf("%s@%d: bpf_convert('%s') failed,, error=%d(%s);\n",
-		       __func__, __LINE__, s, rte_errno, strerror(rte_errno));
-		goto error;
-	}
-
-	printf("bpf convert for \"%s\" produced:\n", s);
-	rte_bpf_dump(stdout, prm->ins, prm->nb_ins);
-
-	bpf = rte_bpf_load(prm);
+	bpf = load_cbpf_program(&fcode, s);
 	if (bpf == NULL) {
-		printf("%s@%d: failed to load bpf code, error=%d(%s);\n",
-			__func__, __LINE__, rte_errno, strerror(rte_errno));
-		goto error;
+		printf("%s@%d: failed to load cbpf program for \"%s\" , error=%d(%s);\n",
+			__func__, __LINE__, s, rte_errno, strerror(rte_errno));
+		test_bpf_dump(&fcode, NULL);
 	}
 
-error:
-	if (bpf)
-		rte_bpf_destroy(bpf);
-	else {
-		printf("%s \"%s\"\n", __func__, s);
-		test_bpf_dump(&fcode, prm);
-	}
+	rte_bpf_destroy(bpf);
 
-	rte_free(prm);
 	pcap_freecode(&fcode);
 	return (bpf == NULL) ? -1 : 0;
 }
@@ -4612,8 +4639,10 @@ test_bpf_convert(void)
 	}
 
 	rc = test_bpf_filter_sanity(pcap);
-	for (i = 0; i < RTE_DIM(sample_filters); i++)
-		rc |= test_bpf_filter(pcap, sample_filters[i]);
+	for (i = 0; i < RTE_DIM(sample_filters); i++) {
+		rc |= test_bpf_filter(pcap, sample_filters[i], load_cbpf_program_convert);
+		rc |= test_bpf_filter(pcap, sample_filters[i], load_cbpf_program_direct);
+	}
 
 	pcap_close(pcap);
 	return rc;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 02/11] bpf: introduce extensible load API
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev, Wathsala Vithanage; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Introduce new BPF load parameters struct rte_bpf_prm_ex that can be
extended without breaking backward or forward compatibility. Introduce
new function rte_bpf_load_ex consolidating in one code path loading from
both ELF file and raw memory image, with possibility to add more options
in the future.

Some changes in code layout and sequence:
* Both old APIs now only forwarding calls to a new single entry point.
* There is now a centralized cleanup point for all temporary resources
  created during the load process.
* External symbols (xsyms) are now checked for validity just after the
  load started, not after they were already used for relocation.
* File bpf_load_elf.c now only handles opening ELF file and providing
  patched instruction array to the load process. These are left as two
  separate functions to support other ELF sources like memory image in
  the future.
* Function stubs for the case libelf is not available are moved to
  bpf_load_elf.c to make keeping track of them easier (forgetting to
  update stubs is a common problem).

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_exec.c      |  10 +--
 lib/bpf/bpf_impl.h      |  32 ++++++-
 lib/bpf/bpf_jit_arm64.c |  12 +--
 lib/bpf/bpf_jit_x86.c   |   8 +-
 lib/bpf/bpf_load.c      | 195 +++++++++++++++++++++++++++++++++++-----
 lib/bpf/bpf_load_elf.c  | 151 ++++++++++++++++++-------------
 lib/bpf/bpf_stub.c      |  17 ----
 lib/bpf/bpf_validate.c  |  32 +++----
 lib/bpf/meson.build     |   4 +-
 lib/bpf/rte_bpf.h       |  68 +++++++++++++-
 10 files changed, 392 insertions(+), 137 deletions(-)

diff --git a/lib/bpf/bpf_exec.c b/lib/bpf/bpf_exec.c
index 18013753b1..e4668ba10b 100644
--- a/lib/bpf/bpf_exec.c
+++ b/lib/bpf/bpf_exec.c
@@ -47,7 +47,7 @@
 		RTE_BPF_LOG_LINE(ERR, \
 			"%s(%p): division by 0 at pc: %#zx;", \
 			__func__, bpf, \
-			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.raw.ins); \
 		return 0; \
 	} \
 } while (0)
@@ -81,7 +81,7 @@
 		RTE_BPF_LOG_LINE(ERR, \
 			"%s(%p): unsupported atomic operation at pc: %#zx;", \
 			__func__, bpf, \
-			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.raw.ins); \
 		return 0; \
 	} \
 } while (0)
@@ -157,7 +157,7 @@ bpf_ld_mbuf(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM],
 		RTE_BPF_LOG_LINE(DEBUG, "%s(bpf=%p, mbuf=%p, ofs=%u, len=%u): "
 			"load beyond packet boundary at pc: %#zx;",
 			__func__, bpf, mb, off, len,
-			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins);
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.raw.ins);
 	return p;
 }
 
@@ -166,7 +166,7 @@ bpf_exec(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM])
 {
 	const struct ebpf_insn *ins;
 
-	for (ins = bpf->prm.ins; ; ins++) {
+	for (ins = bpf->prm.raw.ins; ; ins++) {
 		switch (ins->code) {
 		/* 32 bit ALU IMM operations */
 		case (BPF_ALU | BPF_ADD | BPF_K):
@@ -483,7 +483,7 @@ bpf_exec(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM])
 			RTE_BPF_LOG_LINE(ERR,
 				"%s(%p): invalid opcode %#x at pc: %#zx;",
 				__func__, bpf, ins->code,
-				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+				(uintptr_t)ins - (uintptr_t)bpf->prm.raw.ins);
 			return 0;
 		}
 	}
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index fb5ec3c4d6..1cee109bc9 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -11,17 +11,45 @@
 #define MAX_BPF_STACK_SIZE	0x200
 
 struct rte_bpf {
-	struct rte_bpf_prm prm;
+	struct rte_bpf_prm_ex prm;
 	struct rte_bpf_jit jit;
 	size_t sz;
 	uint32_t stack_sz;
 };
 
+/* Temporary copies etc. used by the load process. */
+struct __rte_bpf_load {
+	struct rte_bpf_prm_ex prm;
+
+	/* Loading ELF and applying relocations. */
+	int elf_fd;  /* ELF fd, must be negative (not zero) by default. */
+	void *elf;  /* Using void to avoid dependency on libelf. */
+
+	/* Value we are going to return, if any. */
+	struct rte_bpf *bpf;
+};
+
 /*
  * Use '__rte' prefix for non-static internal functions
  * to avoid potential name conflict with other libraries.
  */
-int __rte_bpf_validate(struct rte_bpf *bpf);
+
+/* Free temporary resources created by opening ELF. */
+void
+__rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load);
+
+/* Open the ELF file. */
+int
+__rte_bpf_load_elf_file(struct __rte_bpf_load *load);
+
+/* Get code from ELF and apply relocations to it. */
+int
+__rte_bpf_load_elf_code(struct __rte_bpf_load *load);
+
+/* Validate final BPF code and calculate stack size. */
+int
+__rte_bpf_validate(const struct rte_bpf_prm_ex *prm, uint32_t *stack_sz);
+
 int __rte_bpf_jit(struct rte_bpf *bpf);
 int __rte_bpf_jit_x86(struct rte_bpf *bpf);
 int __rte_bpf_jit_arm64(struct rte_bpf *bpf);
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 4bbb97da1b..9e5e142c13 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -111,12 +111,12 @@ jump_offset_init(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 {
 	uint32_t i;
 
-	ctx->map = malloc(bpf->prm.nb_ins * sizeof(ctx->map[0]));
+	ctx->map = malloc(bpf->prm.raw.nb_ins * sizeof(ctx->map[0]));
 	if (ctx->map == NULL)
 		return -ENOMEM;
 
 	/* Fill with fake offsets */
-	for (i = 0; i != bpf->prm.nb_ins; i++) {
+	for (i = 0; i != bpf->prm.raw.nb_ins; i++) {
 		ctx->map[i].off = INT32_MAX;
 		ctx->map[i].off_to_b = 0;
 	}
@@ -1130,8 +1130,8 @@ check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 	uint8_t op;
 	uint32_t i;
 
-	for (i = 0; i != bpf->prm.nb_ins; i++) {
-		ins = bpf->prm.ins + i;
+	for (i = 0; i != bpf->prm.raw.nb_ins; i++) {
+		ins = bpf->prm.raw.ins + i;
 		op = ins->code;
 
 		switch (op) {
@@ -1168,10 +1168,10 @@ emit(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 
 	emit_prologue(ctx);
 
-	for (i = 0; i != bpf->prm.nb_ins; i++) {
+	for (i = 0; i != bpf->prm.raw.nb_ins; i++) {
 
 		jump_offset_update(ctx, i);
-		ins = bpf->prm.ins + i;
+		ins = bpf->prm.raw.ins + i;
 		op = ins->code;
 		off = ins->off;
 		imm = ins->imm;
diff --git a/lib/bpf/bpf_jit_x86.c b/lib/bpf/bpf_jit_x86.c
index 88b1b5aeab..6f4235d434 100644
--- a/lib/bpf/bpf_jit_x86.c
+++ b/lib/bpf/bpf_jit_x86.c
@@ -1324,12 +1324,12 @@ emit(struct bpf_jit_state *st, const struct rte_bpf *bpf)
 
 	emit_prolog(st, bpf->stack_sz);
 
-	for (i = 0; i != bpf->prm.nb_ins; i++) {
+	for (i = 0; i != bpf->prm.raw.nb_ins; i++) {
 
 		st->idx = i;
 		st->off[i] = st->sz;
 
-		ins = bpf->prm.ins + i;
+		ins = bpf->prm.raw.ins + i;
 
 		dr = ebpf2x86[ins->dst_reg];
 		sr = ebpf2x86[ins->src_reg];
@@ -1532,13 +1532,13 @@ __rte_bpf_jit_x86(struct rte_bpf *bpf)
 
 	/* init state */
 	memset(&st, 0, sizeof(st));
-	st.off = malloc(bpf->prm.nb_ins * sizeof(st.off[0]));
+	st.off = malloc(bpf->prm.raw.nb_ins * sizeof(st.off[0]));
 	if (st.off == NULL)
 		return -ENOMEM;
 
 	/* fill with fake offsets */
 	st.exit.off = INT32_MAX;
-	for (i = 0; i != bpf->prm.nb_ins; i++)
+	for (i = 0; i != bpf->prm.raw.nb_ins; i++)
 		st.off[i] = INT32_MAX;
 
 	/*
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index b8a0426fe2..a6793b2c94 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -14,14 +14,14 @@
 #include "bpf_impl.h"
 
 static struct rte_bpf *
-bpf_load(const struct rte_bpf_prm *prm)
+bpf_load(const struct rte_bpf_prm_ex *prm)
 {
 	uint8_t *buf;
 	struct rte_bpf *bpf;
 	size_t sz, bsz, insz, xsz;
 
 	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
-	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	insz = prm->raw.nb_ins * sizeof(prm->raw.ins[0]);
 	bsz = sizeof(bpf[0]);
 	sz = insz + xsz + bsz;
 
@@ -37,10 +37,10 @@ bpf_load(const struct rte_bpf_prm *prm)
 
 	if (xsz > 0)
 		memcpy(buf + bsz, prm->xsym, xsz);
-	memcpy(buf + bsz + xsz, prm->ins, insz);
+	memcpy(buf + bsz + xsz, prm->raw.ins, insz);
 
 	bpf->prm.xsym = (void *)(buf + bsz);
-	bpf->prm.ins = (void *)(buf + bsz + xsz);
+	bpf->prm.raw.ins = (void *)(buf + bsz + xsz);
 
 	return bpf;
 }
@@ -80,37 +80,44 @@ bpf_check_xsym(const struct rte_bpf_xsym *xsym)
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_bpf_load)
-struct rte_bpf *
-rte_bpf_load(const struct rte_bpf_prm *prm)
+static int
+bpf_check_xsyms(const struct rte_bpf_xsym *xsym, uint32_t nb_xsym)
 {
-	struct rte_bpf *bpf;
 	int32_t rc;
 	uint32_t i;
 
-	if (prm == NULL || prm->ins == NULL || prm->nb_ins == 0 ||
-			(prm->nb_xsym != 0 && prm->xsym == NULL)) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
+	if (nb_xsym != 0 && xsym == NULL)
+		return -EINVAL;
 
 	rc = 0;
-	for (i = 0; i != prm->nb_xsym && rc == 0; i++)
-		rc = bpf_check_xsym(prm->xsym + i);
+	for (i = 0; i != nb_xsym && rc == 0; i++)
+		rc = bpf_check_xsym(xsym + i);
 
 	if (rc != 0) {
-		rte_errno = -rc;
 		RTE_BPF_LOG_FUNC_LINE(ERR, "%d-th xsym is invalid", i);
-		return NULL;
+		return rc;
 	}
 
+	return 0;
+}
+
+static int
+bpf_load_raw(struct __rte_bpf_load *load)
+{
+	const struct rte_bpf_prm_ex *const prm = &load->prm;
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	RTE_ASSERT(prm->origin == RTE_BPF_ORIGIN_RAW);
+
+	if (prm->raw.ins == NULL || prm->raw.nb_ins == 0)
+		return -EINVAL;
+
 	bpf = bpf_load(prm);
-	if (bpf == NULL) {
-		rte_errno = ENOMEM;
-		return NULL;
-	}
+	if (bpf == NULL)
+		return -ENOMEM;
 
-	rc = __rte_bpf_validate(bpf);
+	rc = __rte_bpf_validate(&load->prm, &bpf->stack_sz);
 	if (rc == 0) {
 		__rte_bpf_jit(bpf);
 		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
@@ -119,9 +126,151 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
 
 	if (rc != 0) {
 		rte_bpf_destroy(bpf);
+		return rc;
+	}
+
+	load->bpf = bpf;
+	return 0;
+}
+
+RTE_EXPORT_SYMBOL(rte_bpf_load)
+struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	if (prm == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	return rte_bpf_load_ex(&(struct rte_bpf_prm_ex){
+			.sz = sizeof(struct rte_bpf_prm_ex),
+			.origin = RTE_BPF_ORIGIN_RAW,
+			.raw.ins = prm->ins,
+			.raw.nb_ins = prm->nb_ins,
+			.xsym = prm->xsym,
+			.nb_xsym = prm->nb_xsym,
+			.prog_arg = prm->prog_arg,
+		});
+}
+
+RTE_EXPORT_SYMBOL(rte_bpf_elf_load)
+struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	if (prm == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	return rte_bpf_load_ex(&(struct rte_bpf_prm_ex){
+			.sz = sizeof(struct rte_bpf_prm_ex),
+			.origin = RTE_BPF_ORIGIN_ELF_FILE,
+			.elf_file.path = fname,
+			.elf_file.section = sname,
+			.xsym = prm->xsym,
+			.nb_xsym = prm->nb_xsym,
+			.prog_arg = prm->prog_arg,
+		});
+}
+
+/*
+ * Check extensible opts for invalid size or non-zero unsupported members.
+ *
+ * This code provides forward compatibility with applications compiled against
+ * newer version of this library. `opts_sz` is the size of struct `opts` in the
+ * version used for compiling the application, read from the member `sz`;
+ * `type_sz` is the size of same struct in the version used for compiling the
+ * library.
+ *
+ * If new fields were added to the struct in the application version, `opts_sz`
+ * will be greater than `type_sz`. In this case we are making sure all bytes we
+ * don't know how to interpret are zeroes, that is any new features that are
+ * there are not being used.
+ *
+ * This function can be used to check any struct following this convention.
+ */
+static bool
+opts_valid(const void *opts, size_t opts_sz, size_t type_sz)
+{
+	if (opts == NULL)
+		return true;
+
+	if (opts_sz < sizeof(opts_sz))
+		/* Size of the struct is too small even for sz member. */
+		return false;
+
+	/* Verify that all extra bytes are zeroed. */
+	for (size_t offset = type_sz; offset < opts_sz; ++offset)
+		if (((const char *)opts)[offset] != 0)
+			return false;
+
+	return true;
+}
+
+static int
+load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
+{
+	int rc;
+
+	if (app_prm == NULL || !opts_valid(app_prm, app_prm->sz, sizeof(load->prm)))
+		return -EINVAL;
+
+	/*
+	 * Convert extensible prm of application size to the size known to us.
+	 *
+	 * This code provides compatibility with applications compiled against
+	 * different version of this library. `app_prm->sz` is the size of
+	 * struct `rte_bpf_prm_ex` in the version used for compiling the
+	 * application; `sizeof(load->prm)` is the size of the same struct in
+	 * the version used for compiling the library.
+	 *
+	 * We are copying only the fields known to the application and leave
+	 * the rest filled with zeroes. Any features that not known to the
+	 * application will have backward-compatible default behaviour.
+	 */
+	memcpy(&load->prm, app_prm, RTE_MIN(app_prm->sz, sizeof(load->prm)));
+	load->prm.sz = sizeof(load->prm);
+
+	rc = bpf_check_xsyms(load->prm.xsym, load->prm.nb_xsym);
+
+	/* Convert prm origin to raw unless it already is. */
+	switch (load->prm.origin) {
+	case RTE_BPF_ORIGIN_RAW:
+		break;
+	case RTE_BPF_ORIGIN_ELF_FILE:
+		rc = rc < 0 ? rc : __rte_bpf_load_elf_file(load);
+		rc = rc < 0 ? rc : __rte_bpf_load_elf_code(load);
+		break;
+	default:
+		rc = rc < 0 ? rc : -EINVAL;
+	}
+
+	/* Now that it is raw load it as such. */
+	rc = rc < 0 ? rc : bpf_load_raw(load);
+
+	return rc;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_load_ex, 26.11)
+struct rte_bpf *
+rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
+{
+	struct __rte_bpf_load load = { .elf_fd = -1 };
+
+	const int rc = load_try(&load, prm);
+
+	__rte_bpf_load_elf_cleanup(&load);
+
+	RTE_ASSERT((rc < 0) == (load.bpf == NULL));
+
+	if (rc < 0) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "failed, error code: %d", -rc);
 		rte_errno = -rc;
 		return NULL;
 	}
 
-	return bpf;
+	RTE_BPF_LOG_FUNC_LINE(INFO, "successfully creates %p(jit={.func=%p,.sz=%zu});",
+		load.bpf, load.bpf->jit.func, load.bpf->jit.sz);
+	return load.bpf;
 }
diff --git a/lib/bpf/bpf_load_elf.c b/lib/bpf/bpf_load_elf.c
index 2390823cbf..4ae7492351 100644
--- a/lib/bpf/bpf_load_elf.c
+++ b/lib/bpf/bpf_load_elf.c
@@ -2,6 +2,13 @@
  * Copyright(c) 2018 Intel Corporation
  */
 
+#include "bpf_impl.h"
+
+#include <errno.h>
+
+#ifdef RTE_LIBRTE_BPF_ELF
+
+#include <inttypes.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
@@ -26,8 +33,6 @@
 #include <rte_byteorder.h>
 #include <rte_errno.h>
 
-#include "bpf_impl.h"
-
 /* To overcome compatibility issue */
 #ifndef EM_BPF
 #define	EM_BPF	247
@@ -56,7 +61,7 @@ bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
  */
 static int
 resolve_xsym(const char *sn, size_t ofs, struct ebpf_insn *ins, size_t ins_sz,
-	const struct rte_bpf_prm *prm)
+	const struct rte_bpf_prm_ex *prm)
 {
 	uint32_t idx, fidx;
 	enum rte_bpf_xtype type;
@@ -183,7 +188,7 @@ find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
  */
 static int
 process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
-	struct ebpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+	struct ebpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm_ex *prm)
 {
 	int32_t rc;
 	uint32_t i, n;
@@ -232,8 +237,8 @@ process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
  * and update bpf code.
  */
 static int
-elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
-	const struct rte_bpf_prm *prm)
+elf_reloc_code(Elf *elf, struct ebpf_insn *ins, size_t ins_sz, size_t sidx,
+	const struct rte_bpf_prm_ex *prm)
 {
 	Elf64_Rel *re;
 	Elf_Scn *sc;
@@ -256,7 +261,7 @@ elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
 					sd->d_size % sizeof(re[0]) != 0)
 				return -EINVAL;
 			rc = process_reloc(elf, sh->sh_link,
-				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				sd->d_buf, sd->d_size, ins, ins_sz,
 				prm);
 		}
 	}
@@ -264,72 +269,96 @@ elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
 	return rc;
 }
 
-static struct rte_bpf *
-bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+void
+__rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load)
 {
-	Elf *elf;
-	Elf_Data *sd;
-	size_t sidx;
-	int32_t rc;
-	struct rte_bpf *bpf;
-	struct rte_bpf_prm np;
+	elf_end(load->elf);
 
-	elf_version(EV_CURRENT);
-	elf = elf_begin(fd, ELF_C_READ, NULL);
+	if (load->elf_fd >= 0 && close(load->elf_fd) < 0) {
+		const int close_errno = errno;
+		RTE_BPF_LOG_FUNC_LINE(ERR, "error %d closing: %s",
+			close_errno, strerror(close_errno));
+	}
+}
 
-	rc = find_elf_code(elf, section, &sd, &sidx);
-	if (rc == 0)
-		rc = elf_reloc_code(elf, sd, sidx, prm);
+int
+__rte_bpf_load_elf_file(struct __rte_bpf_load *load)
+{
+	const struct rte_bpf_prm_ex *const prm = &load->prm;
 
-	if (rc == 0) {
-		np = prm[0];
-		np.ins = sd->d_buf;
-		np.nb_ins = sd->d_size / sizeof(struct ebpf_insn);
-		bpf = rte_bpf_load(&np);
-	} else {
-		bpf = NULL;
-		rte_errno = -rc;
+	RTE_ASSERT(prm->origin == RTE_BPF_ORIGIN_ELF_FILE);
+
+	if (prm->elf_file.path == NULL || prm->elf_file.section == NULL)
+		return -EINVAL;
+
+	if (elf_version(EV_CURRENT) == EV_NONE)
+		return -ENOTSUP;
+
+	load->elf_fd = open(prm->elf_file.path, O_RDONLY);
+	if (load->elf_fd < 0) {
+		const int open_errno = errno;
+		RTE_BPF_LOG_FUNC_LINE(ERR, "error %d opening \"%s\": %s",
+			open_errno, prm->elf_file.path, strerror(open_errno));
+		return -open_errno;
+	}
+
+	load->elf = elf_begin(load->elf_fd, ELF_C_READ, NULL);
+	if (load->elf == NULL) {
+		const int rc = elf_errno();
+		RTE_BPF_LOG_FUNC_LINE(ERR, "error %d opening ELF \"%s\": %s",
+			rc, prm->elf_file.path, elf_errmsg(rc));
+		return -EINVAL;
 	}
 
-	elf_end(elf);
-	return bpf;
+	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_bpf_elf_load)
-struct rte_bpf *
-rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
-	const char *sname)
+int
+__rte_bpf_load_elf_code(struct __rte_bpf_load *load)
 {
-	int32_t fd, rc;
-	struct rte_bpf *bpf;
+	struct rte_bpf_prm_ex *const prm = &load->prm;
+	Elf_Data *sd;
+	size_t sidx;
+	int rc;
 
-	if (prm == NULL || fname == NULL || sname == NULL) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
+	rc = find_elf_code(load->elf, prm->elf_file.section, &sd, &sidx);
+	if (rc < 0)
+		return rc;
 
-	fd = open(fname, O_RDONLY);
-	if (fd < 0) {
-		rc = errno;
-		RTE_BPF_LOG_LINE(ERR, "%s(%s) error code: %d(%s)",
-			__func__, fname, rc, strerror(rc));
-		rte_errno = EINVAL;
-		return NULL;
-	}
+	prm->origin = RTE_BPF_ORIGIN_RAW;
+	prm->raw.ins = sd->d_buf;
+	prm->raw.nb_ins = sd->d_size / sizeof(struct ebpf_insn);
 
-	bpf = bpf_load_elf(prm, fd, sname);
-	close(fd);
+	rc = elf_reloc_code(load->elf, sd->d_buf, sd->d_size, sidx, prm);
+	if (rc < 0)
+		return -EINVAL;
 
-	if (bpf == NULL) {
-		RTE_BPF_LOG_LINE(ERR,
-			"%s(fname=\"%s\", sname=\"%s\") failed, "
-			"error code: %d",
-			__func__, fname, sname, rte_errno);
-		return NULL;
-	}
+	return 0;
+}
+
+#else /* RTE_LIBRTE_BPF_ELF */
+
+void
+__rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load)
+{
+	RTE_ASSERT(load->elf == NULL);
+	RTE_ASSERT(load->elf_fd < 0);
+}
 
-	RTE_BPF_LOG_LINE(INFO, "%s(fname=\"%s\", sname=\"%s\") "
-		"successfully creates %p(jit={.func=%p,.sz=%zu});",
-		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
-	return bpf;
+int
+__rte_bpf_load_elf_file(struct __rte_bpf_load *load)
+{
+	RTE_SET_USED(load);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
+	return -ENOTSUP;
 }
+
+int
+__rte_bpf_load_elf_code(struct __rte_bpf_load *load)
+{
+	RTE_SET_USED(load);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
+	return -ENOTSUP;
+}
+
+#endif /* RTE_LIBRTE_BPF_ELF */
diff --git a/lib/bpf/bpf_stub.c b/lib/bpf/bpf_stub.c
index e06e820d83..4c329832c2 100644
--- a/lib/bpf/bpf_stub.c
+++ b/lib/bpf/bpf_stub.c
@@ -10,23 +10,6 @@
  * Contains stubs for unimplemented public API functions
  */
 
-#ifndef RTE_LIBRTE_BPF_ELF
-RTE_EXPORT_SYMBOL(rte_bpf_elf_load)
-struct rte_bpf *
-rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
-	const char *sname)
-{
-	if (prm == NULL || fname == NULL || sname == NULL) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
-
-	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
-	rte_errno = ENOTSUP;
-	return NULL;
-}
-#endif
-
 #ifndef RTE_HAS_LIBPCAP
 RTE_EXPORT_SYMBOL(rte_bpf_convert)
 struct rte_bpf_prm *
diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index a7f4f576c9..5bfc59296d 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -80,7 +80,7 @@ struct evst_pool {
 };
 
 struct bpf_verifier {
-	const struct rte_bpf_prm *prm;
+	const struct rte_bpf_prm_ex *prm;
 	struct inst_node *in;
 	uint64_t stack_sz;
 	uint32_t nb_nodes;
@@ -1837,7 +1837,7 @@ add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx)
 {
 	uint32_t ne;
 
-	if (nidx >= bvf->prm->nb_ins) {
+	if (nidx >= bvf->prm->raw.nb_ins) {
 		RTE_BPF_LOG_FUNC_LINE(ERR,
 			"program boundary violation at pc: %u, next pc: %u",
 			get_node_idx(bvf, node), nidx);
@@ -1946,10 +1946,10 @@ log_unreachable(const struct bpf_verifier *bvf)
 	struct inst_node *node;
 	const struct ebpf_insn *ins;
 
-	for (i = 0; i != bvf->prm->nb_ins; i++) {
+	for (i = 0; i != bvf->prm->raw.nb_ins; i++) {
 
 		node = bvf->in + i;
-		ins = bvf->prm->ins + i;
+		ins = bvf->prm->raw.ins + i;
 
 		if (node->colour == WHITE &&
 				ins->code != (BPF_LD | BPF_IMM | EBPF_DW))
@@ -1966,7 +1966,7 @@ log_loop(const struct bpf_verifier *bvf)
 	uint32_t i, j;
 	struct inst_node *node;
 
-	for (i = 0; i != bvf->prm->nb_ins; i++) {
+	for (i = 0; i != bvf->prm->raw.nb_ins; i++) {
 
 		node = bvf->in + i;
 		if (node->colour != BLACK)
@@ -1998,9 +1998,9 @@ validate(struct bpf_verifier *bvf)
 	const char *err;
 
 	rc = 0;
-	for (i = 0; i < bvf->prm->nb_ins; i++) {
+	for (i = 0; i < bvf->prm->raw.nb_ins; i++) {
 
-		ins = bvf->prm->ins + i;
+		ins = bvf->prm->raw.ins + i;
 		node = bvf->in + i;
 
 		err = check_syntax(ins);
@@ -2432,7 +2432,7 @@ evaluate(struct bpf_verifier *bvf)
 
 	bvf->evst->rv[EBPF_REG_10] = rvfp;
 
-	ins = bvf->prm->ins;
+	ins = bvf->prm->raw.ins;
 	node = bvf->in;
 	next = node;
 	rc = 0;
@@ -2522,23 +2522,23 @@ evaluate(struct bpf_verifier *bvf)
 }
 
 int
-__rte_bpf_validate(struct rte_bpf *bpf)
+__rte_bpf_validate(const struct rte_bpf_prm_ex *prm, uint32_t *stack_sz)
 {
 	int32_t rc;
 	struct bpf_verifier bvf;
 
 	/* check input argument type, don't allow mbuf ptr on 32-bit */
-	if (bpf->prm.prog_arg.type != RTE_BPF_ARG_RAW &&
-			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR &&
+	if (prm->prog_arg.type != RTE_BPF_ARG_RAW &&
+			prm->prog_arg.type != RTE_BPF_ARG_PTR &&
 			(sizeof(uint64_t) != sizeof(uintptr_t) ||
-			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
+			prm->prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
 		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
 		return -ENOTSUP;
 	}
 
 	memset(&bvf, 0, sizeof(bvf));
-	bvf.prm = &bpf->prm;
-	bvf.in = calloc(bpf->prm.nb_ins, sizeof(bvf.in[0]));
+	bvf.prm = prm;
+	bvf.in = calloc(prm->raw.nb_ins, sizeof(bvf.in[0]));
 	if (bvf.in == NULL)
 		return -ENOMEM;
 
@@ -2555,11 +2555,11 @@ __rte_bpf_validate(struct rte_bpf *bpf)
 
 	/* copy collected info */
 	if (rc == 0) {
-		bpf->stack_sz = bvf.stack_sz;
+		*stack_sz = bvf.stack_sz;
 
 		/* for LD_ABS/LD_IND, we'll need extra space on the stack */
 		if (bvf.nb_ldmb_nodes != 0)
-			bpf->stack_sz = RTE_ALIGN_CEIL(bpf->stack_sz +
+			*stack_sz = RTE_ALIGN_CEIL(*stack_sz +
 				sizeof(uint64_t), sizeof(uint64_t));
 	}
 
diff --git a/lib/bpf/meson.build b/lib/bpf/meson.build
index 28df7f469a..4901b6ee14 100644
--- a/lib/bpf/meson.build
+++ b/lib/bpf/meson.build
@@ -19,6 +19,7 @@ sources = files('bpf.c',
         'bpf_dump.c',
         'bpf_exec.c',
         'bpf_load.c',
+        'bpf_load_elf.c',
         'bpf_pkt.c',
         'bpf_stub.c',
         'bpf_validate.c')
@@ -38,10 +39,9 @@ deps += ['mbuf', 'net', 'ethdev']
 dep = dependency('libelf', required: false, method: 'pkg-config')
 if dep.found()
     dpdk_conf.set('RTE_LIBRTE_BPF_ELF', 1)
-    sources += files('bpf_load_elf.c')
     ext_deps += dep
 else
-    warning('libelf is missing, rte_bpf_elf_load API will be disabled')
+    warning('libelf is missing, ELF API will be disabled')
 endif
 
 if dpdk_conf.has('RTE_HAS_LIBPCAP')
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index 309d84bc51..bf58a41819 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -86,7 +86,47 @@ struct rte_bpf_xsym {
 };
 
 /**
- * Input parameters for loading eBPF code.
+ * Possible origins of eBPF program code.
+ */
+enum rte_bpf_origin {
+	RTE_BPF_ORIGIN_RAW,		/**< code loaded from raw array */
+	RTE_BPF_ORIGIN_RESERVED,	/**< reserved for cBPF */
+	RTE_BPF_ORIGIN_ELF_FILE,	/**< code loaded from elf_file */
+};
+
+/**
+ * Input parameters for loading eBPF code, extensible version.
+ *
+ * Follows libbpf conventions for extensible structs.
+ */
+struct rte_bpf_prm_ex {
+	size_t sz;  /**< size of this struct for backward compatibility */
+
+	uint32_t flags;  /**< flags controlling eBPF load and other options */
+
+	enum rte_bpf_origin origin;  /**< origin of eBPF program code */
+
+	/** program origin parameters, member in use depends on origin */
+	union {
+		struct {
+			const struct ebpf_insn *ins;  /**< eBPF instructions */
+			uint32_t nb_ins;  /**< number of instructions in ins */
+		} raw;
+		struct {
+			const char *path;  /**< path to the ELF file */
+			const char *section;  /**< ELF section with the code */
+		} elf_file;
+	};
+
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym;  /**< number of elements in xsym */
+
+	struct rte_bpf_arg prog_arg;  /**< input arg description */
+};
+
+/**
+ * Input parameters for loading eBPF code, legacy version.
  */
 struct rte_bpf_prm {
 	const struct ebpf_insn *ins; /**< array of eBPF instructions */
@@ -116,6 +156,32 @@ struct rte_bpf;
 void
 rte_bpf_destroy(struct rte_bpf *bpf);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Create a new eBPF execution context, load code from specified origin into it.
+ *
+ * @param prm
+ *   Parameters used to create and initialise the BPF execution context.
+ *
+ *   Member sz must be set to the struct size as known to the application.
+ *   If it exceeds the size known to the library, and the extra part has
+ *   non-zero bytes, parameter is rejected. If it's smaller than the size known
+ *   to the library, defaults are used for the members that are not present.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL  - invalid parameter passed to function
+ *   - ENOMEM  - can't reserve enough memory
+ *   - ENOTSUP - requested feature is not supported (e.g. no libelf to load ELF)
+ */
+__rte_experimental
+struct rte_bpf *
+rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
+	__rte_malloc __rte_dealloc(rte_bpf_destroy, 1);
+
 /**
  * Create a new eBPF execution context and load given BPF code into it.
  *
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 05/11] bpf: support rte_bpf_prm_ex with port callbacks
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Introduce new functions to install an already loaded BPF program into RX
or TX port/queue, since previous API was tied to rte_bpf_prm.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_pkt.c        | 65 ++++++++++++++++++++++++++++++----------
 lib/bpf/rte_bpf_ethdev.h | 54 +++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 15 deletions(-)

diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 5007f6aef5..87065e939f 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -490,13 +490,11 @@ rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue)
 }
 
 static int
-bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue,
-	const struct rte_bpf_prm *prm, const char *fname, const char *sname,
-	uint32_t flags)
+bpf_eth_elf_install(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue,
+	struct rte_bpf *bpf, uint32_t flags)
 {
 	int32_t rc;
 	struct bpf_eth_cbi *bc;
-	struct rte_bpf *bpf;
 	rte_rx_callback_fn frx;
 	rte_tx_callback_fn ftx;
 	struct rte_bpf_jit jit;
@@ -504,14 +502,17 @@ bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue,
 	frx = NULL;
 	ftx = NULL;
 
-	if (prm == NULL || rte_eth_dev_is_valid_port(port) == 0 ||
+	if (bpf == NULL || rte_eth_dev_is_valid_port(port) == 0 ||
 			queue >= RTE_MAX_QUEUES_PER_PORT)
 		return -EINVAL;
 
+	if (bpf->prm.nb_prog_arg != 1)
+		return -EINVAL;
+
 	if (cbh->type == BPF_ETH_RX)
-		frx = select_rx_callback(prm->prog_arg.type, flags);
+		frx = select_rx_callback(bpf->prm.prog_arg[0].type, flags);
 	else
-		ftx = select_tx_callback(prm->prog_arg.type, flags);
+		ftx = select_tx_callback(bpf->prm.prog_arg[0].type, flags);
 
 	if (frx == NULL && ftx == NULL) {
 		RTE_BPF_LOG_LINE(ERR, "%s(%u, %u): no callback selected;",
@@ -519,16 +520,11 @@ bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue,
 		return -EINVAL;
 	}
 
-	bpf = rte_bpf_elf_load(prm, fname, sname);
-	if (bpf == NULL)
-		return -rte_errno;
-
 	rte_bpf_get_jit(bpf, &jit);
 
 	if ((flags & RTE_BPF_ETH_F_JIT) != 0 && jit.func == NULL) {
 		RTE_BPF_LOG_LINE(ERR, "%s(%u, %u): no JIT generated;",
 			__func__, port, queue);
-		rte_bpf_destroy(bpf);
 		return -ENOTSUP;
 	}
 
@@ -551,7 +547,6 @@ bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue,
 
 	if (bc->cb == NULL) {
 		rc = -rte_errno;
-		rte_bpf_destroy(bpf);
 		bpf_eth_cbi_cleanup(bc);
 	} else
 		rc = 0;
@@ -564,13 +559,33 @@ int
 rte_bpf_eth_rx_elf_load(uint16_t port, uint16_t queue,
 	const struct rte_bpf_prm *prm, const char *fname, const char *sname,
 	uint32_t flags)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	bpf = rte_bpf_elf_load(prm, fname, sname);
+	if (bpf == NULL)
+		return -rte_errno;
+
+	rc = rte_bpf_eth_rx_install(port, queue, bpf, flags);
+
+	if (rc < 0)
+		rte_bpf_destroy(bpf);
+
+	return rc;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_eth_rx_install, 26.11)
+int
+rte_bpf_eth_rx_install(uint16_t port, uint16_t queue, struct rte_bpf *bpf,
+	uint32_t flags)
 {
 	int32_t rc;
 	struct bpf_eth_cbh *cbh;
 
 	cbh = &rx_cbh;
 	rte_spinlock_lock(&cbh->lock);
-	rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags);
+	rc = bpf_eth_elf_install(cbh, port, queue, bpf, flags);
 	rte_spinlock_unlock(&cbh->lock);
 
 	return rc;
@@ -581,13 +596,33 @@ int
 rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue,
 	const struct rte_bpf_prm *prm, const char *fname, const char *sname,
 	uint32_t flags)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	bpf = rte_bpf_elf_load(prm, fname, sname);
+	if (bpf == NULL)
+		return -rte_errno;
+
+	rc = rte_bpf_eth_tx_install(port, queue, bpf, flags);
+
+	if (rc < 0)
+		rte_bpf_destroy(bpf);
+
+	return rc;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_eth_tx_install, 26.11)
+int
+rte_bpf_eth_tx_install(uint16_t port, uint16_t queue, struct rte_bpf *bpf,
+	uint32_t flags)
 {
 	int32_t rc;
 	struct bpf_eth_cbh *cbh;
 
 	cbh = &tx_cbh;
 	rte_spinlock_lock(&cbh->lock);
-	rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags);
+	rc = bpf_eth_elf_install(cbh, port, queue, bpf, flags);
 	rte_spinlock_unlock(&cbh->lock);
 
 	return rc;
diff --git a/lib/bpf/rte_bpf_ethdev.h b/lib/bpf/rte_bpf_ethdev.h
index 8c6dc0825f..e82e0c5f8c 100644
--- a/lib/bpf/rte_bpf_ethdev.h
+++ b/lib/bpf/rte_bpf_ethdev.h
@@ -109,6 +109,60 @@ rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue,
 	const struct rte_bpf_prm *prm, const char *fname, const char *sname,
 	uint32_t flags);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Install callback to execute specified BPF program on given RX port/queue.
+ *
+ * On success the ownership of the program passes to the library,
+ * rte_bpf_eth_unload must be used to unload it, and rte_bpf_destroy must no
+ * longer be called.
+ *
+ * @param port
+ *   The identifier of the ethernet port
+ * @param queue
+ *   The identifier of the RX queue on the given port
+ * @param bpf
+ *   BPF program
+ * @param flags
+ *   Flags that define expected behavior of the loaded filter
+ *   (i.e. jited/non-jited version to use).
+ * @return
+ *   Zero on successful completion or negative error code otherwise.
+ */
+__rte_experimental
+int
+rte_bpf_eth_rx_install(uint16_t port, uint16_t queue, struct rte_bpf *bpf,
+	uint32_t flags);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Install callback to execute specified BPF program on given TX port/queue.
+ *
+ * On success the ownership of the program passes to the library,
+ * rte_bpf_eth_unload must be used to unload it, and rte_bpf_destroy must no
+ * longer be called.
+ *
+ * @param port
+ *   The identifier of the ethernet port
+ * @param queue
+ *   The identifier of the TX queue on the given port
+ * @param bpf
+ *   BPF program
+ * @param flags
+ *   Flags that define expected behavior of the loaded filter
+ *   (i.e. jited/non-jited version to use).
+ * @return
+ *   Zero on successful completion or negative error code otherwise.
+ */
+__rte_experimental
+int
+rte_bpf_eth_tx_install(uint16_t port, uint16_t queue, struct rte_bpf *bpf,
+	uint32_t flags);
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 11/11] test/bpf: add tests for error handling contracts
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Verify NULL parameter rejection in load APIs, graceful failure on
argument/flag mismatch in burst execution APIs, and safe return of the
libpcap stub.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 app/test/test_bpf.c | 128 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 026ba18b75..6422bae6fe 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -3514,6 +3514,121 @@ run_test(const struct bpf_test *tst)
 
 }
 
+/* Test all eBPF load APIs with prm set to NULL. */
+static int
+test_bpf_load_null(void)
+{
+	struct rte_bpf *bpf;
+	int saved_errno;
+
+	rte_errno = 0;
+	bpf = rte_bpf_load(NULL);
+	saved_errno = rte_errno;
+	rte_bpf_destroy(bpf);
+	RTE_TEST_ASSERT_NULL(bpf, "rte_bpf_load(NULL) did not return NULL\n");
+	RTE_TEST_ASSERT_EQUAL(saved_errno, EINVAL,
+		"rte_bpf_load(NULL) did not set rte_errno to EINVAL\n");
+
+	rte_errno = 0;
+	bpf = rte_bpf_elf_load(NULL, "a", "b");
+	saved_errno = rte_errno;
+	rte_bpf_destroy(bpf);
+	RTE_TEST_ASSERT_NULL(bpf, "rte_bpf_elf_load(NULL, \"a\", \"b\") did not return NULL\n");
+	RTE_TEST_ASSERT_EQUAL(saved_errno, EINVAL,
+		"rte_bpf_elf_load(NULL, \"a\", \"b\") did not set rte_errno to EINVAL\n");
+
+	rte_errno = 0;
+	bpf = rte_bpf_load_ex(NULL);
+	saved_errno = rte_errno;
+	rte_bpf_destroy(bpf);
+	RTE_TEST_ASSERT_NULL(bpf, "rte_bpf_load_ex(NULL) did not return NULL\n");
+	RTE_TEST_ASSERT_EQUAL(saved_errno, EINVAL,
+		"rte_bpf_load_ex(NULL) did not set rte_errno to EINVAL\n");
+
+	return 0;
+}
+REGISTER_FAST_TEST(bpf_load_null_autotest, NOHUGE_OK, ASAN_OK, test_bpf_load_null);
+
+/* Test calling wrong API for execution of a multi-argument eBPF program. */
+static int
+test_bpf_exec_wrong_nb_prog_arg(void)
+{
+	static const struct ebpf_insn ins[] = {
+		{ .code = (EBPF_ALU64 | EBPF_MOV | BPF_K), .dst_reg = EBPF_REG_0, .imm = 0 },
+		{ .code = (BPF_JMP | EBPF_EXIT), }
+	};
+	static const struct rte_bpf_prm_ex prm = {
+		.sz = sizeof(struct rte_bpf_prm_ex),
+		.origin = RTE_BPF_ORIGIN_RAW,
+		.raw.ins = ins,
+		.raw.nb_ins = RTE_DIM(ins),
+		.prog_arg = {
+			{ .type = RTE_BPF_ARG_RAW, .size = sizeof(uint64_t) },
+			{ .type = RTE_BPF_ARG_RAW, .size = sizeof(uint64_t) },
+		},
+		.nb_prog_arg = 2, /* Intentionally mismatched: expects 2, burst gives 1 */
+	};
+
+	struct rte_bpf *bpf;
+	uint64_t rc[1];
+	void *ctx[1] = {NULL};
+	uint32_t result;
+	int saved_errno;
+
+	bpf = rte_bpf_load_ex(&prm);
+	RTE_TEST_ASSERT_NOT_NULL(bpf, "rte_bpf_load_ex failed\n");
+
+	rte_errno = 0;
+	result = rte_bpf_exec_burst(bpf, ctx, rc, 1);
+	saved_errno = rte_errno;
+	rte_bpf_destroy(bpf);
+	RTE_TEST_ASSERT_EQUAL(result, 0, "rte_bpf_exec_burst did not return 0\n");
+	RTE_TEST_ASSERT_EQUAL(saved_errno, EINVAL,
+		"rte_bpf_exec_burst did not set rte_errno to EINVAL\n");
+
+	return 0;
+}
+REGISTER_FAST_TEST(bpf_exec_wrong_nb_prog_arg_autotest, NOHUGE_OK, ASAN_OK,
+		test_bpf_exec_wrong_nb_prog_arg);
+
+/* Test passing unsupported flags when executing an eBPF program. */
+static int
+test_bpf_exec_wrong_flags(void)
+{
+	static const struct ebpf_insn ins[] = {
+		{ .code = (EBPF_ALU64 | EBPF_MOV | BPF_K), .dst_reg = EBPF_REG_0, .imm = 0 },
+		{ .code = (BPF_JMP | EBPF_EXIT), }
+	};
+	static const struct rte_bpf_prm_ex prm = {
+		.sz = sizeof(struct rte_bpf_prm_ex),
+		.origin = RTE_BPF_ORIGIN_RAW,
+		.raw.ins = ins,
+		.raw.nb_ins = RTE_DIM(ins),
+		.prog_arg = { { .type = RTE_BPF_ARG_RAW, .size = sizeof(uint64_t) } },
+		.nb_prog_arg = 1,
+	};
+
+	struct rte_bpf *bpf;
+	uint64_t rc[1];
+	struct rte_bpf_prog_ctx ctx_ex[1] = {};
+	uint32_t result;
+	int saved_errno;
+
+	bpf = rte_bpf_load_ex(&prm);
+	RTE_TEST_ASSERT_NOT_NULL(bpf, "rte_bpf_load_ex failed\n");
+
+	rte_errno = 0;
+	result = rte_bpf_exec_burst_ex(bpf, ctx_ex, rc, 1, UINT64_MAX);
+	saved_errno = rte_errno;
+	rte_bpf_destroy(bpf);
+	RTE_TEST_ASSERT_EQUAL(result, 0, "rte_bpf_exec_burst_ex did not return 0\n");
+	RTE_TEST_ASSERT_EQUAL(saved_errno, EINVAL,
+		"rte_bpf_exec_burst_ex did not set rte_errno to EINVAL\n");
+
+	return 0;
+}
+REGISTER_FAST_TEST(bpf_exec_wrong_flags_autotest, NOHUGE_OK, ASAN_OK, test_bpf_exec_wrong_flags);
+
 static int
 test_bpf(void)
 {
@@ -4444,7 +4559,18 @@ REGISTER_FAST_TEST(bpf_elf_autotest, NOHUGE_OK, ASAN_OK, test_bpf_elf);
 static int
 test_bpf_convert(void)
 {
-	printf("BPF convert RTE_HAS_LIBPCAP is undefined, skipping test\n");
+	int dummy = 0;
+	struct rte_bpf_prm *prm;
+
+	prm = rte_bpf_convert(NULL);
+	rte_free(prm);
+	RTE_TEST_ASSERT_NULL(prm, "rte_bpf_convert(NULL) without libpcap did not return NULL\n");
+
+	prm = rte_bpf_convert((const struct bpf_program *)&dummy);
+	rte_free(prm);
+	RTE_TEST_ASSERT_NULL(prm, "rte_bpf_convert(&dummy) without libpcap did not return NULL\n");
+
+	printf("BPF convert RTE_HAS_LIBPCAP is undefined, skipping full test\n");
 	return TEST_SKIPPED;
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v2] app/testpmd: add padding mode to txonly engine
From: Xingui Yang @ 2026-06-12  9:12 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

Add a new padding mode to the txonly forwarding engine, which allows
sending packets with configurable small sizes without standard L2/L3
headers. This is useful for testing NIC padding logic.

When padding mode is enabled via --tx-pkt-pad-mode flag:
- l2_len and l3_len are set to 0 instead of standard header lengths
- Packet data is filled with a static pattern instead of
  Ethernet/IP/UDP headers
- Minimum packet length validation is bypassed to allow small
  packet sizes (e.g., set txpkts 14)

Signed-off-by: Xingui Yang <yangxingui@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
v2: Fix compilation exception of unterminated-string-initialization
---
 app/test-pmd/config.c     |  2 +-
 app/test-pmd/parameters.c |  7 +++++++
 app/test-pmd/testpmd.c    |  3 +++
 app/test-pmd/testpmd.h    |  1 +
 app/test-pmd/txonly.c     | 18 ++++++++++++++++--
 5 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9d457ca88e..36b9b023e2 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -6341,7 +6341,7 @@ set_tx_pkt_segments(unsigned int *seg_lengths, unsigned int nb_segs)
 		}
 		tx_pkt_len = (uint16_t)(tx_pkt_len + seg_lengths[i]);
 	}
-	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8)) {
+	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8) && !tx_pkt_pad_mode) {
 		fprintf(stderr, "total packet length=%u < %d - give up\n",
 				(unsigned) tx_pkt_len,
 				(int)(sizeof(struct rte_ether_hdr) + 20 + 8));
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 337d8fc8ac..8c3b1244e7 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -195,6 +195,8 @@ enum {
 	TESTPMD_OPT_TXONLY_MULTI_FLOW_NUM,
 #define TESTPMD_OPT_TXONLY_FLOWS "txonly-flows"
 	TESTPMD_OPT_TXONLY_FLOWS_NUM,
+#define TESTPMD_OPT_TX_PKT_PAD_MODE "tx-pkt-pad-mode"
+	TESTPMD_OPT_TX_PKT_PAD_MODE_NUM,
 #define TESTPMD_OPT_RXQ_SHARE "rxq-share"
 	TESTPMD_OPT_RXQ_SHARE_NUM,
 #define TESTPMD_OPT_ETH_LINK_SPEED "eth-link-speed"
@@ -351,6 +353,7 @@ static const struct option long_options[] = {
 	NO_ARG(TESTPMD_OPT_MULTI_RX_MEMPOOL),
 	NO_ARG(TESTPMD_OPT_TXONLY_MULTI_FLOW),
 	REQUIRED_ARG(TESTPMD_OPT_TXONLY_FLOWS),
+	NO_ARG(TESTPMD_OPT_TX_PKT_PAD_MODE),
 	NO_ARG(TESTPMD_OPT_RXQ_SHARE),
 	REQUIRED_ARG(TESTPMD_OPT_ETH_LINK_SPEED),
 	NO_ARG(TESTPMD_OPT_DISABLE_LINK_CHECK),
@@ -504,6 +507,7 @@ usage(char* progname)
 	printf("  --txonly-multi-flow: generate multiple flows in txonly mode\n");
 	printf("  --txonly-nb-flows=N: number of flows per lcore in txonly"
 	       " multi-flow mode (1-64, default 64)\n");
+	printf("  --tx-pkt-pad-mode: enable padding mode in txonly mode\n");
 	printf("  --tx-ip=src,dst: IP addresses in Tx-only mode\n");
 	printf("  --tx-udp=src[,dst]: UDP ports in Tx-only mode\n");
 	printf("  --eth-link-speed: force link speed.\n");
@@ -1577,6 +1581,9 @@ launch_args_parse(int argc, char** argv)
 			else
 				rte_exit(EXIT_FAILURE, "txonly-flows must be >= 1 and <= 64\n");
 			break;
+		case TESTPMD_OPT_TX_PKT_PAD_MODE_NUM:
+			tx_pkt_pad_mode = 1;
+			break;
 		case TESTPMD_OPT_RXQ_SHARE_NUM:
 			rxq_share = 1;
 			break;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index fcd8a90967..457bb6d3fe 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -296,6 +296,9 @@ uint32_t tx_pkt_times_inter;
 uint32_t tx_pkt_times_intra;
 /**< Timings for send scheduling in TXONLY mode, time between packets. */
 
+uint8_t tx_pkt_pad_mode;
+/**< Whether packet padding mode is enabled. */
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t nb_pkt_flowgen_clones; /**< Number of Tx packet clones to send in flowgen mode. */
 int nb_flows_flowgen = 1024; /**< Number of flows in flowgen mode. */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 3d4b36d668..04fdc2db42 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -663,6 +663,7 @@ extern uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT]; /**< Seg. lengths */
 extern uint8_t  tx_pkt_nb_segs; /**< Number of segments in TX packets */
 extern uint32_t tx_pkt_times_intra;
 extern uint32_t tx_pkt_times_inter;
+extern uint8_t  tx_pkt_pad_mode;
 
 enum tx_pkt_split {
 	TX_PKT_SPLIT_OFF,
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 64893fa205..2ddc100f21 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -192,8 +192,8 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 	pkt->ol_flags |= ol_flags;
 	pkt->vlan_tci = vlan_tci;
 	pkt->vlan_tci_outer = vlan_tci_outer;
-	pkt->l2_len = sizeof(struct rte_ether_hdr);
-	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+	pkt->l2_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ether_hdr);
+	pkt->l3_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ipv4_hdr);
 
 	pkt_len = pkt->data_len;
 	pkt_seg = pkt;
@@ -204,6 +204,19 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 		pkt_len += pkt_seg->data_len;
 	}
 	pkt_seg->next = NULL; /* Last segment of packet. */
+
+	if (tx_pkt_pad_mode) {
+		static const char pad_pattern[] = "0123456789abcdef";
+		uint32_t j;
+		char *pad;
+
+		pad = rte_pktmbuf_mtod(pkt, char *);
+		for (j = 0; j < pkt->data_len; j++)
+			pad[j] = pad_pattern[j % 16];
+
+		goto out;
+	}
+
 	/*
 	 * Copy headers in first packet segment(s).
 	 */
@@ -295,6 +308,7 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 			sizeof(struct rte_ipv4_hdr) +
 			sizeof(pkt_udp_hdr));
 	}
+out:
 	/*
 	 * Complete first mbuf of packet and append it to the
 	 * burst of packets to be transmitted.
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] net/af_packet: fix parsing of numeric device args
From: Bruce Richardson @ 2026-06-12  9:35 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Denis Sergeev, stable, John W. Linville
In-Reply-To: <20260603181306.459234-1-stephen@networkplumber.org>

On Wed, Jun 03, 2026 at 11:13:06AM -0700, Stephen Hemminger wrote:
> This driver has several numeric arguments but it was using
> atoi() which allows garbage and negative values.
> Convert to a helper using strtoul() with upper bound.
> 
> First found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Reported-by: Denis Sergeev <denserg.edu@gmail.com>
> Fixes: 364e08f2bbc0 ("af_packet: add PMD for AF_PACKET-based virtual devices")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply

* Re: [PATCH 15/15] doc: correct grammar and punctuation consistency issues
From: Radu Nicolau @ 2026-06-12 10:06 UTC (permalink / raw)
  To: Stephen Hemminger, dev
  Cc: Nicolas Chautru, Gowrishankar Muthukrishnan, Bruce Richardson,
	Akhil Goyal, Anatoly Burakov, Jingjing Wu, Rajesh Kumar,
	Cristian Dumitrescu, John McNamara
In-Reply-To: <20260611212119.1026721-16-stephen@networkplumber.org>


On 11-Jun-26 10:18 PM, Stephen Hemminger wrote:
> Correct grammar and punctuation issues across sample application guides:
> - Added missing comma after "To compile the sample application" in 11 files
> - Added missing period in cmd_line.rst compilation instruction
> - Fixed capitalization of "Linux" (was lowercase "linux") in 4 files
> - Fixed capitalization of "Ethernet" (was lowercase "ethernet") in 2 files
> - Fixed "then" to "than" in comparison contexts (2 instances)
> - Fixed subject-verb agreement: "specify" to "specifies"
>
> These changes ensure consistency with the style used in other documentation
> files and improve overall readability.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
Acked-by: Radu Nicolau <radu.nicolau@intel.com>

^ permalink raw reply

* Re: [PATCH 09/15] doc: improve IP reassembly, IPsec, multicast, and keep-alive
From: Radu Nicolau @ 2026-06-12 10:06 UTC (permalink / raw)
  To: Stephen Hemminger, dev; +Cc: Konstantin Ananyev, Akhil Goyal
In-Reply-To: <20260611212119.1026721-10-stephen@networkplumber.org>


On 11-Jun-26 10:18 PM, Stephen Hemminger wrote:
> Updated multiple networking sample application guides:
>
> ip_reassembly.rst:
> - Enhanced fragment reassembly process descriptions
> - Improved command-line parameter documentation
> - Fixed formatting and terminology consistency
>
> ipsec_secgw.rst:
> - Restructured configuration file format sections
> - Improved security association descriptions
> - Enhanced clarity of IPsec gateway operations
>
> ipv4_multicast.rst:
> - Simplified multicast forwarding explanations
> - Fixed formatting in code examples
> - Improved readability of routing descriptions
>
> keep_alive.rst:
> - Enhanced keep-alive mechanism descriptions
> - Fixed minor formatting issues
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
Acked-by: Radu Nicolau <radu.nicolau@intel.com>

^ permalink raw reply

* [PATCH v3 00/25] bpf: test and fix issues in verifier
From: Marat Khalili @ 2026-06-12 10:47 UTC (permalink / raw)
  Cc: dev
In-Reply-To: <20260519093131.52022-1-marat.khalili@huawei.com>

This patchset addresses numerous bugs in the BPF verifier's abstract
interpretation logic and introduces a new validation debugger API to
enable precise, robust testing of the verifier itself.

While the existing DPDK eBPF verifier is capable of checking basic
execution graph loops and dead code, the mathematical tracking of
register bounds (both signed and unsigned) contained flaws resulting in
false positives and false negatives, undefined behavior, and hardware
exceptions such as SIGFPE during validation.

To resolve these issues and ensure they do not regress, this patchset
first introduces the "Validation Debugger API"
(`rte_bpf_validate_debug_*`). This gdb-like interface allows setting
breakpoints and catchpoints during the validation process to inspect the
verifier's internal state.

Using this new API, a comprehensive test harness
(`app/test/test_bpf_validate.c`) was created to formally check the
abstract domains of instructions across all their valid branches. The
remainder of the patchset incrementally fixes the math and bounds logic
for individual eBPF instructions, using the new tests to prove the
correctness of the fixes.

This debugger API also lays the foundation for an interactive eBPF
validation debugger to be introduced in the future.

Series-Depends-on: series-38434 ("bpf: introduce extensible load API")

v3:
* Rebased on v5 of the prerequisite series and updated Depends-on tags.
* Replaced a hardcoded compiler attribute with __rte_format_printf.

v2:
* Addressed AI reviewer comments:
  * replaced `false` and `true` with 0 and 1 in some API descriptions
    and invocations that multiplex boolean and negative error code;
  * made some previously implicit casts explicit;
  * moved new enum value to the end of the definition.
* Added Acked-by and Depends-on tags to all individual commits to
  align with patchwork requirements.
* Added Reported-by tags to fixes of issues discovered by Claudia Cauli
  using a formal methods framework.

Marat Khalili (25):
  bpf: format and dump jlt, jle, jslt, and jsle
  bpf: add format instruction function
  bpf/validate: break on error in evaluate
  bpf/validate: expand comments in evaluate cycle
  bpf/validate: introduce debugging interface
  bpf/validate: fix BPF_ADD of pointer to a scalar
  bpf/validate: fix BPF_LDX | EBPF_DW signed range
  test/bpf_validate: add setup and basic tests
  test/bpf_validate: add harness for pointer tests
  bpf/validate: fix EBPF_JSLT | BPF_X evaluation
  bpf/validate: fix BPF_NEG of INT64_MIN and 0
  bpf/validate: fix BPF_DIV and BPF_MOD signed part
  bpf/validate: fix BPF_MUL ranges minimum typo
  bpf/validate: fix BPF_MUL signed overflow UB
  bpf/validate: fix BPF_JGT/EBPF_JSGT no-jump max
  bpf/validate: fix BPF_JMP source range calculation
  bpf/validate: fix BPF_JMP empty range handling
  bpf/validate: fix BPF_AND min calculations
  bpf/validate: fix BPF_LSH shift-out-of-bounds UB
  bpf/validate: fix BPF_OR min calculations
  bpf/validate: fix BPF_SUB signed max zero case
  bpf/validate: fix BPF_XOR signed min calculation
  bpf/validate: prevent overflow when building graph
  doc: add release notes for BPF validation fixes
  doc: add BPF validate debug to programmer's guide

 app/test/meson.build                   |    1 +
 app/test/test_bpf.c                    |   99 ++
 app/test/test_bpf_validate.c           | 2271 ++++++++++++++++++++++++
 doc/guides/prog_guide/bpf_lib.rst      |   31 +
 doc/guides/rel_notes/release_26_07.rst |   16 +
 lib/bpf/bpf_dump.c                     |  292 +--
 lib/bpf/bpf_validate.c                 |  730 +++++++-
 lib/bpf/bpf_validate.h                 |   60 +
 lib/bpf/bpf_validate_debug.c           |  663 +++++++
 lib/bpf/bpf_validate_debug.h           |   86 +
 lib/bpf/bpf_value_set.c                |  403 +++++
 lib/bpf/bpf_value_set.h                |  126 ++
 lib/bpf/meson.build                    |    9 +-
 lib/bpf/rte_bpf.h                      |   55 +
 lib/bpf/rte_bpf_validate_debug.h       |  377 ++++
 15 files changed, 5022 insertions(+), 197 deletions(-)
 create mode 100644 app/test/test_bpf_validate.c
 create mode 100644 lib/bpf/bpf_validate.h
 create mode 100644 lib/bpf/bpf_validate_debug.c
 create mode 100644 lib/bpf/bpf_validate_debug.h
 create mode 100644 lib/bpf/bpf_value_set.c
 create mode 100644 lib/bpf/bpf_value_set.h
 create mode 100644 lib/bpf/rte_bpf_validate_debug.h

-- 
2.43.0

^ permalink raw reply

* [PATCH v3 01/25] bpf: format and dump jlt, jle, jslt, and jsle
From: Marat Khalili @ 2026-06-12 10:47 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612104743.6465-1-marat.khalili@huawei.com>

Signed and unsigned less and less-then conditional jumps were not
supported by the eBPF format and dump functions, add these instructions.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
Depends-on: series-38434 ("bpf: introduce extensible load API")
 lib/bpf/bpf_dump.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/bpf/bpf_dump.c b/lib/bpf/bpf_dump.c
index 91bc7c0a7af1..0abaeef8ae98 100644
--- a/lib/bpf/bpf_dump.c
+++ b/lib/bpf/bpf_dump.c
@@ -42,6 +42,8 @@ static const char *const jump_tbl[16] = {
 	[BPF_JSET >> 4] = "jset",  [EBPF_JNE >> 4] = "jne",
 	[EBPF_JSGT >> 4] = "jsgt", [EBPF_JSGE >> 4] = "jsge",
 	[EBPF_CALL >> 4] = "call", [EBPF_EXIT >> 4] = "exit",
+	[EBPF_JLT >> 4] = "jlt",   [EBPF_JLE >> 4] = "jle",
+	[EBPF_JSLT >> 4] = "jslt", [EBPF_JSLE >> 4] = "jsle",
 };
 
 static inline const char *
-- 
2.43.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox