DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v15 3/5] vhost: refactor memory helper functions
From: pravin.bathija @ 2026-06-04 23:57 UTC (permalink / raw)
  To: dev, fengchengwen, stephen, maxime.coquelin
  Cc: pravin.bathija, thomas, Stephen Hemminger
In-Reply-To: <20260604235723.1046607-1-pravin.bathija@dell.com>

From: Pravin M Bathija <pravin.bathija@dell.com>

- Extract reusable helper routines for vhost-user backend memory
operations.
- Split DMA map/unmap into per-region logic.
- Decouple and rework memory region free routines.
- Iterate over VHOST_MEMORY_MAX_NREGIONS uniformly
across related functions to simplify code reuse

Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/vhost/vhost_user.c | 172 ++++++++++++++++++++++++++---------------
 1 file changed, 110 insertions(+), 62 deletions(-)

diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 4bfb13fb98..94fca8b589 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -171,20 +171,27 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
-static void
-async_dma_map(struct virtio_net *dev, bool do_map)
+static int
+async_dma_map_region(struct virtio_net *dev, struct rte_vhost_mem_region *reg, bool do_map)
 {
-	int ret = 0;
 	uint32_t i;
-	struct guest_page *page;
+	int ret;
+	uint64_t reg_start = reg->host_user_addr;
+	uint64_t reg_end = reg_start + reg->size;
 
-	if (do_map) {
-		for (i = 0; i < dev->nr_guest_pages; i++) {
-			page = &dev->guest_pages[i];
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		struct guest_page *page = &dev->guest_pages[i];
+
+		/* Only process pages belonging to this region */
+		if (page->host_user_addr < reg_start ||
+		    page->host_user_addr >= reg_end)
+			continue;
+
+		if (do_map) {
 			ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
-							 page->host_user_addr,
-							 page->host_iova,
-							 page->size);
+					page->host_user_addr,
+					page->host_iova,
+					page->size);
 			if (ret) {
 				/*
 				 * DMA device may bind with kernel driver, in this case,
@@ -199,33 +206,57 @@ async_dma_map(struct virtio_net *dev, bool do_map)
 				 * normal case in async path. This is a workaround.
 				 */
 				if (rte_errno == ENODEV)
-					return;
+					return 0;
 
 				/* DMA mapping errors won't stop VHOST_USER_SET_MEM_TABLE. */
 				VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine map failed");
+				return -1;
 			}
-		}
-
-	} else {
-		for (i = 0; i < dev->nr_guest_pages; i++) {
-			page = &dev->guest_pages[i];
+		} else {
 			ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
-							   page->host_user_addr,
-							   page->host_iova,
-							   page->size);
+					page->host_user_addr,
+					page->host_iova,
+					page->size);
 			if (ret) {
 				/* like DMA map, ignore the kernel driver case when unmap. */
 				if (rte_errno == EINVAL)
-					return;
+					return 0;
 
 				VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine unmap failed");
+				return -1;
 			}
 		}
 	}
+
+	return 0;
+}
+
+static void
+async_dma_map(struct virtio_net *dev, bool do_map)
+{
+	uint32_t i;
+	struct rte_vhost_mem_region *reg;
+
+	for (i = 0; i < VHOST_MEMORY_MAX_NREGIONS; i++) {
+		reg = &dev->mem->regions[i];
+		if (reg->host_user_addr == 0)
+			continue;
+		async_dma_map_region(dev, reg, do_map);
+	}
 }
 
 static void
-free_mem_region(struct virtio_net *dev)
+free_mem_region(struct rte_vhost_mem_region *reg)
+{
+	if (reg != NULL && reg->mmap_addr) {
+		munmap(reg->mmap_addr, reg->mmap_size);
+		close(reg->fd);
+		memset(reg, 0, sizeof(struct rte_vhost_mem_region));
+	}
+}
+
+static void
+free_all_mem_regions(struct virtio_net *dev)
 {
 	uint32_t i;
 	struct rte_vhost_mem_region *reg;
@@ -236,12 +267,10 @@ free_mem_region(struct virtio_net *dev)
 	if (dev->async_copy && rte_vfio_is_enabled("vfio"))
 		async_dma_map(dev, false);
 
-	for (i = 0; i < dev->mem->nregions; i++) {
+	for (i = 0; i < VHOST_MEMORY_MAX_NREGIONS; i++) {
 		reg = &dev->mem->regions[i];
-		if (reg->host_user_addr) {
-			munmap(reg->mmap_addr, reg->mmap_size);
-			close(reg->fd);
-		}
+		if (reg->mmap_addr)
+			free_mem_region(reg);
 	}
 }
 
@@ -255,7 +284,7 @@ vhost_backend_cleanup(struct virtio_net *dev)
 		vdpa_dev->ops->dev_cleanup(dev->vid);
 
 	if (dev->mem) {
-		free_mem_region(dev);
+		free_all_mem_regions(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
 	}
@@ -704,7 +733,7 @@ numa_realloc(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
 	vhost_devices[dev->vid] = dev;
 
 	mem_size = sizeof(struct rte_vhost_memory) +
-		sizeof(struct rte_vhost_mem_region) * dev->mem->nregions;
+		sizeof(struct rte_vhost_mem_region) * VHOST_MEMORY_MAX_NREGIONS;
 	mem = rte_realloc_socket(dev->mem, mem_size, 0, node);
 	if (!mem) {
 		VHOST_CONFIG_LOG(dev->ifname, ERR,
@@ -808,8 +837,10 @@ hua_to_alignment(struct rte_vhost_memory *mem, void *ptr)
 	uint32_t i;
 	uintptr_t hua = (uintptr_t)ptr;
 
-	for (i = 0; i < mem->nregions; i++) {
+	for (i = 0; i < VHOST_MEMORY_MAX_NREGIONS; i++) {
 		r = &mem->regions[i];
+		if (r->host_user_addr == 0)
+			continue;
 		if (hua >= r->host_user_addr &&
 			hua < r->host_user_addr + r->size) {
 			return get_blk_size(r->fd);
@@ -1382,6 +1413,52 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	return 0;
 }
 
+static int
+vhost_user_initialize_memory(struct virtio_net **pdev)
+{
+	struct virtio_net *dev = *pdev;
+	int numa_node = SOCKET_ID_ANY;
+
+	if (dev->mem != NULL) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR,
+			"memory already initialized, free it first");
+		return -1;
+	}
+
+	/*
+	 * If VQ 0 has already been allocated, try to allocate on the same
+	 * NUMA node. It can be reallocated later in numa_realloc().
+	 */
+	if (dev->nr_vring > 0)
+		numa_node = dev->virtqueue[0]->numa_node;
+
+	dev->nr_guest_pages = 0;
+	if (dev->guest_pages == NULL) {
+		dev->max_guest_pages = 8;
+		dev->guest_pages = rte_zmalloc_socket(NULL,
+					dev->max_guest_pages *
+					sizeof(struct guest_page),
+					RTE_CACHE_LINE_SIZE,
+					numa_node);
+		if (dev->guest_pages == NULL) {
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"failed to allocate memory for dev->guest_pages");
+			return -1;
+		}
+	}
+
+	dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+		sizeof(struct rte_vhost_mem_region) * VHOST_MEMORY_MAX_NREGIONS, 0, numa_node);
+	if (dev->mem == NULL) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to allocate memory for dev->mem");
+		rte_free(dev->guest_pages);
+		dev->guest_pages = NULL;
+		return -1;
+	}
+
+	return 0;
+}
+
 static int
 vhost_user_set_mem_table(struct virtio_net **pdev,
 			struct vhu_msg_context *ctx,
@@ -1390,7 +1467,6 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
 	struct virtio_net *dev = *pdev;
 	struct VhostUserMemory *memory = &ctx->msg.payload.memory;
 	struct rte_vhost_mem_region *reg;
-	int numa_node = SOCKET_ID_ANY;
 	uint64_t mmap_offset;
 	uint32_t i;
 	bool async_notify = false;
@@ -1435,39 +1511,13 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
 		if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 			vhost_user_iotlb_flush_all(dev);
 
-		free_mem_region(dev);
+		free_all_mem_regions(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
 	}
 
-	/*
-	 * If VQ 0 has already been allocated, try to allocate on the same
-	 * NUMA node. It can be reallocated later in numa_realloc().
-	 */
-	if (dev->nr_vring > 0)
-		numa_node = dev->virtqueue[0]->numa_node;
-
-	dev->nr_guest_pages = 0;
-	if (dev->guest_pages == NULL) {
-		dev->max_guest_pages = 8;
-		dev->guest_pages = rte_zmalloc_socket(NULL,
-					dev->max_guest_pages *
-					sizeof(struct guest_page),
-					RTE_CACHE_LINE_SIZE,
-					numa_node);
-		if (dev->guest_pages == NULL) {
-			VHOST_CONFIG_LOG(dev->ifname, ERR,
-				"failed to allocate memory for dev->guest_pages");
-			goto close_msg_fds;
-		}
-	}
-
-	dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) +
-		sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node);
-	if (dev->mem == NULL) {
-		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to allocate memory for dev->mem");
-		goto free_guest_pages;
-	}
+	if (vhost_user_initialize_memory(pdev) < 0)
+		goto close_msg_fds;
 
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
@@ -1531,11 +1581,9 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_OK;
 
 free_mem_table:
-	free_mem_region(dev);
+	free_all_mem_regions(dev);
 	rte_free(dev->mem);
 	dev->mem = NULL;
-
-free_guest_pages:
 	rte_free(dev->guest_pages);
 	dev->guest_pages = NULL;
 close_msg_fds:
-- 
2.43.0


^ permalink raw reply related

* [PATCH v15 2/5] vhost: header defines for add/rem mem region
From: pravin.bathija @ 2026-06-04 23:57 UTC (permalink / raw)
  To: dev, fengchengwen, stephen, maxime.coquelin
  Cc: pravin.bathija, thomas, Stephen Hemminger
In-Reply-To: <20260604235723.1046607-1-pravin.bathija@dell.com>

From: Pravin M Bathija <pravin.bathija@dell.com>

The changes in this file cover the enum message requests for
supporting add/remove memory regions. The front-end vhost-user
client sends messages like get max memory slots, add memory region
and remove memory region which are covered in these changes which
are on the vhost-user back-end. The changes also include data structure
definition of memory region to be added/removed. The data structure
VhostUserMsg has been changed to include the memory region.

Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
Reviewed-by: Fengchengwen <fengchengwen@huawei.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/vhost/vhost_user.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/vhost/vhost_user.h b/lib/vhost/vhost_user.h
index ef486545ba..6435816534 100644
--- a/lib/vhost/vhost_user.h
+++ b/lib/vhost/vhost_user.h
@@ -67,6 +67,9 @@ typedef enum VhostUserRequest {
 	VHOST_USER_POSTCOPY_END = 30,
 	VHOST_USER_GET_INFLIGHT_FD = 31,
 	VHOST_USER_SET_INFLIGHT_FD = 32,
+	VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+	VHOST_USER_ADD_MEM_REG = 37,
+	VHOST_USER_REM_MEM_REG = 38,
 	VHOST_USER_SET_STATUS = 39,
 	VHOST_USER_GET_STATUS = 40,
 } VhostUserRequest;
@@ -91,6 +94,11 @@ typedef struct VhostUserMemory {
 	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserMemRegMsg {
+	uint64_t padding;
+	VhostUserMemoryRegion region;
+} VhostUserMemRegMsg;
+
 typedef struct VhostUserLog {
 	uint64_t mmap_size;
 	uint64_t mmap_offset;
@@ -186,6 +194,7 @@ typedef struct __rte_packed_begin VhostUserMsg {
 		struct vhost_vring_state state;
 		struct vhost_vring_addr addr;
 		VhostUserMemory memory;
+		VhostUserMemRegMsg memreg;
 		VhostUserLog    log;
 		struct vhost_iotlb_msg iotlb;
 		VhostUserCryptoSessionParam crypto_session;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v15 5/5] vhost: enable configure memory slots
From: pravin.bathija @ 2026-06-04 23:57 UTC (permalink / raw)
  To: dev, fengchengwen, stephen, maxime.coquelin
  Cc: pravin.bathija, thomas, Stephen Hemminger
In-Reply-To: <20260604235723.1046607-1-pravin.bathija@dell.com>

From: Pravin M Bathija <pravin.bathija@dell.com>

This patch enables configure memory slots in the header define
VHOST_USER_PROTOCOL_FEATURES.

Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.com>
Reviewed-by: Fengchengwen <fengchengwen@huawei.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/vhost/vhost_user.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/vhost/vhost_user.h b/lib/vhost/vhost_user.h
index 6435816534..732aa4dc02 100644
--- a/lib/vhost/vhost_user.h
+++ b/lib/vhost/vhost_user.h
@@ -32,6 +32,7 @@
 					 (1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
 
 typedef enum VhostUserRequest {
-- 
2.43.0


^ permalink raw reply related

* [PATCH v15 4/5] vhost: add mem region add/remove handlers
From: pravin.bathija @ 2026-06-04 23:57 UTC (permalink / raw)
  To: dev, fengchengwen, stephen, maxime.coquelin; +Cc: pravin.bathija, thomas
In-Reply-To: <20260604235723.1046607-1-pravin.bathija@dell.com>

From: Pravin M Bathija <pravin.bathija@dell.com>

Add support for VHOST_USER_ADD_MEM_REG, VHOST_USER_REM_MEM_REG and
VHOST_USER_GET_MAX_MEM_SLOTS. Refactor memory initialization into
common helper and add supporting functions for dynamic memory management.

Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
---
 lib/vhost/vhost_user.c | 253 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 253 insertions(+)

diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 94fca8b589..b13c23ccf8 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -71,6 +71,9 @@ VHOST_MESSAGE_HANDLER(VHOST_USER_SET_FEATURES, vhost_user_set_features, false, t
 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_OWNER, vhost_user_set_owner, false, true) \
 VHOST_MESSAGE_HANDLER(VHOST_USER_RESET_OWNER, vhost_user_reset_owner, false, false) \
 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_MEM_TABLE, vhost_user_set_mem_table, true, true) \
+VHOST_MESSAGE_HANDLER(VHOST_USER_GET_MAX_MEM_SLOTS, vhost_user_get_max_mem_slots, false, false) \
+VHOST_MESSAGE_HANDLER(VHOST_USER_ADD_MEM_REG, vhost_user_add_mem_reg, true, true) \
+VHOST_MESSAGE_HANDLER(VHOST_USER_REM_MEM_REG, vhost_user_rem_mem_reg, false, true) \
 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_BASE, vhost_user_set_log_base, true, true) \
 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_FD, vhost_user_set_log_fd, true, true) \
 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_NUM, vhost_user_set_vring_num, false, true) \
@@ -1167,6 +1170,24 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
 	return 0;
 }
 
+static void
+remove_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg)
+{
+	uint64_t reg_start = reg->host_user_addr;
+	uint64_t reg_end = reg_start + reg->size;
+	uint32_t i, j = 0;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		if (dev->guest_pages[i].host_user_addr >= reg_start &&
+		    dev->guest_pages[i].host_user_addr < reg_end)
+			continue;
+		if (j != i)
+			dev->guest_pages[j] = dev->guest_pages[i];
+		j++;
+	}
+	dev->nr_guest_pages = j;
+}
+
 #ifdef RTE_LIBRTE_VHOST_DEBUG
 /* TODO: enable it only in debug mode? */
 static void
@@ -1591,6 +1612,238 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_ERR;
 }
 
+
+static int
+vhost_user_get_max_mem_slots(struct virtio_net **pdev __rte_unused,
+			struct vhu_msg_context *ctx,
+			int main_fd __rte_unused)
+{
+	uint32_t max_mem_slots = VHOST_MEMORY_MAX_NREGIONS;
+
+	ctx->msg.payload.u64 = max_mem_slots;
+	ctx->msg.size = sizeof(ctx->msg.payload.u64);
+	ctx->fd_num = 0;
+
+	return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+/*
+ * Invalidate and re-translate all vring addresses after the memory table
+ * has been modified (add/remove region).
+ *
+ * translate_ring_addresses() may call numa_realloc(), which can reallocate
+ * the device structure.  The updated pointer is written back through *pdev
+ * so callers must refresh their local "dev" afterwards: dev = *pdev.
+ */
+static void
+vhost_user_invalidate_vrings(struct virtio_net **pdev)
+{
+	struct virtio_net *dev = *pdev;
+	uint32_t i;
+
+	for (i = 0; i < dev->nr_vring; i++) {
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+		if (!vq)
+			continue;
+
+		if (vq->desc || vq->avail || vq->used) {
+			vq_assert_lock(dev, vq);
+
+			vring_invalidate(dev, vq);
+
+			translate_ring_addresses(&dev, &vq);
+		}
+	}
+
+	*pdev = dev;
+}
+
+/*
+ * Macro wrapper that performs the compile-time lock assertion with the
+ * correct message ID at the call site, then calls the implementation.
+ */
+#define dev_invalidate_vrings(pdev, id) do { \
+	static_assert(id ## _LOCK_ALL_QPS, \
+		#id " handler is not declared as locking all queue pairs"); \
+	vhost_user_invalidate_vrings(pdev); \
+} while (0)
+
+static int
+vhost_user_add_mem_reg(struct virtio_net **pdev,
+			struct vhu_msg_context *ctx,
+			int main_fd __rte_unused)
+{
+	struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
+	struct virtio_net *dev = *pdev;
+	uint32_t i;
+
+	/* convert first region add to normal memory table set */
+	if (dev->mem == NULL) {
+		if (vhost_user_initialize_memory(pdev) < 0)
+			goto close_msg_fds;
+	}
+
+	/* make sure new region will fit */
+	if (dev->mem->nregions >= VHOST_MEMORY_MAX_NREGIONS) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR, "too many memory regions already (%u)",
+									dev->mem->nregions);
+		goto close_msg_fds;
+	}
+
+	/* make sure supplied memory fd present */
+	if (ctx->fd_num != 1) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR, "fd count makes no sense (%u)", ctx->fd_num);
+		goto close_msg_fds;
+	}
+
+	/* Make sure no overlap in guest virtual address space */
+	for (i = 0; i < dev->mem->nregions; i++) {
+		struct rte_vhost_mem_region *cur = &dev->mem->regions[i];
+		uint64_t cur_start = cur->guest_user_addr;
+		uint64_t cur_end = cur_start + cur->size - 1;
+		uint64_t new_start = region->userspace_addr;
+		uint64_t new_end = new_start + region->memory_size - 1;
+
+		if (new_end >= cur_start && new_start <= cur_end) {
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"requested memory region overlaps with another region");
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"\tRequested region address:0x%" PRIx64,
+				region->userspace_addr);
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"\tRequested region size:0x%" PRIx64,
+				region->memory_size);
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"\tOverlapping region address:0x%" PRIx64,
+				cur->guest_user_addr);
+			VHOST_CONFIG_LOG(dev->ifname, ERR,
+				"\tOverlapping region size:0x%" PRIx64,
+				cur->size);
+			goto close_msg_fds;
+		}
+	}
+
+	/* New region goes at the end of the contiguous array */
+	struct rte_vhost_mem_region *reg = &dev->mem->regions[dev->mem->nregions];
+
+	reg->guest_phys_addr = region->guest_phys_addr;
+	reg->guest_user_addr = region->userspace_addr;
+	reg->size            = region->memory_size;
+	reg->fd              = ctx->fds[0];
+	ctx->fds[0]          = -1;
+
+	if (vhost_user_mmap_region(dev, reg, region->mmap_offset) < 0) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap region");
+		if (reg->mmap_addr) {
+			/* mmap succeeded but a later step (e.g. add_guest_pages)
+			 * failed; undo the mapping and any guest-page entries.
+			 */
+			remove_guest_pages(dev, reg);
+			free_mem_region(reg);
+		} else {
+			close(reg->fd);
+			reg->fd = -1;
+		}
+		goto close_msg_fds;
+	}
+
+	dev->mem->nregions++;
+
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		if (async_dma_map_region(dev, reg, true) < 0)
+			goto free_new_region_no_dma;
+	}
+
+	if (dev->postcopy_listening) {
+		/*
+		 * Cannot use vhost_user_postcopy_register() here because it
+		 * reads ctx->msg.payload.memory (SET_MEM_TABLE layout), but
+		 * ADD_MEM_REG uses the memreg payload.  Register the
+		 * single new region directly instead.
+		 */
+		if (vhost_user_postcopy_region_register(dev, reg) < 0)
+			goto free_new_region;
+	}
+
+	dev_invalidate_vrings(pdev, VHOST_USER_ADD_MEM_REG);
+	dev = *pdev;
+	dump_guest_pages(dev);
+
+	/* Reply with the back-end's mapping address per vhost-user spec */
+	ctx->msg.payload.memreg.region.userspace_addr = reg->host_user_addr;
+	ctx->msg.size = sizeof(ctx->msg.payload.memreg);
+	ctx->fd_num = 0;
+
+	return RTE_VHOST_MSG_RESULT_REPLY;
+
+free_new_region:
+	if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+		async_dma_map_region(dev, reg, false);
+free_new_region_no_dma:
+	remove_guest_pages(dev, reg);
+	free_mem_region(reg);
+	dev->mem->nregions--;
+close_msg_fds:
+	close_msg_fds(ctx);
+	return RTE_VHOST_MSG_RESULT_ERR;
+}
+
+static int
+vhost_user_rem_mem_reg(struct virtio_net **pdev,
+			struct vhu_msg_context *ctx,
+			int main_fd __rte_unused)
+{
+	struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
+	struct virtio_net *dev = *pdev;
+	uint32_t i;
+
+	if (dev->mem == NULL || dev->mem->nregions == 0) {
+		VHOST_CONFIG_LOG(dev->ifname, ERR, "no memory regions to remove");
+		return RTE_VHOST_MSG_RESULT_ERR;
+	}
+
+	for (i = 0; i < dev->mem->nregions; i++) {
+		struct rte_vhost_mem_region *current_region = &dev->mem->regions[i];
+
+		/*
+		 * According to the vhost-user specification:
+		 * The memory region to be removed is identified by its GPA,
+		 * user address and size. The mmap offset is ignored.
+		 */
+		if (region->userspace_addr == current_region->guest_user_addr
+			&& region->guest_phys_addr == current_region->guest_phys_addr
+			&& region->memory_size == current_region->size) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map_region(dev, current_region, false);
+			if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+				vhost_user_iotlb_cache_remove(dev,
+					current_region->guest_phys_addr,
+					current_region->size);
+			remove_guest_pages(dev, current_region);
+			free_mem_region(current_region);
+
+			/* Compact the regions array to keep it contiguous */
+			if (i < dev->mem->nregions - 1) {
+				memmove(&dev->mem->regions[i],
+					&dev->mem->regions[i + 1],
+					(dev->mem->nregions - 1 - i) *
+					sizeof(struct rte_vhost_mem_region));
+				memset(&dev->mem->regions[dev->mem->nregions - 1],
+					0, sizeof(struct rte_vhost_mem_region));
+			}
+
+			dev->mem->nregions--;
+			dev_invalidate_vrings(pdev, VHOST_USER_REM_MEM_REG);
+			dev = *pdev;
+			return RTE_VHOST_MSG_RESULT_OK;
+		}
+	}
+
+	VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to find region");
+	return RTE_VHOST_MSG_RESULT_ERR;
+}
+
 static bool
 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
-- 
2.43.0


^ permalink raw reply related

* [PATCH v2 0/5] net/bnxt: interrupt handling, external mbuf and stability fixes
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique
In-Reply-To: <20260603211341.2112344-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

This series addresses interrupt handling, external memory, and crash bugs:

 - Fix incorrect completion validation for NQEs and RX completions causing
   excess interrupts
 - Use buf_addr instead of IOVA for mbufs from external memory pools
 - Skip IOVA range check for external mbuf head nodes to avoid false failures
 - Add null checks to prevent segfaults when accessing uninitialized structures
 - Fix segfault on exit when bonded ports are present, by checking whether
   ethdev has already freed the RX/TX queue arrays

All patches carry Fixes: tags and Cc: stable@dpdk.org.

Note: this series depends on series "net/bnxt: ULP stats timer and PTP".

Changes in v2:
 - Patch 1/5: replace printf() with PMD_DRV_LOG_LINE() (DPDK logging standard)
 - Patch 2/5: replace custom bnxt_mbuf_buf_addr() with rte_pktmbuf_mtod_offset()

Ajit Khaparde (2):
  net/bnxt: use buf address for external mbuf
  net/bnxt: prevent a potential segfault

Keegan Freyhof (2):
  net/bnxt: fix NQ/CQ processing for interrupt handling
  net/bnxt: fix for segmentation fault that would occur on exit

Mohammad Shuab Siddique (1):
  net/bnxt: fix IOVA range check for external mbuf head node

 drivers/net/bnxt/bnxt.h        |  2 +
 drivers/net/bnxt/bnxt_cpr.c    | 100 ++++++++++++++++++++++++++++++++++
 drivers/net/bnxt/bnxt_cpr.h    |  34 +++++++++++-
 drivers/net/bnxt/bnxt_ethdev.c |   3 ++
 drivers/net/bnxt/bnxt_hwrm.c   |   3 ++
 drivers/net/bnxt/bnxt_ring.c   |  11 +++-
 drivers/net/bnxt/bnxt_rxq.c    |  47 +++++++++++++++-
 drivers/net/bnxt/bnxt_rxr.c    |   2 +-
 drivers/net/bnxt/bnxt_stats.c  |  17 +++---
 drivers/net/bnxt/bnxt_txr.c    |  19 +++++--
 10 files changed, 223 insertions(+), 15 deletions(-)

-- 
2.47.3


^ permalink raw reply

* [PATCH v2 1/5] net/bnxt: fix NQ/CQ processing for interrupt handling
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Keegan Freyhof,
	Mohammad Shuab Siddique
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Keegan Freyhof <keegan.freyhof@broadcom.com>

Restructure CQ and NQ arming in NQ processing to be more readable.
Fix incorrect completion validation being used for NQEs and RX
completions in NQ processing.

Fixed the issue of excess interrupts by properly tracking the valid
bit for NQs and using the correct RX completion validation code.

Fixes: 683e5cf79249 ("net/bnxt: use common NQ ring")
Cc: stable@dpdk.org
Signed-off-by: Keegan Freyhof <keegan.freyhof@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt.h     |  2 +
 drivers/net/bnxt/bnxt_cpr.c | 100 +++++++++++++++++++++++++++++++++++++
 drivers/net/bnxt/bnxt_cpr.h |  34 ++++++++++++-
 drivers/net/bnxt/bnxt_rxq.c |  47 +++++++++++++++++-
 4 files changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bnxt/bnxt.h b/drivers/net/bnxt/bnxt.h
index 03df28e64a..f21753e40c 100644
--- a/drivers/net/bnxt/bnxt.h
+++ b/drivers/net/bnxt/bnxt.h
@@ -433,8 +433,10 @@ struct bnxt_coal {
 #define DBR_TYPE_SQ				(0x0ULL << 60)
 #define DBR_TYPE_SRQ				(0x2ULL << 60)
 #define DBR_TYPE_CQ				(0x4ULL << 60)
+#define DBR_TYPE_CQ_ARMALL			(0x6ULL << 60)
 #define DBR_TYPE_NQ				(0xaULL << 60)
 #define DBR_TYPE_NQ_ARM				(0xbULL << 60)
+#define DBR_TYPE_NQ_MASK			(0xeULL << 60)
 
 #define DB_PF_OFFSET			0x10000
 #define DB_VF_OFFSET			0x4000
diff --git a/drivers/net/bnxt/bnxt_cpr.c b/drivers/net/bnxt/bnxt_cpr.c
index c6606e19a8..60d1ec0b4a 100644
--- a/drivers/net/bnxt/bnxt_cpr.c
+++ b/drivers/net/bnxt/bnxt_cpr.c
@@ -13,6 +13,106 @@
 #include "hsi_struct_def_dpdk.h"
 #include "tfc_vf2pf_msg.h"
 
+void bnxt_process_async_msg(struct bnxt *bp, struct tx_cmpl *cmpl)
+{
+	uint16_t type = cmpl->flags_type & TX_CMPL_TYPE_MASK;
+
+	switch (type) {
+	case HWRM_CMPL_TYPE_HWRM_DONE:
+		break;
+	case HWRM_ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT:
+		bnxt_handle_async_event(bp, (struct cmpl_base *)cmpl);
+		break;
+	default:
+		PMD_DRV_LOG_LINE(ERR, "Port: %d Unhandled async message %x",
+				 bp->eth_dev->data->port_id, type);
+		break;
+	}
+}
+
+void bnxt_process_nq(struct bnxt *bp, struct bnxt_cp_ring_info *nqr,
+		     struct bnxt_cp_ring_info *rx_cpr)
+{
+	struct nqe_cn *nqcmps = (struct nqe_cn *)nqr->cp_desc_ring;
+	uint32_t ring_mask = nqr->cp_ring_struct->ring_mask;
+	uint32_t raw_cons = nqr->cp_raw_cons;
+	uint16_t nq_type, nqe_cnt = 0;
+	bool v_bit = nqr->valid;
+	uint32_t cons = RING_CMPL(ring_mask, raw_cons);
+
+	while (1) {
+		if (!CMPL_VALID(&nqcmps[cons], v_bit)) {
+			if (nqe_cnt) {
+				nqr->cp_raw_cons = raw_cons;
+				nqr->valid = v_bit;
+			}
+			return;
+		}
+
+		nq_type = NQ_CN_TYPE_MASK & nqcmps[cons].type;
+
+		if (CMP_TYPE((struct cmpl_base *)&nqcmps[cons]) != NQ_CN_TYPE_CQ_NOTIFICATION)
+			bnxt_process_async_msg(bp, (struct tx_cmpl *)&nqcmps[cons]);
+		else
+			rx_cpr->toggle = NQE_CN_TOGGLE((uint64_t)nqcmps[cons].type);
+
+		NEXT_CMPL(nqr, cons, v_bit, 1);
+		raw_cons++;
+		if (nq_type)
+			nqe_cnt++;
+	}
+}
+
+/* Arms/disarms the given NQ for Thor/Thor2, with P5+ represening the ASIC family  */
+void bnxt_arm_nq_p5p(struct bnxt_cp_ring_info *nqr, bool enable_irq)
+{
+	uint32_t raw_cons = nqr->cp_raw_cons;
+	uint64_t db_msg = 0;
+	uint32_t toggle = 0;
+
+	if (enable_irq == 1)
+		toggle = nqr->toggle;
+
+	db_msg = nqr->cp_db.db_key64 | (raw_cons & nqr->cp_db.db_ring_mask)
+		| DB_EPOCH(&nqr->cp_db, raw_cons) | DB_TOGGLE(toggle);
+
+	if (enable_irq)
+		db_msg |= DBR_TYPE_NQ_ARM;
+	else
+		db_msg |= DBR_TYPE_NQ_MASK;
+
+	rte_compiler_barrier();
+	rte_write64(db_msg, nqr->cp_db.doorbell);
+	rte_compiler_barrier();
+}
+
+/* Arms/disarms the given CQ for Thor/Thor2, with P5+ represening the ASIC family  */
+void bnxt_arm_rx_cq_p5p(struct bnxt_cp_ring_info *cpr, bool enable_irq)
+{
+	uint32_t raw_cons = cpr->cp_raw_cons;
+	uint64_t db_msg = 0;
+	uint32_t toggle = 0;
+
+	if (raw_cons == UINT32_MAX)
+		raw_cons = 0;
+
+	if (enable_irq == 1)
+		toggle = cpr->toggle;
+
+	db_msg = cpr->cp_db.db_key64 | (raw_cons & cpr->cp_db.db_ring_mask)
+		| DB_EPOCH(&cpr->cp_db, raw_cons) | DB_TOGGLE(toggle);
+
+	if (enable_irq)
+		db_msg |= DBR_TYPE_CQ_ARMALL;
+	else
+		db_msg |= DBR_TYPE_CQ;
+
+	rte_compiler_barrier();
+	rte_write64(db_msg, cpr->cp_db.doorbell);
+	rte_compiler_barrier();
+}
+
+
 void bnxt_wait_for_device_shutdown(struct bnxt *bp)
 {
 	uint32_t val, timeout;
diff --git a/drivers/net/bnxt/bnxt_cpr.h b/drivers/net/bnxt/bnxt_cpr.h
index 858ee15c47..e097f1034f 100644
--- a/drivers/net/bnxt/bnxt_cpr.h
+++ b/drivers/net/bnxt/bnxt_cpr.h
@@ -30,6 +30,14 @@ struct bnxt_db_info;
 #define RING_CMPL(ring_mask, idx)	((idx) & (ring_mask))
 #define NEXT_CMP(idx)		RING_CMP(ADV_RAW_CMP(idx, 1))
 
+#define DBR_TOGGLE_SFT			25
+#define DB_TOGGLE(tgl)			((tgl) << DBR_TOGGLE_SFT)
+
+#define NQ_CN_TOGGLE_MASK              0xc0UL
+#define NQ_CN_TOGGLE_SFT               6
+#define NQE_CN_TOGGLE(type)            (((type) & NQ_CN_TOGGLE_MASK) >>        \
+					NQ_CN_TOGGLE_SFT)
+
 #define DB_CP_REARM_FLAGS	(DB_KEY_CP | DB_IDX_VALID)
 #define DB_CP_FLAGS		(DB_KEY_CP | DB_IDX_VALID | DB_IRQ_DIS)
 
@@ -73,6 +81,21 @@ struct bnxt_db_info {
 #define DB_RING_IDX(db, idx)	(((idx) & (db)->db_ring_mask) |		\
 				 DB_EPOCH(db, idx))
 
+struct nqe_cn {
+	rte_le16_t      type;
+	#define NQ_CN_TYPE_MASK                 0x3fUL
+	#define NQ_CN_TYPE_SFT                  0
+	#define NQ_CN_TYPE_CQ_NOTIFICATION      0x30UL
+	#define NQ_CN_TYPE_LAST                 NQ_CN_TYPE_CQ_NOTIFICATION
+	#define NQ_CN_TOGGLE_MASK               0xc0UL
+	#define NQ_CN_TOGGLE_SFT                6
+	rte_le16_t      reserved16;
+	rte_le32_t      cq_handle_low;
+	rte_le32_t      v;
+	#define NQ_CN_V                         0x1UL
+	rte_le32_t      cq_handle_high;
+};
+
 struct bnxt_ring;
 struct bnxt_cp_ring_info {
 	uint32_t		cp_raw_cons;
@@ -89,7 +112,9 @@ struct bnxt_cp_ring_info {
 	struct bnxt_ring	*cp_ring_struct;
 	bool			valid;
 	uint32_t                epoch;
-	uint8_t			dpi;
+	uint32_t		toggle;
+	uint8_t			dpi;  /* Doorbell page index for multi-doorbell support */
+
 };
 
 #define RX_CMP_L2_ERRORS						\
@@ -101,6 +126,11 @@ void bnxt_handle_fwd_req(struct bnxt *bp, struct cmpl_base *cmp);
 int bnxt_event_hwrm_resp_handler(struct bnxt *bp, struct cmpl_base *cmp);
 void bnxt_dev_reset_and_resume(void *arg);
 void bnxt_wait_for_device_shutdown(struct bnxt *bp);
+void bnxt_arm_nq_p5p(struct bnxt_cp_ring_info *nqr, bool enable_irq);
+void bnxt_arm_rx_cq_p5p(struct bnxt_cp_ring_info *cpr, bool enable_irq);
+void bnxt_process_async_msg(struct bnxt *bp, struct tx_cmpl *cmpl);
+void bnxt_process_nq(struct bnxt *bp, struct bnxt_cp_ring_info *nqr,
+		     struct bnxt_cp_ring_info *rx_cpr);
 
 #define EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL     \
 	HWRM_ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL
@@ -152,4 +182,6 @@ bnxt_cpr_cmp_valid(const void *cmpl, uint32_t raw_cons, uint32_t ring_size)
 	}
 	return false;
 }
+
 #endif
+
diff --git a/drivers/net/bnxt/bnxt_rxq.c b/drivers/net/bnxt/bnxt_rxq.c
index 91b3555df6..b93f5043de 100644
--- a/drivers/net/bnxt/bnxt_rxq.c
+++ b/drivers/net/bnxt/bnxt_rxq.c
@@ -461,7 +461,14 @@ bnxt_rx_queue_intr_enable_op(struct rte_eth_dev *eth_dev, uint16_t queue_id)
 			return -EINVAL;
 
 		cpr = rxq->cp_ring;
-		B_CP_DB_REARM(cpr, cpr->cp_raw_cons);
+		if (BNXT_CHIP_P5_P7(bp)) {
+			struct bnxt_cp_ring_info *nqr = bp->rxtx_nq_ring;
+
+			bnxt_arm_nq_p5p(nqr, 1);
+			bnxt_arm_rx_cq_p5p(rxq->cp_ring, 1);
+		} else {
+			B_CP_DB_ARM(cpr);
+		}
 	}
 	return rc;
 }
@@ -484,7 +491,43 @@ bnxt_rx_queue_intr_disable_op(struct rte_eth_dev *eth_dev, uint16_t queue_id)
 			return -EINVAL;
 
 		cpr = rxq->cp_ring;
-		B_CP_DB_DISARM(cpr);
+		if (BNXT_CHIP_P5_P7(bp)) {
+			struct bnxt_cp_ring_info *nqr = bp->rxtx_nq_ring;
+			struct bnxt_cp_ring_info *rx_cpr;
+
+			bnxt_arm_nq_p5p(nqr, 0);
+			/* Loops through all RXQs and finds the one that
+			 * received a packet if any
+			 */
+			uint32_t cons = RING_CMPL(nqr->cp_ring_struct->ring_mask,
+					nqr->cp_raw_cons);
+			struct cmpl_base *cmpl = &((nqr->cp_desc_ring)[cons]);
+
+			if (CMPL_VALID(cmpl, nqr->valid)) {
+				unsigned int rxid = 0;
+
+				rx_cpr = rxq->cp_ring;
+				for (; rxid < bp->rx_nr_rings; rxid++) {
+					rxq = bp->rx_queues[rxid];
+					rx_cpr = rxq->cp_ring;
+					cons = RING_CMPL(rx_cpr->cp_ring_struct->ring_mask,
+							 rx_cpr->cp_raw_cons);
+					cmpl = &((rx_cpr->cp_desc_ring)[cons]);
+					/* Because the valid bit in the rx
+					 * completion queue is not updated the
+					 * cmp valid using raw cons is used
+					 */
+					if (bnxt_cpr_cmp_valid(cmpl,
+							       rx_cpr->cp_raw_cons,
+							       rx_cpr->cp_ring_struct->ring_size)) {
+						break;
+					}
+				}
+				bnxt_process_nq(bp, nqr, rx_cpr);
+			}
+		} else {
+			B_CP_DB_DISARM(cpr);
+		}
 	}
 	return rc;
 }
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 2/5] net/bnxt: use buf address for external mbuf
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Ajit Khaparde,
	Mohammad Shuab Siddique
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Ajit Khaparde <ajit.khaparde@broadcom.com>

Use buf_addr for mbufs from external pool instead of using iova
addresses. Uses rte_pktmbuf_mtod_offset() to compute the buffer
address for both the first segment and subsequent segments when
the mbuf is marked as RTE_MBUF_F_EXTERNAL.

Fixes: 42b883535804 ("net/bnxt: use new API to get IOVA address")
Cc: stable@dpdk.org
Signed-off-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_txr.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_txr.c b/drivers/net/bnxt/bnxt_txr.c
index fb5be224d9..51b886c1a6 100644
--- a/drivers/net/bnxt/bnxt_txr.c
+++ b/drivers/net/bnxt/bnxt_txr.c
@@ -325,7 +325,10 @@ static int bnxt_start_xmit(struct rte_mbuf *tx_pkt,
 		txbd->flags_type |= TX_BD_LONG_FLAGS_LHINT_GTE2K;
 	else
 		txbd->flags_type |= lhint_arr[tx_pkt->pkt_len >> 9];
-	txbd->address = rte_cpu_to_le_64(rte_mbuf_data_iova(tx_pkt));
+	if (tx_pkt->ol_flags & RTE_MBUF_F_EXTERNAL)
+		txbd->address = rte_cpu_to_le_64(rte_pktmbuf_mtod_offset(tx_pkt, uint64_t, 0));
+	else
+		txbd->address = rte_cpu_to_le_64(rte_mbuf_data_iova(tx_pkt));
 	*last_txbd = txbd;
 
 	if (long_bd) {
@@ -482,7 +485,11 @@ static int bnxt_start_xmit(struct rte_mbuf *tx_pkt,
 		*tx_buf = m_seg;
 
 		txbd = &txr->tx_desc_ring[prod];
-		txbd->address = rte_cpu_to_le_64(rte_mbuf_data_iova(m_seg));
+		if (m_seg->ol_flags & RTE_MBUF_F_EXTERNAL)
+			txbd->address = rte_cpu_to_le_64(
+				rte_pktmbuf_mtod_offset(m_seg, uint64_t, 0));
+		else
+			txbd->address = rte_cpu_to_le_64(rte_mbuf_data_iova(m_seg));
 		txbd->flags_type = TX_BD_SHORT_TYPE_TX_BD_SHORT;
 		txbd->len = m_seg->data_len;
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 3/5] net/bnxt: fix IOVA range check for external mbuf head node
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev
  Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique,
	Damodharam Ammepalli
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

For extmem, the mbuf head node may be allocated from a pool
different from the data buffer pool. In that case the IOVA
address check using rte_mempool_virt2iova() will fail. To fix
this, skip the IOVA range check for external mbufs.
Also add unlikely() in checking invalid mbuf size.

Fixes: d01de33f98e2 ("net/bnxt: skip IOVA range check for external mbuf")
Cc: stable@dpdk.org
Signed-off-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_txr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_txr.c b/drivers/net/bnxt/bnxt_txr.c
index 51b886c1a6..83d18e220b 100644
--- a/drivers/net/bnxt/bnxt_txr.c
+++ b/drivers/net/bnxt/bnxt_txr.c
@@ -220,8 +220,10 @@ static int bnxt_invalid_mbuf(struct rte_mbuf *mbuf)
 	if (unlikely(rte_mbuf_check(mbuf, 1, &reason)))
 		return -EINVAL;
 
-	if (unlikely(!(mbuf->ol_flags & RTE_MBUF_F_EXTERNAL) &&
-		     (mbuf->buf_iova < mbuf_size ||
+	if (unlikely(mbuf->ol_flags & RTE_MBUF_F_EXTERNAL))
+		return 0;
+
+	if (unlikely((mbuf->buf_iova < mbuf_size ||
 		      (mbuf->buf_iova != rte_mempool_virt2iova(mbuf) + mbuf_size))))
 		return -EINVAL;
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 4/5] net/bnxt: prevent a potential segfault
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Ajit Khaparde,
	Mohammad Shuab Siddique
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Ajit Khaparde <ajit.khaparde@broadcom.com>

Add checks to prevent a segfault while accessing members of
structures which may not have been initialized yet.

Fixes: 1b7ceba3e375 ("net/bnxt: support Rx queue count")
Cc: stable@dpdk.org
Signed-off-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_ethdev.c |  3 +++
 drivers/net/bnxt/bnxt_hwrm.c   |  3 +++
 drivers/net/bnxt/bnxt_ring.c   | 11 ++++++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 72a31dfbb5..c45afdb20a 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -1435,6 +1435,9 @@ static int bnxt_scattered_rx(struct rte_eth_dev *eth_dev)
 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 		struct bnxt_rx_queue *rxq = eth_dev->data->rx_queues[i];
 
+		if (rxq == NULL)
+			continue;
+
 		buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
 				      RTE_PKTMBUF_HEADROOM);
 		if (eth_dev->data->mtu + overhead > buf_size)
diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index e4ae27d3f4..02a5d00738 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -3328,6 +3328,9 @@ bnxt_free_all_hwrm_stat_ctxs(struct bnxt *bp)
 
 	for (i = 0; i < bp->rx_cp_nr_rings; i++) {
 
+		if (bp->rx_queues[i] == NULL)
+			continue;
+
 		cpr = bp->rx_queues[i]->cp_ring;
 		if (BNXT_HAS_RING_GRPS(bp))
 			bp->grp_info[i].fw_stats_ctx = -1;
diff --git a/drivers/net/bnxt/bnxt_ring.c b/drivers/net/bnxt/bnxt_ring.c
index 579b73d2ce..064520aa62 100644
--- a/drivers/net/bnxt/bnxt_ring.c
+++ b/drivers/net/bnxt/bnxt_ring.c
@@ -723,6 +723,9 @@ static void bnxt_init_all_rings(struct bnxt *bp)
 
 	for (i = 0; i < bp->rx_cp_nr_rings; i++) {
 		rxq = bp->rx_queues[i];
+
+		if (rxq == NULL)
+			continue;
 		/* Rx-compl */
 		cp_ring = rxq->cp_ring->cp_ring_struct;
 		cp_ring->fw_ring_id = INVALID_HW_RING_ID;
@@ -765,9 +768,15 @@ int bnxt_alloc_hwrm_rings(struct bnxt *bp)
 	for (i = 0; i < bp->rx_cp_nr_rings; i++) {
 		unsigned int soc_id = bp->eth_dev->device->numa_node;
 		struct bnxt_rx_queue *rxq  = bp->rx_queues[i];
-		struct bnxt_rx_ring_info *rxr = rxq->rx_ring;
+		struct bnxt_rx_ring_info *rxr;
 		struct bnxt_ring *ring;
 
+
+		if (rxq == NULL)
+			return -EINVAL;
+
+		rxr = rxq->rx_ring;
+
 		if (bnxt_need_agg_ring(bp->eth_dev)) {
 			ring = rxr->ag_ring_struct;
 			if (ring == NULL) {
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 5/5] net/bnxt: fix for segmentation fault that would occur on exit
From: Mohammad Shuab Siddique @ 2026-06-05  0:50 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Keegan Freyhof,
	Mohammad Shuab Siddique
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Keegan Freyhof <keegan.freyhof@broadcom.com>

When exiting dpdk while having still having members in a bonded
port, the program would experience a segmentation fault due to the
bonding driver relying on the ethdev driver to free the rx and tx
queues of the bond members. The member ports would then try and
close using the bnxt driver, which would then try and access memory
freed by the ethdev driver causing the seen issue. The bnxt stuct's
rx and tx queues pointers would still point to the freed memory,
while the ethdev pointer would reflect the status changes to the rx
and tx queues array and be nulled.
- Changed net/bnxt/bnxt_stats.c and net/bnxt/bnxt_rxr.c to check
that the rx queues had not already been freed by the ethdev driver
and changed net/bnxt/bnxt_stats.c and net/bnxt/bnxt_txr.c to check
that the tx queues had not already been freed by the ethdev driver.

Fixes: 898248fc4287 ("net/bnxt: support statistics query when port is stopped")
Cc: stable@dpdk.org
Signed-off-by: Keegan Freyhof <keegan.freyhof@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_rxr.c   |  2 +-
 drivers/net/bnxt/bnxt_stats.c | 17 +++++++++++------
 drivers/net/bnxt/bnxt_txr.c   |  3 +++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c
index 293b5c03b6..ab2175d21a 100644
--- a/drivers/net/bnxt/bnxt_rxr.c
+++ b/drivers/net/bnxt/bnxt_rxr.c
@@ -1491,7 +1491,7 @@ void bnxt_free_rx_rings(struct bnxt *bp)
 	int i;
 	struct bnxt_rx_queue *rxq;
 
-	if (!bp->rx_queues)
+	if (!bp->rx_queues || !bp->eth_dev->data->rx_queues)
 		return;
 
 	for (i = 0; i < (int)bp->rx_nr_rings; i++) {
diff --git a/drivers/net/bnxt/bnxt_stats.c b/drivers/net/bnxt/bnxt_stats.c
index 7b96cf0df9..49367588e4 100644
--- a/drivers/net/bnxt/bnxt_stats.c
+++ b/drivers/net/bnxt/bnxt_stats.c
@@ -547,15 +547,20 @@ void bnxt_free_stats(struct bnxt *bp)
 {
 	int i;
 
-	for (i = 0; i < (int)bp->tx_cp_nr_rings; i++) {
-		struct bnxt_tx_queue *txq = bp->tx_queues[i];
+	if (bp->tx_queues && bp->eth_dev->data->tx_queues) {
+		for (i = 0; i < (int)bp->tx_cp_nr_rings; i++) {
+			struct bnxt_tx_queue *txq = bp->tx_queues[i];
 
-		bnxt_free_txq_stats(txq);
+			bnxt_free_txq_stats(txq);
+		}
 	}
-	for (i = 0; i < (int)bp->rx_cp_nr_rings; i++) {
-		struct bnxt_rx_queue *rxq = bp->rx_queues[i];
 
-		bnxt_free_rxq_stats(rxq);
+	if (bp->rx_queues && bp->eth_dev->data->rx_queues) {
+		for (i = 0; i < (int)bp->rx_cp_nr_rings; i++) {
+			struct bnxt_rx_queue *rxq = bp->rx_queues[i];
+
+			bnxt_free_rxq_stats(rxq);
+		}
 	}
 }
 
diff --git a/drivers/net/bnxt/bnxt_txr.c b/drivers/net/bnxt/bnxt_txr.c
index 83d18e220b..d37a38735c 100644
--- a/drivers/net/bnxt/bnxt_txr.c
+++ b/drivers/net/bnxt/bnxt_txr.c
@@ -24,6 +24,9 @@ void bnxt_free_tx_rings(struct bnxt *bp)
 {
 	int i;
 
+	if (!bp->tx_queues || !bp->eth_dev->data->tx_queues)
+		return;
+
 	for (i = 0; i < (int)bp->tx_nr_rings; i++) {
 		struct bnxt_tx_queue *txq = bp->tx_queues[i];
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH] dma/cnxk: fix crash on secondary process cleanup
From: pbhagavatula @ 2026-06-05  8:16 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru, Anatoly Burakov, Radha Mohan Chintakuntla
  Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

cnxk_dmadev_probe() ran in secondary processes too, overwriting the
shared rdpi->pci_dev with a process-local pointer and marking the
device ready. With buses now cleaned up on shutdown, the primary's
roc_dpi_dev_fini() dereferences that stale pointer and crashes.

Skip HW init in secondary processes: attach to the shared device data
and return, leaving rdpi and the device state untouched.

Fixes: 53f6d7328bf4 ("dma/cnxk: create and initialize device on PCI probing")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 0dcebc1b0b3b..6ae7fdca3b1b 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -617,6 +617,9 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->dev_private = dpivf;
 	dmadev->dev_ops = &cnxk_dmadev_ops;
 
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return 0;
+
 	dpivf->is_cn10k = roc_model_is_cn10k();
 	dpivf->mcs_lock = NULL;
 	rdpi = &dpivf->rdpi;
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* Re: [PATCH v15 4/5] vhost: add mem region add/remove handlers
From: Maxime Coquelin @ 2026-06-05 11:32 UTC (permalink / raw)
  To: pravin.bathija; +Cc: dev, fengchengwen, stephen, thomas
In-Reply-To: <20260604235723.1046607-5-pravin.bathija@dell.com>

On Fri, Jun 5, 2026 at 1:58 AM <pravin.bathija@dell.com> wrote:
>
> From: Pravin M Bathija <pravin.bathija@dell.com>
>
> Add support for VHOST_USER_ADD_MEM_REG, VHOST_USER_REM_MEM_REG and
> VHOST_USER_GET_MAX_MEM_SLOTS. Refactor memory initialization into
> common helper and add supporting functions for dynamic memory management.
>
> Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
> ---
>  lib/vhost/vhost_user.c | 253 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 253 insertions(+)
>

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply

* [PATCH v3] net/iavf: fix VF reset race and stale ARQ message handling
From: Talluri Chaitanyababu @ 2026-06-05 12:36 UTC (permalink / raw)
  To: dev, bruce.richardson, aman.deep.singh, ciara.loftus
  Cc: shaiq.wani, Talluri Chaitanyababu, stable
In-Reply-To: <20260518054227.110701-1-chaitanyababux.talluri@intel.com>

During VF reset, multiple issues can lead to initialization
instability.

The first issue is a race condition in the VF-initiated reset path,
where VFR state VFACTIVE is treated as both "reset not started" and
"reset completed" in iavf_check_vf_reset_done(). When a VF initiates
a reset, the PF may not have begun processing it by the time
iavf_check_vf_reset_done() is called. Since VFACTIVE satisfies the
completion check, the VF proceeds before the PF has acknowledged the
reset, resulting in inconsistent initialization and virtchnl command
failures (e.g., OP_VERSION timeout).

The second issue is the presence of stale messages in the Admin
Receive Queue (ARQ) after VF reset. After the admin queue is
re-initialized during reset recovery, the PF may post responses to
pre-reset commands or unsolicited events. These may include opcode 0
(VIRTCHNL_OP_UNKNOWN) or responses to commands issued before reset,
which can interfere with API negotiation and cause command mismatch
errors.

Additionally, opcode 0 messages generate excessive warning logs,
causing unnecessary noise during initialization.

The solution involves:

1. Introducing a wait-for-reset-start helper that polls RSTAT until
   it leaves VFACTIVE. This helper is used in VF-initiated reset paths
   to ensure that the PF has started processing the reset before VF
   reinitialization proceeds. It is invoked from iavf_handle_hw_reset()
   for event-driven resets and from iavf_queues_req_reset() for
   queue-change-triggered resets. It is intentionally not used in
   iavf_dev_reset() to avoid redundant wait and unnecessary delay
   when reset completion is already confirmed.

2. Draining stale ARQ messages after admin queue initialization
   during reset recovery only (vf->in_reset_recovery == true).
   During initial device probe, the admin queue is freshly allocated
   and does not contain stale entries.

3. Downgrading opcode 0 (VIRTCHNL_OP_UNKNOWN) message logging to
   DEBUG level while preserving mismatch detection for other
   opcodes, allowing polling to continue until a valid response
   is received.

4. Refactoring reset-start detection and ARQ drain logic into helper
   functions (iavf_wait_for_reset_start() and iavf_drain_arq()) to
   improve readability and maintainability.

5. Introducing a short delay after triggering VF reset in
   iavf_dev_reset() to mitigate timing issues between VF
   reinitialization and PF reset processing. This helps avoid
   virtchnl command failures when PF reset completion is not yet
   fully synchronized.

This fix primarily targets VF-initiated reset handling, while ARQ
drain and opcode handling improvements also benefit PF-initiated
reset recovery scenarios.

Fixes: 1428895ad417 ("net/iavf: fix disabling of promiscuous modes on close")
Cc: stable@dpdk.org

Signed-off-by: Talluri Chaitanyababu <chaitanyababux.talluri@intel.com>
---

v3:
- Rebased the patch.
- Changed the Fixes tag.

v2:
- Removed iavf_wait_for_reset_start() from iavf_dev_reset() to avoid
  redundant wait and 1-second delay after reset completion.
- Added iavf_wait_for_reset_start() to iavf_queues_req_reset() to
  properly handle queue-change-triggered resets.
- Retained usage in iavf_handle_hw_reset() for VF-initiated reset flow.
- Restricted iavf_drain_arq() to reset recovery paths only.
- Added delay in iavf_dev_reset() to mitigate reset timing issues.
---
 drivers/net/intel/iavf/iavf_ethdev.c | 64 ++++++++++++++++++++++++++++
 drivers/net/intel/iavf/iavf_vchnl.c  | 16 +++++--
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index a8031e23a5..e87f3acf5f 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -106,6 +106,7 @@ static int iavf_dev_start(struct rte_eth_dev *dev);
 static int iavf_dev_stop(struct rte_eth_dev *dev);
 static int iavf_dev_close(struct rte_eth_dev *dev);
 static int iavf_dev_reset(struct rte_eth_dev *dev);
+static int iavf_wait_for_reset_start(struct iavf_hw *hw);
 static int iavf_dev_info_get(struct rte_eth_dev *dev,
 			     struct rte_eth_dev_info *dev_info);
 static const uint32_t *iavf_dev_supported_ptypes_get(struct rte_eth_dev *dev,
@@ -605,6 +606,7 @@ iavf_queues_req_reset(struct rte_eth_dev *dev, uint16_t num)
 	struct iavf_adapter *ad =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =  IAVF_DEV_PRIVATE_TO_VF(ad);
+	struct iavf_hw *hw = IAVF_DEV_PRIVATE_TO_HW(ad);
 	int ret;
 
 	ret = iavf_request_queues(dev, num);
@@ -616,6 +618,8 @@ iavf_queues_req_reset(struct rte_eth_dev *dev, uint16_t num)
 			vf->vsi_res->num_queue_pairs, num);
 
 	iavf_dev_watchdog_disable(ad);
+	/* Wait for PF to start processing reset triggered by queue change */
+	iavf_wait_for_reset_start(hw);
 	ret = iavf_dev_reset(dev);
 	if (ret) {
 		PMD_DRV_LOG(ERR, "vf reset failed");
@@ -2086,6 +2090,30 @@ iavf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
 	return 0;
 }
 
+/* Wait until PF acknowledges VF reset (RSTAT leaves VFACTIVE) */
+static int
+iavf_wait_for_reset_start(struct iavf_hw *hw)
+{
+	int i;
+	uint32_t rstat;
+
+	for (i = 0; i < 100; i++) {
+		rte_delay_ms(10);
+
+		rstat = IAVF_READ_REG(hw, IAVF_VFGEN_RSTAT);
+		rstat &= IAVF_VFGEN_RSTAT_VFR_STATE_MASK;
+		rstat >>= IAVF_VFGEN_RSTAT_VFR_STATE_SHIFT;
+
+		if (rstat != VIRTCHNL_VFR_VFACTIVE)
+			return 0;
+	}
+
+	PMD_DRV_LOG(DEBUG, "VF reset did not start within timeout");
+	return -1;
+}
+
+static void iavf_drain_arq(struct iavf_hw *hw, struct iavf_info *vf);
+
 static int
 iavf_check_vf_reset_done(struct iavf_hw *hw)
 {
@@ -2618,6 +2646,30 @@ iavf_init_proto_xtr(struct rte_eth_dev *dev)
 	}
 }
 
+/* Drain stale Admin Receive Queue messages after reset */
+static void
+iavf_drain_arq(struct iavf_hw *hw, struct iavf_info *vf)
+{
+	struct iavf_arq_event_info event;
+	int drain_count = 0;
+
+	memset(&event, 0, sizeof(event));
+	event.msg_buf = vf->aq_resp;
+
+	while (drain_count < IAVF_AQ_LEN) {
+		event.buf_len = IAVF_AQ_BUF_SZ;
+
+		if (iavf_clean_arq_element(hw, &event, NULL) != IAVF_SUCCESS)
+			break;
+
+		drain_count++;
+	}
+
+	if (drain_count > 0)
+		PMD_INIT_LOG(DEBUG, "Drained %d stale ARQ messages",
+				drain_count);
+}
+
 static int
 iavf_init_vf(struct rte_eth_dev *dev)
 {
@@ -2654,6 +2706,10 @@ iavf_init_vf(struct rte_eth_dev *dev)
 		goto err;
 	}
 
+	/* Drain stale ARQ messages only during reset recovery */
+	if (vf->in_reset_recovery)
+		iavf_drain_arq(hw, vf);
+
 	if (iavf_check_api_version(adapter) != 0) {
 		PMD_INIT_LOG(ERR, "check_api version failed");
 		goto err_api;
@@ -3289,6 +3345,8 @@ iavf_dev_reset(struct rte_eth_dev *dev)
 	ret = iavf_dev_uninit(dev);
 	if (ret)
 		return ret;
+	/* Add delay before re-initialization */
+	rte_delay_ms(50);
 
 	return iavf_dev_init(dev);
 }
@@ -3352,6 +3410,7 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev, bool vf_initiated_reset)
 {
 	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 	struct iavf_adapter *adapter = dev->data->dev_private;
+	struct iavf_hw *hw = IAVF_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	int ret;
 	bool restart_device = false;
 
@@ -3370,6 +3429,11 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev, bool vf_initiated_reset)
 	vf->in_reset_recovery = true;
 	iavf_set_no_poll(adapter, false);
 
+	/* For VF-initiated reset, wait for PF to start processing it */
+	if (vf_initiated_reset)
+		if (iavf_wait_for_reset_start(hw) != 0)
+			PMD_DRV_LOG(WARNING, "PF did not acknowledge VF reset");
+
 	/* Call the pre reset callback */
 	if (vf->pre_reset_cb != NULL)
 		vf->pre_reset_cb(dev->data->port_id, vf->pre_reset_cb_arg);
diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 94ccfb5d6e..0a536a15e0 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -296,11 +296,21 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 					__func__, vpe->event);
 		}
 	}  else {
-		/* async reply msg on command issued by vf previously */
+		/* Async reply for previously issued VF command.
+		 * Stale messages from before reset are ignored, and polling
+		 * continues until the expected response is received.
+		 */
 		result = IAVF_MSG_CMD;
 		if (opcode != vf->pend_cmd) {
-			PMD_DRV_LOG(WARNING, "command mismatch, expect %u, get %u",
-					vf->pend_cmd, opcode);
+			if (opcode == VIRTCHNL_OP_UNKNOWN)
+				PMD_DRV_LOG(DEBUG,
+					    "Ignoring stale msg (opcode 0), pending cmd %u",
+					    vf->pend_cmd);
+			else
+				PMD_DRV_LOG(WARNING,
+					    "command mismatch, expect %u, get %u",
+					    vf->pend_cmd, opcode);
+
 			result = IAVF_MSG_ERR;
 		}
 	}
-- 
2.43.0


^ permalink raw reply related

* [PATCH 1/3] fib6: fix tbl8 reservation drift in trie
From: Vladimir Medvedkin @ 2026-06-05 13:03 UTC (permalink / raw)
  To: dev; +Cc: maxime, stable
In-Reply-To: <20260522145855.1748406-2-maxime@leroys.fr>

From: Maxime Leroy <maxime@leroys.fr>

trie_modify() maintained rsvd_tbl8s by computing a depth_diff from
the current RIB topology at both ADD and DEL. The two values diverge
when the RIB changes between an ADD and its later DEL (a covering
parent added or removed), and rsvd_tbl8s eventually wraps to
UINT32_MAX, rejecting all subsequent /25+ ADDs with -ENOSPC.

A helper count_empty_levels() was added to fix the issue.

Fixes: c3e12e0f0354 ("fib: add dataplane algorithm for IPv6")
Cc: stable@dpdk.org

Signed-off-by: Maxime Leroy <maxime@leroys.fr>
Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 lib/fib/trie.c          | 81 +++++++++++++++++++++--------------------
 lib/rib/rib6_internal.h | 24 ++++++++++++
 lib/rib/rte_rib6.c      | 12 +-----
 3 files changed, 66 insertions(+), 51 deletions(-)
 create mode 100644 lib/rib/rib6_internal.h

diff --git a/lib/fib/trie.c b/lib/fib/trie.c
index fa5d9ec6b0..e9f1141cef 100644
--- a/lib/fib/trie.c
+++ b/lib/fib/trie.c
@@ -15,6 +15,7 @@
 #include <rte_fib6.h>
 #include "fib_log.h"
 #include "trie.h"
+#include <rib6_internal.h>
 
 #ifdef CC_AVX512_SUPPORT
 
@@ -534,19 +535,45 @@ modify_dp(struct rte_trie_tbl *dp, struct rte_rib6 *rib,
 	return 0;
 }
 
+/*
+ * Count bumber of TBL8s that can be freed after deleting a prefix or allocated
+ * after adding a prefix.
+ */
+static uint8_t
+count_empty_levels(struct rte_rib6 *rib, const struct rte_ipv6_addr *ip, uint8_t depth)
+{
+	struct rte_rib6_node *cur = rte_rib6_lookup_exact(rib, ip, depth);
+	/* expect prefix exists */
+	if (cur == NULL)
+		return 0;
+
+	/* more specifics present */
+	if (cur->left != NULL || cur->right != NULL)
+		return 0;
+
+	struct rte_rib6_node *parent = cur->parent;
+	/* we know parent->depth lt a target cur->depth
+	 * also, there exists tbl8 path up to RTE_ALIGN_CEIL(parent->depth, 8)
+	 */
+	depth = RTE_MAX(depth, 24);
+	uint8_t parent_depth = (parent) ? RTE_MAX(parent->depth, 24) : 24;
+	uint8_t depth_diff = (RTE_ALIGN_CEIL(depth, 8) - RTE_ALIGN_CEIL(parent_depth, 8)) >> 3;
+
+	return depth_diff;
+}
+
 int
 trie_modify(struct rte_fib6 *fib, const struct rte_ipv6_addr *ip,
 	uint8_t depth, uint64_t next_hop, int op)
 {
 	struct rte_trie_tbl *dp;
 	struct rte_rib6 *rib;
-	struct rte_rib6_node *tmp = NULL;
 	struct rte_rib6_node *node;
 	struct rte_rib6_node *parent;
-	struct rte_ipv6_addr ip_masked, tmp_ip;
+	struct rte_ipv6_addr ip_masked;
 	int ret = 0;
 	uint64_t par_nh, node_nh;
-	uint8_t tmp_depth, depth_diff = 0, parent_depth = 24;
+	uint8_t new_levels;
 
 	if ((fib == NULL) || (ip == NULL) || (depth > RTE_IPV6_MAX_DEPTH))
 		return -EINVAL;
@@ -559,37 +586,6 @@ trie_modify(struct rte_fib6 *fib, const struct rte_ipv6_addr *ip,
 	ip_masked = *ip;
 	rte_ipv6_addr_mask(&ip_masked, depth);
 
-	if (depth > 24) {
-		tmp = rte_rib6_get_nxt(rib, &ip_masked,
-			RTE_ALIGN_FLOOR(depth, 8), NULL,
-			RTE_RIB6_GET_NXT_ALL);
-		if (tmp && op == RTE_FIB6_DEL) {
-			/* in case of delete operation, skip the prefix we are going to delete */
-			rte_rib6_get_ip(tmp, &tmp_ip);
-			rte_rib6_get_depth(tmp, &tmp_depth);
-			if (rte_ipv6_addr_eq(&ip_masked, &tmp_ip) && depth == tmp_depth)
-				tmp = rte_rib6_get_nxt(rib, &ip_masked,
-					RTE_ALIGN_FLOOR(depth, 8), tmp, RTE_RIB6_GET_NXT_ALL);
-		}
-
-		if (tmp == NULL) {
-			tmp = rte_rib6_lookup(rib, ip);
-			/**
-			 * in case of delete operation, lookup returns the prefix
-			 * we are going to delete. Find the parent.
-			 */
-			if (tmp && op == RTE_FIB6_DEL)
-				tmp = rte_rib6_lookup_parent(tmp);
-
-			if (tmp != NULL) {
-				rte_rib6_get_depth(tmp, &tmp_depth);
-				parent_depth = RTE_MAX(tmp_depth, 24);
-			}
-			depth_diff = RTE_ALIGN_CEIL(depth, 8) -
-				RTE_ALIGN_CEIL(parent_depth, 8);
-			depth_diff = depth_diff >> 3;
-		}
-	}
 	node = rte_rib6_lookup_exact(rib, &ip_masked, depth);
 	switch (op) {
 	case RTE_FIB6_ADD:
@@ -603,12 +599,16 @@ trie_modify(struct rte_fib6 *fib, const struct rte_ipv6_addr *ip,
 			return 0;
 		}
 
-		if ((depth > 24) && (dp->rsvd_tbl8s + depth_diff > dp->number_tbl8s))
-			return -ENOSPC;
-
 		node = rte_rib6_insert(rib, &ip_masked, depth);
 		if (node == NULL)
 			return -rte_errno;
+
+		new_levels = count_empty_levels(rib, &ip_masked, depth);
+		if (dp->rsvd_tbl8s + new_levels > dp->number_tbl8s) {
+			rte_rib6_remove(rib, &ip_masked, depth);
+			return -ENOSPC;
+		}
+
 		rte_rib6_set_nh(node, next_hop);
 		parent = rte_rib6_lookup_parent(node);
 		if (parent != NULL) {
@@ -622,7 +622,7 @@ trie_modify(struct rte_fib6 *fib, const struct rte_ipv6_addr *ip,
 			return ret;
 		}
 successfully_added:
-		dp->rsvd_tbl8s += depth_diff;
+		dp->rsvd_tbl8s += new_levels;
 		return 0;
 	case RTE_FIB6_DEL:
 		if (node == NULL)
@@ -640,9 +640,10 @@ trie_modify(struct rte_fib6 *fib, const struct rte_ipv6_addr *ip,
 
 		if (ret != 0)
 			return ret;
-		rte_rib6_remove(rib, ip, depth);
 
-		dp->rsvd_tbl8s -= depth_diff;
+		dp->rsvd_tbl8s -= count_empty_levels(rib, &ip_masked, depth);
+		rte_rib6_remove(rib, &ip_masked, depth);
+
 		return 0;
 	default:
 		break;
diff --git a/lib/rib/rib6_internal.h b/lib/rib/rib6_internal.h
new file mode 100644
index 0000000000..674befc152
--- /dev/null
+++ b/lib/rib/rib6_internal.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Vladimir Medvedkin <medvedkinv@gmail.com>
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _RIB6_INTERNAL_H_
+#define _RIB6_INTERNAL_H_
+
+#include <stdint.h>
+
+#include <rte_ip6.h>
+
+struct rte_rib6_node {
+	struct rte_rib6_node	*left;
+	struct rte_rib6_node	*right;
+	struct rte_rib6_node	*parent;
+	uint64_t		nh;
+	struct rte_ipv6_addr	ip;
+	uint8_t			depth;
+	uint8_t			flag;
+	uint64_t ext[];
+};
+
+#endif /* _RIB6_INTERNAL_H_ */
diff --git a/lib/rib/rte_rib6.c b/lib/rib/rte_rib6.c
index ec8ff68e87..f9023fca59 100644
--- a/lib/rib/rte_rib6.c
+++ b/lib/rib/rte_rib6.c
@@ -19,6 +19,7 @@
 #include <rte_rib6.h>
 
 #include "rib_log.h"
+#include "rib6_internal.h"
 
 #define RTE_RIB_VALID_NODE	1
 /* Maximum length of a RIB6 name. */
@@ -30,17 +31,6 @@ static struct rte_tailq_elem rte_rib6_tailq = {
 };
 EAL_REGISTER_TAILQ(rte_rib6_tailq)
 
-struct rte_rib6_node {
-	struct rte_rib6_node	*left;
-	struct rte_rib6_node	*right;
-	struct rte_rib6_node	*parent;
-	uint64_t		nh;
-	struct rte_ipv6_addr	ip;
-	uint8_t			depth;
-	uint8_t			flag;
-	uint64_t ext[];
-};
-
 struct rte_rib6 {
 	char		name[RTE_RIB6_NAMESIZE];
 	struct rte_rib6_node	*tree;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v1 0/5] fib6: fix tbl8 reservation drift
From: Medvedkin, Vladimir @ 2026-06-05 13:04 UTC (permalink / raw)
  To: Maxime Leroy; +Cc: dev, stable
In-Reply-To: <20260522145855.1748406-1-maxime@leroys.fr>

Hi Maxime,

Thanks for the patches.

For patches 2 and 3 introducing unit tests
Acked-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

For patch 1, 4 and 5,
I think both implementations are over-complicated.
I suggest more optimal implementation for the count_empty_levels() (from 
the patch 1/5):
here is a draft based on your 1/5:
https://patches.dpdk.org/project/dpdk/patch/20260605130317.896413-1-vladimir.medvedkin@intel.com/

This implementation can be backported. The function logically belongs to 
trie.c, since it reflects its specifics, and not to the RIB library as 
it's more generic.
You may take this patch and integrate it into v2 replacing your 1/5.

On 5/22/2026 3:58 PM, Maxime Leroy wrote:
> This v1 supersedes the earlier RFC. The RFC dropped rsvd_tbl8s and
> used tbl8_pool_pos in the pre-check, which loses the worst-case
> envelope: a compressed /48 under a /28 allocates zero tbl8s but must
> reserve the boundaries the /48 would need if the /28 is later
> removed (DEL forces mid-flight decompression in modify_dp() with no
> rollback).
>
> This v1 keeps rsvd_tbl8s and computes it the way dir24_8 already
> does for IPv4. dir24_8 counts /24 supernets that contain at least
> one /25..32 prefix: that count is invariant under unrelated RIB
> changes, so the counter cannot drift. trie6 has the same need at
> 13 levels instead of 1 (byte boundaries 24, 32, ..., 120), so v1
> counts, for each L in that set, the /L supernets containing at
> least one prefix with depth > L. ADD/DEL pairs are symmetric by
> construction.
>
> Patch 1 is the minimal self-contained fix (Fixes: + Cc: stable).
> Patches 2-3 add the reproducer and extended regression tests.
> Patches 4-5 are an optimization (not for stable): valid_descendants
> in rte_rib6 + single-descent helper, so trie_modify() walks once
> instead of up to 13 times per ADD/DEL.
>
> Validated on a live BGP router (grout + FRR, 127 IPv6 prefixes):
> RSVD_TBL8 returned to its pre-cycle value after a zebra-kill /
> reconverge cycle.
>
> Maxime Leroy (5):
>    fib6: fix tbl8 reservation drift in trie
>    test/fib6: add reproducer for tbl8 reservation drift
>    test/fib6: extended drift test cases
>    rib: track valid descendant count per node
>    fib6: speed up tbl8 reservation accounting
>
>   app/test/test_fib6.c    | 335 ++++++++++++++++++++++++++++++++++++++++
>   app/test/test_rib6.c    |  92 +++++++++++
>   lib/fib/trie.c          |  47 +-----
>   lib/rib/rib6_internal.h |  37 +++++
>   lib/rib/rte_rib6.c      |  80 ++++++++++
>   5 files changed, 552 insertions(+), 39 deletions(-)
>   create mode 100644 lib/rib/rib6_internal.h
>
> ---
> v1:
> * Keep rsvd_tbl8s; recompute it via topology-stable empty-supernet
>    count (dir24_8 pattern at 13 levels) instead of RIB-derived
>    depth_diff.
> * Drop RFC patch 3/3 (no longer needed).
> * Add extended regression tests.
> * Add patches 4-5: RIB valid_descendants + single-descent helper
>    (optional perf optimization; not for stable).
> * Production-validated on a live BGP router.
>
> --
> 2.43.0

-- 
Regards,
Vladimir


^ permalink raw reply

* Re: [PATCH] fib: fix name of main TRIE instance memory region
From: Medvedkin, Vladimir @ 2026-06-05 13:07 UTC (permalink / raw)
  To: Andrew Rybchenko, dev
In-Reply-To: <20260316130531.1671904-1-andrew.rybchenko@oktetlabs.ru>

Acked-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

On 3/16/2026 1:05 PM, Andrew Rybchenko wrote:
> mem_name was built, but not used when memory is allocated.
>
> Fixes: c3e12e0f0354 ("fib: add dataplane algorithm for IPv6")
> Signed-off-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> ---
>   lib/fib/trie.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/fib/trie.c b/lib/fib/trie.c
> index fa5d9ec6b0..cd61446dd3 100644
> --- a/lib/fib/trie.c
> +++ b/lib/fib/trie.c
> @@ -678,7 +678,7 @@ trie_create(const char *name, int socket_id,
>   	num_tbl8 = conf->trie.num_tbl8;
>   
>   	snprintf(mem_name, sizeof(mem_name), "DP_%s", name);
> -	dp = rte_zmalloc_socket(name, sizeof(struct rte_trie_tbl) +
> +	dp = rte_zmalloc_socket(mem_name, sizeof(struct rte_trie_tbl) +
>   		TRIE_TBL24_NUM_ENT * (1 << nh_sz) + sizeof(uint32_t),
>   		RTE_CACHE_LINE_SIZE, socket_id);
>   	if (dp == NULL) {

-- 
Regards,
Vladimir


^ permalink raw reply

* Re: [PATCH] examples/vdpa: support show protocol features
From: Maxime Coquelin @ 2026-06-05 13:14 UTC (permalink / raw)
  To: Chengwen Feng; +Cc: thomas, stephen, dev
In-Reply-To: <CAO55cszyJmM-iGYPXOh7UC7Gh831kQA9QCzHi1niyyKmB+=Pwg@mail.gmail.com>

On Tue, Apr 28, 2026 at 11:23 AM Maxime Coquelin
<maxime.coquelin@redhat.com> wrote:
>
>
>
> On Thu, Oct 30, 2025 at 7:57 AM Chengwen Feng <fengchengwen@huawei.com> wrote:
>>
>> This commit adds show device's protocol features in list command.
>>
>> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
>> ---
>>  doc/guides/sample_app_ug/vdpa.rst |  8 ++++----
>>  examples/vdpa/main.c              | 13 ++++++++++---
>>  2 files changed, 14 insertions(+), 7 deletions(-)
>>
>>
>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>
> Thanks,
> Maxime

Applied to next-virtio/for-next-net.

Thanks,
Maxime


^ permalink raw reply

* Re: [PATCH v15 0/5] Support add/remove memory region and get-max-slots
From: Maxime Coquelin @ 2026-06-05 13:14 UTC (permalink / raw)
  To: pravin.bathija; +Cc: dev, fengchengwen, stephen, thomas
In-Reply-To: <20260604235723.1046607-1-pravin.bathija@dell.com>

On Fri, Jun 5, 2026 at 1:57 AM <pravin.bathija@dell.com> wrote:
>
> From: Pravin M Bathija <pravin.bathija@dell.com>
>
> This is version v15 of the patchset and it incorporates the
> recommendations made by Maxime Coquelin.
>
> Patch 4/5
> - Changed VHOST_USER_REM_MEM_REG handler declaration from
>   accepts_fd=true to accepts_fd=false, as the remove request does not
>   expect FDs in ancillary data.
> - Removed all close_msg_fds(ctx) calls from vhost_user_rem_mem_reg(), no
>   longer needed since the handler is declared as not accepting FDs.
> - Removed validate_msg_fds(dev, ctx, 0) check from
>   vhost_user_rem_mem_reg(), as FD validation is now handled generically
>   by the framework.
> - Added targeted IOTLB cache invalidation in vhost_user_rem_mem_reg()
>   using vhost_user_iotlb_cache_remove() for the removed region's GPA
>   range, instead of the nuclear iotlb_flush_all() used by set_mem_table.
>
> This implementation has been extensively tested by doing Read/Write I/O
> from multiple instances of fio + libblkio (front-end) talking to
> spdk/dpdk (back-end) based drives. Tested with qemu front-end talking to
> dpdk testpmd (back-end) performing add/removal of memory regions. Also
> tested post-copy live migration after doing add_memory_region.
>
> Version Log:
> Version v15 (Current version): Incorporate code review suggestions from
> Maxime Coquelin as described above.
>
> Version v14: Incorporate code review suggestions from Stephen Hemminger
> and Fengcheng Wen.
> Changes from Fengcheng Wen review:
> Patch 3/5
> - Moved free_all_mem_regions() call sites in vhost_user_set_mem_table()
>   from patch 4/5 to patch 3/5 so each commit compiles independently
> Patch 4/5
> - Renamed _dev_invalidate_vrings() to vhost_user_invalidate_vrings() to
>   follow vhost naming convention
> -  Added comment explaining *pdev propagation through
>    translate_ring_addresses / numa_realloc()
> - Reordered local variables in vhost_user_add_mem_reg() and
>   vhost_user_rem_mem_reg() by descending line length
> - Shortened overlap check variable names (current_region_guest_start/end
>   --> cur_start/end, proposed_region_guest_start/end -> new_start/end)
> - Fixed DMA error path in vhost_user_add_mem_reg(): added
>   free_new_region_no_dma label so async_dma_map_region(false) is not
>   called when the map itself failed.
> Changes from Stephen Hemminger review:
> Patch 4/5
> - vhost_user_add_mem_reg() now constructs a reply with the back-end's
>   host mapping address in userspace_addr and returns
>   RTE_VHOST_MSG_RESULT_REPLY per the vhost-user spec
> - Added validate_msg_fds(dev, ctx, 0) in vhost_user_rem_mem_reg() to
>   reject malformed messages with unexpected file descriptors
> - Dropped unnecessary (uint64_t) cast in vhost_user_get_max_mem_slots()
>
> Version v13: Incorporate code review suggestions from Fengcheng Wen
> Patch 2/5
> Renamed VhostUserSingleMemReg to VhostUserMemRegMsg and memory_single
> to memreg
> Patches 3/5 and 4/5
> Relocated function remove_guest_pages from patch 3/5 to 4/5
>
> Version v12: Incorporate code review suggestions from Maxime Coquelin
> and ai-code-review.
> Patch 3/5
> Refactored async_dma_map() to delegate to async_dma_map_region(),
> eliminating code duplication between the two functions.
> Restored original comments in async_dma_map_region() explaining why
> ENODEV and EINVAL errors are ignored (these were stripped in v10)
> Reverted unnecessary changes to vhost_user_postcopy_register() --
> removed the host_user_addr == 0 checks and reg_msg_index indirection
> that were added in  v10, since this function is only called from
> vhost_user_set_mem_table() where regions are always contiguous.
>
> Version v11: Incorporate code review suggestions from Stephen Hemminger.
> Patch 4/5
> Fix incomplete cleanup in vhost_user_add_mem_reg() when
> vhost_user_mmap_region() fails after the mmap succeeds (e.g.
> add_guest_pages() realloc failure) realloc failure). The error path now
> calls remove_guest_pages() and free_mem_region() to undo the mapping
> and stale guest-page entries, preventing a leaked mmap and slot reuse
> corruption. The plain close(fd) path is kept for pre-mmap failures.
>
> Version v10: Incorporate code review suggestions from Stephen Hemminger.
> Patch 4/5
> Moved dev_invalidate_vrings after free_mem_region, array compaction, and
> nregions decrement. This ensures translate_ring_addresses only sees
> surviving memory regions, preventing vring pointers from resolving into
> a region that is about to be unmapped.
>
> Version v9: Incorporate code review suggestions from Stephen Hemminger.
> Patch 3/5
> Restored max_guest_pages initial value to hardcoded 8 instead of
> VHOST_MEMORY_MAX_NREGIONS, matching upstream semantics.
> Patch 4/5
> Added close(reg->fd) and reg->fd = -1 before goto close_msg_fds in the
> mmap failure path to fix fd leak after fd was moved from ctx->fds[0].
> Converted dev_invalidate_vrings from a plain function to a macro +
> implementation function pair, accepting message ID as a parameter so
> the static_assert reports the correct handler at each call site.
> Updated dev_invalidate_vrings call in add_mem_reg to pass
> VHOST_USER_ADD_MEM_REG as message ID.
> Updated dev_invalidate_vrings call in rem_mem_reg to pass
> VHOST_USER_REM_MEM_REG as message ID.
>
> Version v8:  Incorporate code review suggestions from Stephen Hemminger.
> rewrite async_dma_map_region function to iterate guest pages by host
> address range matching
> change function dev_invalidate_vrings to accept a double pointer to
> propagate pointer updates
> new function remove_guest_pages was added
> add_mem_reg error path was narrowed to only clean up the single failed
> region instead of destroting all existing regions
>
> Version v7: Incorporate code review suggestions from Maxime Coquelin.
> Add debug messages to vhost_postcopy_register function.
>
> Version v6: Added the enablement of this feature as a final patch in
> this patch-set and other code optimizations as suggested by Maxime
> Coquelin.
>
> Version v5: removed the patch that increased the number of memory regions
> from 8 to 128. This will be submitted as a separate feature at a later
> point after incorporating additional optimizations. Also includes code
> optimizations as suggested by Feng Cheng Wen.
>
> Version v4: code optimizations as suggested by Feng Cheng Wen.
>
> Version v3: code optimizations as suggested by Maxime Coquelin
> and Thomas Monjalon.
>
> Version v2: code optimizations as suggested by Maxime Coquelin.
>
> Version v1: Initial patch set.
>
> Pravin M Bathija (5):
>   vhost: add user to mailmap and define to vhost hdr
>   vhost: header defines for add/rem mem region
>   vhost: refactor memory helper functions
>   vhost: add mem region add/remove handlers
>   vhost: enable configure memory slots
>
>  .mailmap               |   1 +
>  lib/vhost/rte_vhost.h  |   4 +
>  lib/vhost/vhost_user.c | 425 +++++++++++++++++++++++++++++++++++------
>  lib/vhost/vhost_user.h |  10 +
>  4 files changed, 378 insertions(+), 62 deletions(-)
>
> --
> 2.43.0
>

Applied to next-virtio/for-next-net.

Thanks,
Maxime


^ permalink raw reply

* RE: [PATCH v3] net/iavf: fix VF reset race and stale ARQ message handling
From: Loftus, Ciara @ 2026-06-05 13:54 UTC (permalink / raw)
  To: Talluri, ChaitanyababuX, dev@dpdk.org, Richardson, Bruce,
	Singh, Aman Deep
  Cc: Wani, Shaiq, stable@dpdk.org
In-Reply-To: <20260605123646.1328492-1-chaitanyababux.talluri@intel.com>

> Subject: [PATCH v3] net/iavf: fix VF reset race and stale ARQ message handling

Some comments below. I agree there is an issue here but I'm not sure about this
approach to fixing it. I've explained why inline.

One alternative solution could be to call the existing iavf_is_reset_detected helper
immediately after iavf_vf_reset() in iavf_dev_close() (on the reset path only). That
would ensure that the reset has started before before proceeding with tearing down
and reinitialising the adminq etc.

[snip]

> +
>  static int
>  iavf_init_vf(struct rte_eth_dev *dev)
>  {
> @@ -2654,6 +2706,10 @@ iavf_init_vf(struct rte_eth_dev *dev)
>  		goto err;
>  	}
> 
> +	/* Drain stale ARQ messages only during reset recovery */
> +	if (vf->in_reset_recovery)
> +		iavf_drain_arq(hw, vf);

Even with the guard, I still think this draining is unnecessary.
The adminq is initialised in the code block immediately before this.
I think it will be empty and there will be nothing to drain here.

> +
>  	if (iavf_check_api_version(adapter) != 0) {
>  		PMD_INIT_LOG(ERR, "check_api version failed");
>  		goto err_api;
> @@ -3289,6 +3345,8 @@ iavf_dev_reset(struct rte_eth_dev *dev)
>  	ret = iavf_dev_uninit(dev);
>  	if (ret)
>  		return ret;
> +	/* Add delay before re-initialization */
> +	rte_delay_ms(50);

Although it's a short delay it feels a bit heavy handed to unconditionally
delay here for every reset.

> 
>  	return iavf_dev_init(dev);
>  }
> @@ -3352,6 +3410,7 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev,
> bool vf_initiated_reset)
>  {
>  	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data-
> >dev_private);
>  	struct iavf_adapter *adapter = dev->data->dev_private;
> +	struct iavf_hw *hw = IAVF_DEV_PRIVATE_TO_HW(dev->data-
> >dev_private);
>  	int ret;
>  	bool restart_device = false;
> 
> @@ -3370,6 +3429,11 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev,
> bool vf_initiated_reset)
>  	vf->in_reset_recovery = true;
>  	iavf_set_no_poll(adapter, false);
> 
> +	/* For VF-initiated reset, wait for PF to start processing it */
> +	if (vf_initiated_reset)
> +		if (iavf_wait_for_reset_start(hw) != 0)

At this point of the VF initiated reset we haven't indicated to
hardware yet that we want to reset. So this wait will always time out here.
The reset is kicked off @ iavf_vf_reset where the VIRTCHNL_OP_RESET_VF
op is sent. Only after that does it make sense to check the reset status
reported by the PF.

> +			PMD_DRV_LOG(WARNING, "PF did not acknowledge
> VF reset");
> +
>  	/* Call the pre reset callback */
>  	if (vf->pre_reset_cb != NULL)
>  		vf->pre_reset_cb(dev->data->port_id, vf->pre_reset_cb_arg);
> diff --git a/drivers/net/intel/iavf/iavf_vchnl.c
> b/drivers/net/intel/iavf/iavf_vchnl.c
> index 94ccfb5d6e..0a536a15e0 100644
> --- a/drivers/net/intel/iavf/iavf_vchnl.c
> +++ b/drivers/net/intel/iavf/iavf_vchnl.c
> @@ -296,11 +296,21 @@ iavf_read_msg_from_pf(struct iavf_adapter
> *adapter, uint16_t buf_len,
>  					__func__, vpe->event);
>  		}
>  	}  else {
> -		/* async reply msg on command issued by vf previously */
> +		/* Async reply for previously issued VF command.
> +		 * Stale messages from before reset are ignored, and polling
> +		 * continues until the expected response is received.
> +		 */
>  		result = IAVF_MSG_CMD;
>  		if (opcode != vf->pend_cmd) {
> -			PMD_DRV_LOG(WARNING, "command mismatch,
> expect %u, get %u",
> -					vf->pend_cmd, opcode);
> +			if (opcode == VIRTCHNL_OP_UNKNOWN)
> +				PMD_DRV_LOG(DEBUG,
> +					    "Ignoring stale msg (opcode 0),
> pending cmd %u",
> +					    vf->pend_cmd);
> +			else
> +				PMD_DRV_LOG(WARNING,
> +					    "command mismatch, expect %u,
> get %u",
> +					    vf->pend_cmd, opcode);
> +
>  			result = IAVF_MSG_ERR;
>  		}
>  	}
> --
> 2.43.0


^ permalink raw reply

* [PATCH v4 1/5] eal: fix wrong log message in async IPC request
From: Anatoly Burakov @ 2026-06-05 14:29 UTC (permalink / raw)
  To: dev, Jianfeng Tan
In-Reply-To: <740b39c5098b4d40cafb9881ad70865a3c889012.1773936429.git.anatoly.burakov@intel.com>

The allocation failure log message in mp_request_async() says "sync
request" but the function handles asynchronous requests.

Fix the log to say "async request".

Fixes: f05e26051c15 ("eal: add IPC asynchronous request")
Cc: stable@dpdk.org

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/eal/common/eal_common_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index 06f151818c..799c6e81b0 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -883,7 +883,7 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
 	pending_req = calloc(1, sizeof(*pending_req));
 	reply_msg = calloc(1, sizeof(*reply_msg));
 	if (pending_req == NULL || reply_msg == NULL) {
-		EAL_LOG(ERR, "Could not allocate space for sync request");
+		EAL_LOG(ERR, "Could not allocate space for async request");
 		rte_errno = ENOMEM;
 		ret = -1;
 		goto fail;
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 2/5] eal: fix async IPC callback not fired when no peers
From: Anatoly Burakov @ 2026-06-05 14:29 UTC (permalink / raw)
  To: dev, Jianfeng Tan
In-Reply-To: <2bc77b94493d94b53a28ea535ed96d92a157a7c7.1780669755.git.anatoly.burakov@intel.com>

Currently, when rte_mp_request_async() is called and no peer processes
are connected (nb_sent == 0), the user callback is never invoked.

The original implementation used a dedicated background thread and
pthread_cond_signal() to wake it after queuing the dummy request. When
that thread was replaced with per-message alarms, no alarm was set for
the dummy request, silently breaking the nb_sent == 0 path.

This was not noticed because async requests are used while handling
secondary process requests, where peers are typically already present.

Fix it by setting a 1us alarm on the dummy request, so the callback path
immediately triggers and processes it.

Fixes: daf9bfca717e ("ipc: remove thread for async requests")
Cc: stable@dpdk.org

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/eal/common/eal_common_proc.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index 799c6e81b0..2f4a939c68 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -1187,11 +1187,22 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		ret = mp_request_async(eal_mp_socket_path(), copy, param, ts);
 
-		/* if we didn't send anything, put dummy request on the queue */
+		/* if we didn't send anything, put dummy request on the queue
+		 * and set a minimum-delay alarm so the callback fires immediately.
+		 */
 		if (ret == 0 && reply->nb_sent == 0) {
 			TAILQ_INSERT_TAIL(&pending_requests.requests, dummy,
 					next);
 			dummy_used = true;
+
+			if (rte_eal_alarm_set(1, async_reply_handle, dummy) < 0) {
+				EAL_LOG(ERR, "Fail to set alarm for dummy request");
+				/* roll back the changes */
+				TAILQ_REMOVE(&pending_requests.requests, dummy, next);
+				dummy_used = false;
+				ret = -1;
+				goto fail;
+			}
 		}
 
 		pthread_mutex_unlock(&pending_requests.lock);
@@ -1232,10 +1243,14 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 		} else if (mp_request_async(path, copy, param, ts))
 			ret = -1;
 	}
-	/* if we didn't send anything, put dummy request on the queue */
+	/* if we didn't send anything, put dummy request on the queue
+	 * and set a minimum-delay alarm so the callback fires immediately.
+	 */
 	if (ret == 0 && reply->nb_sent == 0) {
 		TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next);
 		dummy_used = true;
+		if (rte_eal_alarm_set(1, async_reply_handle, dummy) < 0)
+			EAL_LOG(ERR, "Fail to set alarm for dummy request");
 	}
 
 	/* finally, unlock the queue */
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 3/5] eal: fix memory leak in async IPC secondary path
From: Anatoly Burakov @ 2026-06-05 14:29 UTC (permalink / raw)
  To: dev, Jianfeng Tan
In-Reply-To: <2bc77b94493d94b53a28ea535ed96d92a157a7c7.1780669755.git.anatoly.burakov@intel.com>

When rte_mp_request_async() succeeds on the secondary process path, the
dummy request is freed only if it was inserted into the queue. However,
when the actual request was sent successfully (nb_sent > 0), the dummy is
not used and the function returns without freeing it.

Free dummy before returning on the success path when it was not inserted
into the queue.

Fixes: f05e26051c15 ("eal: add IPC asynchronous request")
Cc: stable@dpdk.org

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/eal/common/eal_common_proc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index 2f4a939c68..c8e59967d9 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -1210,6 +1210,8 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 		/* if we couldn't send anything, clean up */
 		if (ret != 0)
 			goto fail;
+		if (!dummy_used)
+			free(dummy);
 		return 0;
 	}
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 4/5] eal: fix async IPC resource leaks on partial failure
From: Anatoly Burakov @ 2026-06-05 14:29 UTC (permalink / raw)
  To: dev, Jianfeng Tan
In-Reply-To: <2bc77b94493d94b53a28ea535ed96d92a157a7c7.1780669755.git.anatoly.burakov@intel.com>

When rte_mp_request_async() fails to send requests to all peers,
copy and param can lose ownership and leak.

On partial failure, some requests may already be queued and still
reference copy and param, so freeing them directly on the error
path can cause use-after-free when those requests are later handled.

Fix this by rolling back queued requests from the current batch,
resetting nb_sent to 0, and freeing copy/param only after rollback.
Use a numeric request ID for alarm callback lookup so stale callbacks
from rolled-back requests become harmless no-ops.

Coverity issue: 501503
Fixes: f05e26051c15 ("eal: add IPC asynchronous request")
Cc: stable@dpdk.org

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/eal/common/eal_common_proc.c | 112 +++++++++++++++++++++++--------
 1 file changed, 84 insertions(+), 28 deletions(-)

diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index c8e59967d9..64812bcfd7 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -74,6 +74,7 @@ struct async_request_param {
 
 struct pending_request {
 	TAILQ_ENTRY(pending_request) next;
+	unsigned long id;
 	enum {
 		REQUEST_TYPE_SYNC,
 		REQUEST_TYPE_ASYNC
@@ -92,6 +93,8 @@ struct pending_request {
 	};
 };
 
+static unsigned long next_request_id;
+
 TAILQ_HEAD(pending_request_list, pending_request);
 
 static struct {
@@ -111,9 +114,9 @@ mp_send(struct rte_mp_msg *msg, const char *peer, int type);
 static void
 async_reply_handle(void *arg);
 
-/* for use with process_msg */
+/* for use with alarm callback and process_msg */
 static struct pending_request *
-async_reply_handle_thread_unsafe(void *arg);
+async_reply_handle_thread_unsafe(struct pending_request *req);
 
 static void
 trigger_async_action(struct pending_request *req);
@@ -132,6 +135,19 @@ find_pending_request(const char *dst, const char *act_name)
 	return r;
 }
 
+static struct pending_request *
+find_async_request_by_id(unsigned long id)
+{
+	struct pending_request *r;
+
+	TAILQ_FOREACH(r, &pending_requests.requests, next) {
+		if (r->id == id && r->type == REQUEST_TYPE_ASYNC)
+			return r;
+	}
+
+	return NULL;
+}
+
 /*
  * Combine prefix and name(optional) to return unix domain socket path
  * return the number of characters that would have been put into buffer.
@@ -519,9 +535,8 @@ trigger_async_action(struct pending_request *sr)
 }
 
 static struct pending_request *
-async_reply_handle_thread_unsafe(void *arg)
+async_reply_handle_thread_unsafe(struct pending_request *req)
 {
-	struct pending_request *req = (struct pending_request *)arg;
 	enum async_action action;
 	struct timespec ts_now;
 
@@ -534,7 +549,8 @@ async_reply_handle_thread_unsafe(void *arg)
 
 	TAILQ_REMOVE(&pending_requests.requests, req, next);
 
-	if (rte_eal_alarm_cancel(async_reply_handle, req) < 0) {
+	if (rte_eal_alarm_cancel(async_reply_handle,
+			(void *)(uintptr_t)req->id) < 0) {
 		/* if we failed to cancel the alarm because it's already in
 		 * progress, don't proceed because otherwise we will end up
 		 * handling the same message twice.
@@ -557,9 +573,13 @@ static void
 async_reply_handle(void *arg)
 {
 	struct pending_request *req;
+	/* alarm arg carries the request ID packed into a void * via uintptr_t */
+	unsigned long id = (uintptr_t)arg;
 
 	pthread_mutex_lock(&pending_requests.lock);
-	req = async_reply_handle_thread_unsafe(arg);
+	req = find_async_request_by_id(id);
+	if (req != NULL)
+		req = async_reply_handle_thread_unsafe(req);
 	pthread_mutex_unlock(&pending_requests.lock);
 
 	if (req != NULL)
@@ -878,7 +898,29 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
 {
 	struct rte_mp_msg *reply_msg;
 	struct pending_request *pending_req, *exist;
-	int ret = -1;
+	unsigned long id;
+	int ret;
+
+	/* queue already locked by caller */
+
+	exist = find_pending_request(dst, req->name);
+	if (exist) {
+		EAL_LOG(ERR, "A pending request %s:%s", dst, req->name);
+		rte_errno = EEXIST;
+		return -1;
+	}
+
+	/* Set alarm before allocating or sending so request timeout tracking
+	 * is active as soon as this request ID is reserved.
+	 */
+	id = ++next_request_id;
+	if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000,
+			async_reply_handle,
+			(void *)(uintptr_t)id) < 0) {
+		EAL_LOG(ERR, "Fail to set alarm for request %s:%s",
+			dst, req->name);
+		return -1;
+	}
 
 	pending_req = calloc(1, sizeof(*pending_req));
 	reply_msg = calloc(1, sizeof(*reply_msg));
@@ -890,21 +932,12 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
 	}
 
 	pending_req->type = REQUEST_TYPE_ASYNC;
+	pending_req->id = id;
 	strlcpy(pending_req->dst, dst, sizeof(pending_req->dst));
 	pending_req->request = req;
 	pending_req->reply = reply_msg;
 	pending_req->async.param = param;
 
-	/* queue already locked by caller */
-
-	exist = find_pending_request(dst, req->name);
-	if (exist) {
-		EAL_LOG(ERR, "A pending request %s:%s", dst, req->name);
-		rte_errno = EEXIST;
-		ret = -1;
-		goto fail;
-	}
-
 	ret = send_msg(dst, req, MP_REQ);
 	if (ret < 0) {
 		EAL_LOG(ERR, "Fail to send request %s:%s",
@@ -917,14 +950,6 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
 	}
 	param->user_reply.nb_sent++;
 
-	/* if alarm set fails, we simply ignore the reply */
-	if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000,
-			      async_reply_handle, pending_req) < 0) {
-		EAL_LOG(ERR, "Fail to set alarm for request %s:%s",
-			dst, req->name);
-		ret = -1;
-		goto fail;
-	}
 	TAILQ_INSERT_TAIL(&pending_requests.requests, pending_req, next);
 
 	return 0;
@@ -1178,6 +1203,7 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 	 * it, and put it on the queue if we don't send any requests.
 	 */
 	dummy->type = REQUEST_TYPE_ASYNC;
+	dummy->id = ++next_request_id;
 	dummy->request = copy;
 	dummy->reply = NULL;
 	dummy->async.param = param;
@@ -1194,8 +1220,8 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 			TAILQ_INSERT_TAIL(&pending_requests.requests, dummy,
 					next);
 			dummy_used = true;
-
-			if (rte_eal_alarm_set(1, async_reply_handle, dummy) < 0) {
+			if (rte_eal_alarm_set(1, async_reply_handle,
+					(void *)(uintptr_t)dummy->id) < 0) {
 				EAL_LOG(ERR, "Fail to set alarm for dummy request");
 				/* roll back the changes */
 				TAILQ_REMOVE(&pending_requests.requests, dummy, next);
@@ -1245,13 +1271,38 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 		} else if (mp_request_async(path, copy, param, ts))
 			ret = -1;
 	}
+
+	/*
+	 * On partial failure, roll back all queued requests in this batch while
+	 * holding pending_requests.lock. Any alarm callback that runs later for
+	 * these removed IDs will not find a pending request and will return.
+	 */
+	if (ret != 0 && reply->nb_sent > 0) {
+		struct pending_request *r, *next;
+
+		for (r = TAILQ_FIRST(&pending_requests.requests);
+				r != NULL; r = next) {
+			next = TAILQ_NEXT(r, next);
+			if (r->type == REQUEST_TYPE_ASYNC &&
+					r->async.param == param) {
+				TAILQ_REMOVE(&pending_requests.requests,
+						r, next);
+				free(r->reply);
+				/* r->request == copy, freed below after the loop */
+				free(r);
+			}
+		}
+		reply->nb_sent = 0;
+	}
+
 	/* if we didn't send anything, put dummy request on the queue
 	 * and set a minimum-delay alarm so the callback fires immediately.
 	 */
 	if (ret == 0 && reply->nb_sent == 0) {
 		TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next);
 		dummy_used = true;
-		if (rte_eal_alarm_set(1, async_reply_handle, dummy) < 0)
+		if (rte_eal_alarm_set(1, async_reply_handle,
+				(void *)(uintptr_t)dummy->id) < 0)
 			EAL_LOG(ERR, "Fail to set alarm for dummy request");
 	}
 
@@ -1267,6 +1318,11 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 	/* if dummy was unused, free it */
 	if (!dummy_used)
 		free(dummy);
+	/* if nothing was sent, nobody owns copy/param */
+	if (ret != 0) {
+		free(param);
+		free(copy);
+	}
 
 	return ret;
 closedir_fail:
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 5/5] eal: avoid deadlock in async IPC alarm callback
From: Anatoly Burakov @ 2026-06-05 14:29 UTC (permalink / raw)
  To: dev, Jianfeng Tan
In-Reply-To: <2bc77b94493d94b53a28ea535ed96d92a157a7c7.1780669755.git.anatoly.burakov@intel.com>

async_reply_handle_thread_unsafe() can run while holding
pending_requests.lock and currently calls rte_eal_alarm_cancel().

rte_eal_alarm_cancel() may spin-wait for an executing callback, which can
deadlock if that callback is blocked on the same lock.

Remove callback-side alarm cancellation. It is safe to do so, because any
callback triggered without a pending request becomes a noop.

Fixes: daf9bfca717e ("ipc: remove thread for async requests")
Cc: stable@dpdk.org

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/eal/common/eal_common_proc.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index 64812bcfd7..e459109fec 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -549,19 +549,6 @@ async_reply_handle_thread_unsafe(struct pending_request *req)
 
 	TAILQ_REMOVE(&pending_requests.requests, req, next);
 
-	if (rte_eal_alarm_cancel(async_reply_handle,
-			(void *)(uintptr_t)req->id) < 0) {
-		/* if we failed to cancel the alarm because it's already in
-		 * progress, don't proceed because otherwise we will end up
-		 * handling the same message twice.
-		 */
-		if (rte_errno == EINPROGRESS) {
-			EAL_LOG(DEBUG, "Request handling is already in progress");
-			goto no_trigger;
-		}
-		EAL_LOG(ERR, "Failed to cancel alarm");
-	}
-
 	if (action == ACTION_TRIGGER)
 		return req;
 no_trigger:
@@ -910,8 +897,12 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
 		return -1;
 	}
 
-	/* Set alarm before allocating or sending so request timeout tracking
-	 * is active as soon as this request ID is reserved.
+	/* Set alarm before allocating or sending. The alarm is never cancelled:
+	 * rte_eal_alarm_cancel spin-waits for an executing callback to finish,
+	 * which deadlocks if we hold pending_requests.lock while the callback
+	 * is blocked on it. Instead, let stale alarms fire; with ID-based
+	 * lookup the callback will simply not find the request and return
+	 * harmlessly.
 	 */
 	id = ++next_request_id;
 	if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000,
@@ -1273,9 +1264,10 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 	}
 
 	/*
-	 * On partial failure, roll back all queued requests in this batch while
-	 * holding pending_requests.lock. Any alarm callback that runs later for
-	 * these removed IDs will not find a pending request and will return.
+	 * On partial failure, roll back all queued requests. We hold the lock
+	 * so no one else touches the queue. All requests in this batch share
+	 * the same param pointer. Stale alarms will fire and harmlessly find
+	 * nothing via ID-based lookup.
 	 */
 	if (ret != 0 && reply->nb_sent > 0) {
 		struct pending_request *r, *next;
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH 1/7] net/ice: use granular PTYPEs for L2TPv2 PPP
From: Burakov, Anatoly @ 2026-06-05 14:44 UTC (permalink / raw)
  To: Shaiq Wani, dev, bruce.richardson, aman.deep.singh
In-Reply-To: <20260427023115.1225843-2-shaiq.wani@intel.com>

On 4/27/2026 4:31 AM, Shaiq Wani wrote:
> All L2TPv2 PPP variants map to ICE_MAC_IPV4_L2TPV2 (398), so inner
> protocol type is not differentiated, allowing cross-protocol flow
> matches (e.g. a PPP/IPv4 rule hitting a PPP/IPv6 packet).
> 
> Fix ice_ptype_map[] to use the 30 granular L2TPv2 PTYPEs (396-425)
> defined by the DDP package. Also add PPP inner protocol flow types to
> ice_fdir_gen_l2tpv2_pkt() so training packets get dynamically built
> L2TPv2 headers instead of static templates.
> 
> Fixes: 733640dae75e ("net/ice: support L2TPv2 flow pattern matching")
> Fixes: bf662653976e ("net/ice/base: support L2TPv2 flow rule")
> Signed-off-by: Shaiq Wani <shaiq.wani@intel.com>
> ---

Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

-- 
Thanks,
Anatoly

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox