DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v16 4/5] vhost: add mem region add/remove handlers
From: Maxime Coquelin @ 2026-06-08 16:16 UTC (permalink / raw)
  To: pravin.bathija; +Cc: dev, stephen, fengchengwen, thomas
In-Reply-To: <20260606025211.1082615-5-pravin.bathija@dell.com>

On Sat, Jun 6, 2026 at 4:52 AM <pravin.bathija@dell.com> wrote:
>
> From: Pravin M Bathija <pravin.bathija@dell.com>
>
> Add support for VHOST_USER_ADD_MEM_REG, VHOST_USER_REM_MEM_REG and
> VHOST_USER_GET_MAX_MEM_SLOTS. Refactor memory initialization into
> common helper and add supporting functions for dynamic memory management.
>
> Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
> ---
>  lib/vhost/vhost_user.c | 266 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 266 insertions(+)
>
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> index 94fca8b589..020c993b29 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -71,6 +71,9 @@ VHOST_MESSAGE_HANDLER(VHOST_USER_SET_FEATURES, vhost_user_set_features, false, t
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_OWNER, vhost_user_set_owner, false, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_RESET_OWNER, vhost_user_reset_owner, false, false) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_MEM_TABLE, vhost_user_set_mem_table, true, true) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_GET_MAX_MEM_SLOTS, vhost_user_get_max_mem_slots, false, false) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_ADD_MEM_REG, vhost_user_add_mem_reg, true, true) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_REM_MEM_REG, vhost_user_rem_mem_reg, false, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_BASE, vhost_user_set_log_base, true, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_FD, vhost_user_set_log_fd, true, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_NUM, vhost_user_set_vring_num, false, true) \
> @@ -1167,6 +1170,24 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
>         return 0;
>  }
>
> +static void
> +remove_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg)
> +{
> +       uint64_t reg_start = reg->host_user_addr;
> +       uint64_t reg_end = reg_start + reg->size;
> +       uint32_t i, j = 0;
> +
> +       for (i = 0; i < dev->nr_guest_pages; i++) {
> +               if (dev->guest_pages[i].host_user_addr >= reg_start &&
> +                   dev->guest_pages[i].host_user_addr < reg_end)
> +                       continue;
> +               if (j != i)
> +                       dev->guest_pages[j] = dev->guest_pages[i];
> +               j++;
> +       }
> +       dev->nr_guest_pages = j;
> +}
> +
>  #ifdef RTE_LIBRTE_VHOST_DEBUG
>  /* TODO: enable it only in debug mode? */
>  static void
> @@ -1591,6 +1612,251 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
>         return RTE_VHOST_MSG_RESULT_ERR;
>  }
>
> +
> +static int
> +vhost_user_get_max_mem_slots(struct virtio_net **pdev __rte_unused,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       uint32_t max_mem_slots = VHOST_MEMORY_MAX_NREGIONS;
> +
> +       ctx->msg.payload.u64 = max_mem_slots;
> +       ctx->msg.size = sizeof(ctx->msg.payload.u64);
> +       ctx->fd_num = 0;
> +
> +       return RTE_VHOST_MSG_RESULT_REPLY;
> +}
> +
> +/*
> + * Invalidate and re-translate all vring addresses after the memory table
> + * has been modified (add/remove region).
> + *
> + * translate_ring_addresses() may call numa_realloc(), which can reallocate
> + * the device structure.  The updated pointer is written back through *pdev
> + * so callers must refresh their local "dev" afterwards: dev = *pdev.
> + */
> +static void
> +vhost_user_invalidate_vrings(struct virtio_net **pdev)
> +{
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       for (i = 0; i < dev->nr_vring; i++) {
> +               struct vhost_virtqueue *vq = dev->virtqueue[i];
> +
> +               if (!vq)
> +                       continue;
> +
> +               if (vq->desc || vq->avail || vq->used) {
> +                       vq_assert_lock(dev, vq);
> +
> +                       vring_invalidate(dev, vq);
> +
> +                       translate_ring_addresses(&dev, &vq);
> +               }
> +       }
> +
> +       *pdev = dev;
> +}
> +
> +/*
> + * Macro wrapper that performs the compile-time lock assertion with the
> + * correct message ID at the call site, then calls the implementation.
> + */
> +#define dev_invalidate_vrings(pdev, id) do { \
> +       static_assert(id ## _LOCK_ALL_QPS, \
> +               #id " handler is not declared as locking all queue pairs"); \
> +       vhost_user_invalidate_vrings(pdev); \
> +} while (0)
> +
> +static int
> +vhost_user_add_mem_reg(struct virtio_net **pdev,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       /* convert first region add to normal memory table set */
> +       if (dev->mem == NULL) {
> +               if (vhost_user_initialize_memory(pdev) < 0)
> +                       goto close_msg_fds;
> +       }
> +
> +       /* make sure new region will fit */
> +       if (dev->mem->nregions >= VHOST_MEMORY_MAX_NREGIONS) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "too many memory regions already (%u)",
> +                                                                       dev->mem->nregions);
> +               goto close_msg_fds;
> +       }
> +
> +       /* make sure supplied memory fd present */
> +       if (ctx->fd_num != 1) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "fd count makes no sense (%u)", ctx->fd_num);
> +               goto close_msg_fds;
> +       }
> +
> +       /* Make sure no overlap in guest virtual address space */
> +       for (i = 0; i < dev->mem->nregions; i++) {
> +               struct rte_vhost_mem_region *cur = &dev->mem->regions[i];
> +               uint64_t cur_start = cur->guest_user_addr;
> +               uint64_t cur_end = cur_start + cur->size - 1;
> +               uint64_t new_start = region->userspace_addr;
> +               uint64_t new_end = new_start + region->memory_size - 1;
> +
> +               if (new_end >= cur_start && new_start <= cur_end) {
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "requested memory region overlaps with another region");
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tRequested region address:0x%" PRIx64,
> +                               region->userspace_addr);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tRequested region size:0x%" PRIx64,
> +                               region->memory_size);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tOverlapping region address:0x%" PRIx64,
> +                               cur->guest_user_addr);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tOverlapping region size:0x%" PRIx64,
> +                               cur->size);
> +                       goto close_msg_fds;
> +               }
> +       }
> +
> +       /* New region goes at the end of the contiguous array */
> +       struct rte_vhost_mem_region *reg = &dev->mem->regions[dev->mem->nregions];
> +
> +       reg->guest_phys_addr = region->guest_phys_addr;
> +       reg->guest_user_addr = region->userspace_addr;
> +       reg->size            = region->memory_size;
> +       reg->fd              = ctx->fds[0];
> +       ctx->fds[0]          = -1;
> +
> +       if (vhost_user_mmap_region(dev, reg, region->mmap_offset) < 0) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap region");
> +               if (reg->mmap_addr) {
> +                       /* mmap succeeded but a later step (e.g. add_guest_pages)
> +                        * failed; undo the mapping and any guest-page entries.
> +                        */
> +                       remove_guest_pages(dev, reg);
> +                       free_mem_region(reg);
> +               } else {
> +                       close(reg->fd);
> +                       reg->fd = -1;
> +               }
> +               goto close_msg_fds;
> +       }
> +
> +       dev->mem->nregions++;
> +
> +       if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
> +               if (async_dma_map_region(dev, reg, true) < 0)
> +                       goto free_new_region_no_dma;
> +       }
> +
> +       if (dev->postcopy_listening) {
> +               /*
> +                * Cannot use vhost_user_postcopy_register() here because it
> +                * reads ctx->msg.payload.memory (SET_MEM_TABLE layout), but
> +                * ADD_MEM_REG uses the memreg payload.  Register the
> +                * single new region directly instead.
> +                */
> +               if (vhost_user_postcopy_region_register(dev, reg) < 0)
> +                       goto free_new_region;
> +       }
> +
> +       dev_invalidate_vrings(pdev, VHOST_USER_ADD_MEM_REG);
> +       dev = *pdev;
> +       dump_guest_pages(dev);
> +
> +       /*
> +        * In postcopy mode the front-end expects the back-end to reply with
> +        * the base of the mapped region (see VHOST_USER_SET_MEM_TABLE, which
> +        * applies here accordingly).  No reply is expected otherwise.
> +        *
> +        * translate_ring_addresses() above may have reallocated dev->mem via
> +        * numa_realloc(), so re-derive the region pointer from the refreshed
> +        * dev rather than using the now-stale reg.  The new region is the last
> +        * entry in the contiguous array.
> +        */
> +       if (dev->postcopy_listening) {
> +               reg = &dev->mem->regions[dev->mem->nregions - 1];
> +               ctx->msg.payload.memreg.region.userspace_addr = reg->host_user_addr;
> +               ctx->msg.size = sizeof(ctx->msg.payload.memreg);
> +               ctx->fd_num = 0;
> +               return RTE_VHOST_MSG_RESULT_REPLY;
> +       }

Thanks Stephen, good catch by the AI.
I did some digging into Qemu code, which AI later confirmed, and if
the series was tested,
the test passed "by accident" when postcopy was not enabled because
Qemu would read the
padding field of the payload, and would treat its value as
RTE_VHOST_MSG_RESULT_OK
because it is zero-initialized...


With this fix:
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

> +
> +       return RTE_VHOST_MSG_RESULT_OK;
> +
> +free_new_region:
> +       if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +               async_dma_map_region(dev, reg, false);
> +free_new_region_no_dma:
> +       remove_guest_pages(dev, reg);
> +       free_mem_region(reg);
> +       dev->mem->nregions--;
> +close_msg_fds:
> +       close_msg_fds(ctx);
> +       return RTE_VHOST_MSG_RESULT_ERR;
> +}
> +
> +static int
> +vhost_user_rem_mem_reg(struct virtio_net **pdev,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       if (dev->mem == NULL || dev->mem->nregions == 0) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "no memory regions to remove");
> +               return RTE_VHOST_MSG_RESULT_ERR;
> +       }
> +
> +       for (i = 0; i < dev->mem->nregions; i++) {
> +               struct rte_vhost_mem_region *current_region = &dev->mem->regions[i];
> +
> +               /*
> +                * According to the vhost-user specification:
> +                * The memory region to be removed is identified by its GPA,
> +                * user address and size. The mmap offset is ignored.
> +                */
> +               if (region->userspace_addr == current_region->guest_user_addr
> +                       && region->guest_phys_addr == current_region->guest_phys_addr
> +                       && region->memory_size == current_region->size) {
> +                       if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +                               async_dma_map_region(dev, current_region, false);
> +                       if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> +                               vhost_user_iotlb_cache_remove(dev,
> +                                       current_region->guest_phys_addr,
> +                                       current_region->size);
> +                       remove_guest_pages(dev, current_region);
> +                       free_mem_region(current_region);
> +
> +                       /* Compact the regions array to keep it contiguous */
> +                       if (i < dev->mem->nregions - 1) {
> +                               memmove(&dev->mem->regions[i],
> +                                       &dev->mem->regions[i + 1],
> +                                       (dev->mem->nregions - 1 - i) *
> +                                       sizeof(struct rte_vhost_mem_region));
> +                               memset(&dev->mem->regions[dev->mem->nregions - 1],
> +                                       0, sizeof(struct rte_vhost_mem_region));
> +                       }
> +
> +                       dev->mem->nregions--;
> +                       dev_invalidate_vrings(pdev, VHOST_USER_REM_MEM_REG);
> +                       dev = *pdev;
> +                       return RTE_VHOST_MSG_RESULT_OK;
> +               }
> +       }
> +
> +       VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to find region");
> +       return RTE_VHOST_MSG_RESULT_ERR;
> +}
> +
>  static bool
>  vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> --
> 2.43.0
>


^ permalink raw reply

* Re: [PATCH v1 2/2] dma/odm: avoid zero length DMA transfers
From: Jerin Jacob @ 2026-06-08 16:26 UTC (permalink / raw)
  To: Shijith Thotton; +Cc: Gowrishankar Muthukrishnan, Vidya Sagar Velumuri, dev
In-Reply-To: <20260601101559.1925302-3-sthotton@marvell.com>

On Mon, Jun 1, 2026 at 3:46 PM Shijith Thotton <sthotton@marvell.com> wrote:
>
> Add validation to reject zero-length DMA operations early
> with -EINVAL, preventing queue disable.
>
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>


Added Fixes to 2/2 patch.

Series applied to dpdk-next-net-mrvl/for-main. Thanks



> ---
>  drivers/dma/odm/odm_dmadev.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/dma/odm/odm_dmadev.c b/drivers/dma/odm/odm_dmadev.c
> index 0211133bd4..7488b960fd 100644
> --- a/drivers/dma/odm/odm_dmadev.c
> +++ b/drivers/dma/odm/odm_dmadev.c
> @@ -110,6 +110,9 @@ odm_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t ds
>         vq = &odm->vq[vchan];
>         hdr.s.xtype = vq->xtype;
>
> +       if (unlikely(!length))
> +               return -EINVAL;
> +
>         h = length;
>         h |= ((uint64_t)length << 32);
>
> @@ -262,14 +265,20 @@ odm_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *
>         pending_submit_len = vq->pending_submit_len;
>         pending_submit_cnt = vq->pending_submit_cnt;
>
> -       if (unlikely(nb_src > 4 || nb_dst > 4))
> +       if (unlikely(!nb_src || nb_src > 4 || !nb_dst || nb_dst > 4))
>                 return -EINVAL;
>
> -       for (i = 0; i < nb_src; i++)
> +       for (i = 0; i < nb_src; i++) {
> +               if (unlikely(!src[i].length))
> +                       return -EINVAL;
>                 s_sz += src[i].length;
> +       }
>
> -       for (i = 0; i < nb_dst; i++)
> +       for (i = 0; i < nb_dst; i++) {
> +               if (unlikely(!dst[i].length))
> +                       return -EINVAL;
>                 d_sz += dst[i].length;
> +       }
>
>         if (s_sz != d_sz)
>                 return -EINVAL;
> @@ -342,6 +351,9 @@ odm_dmadev_fill(void *dev_private, uint16_t vchan, uint64_t pattern, rte_iova_t
>                 .s.nlst = 1,
>         };
>
> +       if (unlikely(!length))
> +               return -EINVAL;
> +
>         h = (uint64_t)length;
>
>         switch (pattern) {
> --
> 2.25.1
>

^ permalink raw reply

* Re: [PATCH] lib/net: Add ICMP support to rte_net_get_ptype()
From: Stephen Hemminger @ 2026-06-08 16:28 UTC (permalink / raw)
  To: Eimear Morrissey; +Cc: dev@dpdk.org
In-Reply-To: <03abaccc37f14f4c8955580784f30cbe@huawei.com>

On Fri, 31 Oct 2025 12:32:18 +0000
Eimear Morrissey <eimear.morrissey@huawei.com> wrote:

> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Wednesday 15 October 2025 18:30
> > To: Eimear Morrissey <eimear.morrissey@huawei.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [PATCH] lib/net: Add ICMP support to rte_net_get_ptype()
> > 
> > On Thu, 9 Oct 2025 16:27:40 +0100
> > Eimear Morrissey <eimear.morrissey@huawei.com> wrote:
> >   
> > > Set RTE_PTYPE_L4_ICMP for ICMP packets.
> > >
> > > Signed-off-by: Eimear Morrissey <eimear.morrissey@huawei.com>  
> > 
> > Would be good to do ICMP6 as well.  
> 
> 
> Should an ICMPv6 packet be RTE_PTYPE_L4_ICMP/RTE_PTYPE_INNER_L4_ICMP as well? 
> 
> The comments in rte_mbuf_ptype.h are inconsistent, the top level comment with examples suggests that 'version'=6, 'next header'=0x3A
>  should be RTE_PYTPE_INNER_L4_ICMP but the comment on RTE_PTYPE_INNER_L4_ICMP itself says  'version'=6, 'next header'=1 ?
> 
> -Eimear

	static const uint32_t ptype_l4_proto[256] = {
		[IPPROTO_ICMP] = RTE_PTYPE_L4_ICMP,
		[IPPROTO_ICMPV6] = RTE_PTYPE_L4_ICMP,
		[IPPROTO_UDP] = RTE_PTYPE_L4_UDP,
		...

^ permalink raw reply

* [PATCH v4] net/iavf: fix duplicate VF reset during PF reset recovery
From: Anurag Mandal @ 2026-06-08 16:29 UTC (permalink / raw)
  To: dev
  Cc: bruce.richardson, vladimir.medvedkin, ciara.loftus, Anurag Mandal,
	stable
In-Reply-To: <20260605202911.314359-1-anurag.mandal@intel.com>

During PF initiated reset recovery, iavf_dev_close() sending
an extra VIRTCHNL_OP_RESET_VF while recovery is already in progress.
That second reset can leave PF/VF virtchnl state inconsistent and
cause VIRTCHNL_OP_CONFIG_VSI_QUEUES to fail with ERR_PARAM after
ToR link flap/power-cycle, leaving the VF unable to recover.
This results in connection loss.

This patch skipped close-time VF reset and related close-time
virtchnl operations when PF triggered reset recovery is set.
This is done to avoid a duplicate VF reset, and keep normal
behavior for application-driven close.

Fixes: 675a104e2e94 ("net/iavf: fix abnormal disable HW interrupt")
Fixes: b34fe66ea893 ("net/iavf: delay VF reset command")
Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without interrupt")
Cc: stable@dpdk.org

Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
---
V4: Addressed Ciara Loftus comments
  - split VF reset from other code changes 
V3: Addressed latest ai-code-review comments
V2: Addressed ai-code-review comments

 doc/guides/rel_notes/release_26_07.rst |  3 +++
 drivers/net/intel/iavf/iavf_ethdev.c   | 37 +++++++++++++++-----------
 drivers/net/intel/iavf/iavf_vchnl.c    | 18 ++++++++++---
 3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index d2563ac503..f6899a78c3 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -95,6 +95,9 @@ New Features
 
   * Added support for transmitting LLDP packets based on mbuf packet type.
   * Implemented AVX2 context descriptor transmit paths.
+  * Prevented duplicate 'VIRTCHNL_OP_RESET_VF' during a PF-initiated
+    reset recovery, which earlier caused virtchnl state corruption
+    and connection loss after a top-of-rack (ToR) link flap/power-cycle.
 
 * **Updated PCAP ethernet driver.**
 
diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index a8031e23a5..99457ae510 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -3166,24 +3166,27 @@ iavf_dev_close(struct rte_eth_dev *dev)
 
 	ret = iavf_dev_stop(dev);
 
-	/*
-	 * Release redundant queue resource when close the dev
-	 * so that other vfs can re-use the queues.
-	 */
-	if (vf->lv_enabled) {
-		ret = iavf_request_queues(dev, IAVF_MAX_NUM_QUEUES_DFLT);
-		if (ret)
-			PMD_DRV_LOG(ERR, "Reset the num of queues failed");
+	/* Skip RESET_VF on a PF-initiated reset */
+	if (!adapter->closed && !vf->in_reset_recovery) {
+		/*
+		 * Release redundant queue resource when close the dev
+		 * so that other vfs can re-use the queues.
+		 */
+		if (vf->lv_enabled) {
+			ret = iavf_request_queues(dev, IAVF_MAX_NUM_QUEUES_DFLT);
+			if (ret)
+				PMD_DRV_LOG(ERR, "Reset the num of queues failed");
+			vf->max_rss_qregion = IAVF_MAX_NUM_QUEUES_DFLT;
+		}
 
-		vf->max_rss_qregion = IAVF_MAX_NUM_QUEUES_DFLT;
+		/*
+		 * Disable promiscuous mode before resetting the VF. This is to avoid
+		 * potential issues when the PF is bound to the kernel driver.
+		 */
+		if (vf->promisc_unicast_enabled || vf->promisc_multicast_enabled)
+			iavf_config_promisc(adapter, false, false);
 	}
 
-	/* Disable promiscuous mode before resetting the VF. This is to avoid
-	 * potential issues when the PF is bound to the kernel driver.
-	 */
-	if (vf->promisc_unicast_enabled || vf->promisc_multicast_enabled)
-		iavf_config_promisc(adapter, false, false);
-
 	adapter->closed = true;
 
 	/* free iAVF security device context all related resources */
@@ -3195,7 +3198,9 @@ iavf_dev_close(struct rte_eth_dev *dev)
 	iavf_flow_flush(dev, NULL);
 	iavf_flow_uninit(adapter);
 
-	iavf_vf_reset(hw);
+	/* Skip RESET_VF on a PF-initiated reset */
+	if (!vf->in_reset_recovery)
+		iavf_vf_reset(hw);
 	vf->aq_intr_enabled = false;
 	iavf_shutdown_adminq(hw);
 	if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR) {
diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 94ccfb5d6e..cf3513ef94 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -283,9 +283,21 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 					vf->link_up ? "up" : "down");
 			break;
 		case VIRTCHNL_EVENT_RESET_IMPENDING:
-			vf->vf_reset = true;
-			iavf_set_no_poll(adapter, false);
-			PMD_DRV_LOG(INFO, "VF is resetting");
+			/*
+			 * Force link down on impending reset to drop
+			 * the cached link-up state; a fresh LSC up
+			 * event will be re-issued by the PF once the
+			 * VF is reinitialised.
+			 */
+			vf->link_up = false;
+			if (!vf->vf_reset) {
+				vf->vf_reset = true;
+				iavf_set_no_poll(adapter, false);
+				iavf_dev_event_post(vf->eth_dev,
+					RTE_ETH_EVENT_INTR_RESET,
+					NULL, 0);
+			}
+			PMD_DRV_LOG(DEBUG, "VF is resetting");
 			break;
 		case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
 			vf->dev_closed = true;
-- 
2.34.1


^ permalink raw reply related

* RE: [PATCH v3] net/iavf: fix duplicate VF reset during PF reset recovery
From: Mandal, Anurag @ 2026-06-08 16:32 UTC (permalink / raw)
  To: Loftus, Ciara, dev@dpdk.org
  Cc: Richardson, Bruce, Medvedkin, Vladimir, stable@dpdk.org
In-Reply-To: <IA4PR11MB9278AB660100B34D0001DC478E1C2@IA4PR11MB9278.namprd11.prod.outlook.com>

> -----Original Message-----
> From: Loftus, Ciara <ciara.loftus@intel.com>
> Sent: 08 June 2026 15:29
> To: Mandal, Anurag <anurag.mandal@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Medvedkin, Vladimir
> <vladimir.medvedkin@intel.com>; Mandal, Anurag
> <anurag.mandal@intel.com>; stable@dpdk.org
> Subject: RE: [PATCH v3] net/iavf: fix duplicate VF reset during PF reset recovery
> 
> > Subject: [PATCH v3] net/iavf: fix duplicate VF reset during PF reset
> > recovery
> >
> > During PF initiated reset recovery, iavf_dev_close() sending an extra
> > VIRTCHNL_OP_RESET_VF while recovery is already in progress.
> > That second reset can leave PF/VF virtchnl state inconsistent and
> > cause VIRTCHNL_OP_CONFIG_VSI_QUEUES to fail with ERR_PARAM after ToR
> > link flap/power-cycle, leaving the VF unable to recover.
> > This results in connection loss.
> >
> > Skipped close-time VF reset and related close-time virtchnl operations
> > when PF triggered reset recovery is set. This is done to avoid a
> > duplicate VF reset, and keep normal behavior for application-driven
> > close.
> > Handled link-change events through a common static function that reads
> > the correct advanced & legacy link fields properly and updates
> > no-poll/watchdog/LSC state consistently.
> > Also added IAVF_ERR_ADMIN_QUEUE_NO_WORK in virtchnl message drain
> as a
> > normal empty-queue condition and avoid logging it as an misleading AQ
> > failure.
> >
> > Fixes: 675a104e2e94 ("net/iavf: fix abnormal disable HW interrupt")
> > Fixes: b34fe66ea893 ("net/iavf: delay VF reset command")
> > Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without
> > interrupt")
> > Fixes: 5c8ca9f13c78 ("net/iavf: fix no polling mode switching")
> > Fixes: 48de41ca11f0 ("net/avf: enable link status update")
> > Fixes: 02d212ca3125 ("net/iavf: rename remaining avf strings")
> > Cc: stable@dpdk.org
> 
> Hi Anurag,
> 
> Thanks for the patch. There seems to be multiple logical fixes/changes in here
> and I think it would be good to split them into individual patches, each with
> their own Fixes tag where relevant. Having multiple fixes in one patch with
> multiple Fixes tags makes backporting tricky.
> I think at least logic which prevents the RESET_VF during a PF initiated reset
> should be split out from the link-change logic.
> 
> Thanks,
> Ciara
> 

Hi Ciara,

Thank you for your review comment. I have split the VF reset changes and sent patch v4 for the same. Others, I will put fresh patch.

Kindly review.

Thanks,
Anurag

> >
> > Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
> > ---
> > V3: Addressed latest ai-code-review comments
> > V2: Addressed ai-code-review comments
> >
> >  doc/guides/rel_notes/release_26_07.rst |   3 +
> >  drivers/net/intel/iavf/iavf_ethdev.c   |  37 +++---
> >  drivers/net/intel/iavf/iavf_vchnl.c    | 155 ++++++++++++++++---------
> >  3 files changed, 123 insertions(+), 72 deletions(-)
> >
> > diff --git a/doc/guides/rel_notes/release_26_07.rst
> > b/doc/guides/rel_notes/release_26_07.rst
> > index b8a3e2ced9..e7ac730369 100644
> > --- a/doc/guides/rel_notes/release_26_07.rst
> > +++ b/doc/guides/rel_notes/release_26_07.rst
> > @@ -89,6 +89,9 @@ New Features
> >
> >    * Added support for transmitting LLDP packets based on mbuf packet type.
> >    * Implemented AVX2 context descriptor transmit paths.
> > +  * Prevented duplicate 'VIRTCHNL_OP_RESET_VF' during a PF-initiated
> > +    reset recovery, which earlier caused virtchnl state corruption
> > +    and connection loss after a top-of-rack (ToR) link flap/power-cycle.
> >
> >  * **Updated PCAP ethernet driver.**
> >
> > diff --git a/drivers/net/intel/iavf/iavf_ethdev.c
> > b/drivers/net/intel/iavf/iavf_ethdev.c
> > index bdf650b822..fb6f287d3c 100644
> > --- a/drivers/net/intel/iavf/iavf_ethdev.c
> > +++ b/drivers/net/intel/iavf/iavf_ethdev.c
> > @@ -3166,24 +3166,27 @@ iavf_dev_close(struct rte_eth_dev *dev)
> >
> >  	ret = iavf_dev_stop(dev);
> >
> > -	/*
> > -	 * Release redundant queue resource when close the dev
> > -	 * so that other vfs can re-use the queues.
> > -	 */
> > -	if (vf->lv_enabled) {
> > -		ret = iavf_request_queues(dev,
> > IAVF_MAX_NUM_QUEUES_DFLT);
> > -		if (ret)
> > -			PMD_DRV_LOG(ERR, "Reset the num of queues
> > failed");
> > +	/* Skip RESET_VF on a PF-initiated reset */
> > +	if (!adapter->closed && !vf->in_reset_recovery) {
> 
> adapter->closed will always be false here so the check is redundant.
> 
> > +		/*
> > +		 * Release redundant queue resource when close the dev
> > +		 * so that other vfs can re-use the queues.
> > +		 */
> > +		if (vf->lv_enabled) {
> > +			ret = iavf_request_queues(dev,
> > IAVF_MAX_NUM_QUEUES_DFLT);
> > +			if (ret)
> > +				PMD_DRV_LOG(ERR, "Reset the num of
> > queues failed");
> > +			vf->max_rss_qregion =
> > IAVF_MAX_NUM_QUEUES_DFLT;
> > +		}
> >
> 


^ permalink raw reply

* Re: [PATCH] common/cnxk: fix VFIO MSI-X interrupt setup
From: Jerin Jacob @ 2026-06-08 16:35 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Stephen Hemminger, Tejasree Kondoj, dev,
	stable
In-Reply-To: <20260521082647.71442-1-pbhagavatula@marvell.com>

On Thu, May 21, 2026 at 2:25 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use heap allocation sized by the configured maximum interrupt count for the
> VFIO irq_set buffer, correct handling when irq.count is zero, and use a
> minimal stack buffer for per-vector configuration.
>
> Fixes: 1fb9f4ab14b3 ("common/cnxk: remove VLA in interrupt configuration")
> Cc: stable@dpdk.org
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>



Series applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply

* Re: [PATCH] eal: fix core_index for non-EAL registered threads
From: David Marchand @ 2026-06-08 16:35 UTC (permalink / raw)
  To: Maxime Peim; +Cc: dev
In-Reply-To: <CAJFAV8wn26hRUTzCG4ai+apT8+bWL7+cEE1N-gV1tJt8B9a4hQ@mail.gmail.com>

On Mon, 8 Jun 2026 at 18:10, David Marchand <david.marchand@redhat.com> wrote:
>
> On Wed, 22 Apr 2026 at 09:54, Maxime Peim <maxime.peim@gmail.com> wrote:
> >
> > Threads registered via rte_thread_register() are assigned a valid
> > lcore_id by eal_lcore_non_eal_allocate(), but their core_index in
> > lcore_config is left at -1. This value was set during rte_eal_cpu_init()
> > for lcores with ROLE_OFF (undetected CPUs) and is never updated when the
> > lcore is later allocated to a non-EAL thread.
> >
> > As a result, rte_lcore_index() returns -1 for registered non-EAL
> > threads. Libraries that use rte_lcore_index() to select per-lcore
> > caches fall back to a shared global path when it returns -1, causing
> > severe contention under concurrent access from multiple registered
> > threads.
> >
> > A concrete example is the mlx5 indexed memory pool (mlx5_ipool), which
> > uses rte_lcore_index() in mlx5_ipool_malloc_cache() to select a per-core
> > cache slot. When core_index is -1, all registered threads are funneled
> > into a single shared slot protected by a spinlock. In testing with VPP
> > (which registers worker threads via rte_thread_register()), this caused
> > async flow rule insertion throughput to drop from ~6.4M rules/sec to
> > ~1.2M rules/sec with 4 workers -- a 5x regression attributable entirely
> > to spinlock contention in the ipool allocator.
> >
> > Fix by setting core_index to the next sequential index (cfg->lcore_count)
> > in eal_lcore_non_eal_allocate() before incrementing the count. Also reset
> > core_index back to -1 on the error rollback path and in
> > eal_lcore_non_eal_release() for correctness.
> >
> > Fixes: 5c307ba2a5b1 ("eal: register non-EAL threads as lcores")
> Cc: stable@dpdk.org
>
> > Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
> Acked-by: David Marchand <david.marchand@redhat.com>
>

Hum, I did not push the change.
Re-reading this code, we have an issue if some external thread
unregisters in the middle.

What do you think of the additional hunk:

$ git diff
diff --git a/lib/eal/common/eal_common_lcore.c
b/lib/eal/common/eal_common_lcore.c
index ae085d73e4..6f53f20d90 100644
--- a/lib/eal/common/eal_common_lcore.c
+++ b/lib/eal/common/eal_common_lcore.c
@@ -372,13 +372,16 @@ eal_lcore_non_eal_allocate(void)
        struct rte_config *cfg = rte_eal_get_configuration();
        struct lcore_callback *callback;
        struct lcore_callback *prev;
+       unsigned int index = 0;
        unsigned int lcore_id;

        rte_rwlock_write_lock(&lcore_lock);
        for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-               if (cfg->lcore_role[lcore_id] != ROLE_OFF)
+               if (cfg->lcore_role[lcore_id] != ROLE_OFF) {
+                       index++;
                        continue;
-               lcore_config[lcore_id].core_index = cfg->lcore_count;
+               }
+               lcore_config[lcore_id].core_index = index;
                cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
                cfg->lcore_count++;
                break;


-- 
David Marchand


^ permalink raw reply related

* Re: [PATCH 6/7] pcapng: add user-supplied timestamp support
From: Stephen Hemminger @ 2026-06-08 16:38 UTC (permalink / raw)
  To: Dawid Wesierski
  Cc: dev, bruce.richardson, anatoly.burakov, vladimir.medvedkin,
	reshma.pattan, thomas, andrew.rybchenko, marek.kasiewicz
In-Reply-To: <20260429073111.3712950-7-dawid.wesierski@intel.com>

On Wed, 29 Apr 2026 03:31:10 -0400
Dawid Wesierski <dawid.wesierski@intel.com> wrote:

> @@ -737,16 +736,6 @@ rte_pcapng_write_packets(rte_pcapng_t *self,
>  			return -1;
>  		}
>  
> -		/*
> -		 * When data is captured by pcapng_copy the current TSC is stored.
> -		 * Adjust the value recorded in file to PCAP epoch units.
> -		 */
> -		cycles = (uint64_t)epb->timestamp_hi << 32;
> -		cycles += epb->timestamp_lo;
> -		timestamp = tsc_to_ns_epoch(&self->clock, cycles);
> -		epb->timestamp_hi = timestamp >> 32;
> -		epb->timestamp_lo = (uint32_t)timestamp;
> -
>  		/*
>  		 * Handle case of highly fragmented and large burst size
>  		 * Note: this assumes that max segments per mbuf < IOV_MAX
> diff --git a/lib/pcapng/rte_pcapng.h b/lib/pcapng/rte_pcapng.h

NAK

You need to keep the correct timestamp correction.
PCAPNG specifies times as nanoseconds since 1/1/1970.


Any new API needs a test as well.

^ permalink raw reply

* Re: [PATCH 0/7] intel network and pcapng updates
From: Thomas Monjalon @ 2026-06-08 16:59 UTC (permalink / raw)
  To: Dawid Wesierski
  Cc: dev, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Wesierski, Dawid
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

08/06/2026 18:40, Dawid Wesierski:
> From: "Wesierski, Dawid" <dawid.wesierski@intel.com>
> 
> These patches provide various updates for Intel iavf/ice drivers and pcapng.

Please would you mind sending the pcapng changes in a separate series?
Thank you



^ permalink raw reply

* Re: [v3] net/cksum: compute raw cksum for several segments
From: Stephen Hemminger @ 2026-06-08 17:02 UTC (permalink / raw)
  To: Su Sai; +Cc: dev
In-Reply-To: <20250804035430.4058391-1-spiderdetective.ss@gmail.com>

On Mon,  4 Aug 2025 11:54:30 +0800
Su Sai <spiderdetective.ss@gmail.com> wrote:

> The rte_raw_cksum_mbuf function is used to compute
> the raw checksum of a packet.
> If the packet payload stored in multi mbuf, the function
> will goto the hard case. In hard case,
> the variable 'tmp' is a type of uint32_t,
> so rte_bswap16 will drop high 16 bit.
> Meanwhile, the variable 'sum' is a type of uint32_t,
> so 'sum += tmp' will drop the carry when overflow.
> Both drop will make cksum incorrect.
> This commit fixes the above bug.
> 
> Signed-off-by: Su Sai <spiderdetective.ss@gmail.com>
> ---
>  .mailmap              |   1 +
>  app/test/test_cksum.c | 106 ++++++++++++++++++++++++++++++++++++++++++
>  lib/net/rte_cksum.h   |  27 +++++++++--
>  3 files changed, 130 insertions(+), 4 deletions(-)
> 
> diff --git a/.mailmap b/.mailmap
> index 34a99f93a1..1da1d9f8e1 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -1552,6 +1552,7 @@ Sunil Kumar Kori <skori@marvell.com> <sunil.kori@nxp.com>
>  Sunil Pai G <sunil.pai.g@intel.com>
>  Sunil Uttarwar <sunilprakashrao.uttarwar@amd.com>
>  Sun Jiajia <sunx.jiajia@intel.com>
> +Su Sai <spiderdetective.ss@gmail.com> <susai.ss@bytedance.com>
>  Sunyang Wu <sunyang.wu@jaguarmicro.com>
>  Surabhi Boob <surabhi.boob@intel.com>
>  Suyang Ju <sju@paloaltonetworks.com>
> diff --git a/app/test/test_cksum.c b/app/test/test_cksum.c
> index f2ab5af5a7..fb2e3cf9e6 100644
> --- a/app/test/test_cksum.c
> +++ b/app/test/test_cksum.c
> @@ -85,6 +85,42 @@ static const char test_cksum_ipv4_opts_udp[] = {
>  	0x00, 0x35, 0x00, 0x09, 0x89, 0x6f, 0x78,
>  };
>  
> +/*
> + * generated in scapy with
> + * Ether()/IP()/TCP(options=[NOP,NOP,Timestamps])/os.urandom(113))
> + */
> +static const char test_cksum_ipv4_tcp_multi_segs[] = {
> +	0x00, 0x16, 0x3e, 0x0b, 0x6b, 0xd2, 0xee, 0xff,
> +	0xff, 0xff, 0xff, 0xff, 0x08, 0x00, 0x45, 0x00,
> +	0x00, 0xa5, 0x46, 0x10, 0x40, 0x00, 0x40, 0x06,
> +	0x80, 0xb5, 0xc0, 0xa8, 0xf9, 0x1d, 0xc0, 0xa8,
> +	0xf9, 0x1e, 0xdc, 0xa2, 0x14, 0x51, 0xbb, 0x8f,
> +	0xa0, 0x00, 0xe4, 0x7c, 0xe4, 0xb8, 0x80, 0x10,
> +	0x02, 0x00, 0x4b, 0xc1, 0x00, 0x00, 0x01, 0x01,
> +	0x08, 0x0a, 0x90, 0x60, 0xf4, 0xff, 0x03, 0xc5,
> +	0xb4, 0x19, 0x77, 0x34, 0xd4, 0xdc, 0x84, 0x86,
> +	0xff, 0x44, 0x09, 0x63, 0x36, 0x2e, 0x26, 0x9b,
> +	0x90, 0x70, 0xf2, 0xed, 0xc8, 0x5b, 0x87, 0xaa,
> +	0xb4, 0x67, 0x6b, 0x32, 0x3d, 0xc4, 0xbf, 0x15,
> +	0xa9, 0x16, 0x6c, 0x2a, 0x9d, 0xb2, 0xb7, 0x6b,
> +	0x58, 0x44, 0x58, 0x12, 0x4b, 0x8f, 0xe5, 0x12,
> +	0x11, 0x90, 0x94, 0x68, 0x37, 0xad, 0x0a, 0x9b,
> +	0xd6, 0x79, 0xf2, 0xb7, 0x31, 0xcf, 0x44, 0x22,
> +	0xc8, 0x99, 0x3f, 0xe5, 0xe7, 0xac, 0xc7, 0x0b,
> +	0x86, 0xdf, 0xda, 0xed, 0x0a, 0x0f, 0x86, 0xd7,
> +	0x48, 0xe2, 0xf1, 0xc2, 0x43, 0xed, 0x47, 0x3a,
> +	0xea, 0x25, 0x2d, 0xd6, 0x60, 0x38, 0x30, 0x07,
> +	0x28, 0xdd, 0x1f, 0x0c, 0xdd, 0x7b, 0x7c, 0xd9,
> +	0x35, 0x9d, 0x14, 0xaa, 0xc6, 0x35, 0xd1, 0x03,
> +	0x38, 0xb1, 0xf5,
> +};
> +
> +static const uint8_t test_cksum_ipv4_tcp_multi_segs_len[] = {
> +	66,  /* the first seg contains all headers, including L2 to L4 */
> +	61,  /* the second seg length is odd, test byte order independent */
> +	52,  /* three segs are sufficient to test the most complex scenarios */
> +};
> +
>  /* test l3/l4 checksum api */
>  static int
>  test_l4_cksum(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len)
> @@ -223,6 +259,70 @@ test_l4_cksum(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len)
>  	return -1;
>  }
>  
> +/* test l4 checksum api for a packet with multiple mbufs */
> +static int
> +test_l4_cksum_multi_mbufs(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len,
> +			     const uint8_t *segs, size_t segs_len)
> +{
> +	struct rte_mbuf *m[NB_MBUF] = {0};
> +	struct rte_mbuf *m_hdr = NULL;
> +	struct rte_net_hdr_lens hdr_lens;
> +	size_t i, off = 0;
> +	uint32_t packet_type, l3;
> +	void *l3_hdr;
> +	char *data;
> +
> +	for (i = 0; i < segs_len; i++) {
> +		m[i] = rte_pktmbuf_alloc(pktmbuf_pool);
> +		if (m[i] == NULL)
> +			GOTO_FAIL("Cannot allocate mbuf");
> +
> +		data = rte_pktmbuf_append(m[i], segs[i]);
> +		if (data == NULL)
> +			GOTO_FAIL("Cannot append data");
> +
> +		rte_memcpy(data, pktdata + off, segs[i]);

Tests (except rte_memcpy test) should not use rte_memcpy, instead use
regular memcpy which has better coverage from analyzers.

> +		off += segs[i];
> +
> +		if (m_hdr) {
> +			if (rte_pktmbuf_chain(m_hdr, m[i]))
> +				GOTO_FAIL("Cannot chain mbuf");
> +		} else {
> +			m_hdr = m[i];
> +		}
> +	}
> +
> +	if (off != len)
> +		GOTO_FAIL("Invalid segs");
> +
> +	packet_type = rte_net_get_ptype(m_hdr, &hdr_lens, RTE_PTYPE_ALL_MASK);
> +	l3 = packet_type & RTE_PTYPE_L3_MASK;
> +
> +	l3_hdr = rte_pktmbuf_mtod_offset(m_hdr, void *, hdr_lens.l2_len);
> +	off = hdr_lens.l2_len + hdr_lens.l3_len;
> +
> +	if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
> +		if (rte_ipv4_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off) != 0)
> +			GOTO_FAIL("Invalid L4 checksum verification for multiple mbufs");
> +	} else if (l3 == RTE_PTYPE_L3_IPV6 || l3 == RTE_PTYPE_L3_IPV6_EXT) {
> +		if (rte_ipv6_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off) != 0)
> +			GOTO_FAIL("Invalid L4 checksum verification for multiple mbufs");
> +	}
> +
> +	for (i = 0; i < segs_len; i++)
> +		rte_pktmbuf_free(m[i]);

Can avoid the loop here and elsewhere by using rte_pktmbuf_free_bulk()

> +	return 0;
> +
> +fail:
> +	for (i = 0; i < segs_len; i++) {
> +		if (m[i])
> +			rte_pktmbuf_free(m[i]);
> +	}

Freebulk will work here

> +	return -1;
> +}
> +
>  static int
>  test_cksum(void)
>  {
> @@ -256,6 +356,12 @@ test_cksum(void)
>  			  sizeof(test_cksum_ipv4_opts_udp)) < 0)
>  		GOTO_FAIL("checksum error on ipv4_opts_udp");
>  
> +	if (test_l4_cksum_multi_mbufs(pktmbuf_pool, test_cksum_ipv4_tcp_multi_segs,
> +			  sizeof(test_cksum_ipv4_tcp_multi_segs),
> +			  test_cksum_ipv4_tcp_multi_segs_len,
> +			  sizeof(test_cksum_ipv4_tcp_multi_segs_len)) < 0)
> +		GOTO_FAIL("checksum error on multi mbufs check");
> +
>  	rte_mempool_free(pktmbuf_pool);
>  
>  	return 0;
> diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
> index a8e8927952..679ba82eb6 100644
> --- a/lib/net/rte_cksum.h
> +++ b/lib/net/rte_cksum.h
> @@ -80,6 +80,25 @@ __rte_raw_cksum_reduce(uint32_t sum)
>  	return (uint16_t)sum;
>  }
>  
> +/**
> + * @internal Reduce a sum to the non-complemented checksum.
> + * Helper routine for the rte_raw_cksum_mbuf().
> + *
> + * @param sum
> + *   Value of the sum.
> + * @return
> + *   The non-complemented checksum.
> + */
> +static inline uint16_t
> +__rte_raw_cksum_reduce_u64(uint64_t sum)
> +{
> +	uint32_t tmp;
> +
> +	tmp = __rte_raw_cksum_reduce((uint32_t)sum);
> +	tmp += __rte_raw_cksum_reduce((uint32_t)(sum >> 32));
> +	return __rte_raw_cksum_reduce(tmp);
> +}
> +
>  /**
>   * Process the non-complemented checksum of a buffer.
>   *
> @@ -119,8 +138,8 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
>  {
>  	const struct rte_mbuf *seg;
>  	const char *buf;
> -	uint32_t sum, tmp;
> -	uint32_t seglen, done;
> +	uint32_t seglen, done, tmp;
> +	uint64_t sum;
>  
>  	/* easy case: all data in the first segment */
>  	if (off + len <= rte_pktmbuf_data_len(m)) {
> @@ -157,7 +176,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
>  	for (;;) {
>  		tmp = __rte_raw_cksum(buf, seglen, 0);
>  		if (done & 1)
> -			tmp = rte_bswap16((uint16_t)tmp);
> +			tmp = rte_bswap32(tmp);
>  		sum += tmp;
>  		done += seglen;
>  		if (done == len)
> @@ -169,7 +188,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
>  			seglen = len - done;
>  	}
>  
> -	*cksum = __rte_raw_cksum_reduce(sum);
> +	*cksum = __rte_raw_cksum_reduce_u64(sum);
>  	return 0;
>  }
>  


^ permalink raw reply

* Re: [PATCH] test/event_eth_rx_intr_adapter: support NICs with fewer int vectors
From: Jerin Jacob @ 2026-06-08 17:08 UTC (permalink / raw)
  To: Loftus, Ciara; +Cc: Richardson, Bruce, dev@dpdk.org
In-Reply-To: <DM3PPF7D18F34A115BCA37E02527DC8C08B8E2D2@DM3PPF7D18F34A1.namprd11.prod.outlook.com>

On Wed, Apr 22, 2026 at 4:06 PM Loftus, Ciara <ciara.loftus@intel.com> wrote:
>
> > Subject: [PATCH] test/event_eth_rx_intr_adapter: support NICs with fewer int
> > vectors
> >
> > Some NICs may not be able to support interrupts on all queues that are
> > advertised, which will cause the test to fail if the queues supporting
> > interrupts are fewer than 64. We can work around this by retrying the
> > NIC configuration multiple times with fewer queues in case of failure.
> > This allows the test to pass with NICs using ixgbe driver, for example.
> >
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
>
> LGTM.
> Acked-by: Ciara Loftus <ciara.loftus@intel.com>


Applied to dpdk-next-eventdev/for-main. Thanks

>
> > ---
> >  app/test/test_event_eth_rx_adapter.c | 52 +++++++++++++++++-----------
> >  1 file changed, 32 insertions(+), 20 deletions(-)
> >
> > diff --git a/app/test/test_event_eth_rx_adapter.c
> > b/app/test/test_event_eth_rx_adapter.c
> > index ae428b3333..7b38935bec 100644
> > --- a/app/test/test_event_eth_rx_adapter.c
> > +++ b/app/test/test_event_eth_rx_adapter.c
> > @@ -60,6 +60,7 @@ port_init_common(uint16_t port, const struct
> > rte_eth_conf *port_conf,
> >  {
> >       const uint16_t rx_ring_size = 512, tx_ring_size = 512;
> >       int retval;
> > +     bool started = false;
> >       uint16_t q;
> >       struct rte_eth_dev_info dev_info;
> >
> > @@ -76,32 +77,43 @@ port_init_common(uint16_t port, const struct
> > rte_eth_conf *port_conf,
> >                                       MAX_NUM_RX_QUEUE);
> >       default_params.tx_rings = 1;
> >
> > -     /* Configure the Ethernet device. */
> > -     retval = rte_eth_dev_configure(port, default_params.rx_rings,
> > +     while (!started) {
> > +             /* Configure the Ethernet device. */
> > +             retval = rte_eth_dev_configure(port, default_params.rx_rings,
> >                               default_params.tx_rings, port_conf);
> > -     if (retval != 0)
> > -             return retval;
> > -
> > -     for (q = 0; q < default_params.rx_rings; q++) {
> > -             retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
> > -                             rte_eth_dev_socket_id(port), NULL, mp);
> > -             if (retval < 0)
> > +             if (retval != 0)
> >                       return retval;
> > -     }
> >
> > -     /* Allocate and set up 1 TX queue per Ethernet port. */
> > -     for (q = 0; q < default_params.tx_rings; q++) {
> > -             retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
> > -                             rte_eth_dev_socket_id(port), NULL);
> > -             if (retval < 0)
> > +             for (q = 0; q < default_params.rx_rings; q++) {
> > +                     retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
> > +                                     rte_eth_dev_socket_id(port), NULL,
> > mp);
> > +                     if (retval < 0)
> > +                             return retval;
> > +             }
> > +
> > +             /* Allocate and set up 1 TX queue per Ethernet port. */
> > +             for (q = 0; q < default_params.tx_rings; q++) {
> > +                     retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
> > +                                     rte_eth_dev_socket_id(port), NULL);
> > +                     if (retval < 0)
> > +                             return retval;
> > +             }
> > +
> > +             /* Start the Ethernet port. */
> > +             retval = rte_eth_dev_start(port);
> > +             if (retval < 0) {
> > +                     /* Some NICs may not support interrupts on all
> > reported queues.
> > +                      * Therefore try to reconfigure and start with fewer
> > queues
> > +                      */
> > +                     if (default_params.rx_rings > 2) {
> > +                             default_params.rx_rings /= 2;
> > +                             continue;
> > +                     }
> >                       return retval;
> > +             }
> > +             started = true;
> >       }
> >
> > -     /* Start the Ethernet port. */
> > -     retval = rte_eth_dev_start(port);
> > -     if (retval < 0)
> > -             return retval;
> > -
> >       /* Display the port MAC address. */
> >       struct rte_ether_addr addr;
> >       retval = rte_eth_macaddr_get(port, &addr);
> > --
> > 2.51.0
>

^ permalink raw reply

* Re: [PATCH 6/7] pcapng: add user-supplied timestamp support
From: Stephen Hemminger @ 2026-06-08 17:09 UTC (permalink / raw)
  To: Dawid Wesierski
  Cc: dev, thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, Marek Kasiewicz
In-Reply-To: <20260608164059.65420-7-dawid.wesierski@intel.com>

On Mon,  8 Jun 2026 12:40:58 -0400
Dawid Wesierski <dawid.wesierski@intel.com> wrote:

> @@ -737,16 +736,6 @@ rte_pcapng_write_packets(rte_pcapng_t *self,
>  			return -1;
>  		}
>  
> -		/*
> -		 * When data is captured by pcapng_copy the current TSC is stored.
> -		 * Adjust the value recorded in file to PCAP epoch units.
> -		 */
> -		cycles = (uint64_t)epb->timestamp_hi << 32;
> -		cycles += epb->timestamp_lo;
> -		timestamp = tsc_to_ns_epoch(&self->clock, cycles);
> -		epb->timestamp_hi = timestamp >> 32;
> -		epb->timestamp_lo = (uint32_t)timestamp;
> -

You can't generate valid pcapng timestamps without this.

^ permalink raw reply

* [PATCH] net/iavf: fix misleading AQ failure logging
From: Anurag Mandal @ 2026-06-08 17:15 UTC (permalink / raw)
  To: dev
  Cc: bruce.richardson, vladimir.medvedkin, ciara.loftus, Anurag Mandal,
	stable

iavf_handle_virtchnl_msg() drains the admin receive queue in a loop
until iavf_clean_arq_element() reports that no descriptors are
pending. When the queue becomes empty, the base driver returns
IAVF_ERR_ADMIN_QUEUE_NO_WORK (-57), which is the documented
terminator for the drain loop, and is not an error.

The current loop treats every non-IAVF_SUCCESS return as a failure
and logs it as follows:

"Failed to read msg from AdminQ, ret: -57"

This message floods the logs on every interrupt cycle and misleads
the triage during VF reset by chasing a real virtchnl problem
seeing these spurious -57 AQ failure lines in logs and assumes
the admin queue is broken, when in fact it has just been drained.

This patch fixes the aforesaid issue by treating
IAVF_ERR_ADMIN_QUEUE_NO_WORK in virtchnl message drain as a normal
loop exit empty-queue condition and avoid logging it as an misleading
AQ failure.

Fixes: 02d212ca3125 ("net/iavf: rename remaining avf strings")
Cc: stable@dpdk.org

Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
---
 drivers/net/intel/iavf/iavf_vchnl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 94ccfb5d6e..870d5c1820 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -570,7 +570,15 @@ iavf_handle_virtchnl_msg(struct rte_eth_dev *dev)
 	while (pending) {
 		ret = iavf_clean_arq_element(hw, &info, &pending);

-		if (ret != IAVF_SUCCESS) {
+		/*
+		 * IAVF_ERR_ADMIN_QUEUE_NO_WORK (-57) means AQ is empty
+		 * and is a normal way to terminate the drain loop.
+		 * Log error only for genuine other failure codes.
+		 * Incorrect logging like this during VF resets might
+		 * mislead into chasing a non-existent AQ failure.
+		 */
+		if (ret != IAVF_SUCCESS &&
+		    ret != IAVF_ERR_ADMIN_QUEUE_NO_WORK) {
 			PMD_DRV_LOG(INFO, "Failed to read msg from AdminQ,"
 				    "ret: %d", ret);
 			break;
-- 
2.34.1

^ permalink raw reply related

* Re: [PATCH 2/2] net/cnxk: add PMD API to support custom profile setup
From: Jerin Jacob @ 2026-06-08 17:27 UTC (permalink / raw)
  To: rkudurumalla
  Cc: Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao,
	Harman Kalra, dev, jerinj
In-Reply-To: <20260505085037.3961519-2-rkudurumalla@marvell.com>

On Tue, May 5, 2026 at 2:36 PM rkudurumalla <rkudurumalla@marvell.com> wrote:
>
> From: Rakesh Kudurumalla <rkudurumalla@marvell.com>
>
> Added new PMD APIS to create custom profile and API to update
> SA table created during profile setup based on profile ID
>
> Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>

Series applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply

* Re: [PATCH v2 2/3] event/cnxk: add pause to spinloops
From: Stephen Hemminger @ 2026-06-08 17:44 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, Pavan Nikhilesh, Shijith Thotton
In-Reply-To: <CALBAE1PzEtyBzKiKaU83q0OtwRwaCTmqDETsc5NNODDQ9hzauw@mail.gmail.com>

On Mon, 8 Jun 2026 21:19:20 +0530
Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Mon, Apr 13, 2026 at 10:36 PM Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> >
> > On SMT systems when a spinloop is done without a pause
> > it may cause excessive latency. This problem was found
> > by the fix_empty_spinloops coccinelle script.
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>  
> 
> rte_pause() translates to YIELD instruction. Since cnxk is an
> integrated SoC and it is a single threaded core, it won't help on
> anything other than adding one instruction bit more latency.
> In general 3/3 devtool is good. Please send a it separate version so
> that 3/3 patches can be merged through the main tree.


It matters if your SOC has SMT where two cores are sharing
and one core is waiting for its partner.

^ permalink raw reply

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
From: Stephen Hemminger @ 2026-06-08 17:16 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev
In-Reply-To: <20250716103439.831760-1-16567adigashreesh@gmail.com>

On Wed, 16 Jul 2025 16:04:39 +0530
Shreesh Adiga <16567adigashreesh@gmail.com> wrote:

> Replace the clearing of lower 32 bits of XMM register with blend of
> zero register.
> Replace the clearing of upper 64 bits of XMM register with _mm_move_epi64.
> Clang is able to optimize away the AND + memory operand with the
> above sequence, however GCC is still emitting the code for AND with
> memory operands which is being explicitly eliminated here.
> 
> Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> shf_table which is 32 bytes, achieving the same functionality.
> 
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> ---

Applied to net-next

^ permalink raw reply

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
From: Stephen Hemminger @ 2026-06-08 17:16 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev
In-Reply-To: <20251011113202.937991-1-16567adigashreesh@gmail.com>

On Sat, 11 Oct 2025 16:59:34 +0530
Shreesh Adiga <16567adigashreesh@gmail.com> wrote:

> Replace the clearing of lower 32 bits of XMM register with blend of
> zero register.
> Remove the clearing of upper 64 bits of tmp1 as it is redundant.
> tmp1 after clearing upper bits was being xor with tmp2 before the
> bits 96:65 from tmp2 were returned. The xor operation of bits 96:65
> remains unchanged due to tmp1 having bits 96:64 cleared to 0.
> After removing the xor operation, the clearing of upper 64 bits of tmp1
> becomes redundant and hence can be removed.
> Clang is able to optimize away the AND + memory operand with the
> above sequence, however GCC is still emitting the code for AND with
> memory operands which is being explicitly eliminated here.
> 
> Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> shf_table which is 32 bytes, achieving the same functionality.
> 

Applied to net-next

^ permalink raw reply

* Re: [PATCH v7 1/1] net/mana: add device reset support
From: Stephen Hemminger @ 2026-06-08 19:11 UTC (permalink / raw)
  To: Wei Hu; +Cc: dev, longli, weh
In-Reply-To: <20260608120824.287050-2-weh@linux.microsoft.com>

On Mon,  8 Jun 2026 05:08:24 -0700
Wei Hu <weh@linux.microsoft.com> wrote:

> From: Wei Hu <weh@microsoft.com>
> 
> Add support for handling hardware reset events in the MANA driver.
> When the MANA kernel driver receives a hardware service event, it
> initiates a device reset and notifies userspace via
> IBV_EVENT_DEVICE_FATAL. The DPDK driver handles this by performing
> an automatic teardown and recovery sequence.
> 
> The reset flow has two phases. In the enter phase, running on the
> EAL interrupt thread, the driver transitions the device state,
> waits for data path threads to drain using per-queue atomic flags,
> stops queues, tears down IB resources, and frees per-queue MR
> caches. A control thread is then spawned to handle the exit phase:
> it waits for the hardware to recover, unregisters the interrupt
> handler, re-probes the PCI device, reinitializes MR caches, and
> restarts queues.
> 
> Each queue has an atomic burst_state variable where bit 0 is the
> in-burst flag and bits 1+ encode device state. The data path uses
> a single compare-and-swap (0 to 1) to enter a burst, which fails
> immediately if the reset path has set any state bits. The reset
> path sets state bits via atomic fetch-or and polls bit 0 to wait
> for in-flight bursts to drain. This single-variable design avoids
> the need for sequential consistency ordering.
> 
> A per-device mutex serializes the reset path with ethdev
> operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
> support and is held across blocking IB verbs calls. A trylock
> helper encapsulates the lock acquisition and device state check
> for all ethdev operation wrappers. Operations that cannot wait
> (configure, queue setup) return -EBUSY during reset, while
> dev_stop and dev_close join the reset thread before acquiring
> the lock to ensure proper sequencing. A CAS-based helper prevents
> double-join of the reset thread.
> 
> Multi-process support is included: secondary processes unmap and
> remap doorbell pages via IPC during the reset enter and exit
> phases. Data path functions in both primary and secondary
> processes check the device state atomically and return early when
> the device is not active.
> 
> The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
> reset path so that upper layers (e.g. netvsc) can switch their
> data path before queues are stopped. The event is emitted outside
> the reset lock to avoid deadlock if the callback calls dev_stop or
> dev_close. On completion, the driver emits RECOVERY_SUCCESS or
> RECOVERY_FAILED. If the enter phase fails internally,
> RECOVERY_FAILED is sent immediately so the application receives a
> terminal event. A PCI device removal event callback distinguishes
> hot-remove from service reset.
> 
> Documentation for the device reset feature is added in the MANA
> NIC guide and the 26.07 release notes.
> 
> Signed-off-by: Wei Hu <weh@microsoft.com>
> ---

This is is a rather complex set of state transitions so admit to relying
on AI as backup for tracking this. It still sees some errors here.

Worth asking the question, "what does mlx5 do?" and "should DPDK
EAL be doing this at the PCI layer instead?"

---

Much better - this addresses the RCU, the macros, the thread-safety-analysis
suppressions, and the callback-under-lock deadlock. The single-variable
burst_state CAS is a clean way to do the drain and the acquire/release
reasoning checks out.

One structural thing remains. The enter phase still runs the heavy teardown
on the EAL interrupt thread under reset_ops_lock: dev_stop, then
mana_mp_req_on_rxtx(RESET_ENTER) which is a blocking rte_mp_request_sync with
a multi-second timeout, then dev_close with its ibv calls. You already moved
the exit phase to a control thread because intr_callback_unregister cannot run
on the interrupt thread; the same argument applies to blocking IPC and verbs
teardown. A slow or absent secondary will stall the interrupt thread for the
MP timeout, and this is the blocking-under-a-sleeping-mutex pattern. Please
have the interrupt handler just set state and drain, then hand the rest of the
teardown to the control thread. That also removes the last lock hand-off
between functions/threads, so each function can own its lock.

Smaller points:
- RECOVERY_SUCCESS/FAILED are emitted from the reset thread. If the callback
  calls dev_stop/dev_close, mana_join_reset_thread() joins the current thread
  (EDEADLK, leaked handle). INTR_RMV is fine since it runs on the dev-event
  thread.
- The burst_state comment says bits 1+ encode device state, but only
  RESET_ENTER<<1 is ever stored - it is effectively a single "blocked" flag.

^ permalink raw reply

* RE: [PATCH v15 0/5] Support add/remove memory region and get-max-slots
From: Bathija, Pravin @ 2026-06-08 20:13 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev@dpdk.org, fengchengwen@huawei.com, maxime.coquelin@redhat.com,
	thomas@monjalon.net
In-Reply-To: <20260605094533.26cd079c@phoenix.local>

Hi Stephen, Answers inline. I have also sent out patch-set v16 incorporating the fixes from your AI review.


Internal Use - Confidential
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Friday, June 5, 2026 9:46 AM
> To: Bathija, Pravin <Pravin.Bathija@dell.com>
> Cc: dev@dpdk.org; fengchengwen@huawei.com;
> maxime.coquelin@redhat.com; thomas@monjalon.net
> Subject: Re: [PATCH v15 0/5] Support add/remove memory region and get-max-
> slots
>
>
> [EXTERNAL EMAIL]
>
> On Thu, 4 Jun 2026 23:57:18 +0000
> <pravin.bathija@dell.com> wrote:
>
> > From: Pravin M Bathija <pravin.bathija@dell.com>
> >
> > This is version v15 of the patchset and it incorporates the
> > recommendations made by Maxime Coquelin.
> >
> > Patch 4/5
> > - Changed VHOST_USER_REM_MEM_REG handler declaration from
> >   accepts_fd=true to accepts_fd=false, as the remove request does not
> >   expect FDs in ancillary data.
> > - Removed all close_msg_fds(ctx) calls from vhost_user_rem_mem_reg(), no
> >   longer needed since the handler is declared as not accepting FDs.
> > - Removed validate_msg_fds(dev, ctx, 0) check from
> >   vhost_user_rem_mem_reg(), as FD validation is now handled generically
> >   by the framework.
> > - Added targeted IOTLB cache invalidation in vhost_user_rem_mem_reg()
> >   using vhost_user_iotlb_cache_remove() for the removed region's GPA
> >   range, instead of the nuclear iotlb_flush_all() used by set_mem_table.
> >
> > This implementation has been extensively tested by doing Read/Write
> > I/O from multiple instances of fio + libblkio (front-end) talking to
> > spdk/dpdk (back-end) based drives. Tested with qemu front-end talking
> > to dpdk testpmd (back-end) performing add/removal of memory regions.
> > Also tested post-copy live migration after doing add_memory_region.
> >
> > Version Log:
> > Version v15 (Current version): Incorporate code review suggestions
> > from Maxime Coquelin as described above.
> >
> > Version v14: Incorporate code review suggestions from Stephen
> > Hemminger and Fengcheng Wen.
> > Changes from Fengcheng Wen review:
> > Patch 3/5
> > - Moved free_all_mem_regions() call sites in vhost_user_set_mem_table()
> >   from patch 4/5 to patch 3/5 so each commit compiles independently
> > Patch 4/5
> > - Renamed _dev_invalidate_vrings() to vhost_user_invalidate_vrings() to
> >   follow vhost naming convention
> > -  Added comment explaining *pdev propagation through
> >    translate_ring_addresses / numa_realloc()
> > - Reordered local variables in vhost_user_add_mem_reg() and
> >   vhost_user_rem_mem_reg() by descending line length
> > - Shortened overlap check variable names (current_region_guest_start/end
> >   --> cur_start/end, proposed_region_guest_start/end -> new_start/end)
> > - Fixed DMA error path in vhost_user_add_mem_reg(): added
> >   free_new_region_no_dma label so async_dma_map_region(false) is not
> >   called when the map itself failed.
> > Changes from Stephen Hemminger review:
> > Patch 4/5
> > - vhost_user_add_mem_reg() now constructs a reply with the back-end's
> >   host mapping address in userspace_addr and returns
> >   RTE_VHOST_MSG_RESULT_REPLY per the vhost-user spec
> > - Added validate_msg_fds(dev, ctx, 0) in vhost_user_rem_mem_reg() to
> >   reject malformed messages with unexpected file descriptors
> > - Dropped unnecessary (uint64_t) cast in
> > vhost_user_get_max_mem_slots()
> >
> > Version v13: Incorporate code review suggestions from Fengcheng Wen
> > Patch 2/5 Renamed VhostUserSingleMemReg to VhostUserMemRegMsg and
> > memory_single to memreg Patches 3/5 and 4/5 Relocated function
> > remove_guest_pages from patch 3/5 to 4/5
> >
> > Version v12: Incorporate code review suggestions from Maxime Coquelin
> > and ai-code-review.
> > Patch 3/5
> > Refactored async_dma_map() to delegate to async_dma_map_region(),
> > eliminating code duplication between the two functions.
> > Restored original comments in async_dma_map_region() explaining why
> > ENODEV and EINVAL errors are ignored (these were stripped in v10)
> > Reverted unnecessary changes to vhost_user_postcopy_register() --
> > removed the host_user_addr == 0 checks and reg_msg_index indirection
> > that were added in  v10, since this function is only called from
> > vhost_user_set_mem_table() where regions are always contiguous.
> >
> > Version v11: Incorporate code review suggestions from Stephen Hemminger.
> > Patch 4/5
> > Fix incomplete cleanup in vhost_user_add_mem_reg() when
> > vhost_user_mmap_region() fails after the mmap succeeds (e.g.
> > add_guest_pages() realloc failure) realloc failure). The error path
> > now calls remove_guest_pages() and free_mem_region() to undo the
> > mapping and stale guest-page entries, preventing a leaked mmap and
> > slot reuse corruption. The plain close(fd) path is kept for pre-mmap failures.
> >
> > Version v10: Incorporate code review suggestions from Stephen Hemminger.
> > Patch 4/5
> > Moved dev_invalidate_vrings after free_mem_region, array compaction,
> > and nregions decrement. This ensures translate_ring_addresses only
> > sees surviving memory regions, preventing vring pointers from
> > resolving into a region that is about to be unmapped.
> >
> > Version v9: Incorporate code review suggestions from Stephen Hemminger.
> > Patch 3/5
> > Restored max_guest_pages initial value to hardcoded 8 instead of
> > VHOST_MEMORY_MAX_NREGIONS, matching upstream semantics.
> > Patch 4/5
> > Added close(reg->fd) and reg->fd = -1 before goto close_msg_fds in the
> > mmap failure path to fix fd leak after fd was moved from ctx->fds[0].
> > Converted dev_invalidate_vrings from a plain function to a macro +
> > implementation function pair, accepting message ID as a parameter so
> > the static_assert reports the correct handler at each call site.
> > Updated dev_invalidate_vrings call in add_mem_reg to pass
> > VHOST_USER_ADD_MEM_REG as message ID.
> > Updated dev_invalidate_vrings call in rem_mem_reg to pass
> > VHOST_USER_REM_MEM_REG as message ID.
> >
> > Version v8:  Incorporate code review suggestions from Stephen Hemminger.
> > rewrite async_dma_map_region function to iterate guest pages by host
> > address range matching change function dev_invalidate_vrings to accept
> > a double pointer to propagate pointer updates new function
> > remove_guest_pages was added add_mem_reg error path was narrowed to
> > only clean up the single failed region instead of destroting all
> > existing regions
> >
> > Version v7: Incorporate code review suggestions from Maxime Coquelin.
> > Add debug messages to vhost_postcopy_register function.
> >
> > Version v6: Added the enablement of this feature as a final patch in
> > this patch-set and other code optimizations as suggested by Maxime
> > Coquelin.
> >
> > Version v5: removed the patch that increased the number of memory
> > regions from 8 to 128. This will be submitted as a separate feature at
> > a later point after incorporating additional optimizations. Also
> > includes code optimizations as suggested by Feng Cheng Wen.
> >
> > Version v4: code optimizations as suggested by Feng Cheng Wen.
> >
> > Version v3: code optimizations as suggested by Maxime Coquelin and
> > Thomas Monjalon.
> >
> > Version v2: code optimizations as suggested by Maxime Coquelin.
> >
> > Version v1: Initial patch set.
> >
> > Pravin M Bathija (5):
> >   vhost: add user to mailmap and define to vhost hdr
> >   vhost: header defines for add/rem mem region
> >   vhost: refactor memory helper functions
> >   vhost: add mem region add/remove handlers
> >   vhost: enable configure memory slots
> >
> >  .mailmap               |   1 +
> >  lib/vhost/rte_vhost.h  |   4 +
> >  lib/vhost/vhost_user.c | 425
> > +++++++++++++++++++++++++++++++++++------
> >  lib/vhost/vhost_user.h |  10 +
> >  4 files changed, 378 insertions(+), 62 deletions(-)
> >
>
> I don't think this is ready to merge based on AI review.
> Did AI review with Opus 4.8 on a chat which has past context.
>
> Summary of v15 findings
>
>
> New in v15 (both patch 4/5, both errors):
>
>     Use-after-free on the reply path: reg points into dev->mem->regions[], but
> dev_invalidate_vrings() -> translate_ring_addresses() -> numa_realloc() can
> relocate dev->mem. dev is refreshed via *pdev, reg is not, then reg-
> >host_user_addr is read for the reply. Re-derive reg (or capture
> host_user_addr) after dev = *pdev.
Confirmed. reg points into dev->mem->regions[], and dev_invalidate_vrings() -> translate_ring_addresses() -> numa_realloc()
can reallocate dev->mem, leaving reg dangling before reg->host_user_addr is read. Fixed by re-deriving the region from the
refreshed dev (reg = &dev->mem->regions[dev->mem->nregions - 1]) after dev = *pdev, before building the reply.

>     ADD_MEM_REG reply sent unconditionally: handler always returns
> RESULT_REPLY, but the spec makes the mapping-address reply postcopy- only.
> In non-postcopy mode this desyncs the channel (no REPLY_ACK: the front-end
> never reads it; with REPLY_ACK: it expects a u64 ack, not a memreg). Gate the
> reply on dev->postcopy_listening, else return RESULT_OK -- same as
> SET_MEM_TABLE.
Confirmed against the vhost-user spec, which states the mapping-address reply is postcopy-only ("In postcopy mode... the
back-end replies  with the bases of the memory mapped region"). The reply is now gated on dev->postcopy_listening: return
RTE_VHOST_MSG_RESULT_REPLY in postcopy mode, and RTE_VHOST_MSG_RESULT_OK otherwise, matching
VHOST_USER_SET_MEM_TABLE. This avoids desyncing the channel in non-postcopy mode.

>
> Carried over from v13 (now in a different form):
>
>     The v13 Warning (missing postcopy mapping-address reply) is addressed but
> mis-gated; correct fix is the conditional reply above. Until then postcopy
> correctness still isn't right.

Same fix as above.


^ permalink raw reply

* [PATCH 0/4] bpf/arm64: add BPF_ABS/BPF_IND packet load support
From: Stephen Hemminger @ 2026-06-08 20:28 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Discovered this while exploring packet filtering.
The arm64 BPF JIT did not implement BPF_LD | BPF_ABS or BPF_LD | BPF_IND,
so cBPF filters converted by rte_bpf_convert() could not be JIT compiled
and silently fell back to the interpreter on arm64.

The first patch fixes a latent bug in emit_return_zero_if_src_zero():
the offset of the branch to the epilogue was held in an unsigned.
A backward branch wrapped around. Existing JIT tests were never
being run on ARM.

The next two patches make the bpf tests assert that,
on an architecture with a JITbackend, code is actually generated,
so a missing or failed JIT is reported rather than skipped.

The final patch adds the ABS/IND opcodes, mirroring the x86 JIT
with a fast path for data in the first
mbuf segment and a __rte_pktmbuf_read() slow path for the rest.

Stephen Hemminger (4):
  bpf/arm64: fix zero-return branch in multi-exit programs
  test: bpf check that JIT was generated
  test: bpf check that bpf_convert can be JIT'd
  bpf/arm64: add BPF_ABS/BPF_IND packet load support

 app/test/test_bpf.c     |  23 ++++++-
 lib/bpf/bpf_jit_arm64.c | 149 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 169 insertions(+), 3 deletions(-)

-- 
2.53.0

^ permalink raw reply

* [PATCH 1/4] bpf/arm64: fix zero-return branch in multi-exit programs
From: Stephen Hemminger @ 2026-06-08 20:28 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, stable, Wathsala Vithanage, Konstantin Ananyev,
	Marat Khalili, Jerin Jacob
In-Reply-To: <20260608203322.1116296-1-stephen@networkplumber.org>

If a JIT'd BPF program has more than one exit,
the branch to the epilogue can be backwards.

The current code assumed it is always forward:
emit_return_zero_if_src_zero() held the offset in an unsigned uint16_t,
so a backward (negative) offset wrapped to a large positive value and
branch off the end of the program, faulting at run time.

This was masked until now: the only test with this shape, test_ld_mbuf,
needs BPF_ABS/BPF_IND which the arm64 JIT did not implement, so it never
ran under the JIT.  The x86 JIT is unaffected because emit_epilog() keeps a
single exit (st->exit.off) reached from later exits and the divide-by-zero
check via a signed absolute jump (emit_abs_jcc), so direction does not
matter.

Use a signed offset; emit_b() already sign-extends imm26 correctly.

Fixes: 111e2a747a4f ("bpf/arm: add basic arithmetic operations")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/bpf/bpf_jit_arm64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index a04ef33a9c..099822e9f1 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -957,7 +957,7 @@ static void
 emit_return_zero_if_src_zero(struct a64_jit_ctx *ctx, bool is64, uint8_t src)
 {
 	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
-	uint16_t jump_to_epilogue;
+	int32_t jump_to_epilogue;

 	emit_cbnz(ctx, is64, src, 3);
 	emit_mov_imm(ctx, is64, r0, 0);
-- 
2.53.0

^ permalink raw reply related

* [PATCH 2/4] test: bpf check that JIT was generated
From: Stephen Hemminger @ 2026-06-08 20:28 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Konstantin Ananyev, Marat Khalili
In-Reply-To: <20260608203322.1116296-1-stephen@networkplumber.org>

Avoid silently ignoring JIT failures. The test cases should
all succeed JIT compilation; if not it is a bug in the JIT
implementation and should be reported.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/test_bpf.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index dd24722450..79d547dc82 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -3508,6 +3508,14 @@ run_test(const struct bpf_test *tst)
 				rv, strerror(rv));
 		}
 	}
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	else {
+		/* a JIT backend exists for this arch, so it must compile */
+		printf("%s@%d: %s: no JIT code generated;\n",
+			__func__, __LINE__, tst->name);
+		ret = -1;
+	}
+#endif
 
 	rte_bpf_destroy(bpf);
 	return ret;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 3/4] test: bpf check that bpf_convert can be JIT'd
From: Stephen Hemminger @ 2026-06-08 20:28 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Konstantin Ananyev, Marat Khalili
In-Reply-To: <20260608203322.1116296-1-stephen@networkplumber.org>

Add followup in bpf conversion tests to make sure resulting
code was also run through JIT.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/test_bpf.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 79d547dc82..f5ab447ff6 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -4569,6 +4569,7 @@ test_bpf_filter(pcap_t *pcap, const char *s)
 	struct bpf_program fcode;
 	struct rte_bpf_prm *prm = NULL;
 	struct rte_bpf *bpf = NULL;
+	int ret = -1;
 
 	if (pcap_compile(pcap, &fcode, s, 1, PCAP_NETMASK_UNKNOWN)) {
 		printf("%s@%d: pcap_compile('%s') failed: %s;\n",
@@ -4592,6 +4593,18 @@ test_bpf_filter(pcap_t *pcap, const char *s)
 			__func__, __LINE__, rte_errno, strerror(rte_errno));
 		goto error;
 	}
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	{
+		struct rte_bpf_jit jit;
+
+		rte_bpf_get_jit(bpf, &jit);
+		if (jit.func == NULL) {
+			printf("%s@%d: no JIT generated\n", __func__, __LINE__);
+			goto error;
+		}
+	}
+#endif
+	ret = 0;
 
 error:
 	if (bpf)
@@ -4603,7 +4616,7 @@ test_bpf_filter(pcap_t *pcap, const char *s)
 
 	rte_free(prm);
 	pcap_freecode(&fcode);
-	return (bpf == NULL) ? -1 : 0;
+	return ret;
 }
 
 static int
-- 
2.53.0


^ permalink raw reply related

* [PATCH 4/4] bpf/arm64: add BPF_ABS/BPF_IND packet load support
From: Stephen Hemminger @ 2026-06-08 20:28 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Wathsala Vithanage, Konstantin Ananyev,
	Marat Khalili
In-Reply-To: <20260608203322.1116296-1-stephen@networkplumber.org>

The arm64 JIT rejected BPF_LD | BPF_ABS and BPF_LD | BPF_IND with
"invalid opcode", so cBPF programs converted by rte_bpf_convert() could
not be JITed. Add these opcodes, mirroring the x86 JIT: a fast path for
data held in the first mbuf segment and a __rte_pktmbuf_read() slow path
for everything else. Programs using these opcodes now use the call
register layout, since the slow path makes a function call.

Bugzilla ID: 1427

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/bpf/bpf_jit_arm64.c | 147 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 1 deletion(-)

diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 099822e9f1..6952c61806 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -1123,6 +1123,133 @@ emit_branch(struct a64_jit_ctx *ctx, uint8_t op, uint32_t i, int16_t off)
 	emit_b_cond(ctx, ebpf_to_a64_cond(op), jump_offset_get(ctx, i, off));
 }
 
+/* LD_ABS/LD_IND code block offsets (in arm64 instructions) */
+enum {
+	LDMB_FAST_OFS, /* fast path */
+	LDMB_SLOW_OFS, /* slow path */
+	LDMB_FIN_OFS,  /* common tail */
+	LDMB_OFS_NUM
+};
+
+/*
+ * Helper for emit_ld_mbuf(): fast path.
+ * Compute the packet offset; if it lies inside the first segment leave the
+ * data pointer in R0, otherwise branch to the slow path.
+ */
+static void
+emit_ldmb_fast_path(struct a64_jit_ctx *ctx, uint8_t src, uint8_t mode,
+		    uint32_t sz, int32_t imm, const uint32_t ofs[LDMB_OFS_NUM])
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+	uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+	uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+	uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
+	uint8_t tmp3 = ebpf_to_a64_reg(ctx, TMP_REG_3);
+
+	/* off = imm (+ src for BPF_IND) */
+	emit_mov_imm(ctx, 1, tmp1, imm);
+	if (mode == BPF_IND)
+		emit_add(ctx, 1, tmp1, src);
+
+	/* if ((int64_t)(mbuf->data_len - off) < sz) goto slow_path */
+	emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_len));
+	emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+	emit_sub(ctx, 1, tmp2, tmp1);
+	emit_mov_imm(ctx, 1, tmp3, sz);
+	emit_cmp(ctx, 1, tmp2, tmp3);
+	emit_b_cond(ctx, A64_LT, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
+
+	/* R0 = mbuf->buf_addr + mbuf->data_off + off */
+	emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_off));
+	emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+	emit_mov_imm(ctx, 1, r0, offsetof(struct rte_mbuf, buf_addr));
+	emit_ldr(ctx, EBPF_DW, r0, r6, r0);
+	emit_add(ctx, 1, r0, tmp2);
+	emit_add(ctx, 1, r0, tmp1);
+
+	emit_b(ctx, (int32_t)(ofs[LDMB_FIN_OFS] - ctx->idx));
+}
+
+/*
+ * Helper for emit_ld_mbuf(): slow path.
+ * R0 = __rte_pktmbuf_read(mbuf, off, sz, buf); return 0 if NULL.
+ * The scratch buffer is the space reserved by __rte_bpf_validate() at the
+ * bottom of the eBPF stack frame, i.e. (frame_pointer - stack_ofs).
+ */
+static void
+emit_ldmb_slow_path(struct a64_jit_ctx *ctx, uint32_t sz, uint32_t stack_ofs)
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+	uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+	uint8_t fp = ebpf_to_a64_reg(ctx, EBPF_FP);
+	uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+
+	/* arguments of __rte_pktmbuf_read(mbuf, off, len, buf) */
+	emit_mov_64(ctx, A64_R(1), tmp1);		/* off (held in tmp1) */
+	emit_mov_64(ctx, A64_R(0), r6);			/* mbuf */
+	emit_mov_imm(ctx, 0, A64_R(2), sz);		/* len */
+	emit_sub_imm_64(ctx, A64_R(3), fp, stack_ofs);	/* buf */
+
+	emit_call(ctx, tmp1, (void *)(uintptr_t)__rte_pktmbuf_read);
+	emit_return_zero_if_src_zero(ctx, 1, r0);
+}
+
+/*
+ * Helper for emit_ld_mbuf(): common tail.
+ * Load the value pointed to by R0 and convert from network byte order.
+ */
+static void
+emit_ldmb_fin(struct a64_jit_ctx *ctx, uint8_t opsz, uint32_t sz)
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+
+	emit_ldr(ctx, opsz, r0, r0, A64_ZR);
+	if (opsz != BPF_B)
+		emit_be(ctx, r0, sz * 8);
+}
+
+/*
+ * Emit code for BPF_LD | BPF_ABS and BPF_LD | BPF_IND packet loads:
+ *
+ *	off = imm (+ src for BPF_IND)
+ *	if (mbuf->data_len - off >= sz)			    -- fast path
+ *		ptr = mbuf->buf_addr + mbuf->data_off + off;
+ *	else						    -- slow path
+ *		ptr = __rte_pktmbuf_read(mbuf, off, sz, buf);
+ *		if (ptr == NULL)
+ *			return 0;
+ *	R0 = ntoh(*(size *)ptr);			    -- common tail
+ *
+ * The three blocks are sized in a dry run so the forward branches can be
+ * resolved, then emitted for real (arm64 instructions are fixed width, so
+ * the dry run reproduces the real instruction count exactly).
+ */
+static void
+emit_ld_mbuf(struct a64_jit_ctx *ctx, uint8_t op, uint8_t src, int32_t imm,
+	     uint32_t stack_ofs)
+{
+	uint8_t mode = BPF_MODE(op);
+	uint8_t opsz = BPF_SIZE(op);
+	uint32_t sz = bpf_size(opsz);
+	uint32_t ofs[LDMB_OFS_NUM];
+
+	/* seed offsets so the dry-run branches stay in range */
+	ofs[LDMB_FAST_OFS] = ofs[LDMB_SLOW_OFS] = ofs[LDMB_FIN_OFS] = ctx->idx;
+
+	/* dry run to record block offsets */
+	emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+	ofs[LDMB_SLOW_OFS] = ctx->idx;
+	emit_ldmb_slow_path(ctx, sz, stack_ofs);
+	ofs[LDMB_FIN_OFS] = ctx->idx;
+	emit_ldmb_fin(ctx, opsz, sz);
+
+	/* rewind and emit for real with resolved offsets */
+	ctx->idx = ofs[LDMB_FAST_OFS];
+	emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+	emit_ldmb_slow_path(ctx, sz, stack_ofs);
+	emit_ldmb_fin(ctx, opsz, sz);
+}
+
 static void
 check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 {
@@ -1135,8 +1262,17 @@ check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 		op = ins->code;
 
 		switch (op) {
-		/* Call imm */
+		/*
+		 * BPF_ABS/BPF_IND can fall through to __rte_pktmbuf_read(),
+		 * so they need the call-clobbered register layout as well.
+		 */
 		case (BPF_JMP | EBPF_CALL):
+		case (BPF_LD | BPF_ABS | BPF_B):
+		case (BPF_LD | BPF_ABS | BPF_H):
+		case (BPF_LD | BPF_ABS | BPF_W):
+		case (BPF_LD | BPF_IND | BPF_B):
+		case (BPF_LD | BPF_IND | BPF_H):
+		case (BPF_LD | BPF_IND | BPF_W):
 			ctx->foundcall = 1;
 			return;
 		}
@@ -1338,6 +1474,15 @@ emit(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 			emit_mov_imm(ctx, 1, dst, u64);
 			i++;
 			break;
+		/* R0 = ntoh(*(size *)(mbuf data + (src) + imm)) */
+		case (BPF_LD | BPF_ABS | BPF_B):
+		case (BPF_LD | BPF_ABS | BPF_H):
+		case (BPF_LD | BPF_ABS | BPF_W):
+		case (BPF_LD | BPF_IND | BPF_B):
+		case (BPF_LD | BPF_IND | BPF_H):
+		case (BPF_LD | BPF_IND | BPF_W):
+			emit_ld_mbuf(ctx, op, src, imm, bpf->stack_sz);
+			break;
 		/* *(size *)(dst + off) = src */
 		case (BPF_STX | BPF_MEM | BPF_B):
 		case (BPF_STX | BPF_MEM | BPF_H):
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v13 01/20] net/sxe2: support AVX512 vectorized path for Rx and Tx
From: Stephen Hemminger @ 2026-06-08 20:56 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260608074257.3043531-2-liujie5@linkdatatechnology.com>

On Mon,  8 Jun 2026 15:42:38 +0800
liujie5@linkdatatechnology.com wrote:

>  
> -struct sxe2_drv_queue_caps {
> +struct __rte_aligned(4) __rte_packed_begin sxe2_drv_queue_caps {
>  	uint16_t queues_cnt;
>  	uint16_t base_idx_in_pf;
> -};
> +} __rte_packed_end;

I don't see the point of packed and aligned. This structure is already embedded
in other struct. The alignment will be right.
Packed should be reserved for where you are avoiding padding.

Bottom line: drivers should not use packed except for hardware registers,
network headers.  You probably could just use existing structures here
and elsewhere; with static_assert() added in the vector code to make
sure future changes don't break assumptions.

> diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c
> index 8d66e5d8c5..e0f7002138 100644
> --- a/drivers/net/sxe2/sxe2_ethdev.c
> +++ b/drivers/net/sxe2/sxe2_ethdev.c
> @@ -891,7 +891,7 @@ static int32_t sxe2_eth_pmd_probe_pf(struct sxe2_common_device *cdev,
>  static int32_t sxe2_parse_eth_devargs(struct rte_device *dev,
>  			  struct rte_eth_devargs *eth_da)
>  {
> -	int ret = 0;
> +	int32_t ret = 0;
>  
>  	if (dev->devargs == NULL)
>  		return 0;

Hmm. This raises the question why did sxe2 clone the model from other drivers using vdpa
but change the return to int32_t.

I think you are making things unnecessarily complex here:

$ git grep probe_t | grep int
drivers/bus/auxiliary/bus_auxiliary_driver.h:typedef int (rte_auxiliary_probe_t)(struct rte_auxiliary_driver *drv,
drivers/bus/cdx/bus_cdx_driver.h:typedef int (rte_cdx_probe_t)(struct rte_cdx_driver *, struct rte_cdx_device *);
drivers/bus/dpaa/bus_dpaa_driver.h:typedef int (*rte_dpaa_probe_t)(struct rte_dpaa_driver *dpaa_drv,
drivers/bus/fslmc/bus_fslmc_driver.h:typedef int (*rte_dpaa2_probe_t)(struct rte_dpaa2_driver *dpaa2_drv,
drivers/bus/ifpga/bus_ifpga_driver.h:typedef int (afu_probe_t)(struct rte_afu_device *);
drivers/bus/pci/bus_pci_driver.h:typedef int (rte_pci_probe_t)(struct rte_pci_driver *, struct rte_pci_device *);
drivers/bus/platform/bus_platform_driver.h:typedef int (rte_platform_probe_t)(struct rte_platform_device *pdev);
drivers/bus/uacce/bus_uacce_driver.h:typedef int (rte_uacce_probe_t)(struct rte_uacce_driver *, struct rte_uacce_device *);
drivers/bus/vdev/bus_vdev_driver.h:typedef int (rte_vdev_probe_t)(struct rte_vdev_device *dev);
drivers/bus/vmbus/bus_vmbus_driver.h:typedef int (vmbus_probe_t)(struct rte_vmbus_driver *,
drivers/common/mlx5/mlx5_common.h:typedef int (mlx5_class_driver_probe_t)(struct mlx5_common_device *cdev,
drivers/common/nfp/nfp_common_pci.h:typedef int (nfp_class_driver_probe_t)(struct rte_pci_device *dev);
drivers/common/sxe2/sxe2_common.h:typedef int32_t (sxe2_class_driver_probe_t)(struct sxe2_common_device *scdev,
lib/eal/include/bus_driver.h:typedef int (*rte_bus_probe_t)(struct rte_bus *bus);

No other driver uses int32...

> @@ -315,19 +370,30 @@ static const struct {
>  	eth_rx_burst_t rx_burst;
>  	const char *info;
>  } sxe2_rx_burst_infos[] = {
> -	{ sxe2_rx_pkts_scattered,          "Scalar Scattered" },
> -	{ sxe2_rx_pkts_scattered_split,          "Scalar Scattered split" },
> +	{ sxe2_rx_pkts_scattered,
> +	      "Scalar Scattered" },
> +	{ sxe2_rx_pkts_scattered_split,
> +	      "Scalar Scattered split" },
>  #ifdef RTE_ARCH_X86
> -	{ sxe2_rx_pkts_scattered_vec_sse_offload,      "Vector SSE Scattered" },
> +#ifdef CC_AVX512_SUPPORT
> +	{ sxe2_rx_pkts_scattered_vec_avx512,
> +	      "Vector AVX512 Scattered" },
> +	{ sxe2_rx_pkts_scattered_vec_avx512_offload,
> +	      "Offload Vector AVX512 Scattered" },
> +#endif
> +	{ sxe2_rx_pkts_scattered_vec_sse_offload,
> +	      "Vector SSE Scattered" },
>  #endif
>  };

The table looked better before with longer lines.
The DPDK coding style allows lines up to 100 characters; why not use it

>  int32_t sxe2_rx_burst_mode_get(struct rte_eth_dev *dev,
> -			__rte_unused uint16_t queue_id, struct rte_eth_burst_mode *mode)
> +			       __rte_unused uint16_t queue_id,
> +			       struct rte_eth_burst_mode *mode)
>  {
>  	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
>  	int32_t ret = -EINVAL;
>  	uint32_t i, size;
> +
>  	size = RTE_DIM(sxe2_rx_burst_infos);
>  	for (i = 0; i < size; ++i) {
>  		if (pkt_burst == sxe2_rx_burst_infos[i].rx_burst) {

The old code was fine, no need to change this. Either indentation style is OK.
Prefer not to have non-related changes.


> +static __rte_always_inline void
> +sxe2_tx_pkts_mbuf_fill_avx512(struct sxe2_tx_buffer_vec *buffer,
> +	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> +{

Please only use always_inline where it is absolutely necessary.
Dont fight with the compiler.

> +static __rte_always_inline int32_t sxe2_tx_bufs_free_vec_avx512(struct sxe2_tx_queue *txq)
> +{
> +	struct sxe2_tx_buffer_vec *buffer;
> +	struct rte_mbuf *mbuf;
> +	struct rte_mbuf *mbuf_free_arr[SXE2_TX_FREE_BUFFER_SIZE_MAX_VEC];
> +	struct rte_mempool *mp;
> +	struct rte_mempool_cache *cache;
> +	void **cache_objs;
> +	uint32_t copied;
> +	uint32_t i;
> +	int32_t ret;
> +	uint16_t rs_thresh;
> +	uint16_t free_num;
> +
> +	if (rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_DESC_DONE) !=
> +		(txq->desc_ring[txq->next_dd].wb.dd &
> +			rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_MASK))) {
> +		ret = 0;
> +		goto l_end;
> +	}
> +
> +	rs_thresh = txq->rs_thresh;
> +
> +	buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
> +	buffer += txq->next_dd - (rs_thresh - 1);
> +
> +	if ((txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) &&
> +			(rs_thresh & 31) == 0) {
> +		mp = buffer[0].mbuf->pool;
> +		cache = rte_mempool_default_cache(mp, rte_lcore_id());
> +
> +		if (cache == NULL || cache->len)
> +			goto normal;
> +
> +		if (rs_thresh > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> +			(void)rte_mempool_ops_enqueue_bulk(mp, (void *)buffer, rs_thresh);
> +			goto done;
> +		}
> +		cache_objs = &cache->objs[cache->len];

Directly using cache is going to be brittle and likely get broken by other
coming changes to mempool cache.


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox