* [PATCH v3 09/11] bus: implement cleanup in EAL
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, Parav Pandit, Xueming Li, Sachin Saxena, Rosen Xu,
Chenbo Xia, Nipun Gupta, Tomasz Duszynski, Wei Hu
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Introduce a generic cleanup helper rte_bus_generic_cleanup() that
eliminates code duplication across bus cleanup implementations:
unplug probed devices, remove devargs, remove from bus list,
and free device structures.
Add .free_device operation to struct rte_bus to allow buses to specify
how to free their device structures.
Update all buses for the new .cleanup and RTE_REGISTER_BUS prototypes.
Convert to rte_bus_generic_cleanup() the buses that have both a .cleanup
and .unplug_device: this requires implementing .free_device for them.
Untouched buses are:
- dma/idxd which has no unplug support,
- bus/cdx which has unplug support, but no cleanup was implemented so
far,
- NXP buses:
- bus/dpaa and bus/fslmc have many issues on interrupt
allocation/setup/freeing or VFIO setup/release,
- bus/fslmc cleanup callback is actually implemented in its internal
VFIO layer and requires too much refactoring,
Signed-off-by: David Marchand <david.marchand@redhat.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
Changes since v1:
- dropped hack on using free() and the check in RTE_REGISTER_BUS,
---
drivers/bus/auxiliary/auxiliary_common.c | 28 ++++---------------
drivers/bus/dpaa/dpaa_bus.c | 4 +--
drivers/bus/fslmc/fslmc_bus.c | 2 +-
drivers/bus/ifpga/ifpga_bus.c | 32 ++++------------------
drivers/bus/pci/pci_common.c | 29 +++++---------------
drivers/bus/platform/platform.c | 20 ++++----------
drivers/bus/uacce/uacce.c | 28 ++++---------------
drivers/bus/vdev/vdev.c | 26 +++++++-----------
drivers/bus/vmbus/vmbus_common.c | 6 ++---
lib/eal/common/eal_common_bus.c | 33 ++++++++++++++++++++++-
lib/eal/include/bus_driver.h | 34 +++++++++++++++++++++++-
11 files changed, 107 insertions(+), 135 deletions(-)
diff --git a/drivers/bus/auxiliary/auxiliary_common.c b/drivers/bus/auxiliary/auxiliary_common.c
index 10f466e57a..80b90a4961 100644
--- a/drivers/bus/auxiliary/auxiliary_common.c
+++ b/drivers/bus/auxiliary/auxiliary_common.c
@@ -179,29 +179,10 @@ rte_auxiliary_unregister(struct rte_auxiliary_driver *driver)
rte_bus_remove_driver(&auxiliary_bus, &driver->driver);
}
-static int
-auxiliary_cleanup(void)
+static void
+auxiliary_free_device(struct rte_device *dev)
{
- struct rte_auxiliary_device *dev;
- int error = 0;
-
- RTE_BUS_FOREACH_DEV(dev, &auxiliary_bus) {
- int ret;
-
- if (rte_dev_is_probed(&dev->device)) {
- ret = auxiliary_unplug_device(&dev->device);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
- }
- }
-
- rte_devargs_remove(dev->device.devargs);
- rte_bus_remove_device(&auxiliary_bus, &dev->device);
- free(dev);
- }
-
- return error;
+ free(RTE_BUS_DEVICE(dev, struct rte_auxiliary_device));
}
static int
@@ -247,7 +228,8 @@ auxiliary_get_iommu_class(void)
struct rte_bus auxiliary_bus = {
.scan = auxiliary_scan,
.probe = rte_bus_generic_probe,
- .cleanup = auxiliary_cleanup,
+ .free_device = auxiliary_free_device,
+ .cleanup = rte_bus_generic_cleanup,
.find_device = rte_bus_generic_find_device,
.match = auxiliary_bus_match,
.probe_device = auxiliary_probe_device,
diff --git a/drivers/bus/dpaa/dpaa_bus.c b/drivers/bus/dpaa/dpaa_bus.c
index ee467b94d5..54779f82f7 100644
--- a/drivers/bus/dpaa/dpaa_bus.c
+++ b/drivers/bus/dpaa/dpaa_bus.c
@@ -807,12 +807,12 @@ dpaa_bus_probe_device(struct rte_driver *drv, struct rte_device *dev)
}
static int
-dpaa_bus_cleanup(void)
+dpaa_bus_cleanup(struct rte_bus *bus)
{
struct rte_dpaa_device *dev;
BUS_INIT_FUNC_TRACE();
- RTE_BUS_FOREACH_DEV(dev, &rte_dpaa_bus) {
+ RTE_BUS_FOREACH_DEV(dev, bus) {
const struct rte_dpaa_driver *drv;
int ret = 0;
diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index dca4c5b182..1a0eca30b4 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -436,7 +436,7 @@ fslmc_bus_match(const struct rte_driver *drv, const struct rte_device *dev)
}
static int
-rte_fslmc_close(void)
+rte_fslmc_close(struct rte_bus *bus __rte_unused)
{
int ret = 0;
diff --git a/drivers/bus/ifpga/ifpga_bus.c b/drivers/bus/ifpga/ifpga_bus.c
index 7e2e2efce0..79d1c3778f 100644
--- a/drivers/bus/ifpga/ifpga_bus.c
+++ b/drivers/bus/ifpga/ifpga_bus.c
@@ -298,33 +298,10 @@ ifpga_unplug_device(struct rte_device *dev)
return 0;
}
-/*
- * Cleanup the content of the Intel FPGA bus, and call the remove() function
- * for all registered devices.
- */
-static int
-ifpga_cleanup(void)
+static void
+ifpga_free_device(struct rte_device *dev)
{
- struct rte_afu_device *afu_dev;
- int error = 0;
-
- RTE_BUS_FOREACH_DEV(afu_dev, &rte_ifpga_bus) {
- int ret = 0;
-
- if (rte_dev_is_probed(&afu_dev->device)) {
- ret = ifpga_unplug_device(&afu_dev->device);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
- }
- }
-
- rte_devargs_remove(afu_dev->device.devargs);
- rte_bus_remove_device(&rte_ifpga_bus, &afu_dev->device);
- free(afu_dev);
- }
-
- return error;
+ free(RTE_BUS_DEVICE(dev, struct rte_afu_device));
}
static int
@@ -374,7 +351,8 @@ ifpga_parse(const char *name, void *addr)
static struct rte_bus rte_ifpga_bus = {
.scan = ifpga_scan,
.probe = rte_bus_generic_probe,
- .cleanup = ifpga_cleanup,
+ .free_device = ifpga_free_device,
+ .cleanup = rte_bus_generic_cleanup,
.find_device = rte_bus_generic_find_device,
.match = ifpga_bus_match,
.probe_device = ifpga_probe_device,
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index bf4822f7ec..0f635e1537 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -317,29 +317,11 @@ pci_unplug_device(struct rte_device *rte_dev)
return 0;
}
-static int
-pci_cleanup(void)
+static void
+pci_free_device(struct rte_device *dev)
{
- struct rte_pci_device *dev;
- int error = 0;
-
- RTE_BUS_FOREACH_DEV(dev, &rte_pci_bus) {
- int ret = 0;
-
- if (rte_dev_is_probed(&dev->device)) {
- ret = pci_unplug_device(&dev->device);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
- }
- }
-
- rte_devargs_remove(dev->device.devargs);
- rte_bus_remove_device(&rte_pci_bus, &dev->device);
- pci_free(RTE_PCI_DEVICE_INTERNAL(dev));
- }
-
- return error;
+ struct rte_pci_device *pdev = RTE_BUS_DEVICE(dev, *pdev);
+ pci_free(RTE_PCI_DEVICE_INTERNAL(pdev));
}
/* dump one device */
@@ -743,7 +725,8 @@ struct rte_bus rte_pci_bus = {
.allow_multi_probe = true,
.scan = rte_pci_scan,
.probe = rte_bus_generic_probe,
- .cleanup = pci_cleanup,
+ .free_device = pci_free_device,
+ .cleanup = rte_bus_generic_cleanup,
.find_device = rte_bus_generic_find_device,
.match = pci_bus_match,
.probe_device = pci_probe_device,
diff --git a/drivers/bus/platform/platform.c b/drivers/bus/platform/platform.c
index 5b3c78a505..90d865a8df 100644
--- a/drivers/bus/platform/platform.c
+++ b/drivers/bus/platform/platform.c
@@ -491,26 +491,17 @@ platform_bus_get_iommu_class(void)
return RTE_IOVA_DC;
}
-static int
-platform_bus_cleanup(void)
+static void
+platform_free_device(struct rte_device *dev)
{
- struct rte_platform_device *pdev;
-
- RTE_BUS_FOREACH_DEV(pdev, &platform_bus) {
- if (rte_dev_is_probed(&pdev->device))
- platform_bus_unplug_device(&pdev->device);
-
- rte_devargs_remove(pdev->device.devargs);
- rte_bus_remove_device(&platform_bus, &pdev->device);
- free(pdev);
- }
-
- return 0;
+ free(RTE_BUS_DEVICE(dev, struct rte_platform_device));
}
static struct rte_bus platform_bus = {
.scan = platform_bus_scan,
.probe = rte_bus_generic_probe,
+ .free_device = platform_free_device,
+ .cleanup = rte_bus_generic_cleanup,
.find_device = rte_bus_generic_find_device,
.match = platform_bus_match,
.probe_device = platform_bus_probe_device,
@@ -520,7 +511,6 @@ static struct rte_bus platform_bus = {
.dma_unmap = platform_bus_dma_unmap,
.get_iommu_class = platform_bus_get_iommu_class,
.dev_iterate = rte_bus_generic_dev_iterate,
- .cleanup = platform_bus_cleanup,
};
RTE_REGISTER_BUS(platform, platform_bus);
diff --git a/drivers/bus/uacce/uacce.c b/drivers/bus/uacce/uacce.c
index bfe1f26557..99a6fb314d 100644
--- a/drivers/bus/uacce/uacce.c
+++ b/drivers/bus/uacce/uacce.c
@@ -402,29 +402,10 @@ uacce_unplug_device(struct rte_device *rte_dev)
return 0;
}
-static int
-uacce_cleanup(void)
+static void
+uacce_free_device(struct rte_device *dev)
{
- struct rte_uacce_device *dev;
- int error = 0;
-
- RTE_BUS_FOREACH_DEV(dev, &uacce_bus) {
- int ret = 0;
-
- if (rte_dev_is_probed(&dev->device)) {
- ret = uacce_unplug_device(&dev->device);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
- }
- }
-
- rte_devargs_remove(dev->device.devargs);
- rte_bus_remove_device(&uacce_bus, &dev->device);
- free(dev);
- }
-
- return error;
+ free(RTE_BUS_DEVICE(dev, struct rte_uacce_device));
}
static int
@@ -551,7 +532,8 @@ rte_uacce_unregister(struct rte_uacce_driver *driver)
static struct rte_bus uacce_bus = {
.scan = uacce_scan,
.probe = rte_bus_generic_probe,
- .cleanup = uacce_cleanup,
+ .free_device = uacce_free_device,
+ .cleanup = rte_bus_generic_cleanup,
.match = uacce_bus_match,
.probe_device = uacce_probe_device,
.unplug_device = uacce_unplug_device,
diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
index 7e94f86e28..02d719a44d 100644
--- a/drivers/bus/vdev/vdev.c
+++ b/drivers/bus/vdev/vdev.c
@@ -548,26 +548,19 @@ vdev_scan(void)
return 0;
}
+static void
+vdev_free_device(struct rte_device *dev)
+{
+ free(RTE_BUS_DEVICE(dev, struct rte_vdev_device));
+}
+
static int
-vdev_cleanup(void)
+vdev_cleanup(struct rte_bus *bus)
{
- struct rte_vdev_device *dev;
- int error = 0;
+ int error;
rte_spinlock_recursive_lock(&vdev_device_list_lock);
- RTE_BUS_FOREACH_DEV(dev, &rte_vdev_bus) {
- int ret;
-
- if (rte_dev_is_probed(&dev->device)) {
- ret = vdev_unplug_device(&dev->device);
- if (ret < 0)
- error = -1;
- }
-
- rte_devargs_remove(dev->device.devargs);
- rte_bus_remove_device(&rte_vdev_bus, &dev->device);
- free(dev);
- }
+ error = rte_bus_generic_cleanup(bus);
rte_spinlock_recursive_unlock(&vdev_device_list_lock);
return error;
@@ -608,6 +601,7 @@ vdev_get_iommu_class(void)
static struct rte_bus rte_vdev_bus = {
.scan = vdev_scan,
.probe = rte_bus_generic_probe,
+ .free_device = vdev_free_device,
.cleanup = vdev_cleanup,
.find_device = vdev_find_device,
.match = vdev_bus_match,
diff --git a/drivers/bus/vmbus/vmbus_common.c b/drivers/bus/vmbus/vmbus_common.c
index bfb45e963c..a6e3a24a7c 100644
--- a/drivers/bus/vmbus/vmbus_common.c
+++ b/drivers/bus/vmbus/vmbus_common.c
@@ -144,12 +144,12 @@ rte_vmbus_probe(void)
}
static int
-rte_vmbus_cleanup(void)
+rte_vmbus_cleanup(struct rte_bus *bus)
{
struct rte_vmbus_device *dev;
int error = 0;
- RTE_BUS_FOREACH_DEV(dev, &rte_vmbus_bus) {
+ RTE_BUS_FOREACH_DEV(dev, bus) {
const struct rte_vmbus_driver *drv;
int ret;
@@ -167,7 +167,7 @@ rte_vmbus_cleanup(void)
rte_intr_instance_free(dev->intr_handle);
dev->device.driver = NULL;
- rte_bus_remove_device(&rte_vmbus_bus, &dev->device);
+ rte_bus_remove_device(bus, &dev->device);
free(dev);
}
diff --git a/lib/eal/common/eal_common_bus.c b/lib/eal/common/eal_common_bus.c
index ca13ccce5b..9ba23516ee 100644
--- a/lib/eal/common/eal_common_bus.c
+++ b/lib/eal/common/eal_common_bus.c
@@ -124,6 +124,37 @@ rte_bus_generic_probe(struct rte_bus *bus)
return (probed && probed == failed) ? -1 : 0;
}
+/*
+ * Generic cleanup function for buses.
+ * Iterates through all devices on the bus, unplugs probed devices,
+ * removes devargs, removes devices from the bus list, and frees device structures.
+ */
+RTE_EXPORT_INTERNAL_SYMBOL(rte_bus_generic_cleanup)
+int
+rte_bus_generic_cleanup(struct rte_bus *bus)
+{
+ struct rte_device *dev;
+ int error = 0;
+
+ RTE_VERIFY(bus->free_device);
+ RTE_VERIFY(bus->unplug_device);
+
+ while ((dev = TAILQ_FIRST(&bus->device_list)) != NULL) {
+ if (rte_dev_is_probed(dev)) {
+ if (bus->unplug_device && bus->unplug_device(dev) < 0) {
+ rte_errno = errno;
+ error = -1;
+ }
+ }
+
+ rte_devargs_remove(dev->devargs);
+ rte_bus_remove_device(bus, dev);
+ bus->free_device(dev);
+ }
+
+ return error;
+}
+
/* Probe all devices of all buses */
RTE_EXPORT_SYMBOL(rte_bus_probe)
int
@@ -164,7 +195,7 @@ eal_bus_cleanup(void)
TAILQ_FOREACH(bus, &rte_bus_list, next) {
if (bus->cleanup == NULL)
continue;
- if (bus->cleanup() != 0)
+ if (bus->cleanup(bus) != 0)
ret = -1;
}
diff --git a/lib/eal/include/bus_driver.h b/lib/eal/include/bus_driver.h
index fde55ff06d..4f6521c87f 100644
--- a/lib/eal/include/bus_driver.h
+++ b/lib/eal/include/bus_driver.h
@@ -226,17 +226,31 @@ typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev);
*/
typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr);
+/**
+ * Free a bus-specific device structure.
+ *
+ * @param dev
+ * Device pointer.
+ */
+typedef void (*rte_bus_free_device_t)(struct rte_device *dev);
+
/**
* Implementation specific cleanup function which is responsible for cleaning up
* devices on that bus with applicable drivers.
*
+ * The cleanup operation is the counterpart to scan, removing all devices added
+ * during scan.
+ *
* This is called while iterating over each registered bus.
*
+ * @param bus
+ * Pointer to the bus to cleanup.
+ *
* @return
* 0 for successful cleanup
* !0 for any error during cleanup
*/
-typedef int (*rte_bus_cleanup_t)(void);
+typedef int (*rte_bus_cleanup_t)(struct rte_bus *bus);
/**
* Check if a driver matches a device.
@@ -336,6 +350,7 @@ struct rte_bus {
/**< handle hot-unplug failure on the bus */
rte_bus_sigbus_handler_t sigbus_handler;
/**< handle sigbus error on the bus */
+ rte_bus_free_device_t free_device; /**< Free bus-specific device */
rte_bus_cleanup_t cleanup; /**< Cleanup devices on bus */
RTE_TAILQ_HEAD(, rte_device) device_list; /**< List of devices on the bus */
RTE_TAILQ_HEAD(, rte_driver) driver_list; /**< List of drivers on the bus */
@@ -624,6 +639,23 @@ struct rte_driver *rte_bus_find_driver(const struct rte_bus *bus, const struct r
__rte_internal
int rte_bus_generic_probe(struct rte_bus *bus);
+/**
+ * Generic cleanup function for buses.
+ *
+ * Iterates through all devices on the bus, unplugs probed devices,
+ * removes devargs, removes devices from the bus list, and frees device structures.
+ *
+ * This function can be used by buses that don't require special cleanup
+ * logic and just need the standard device cleanup sequence.
+ *
+ * @param bus
+ * Pointer to the bus to cleanup.
+ * @return
+ * 0 on success, -1 if any errors occurred during cleanup.
+ */
+__rte_internal
+int rte_bus_generic_cleanup(struct rte_bus *bus);
+
#ifdef __cplusplus
}
#endif
--
2.54.0
^ permalink raw reply related
* [PATCH v3 08/11] bus: align unplug with device probe
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, Parav Pandit, Xueming Li, Nipun Gupta,
Nikhil Agarwal, Sachin Saxena, Rosen Xu, Chenbo Xia,
Tomasz Duszynski
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Refactor bus unplug operations to be the counterpart of probe_device.
The (renamed) unplug operation now only handles:
- Driver removal (calling the driver's remove callback)
- Freeing probe-allocated resources (interrupts, mappings)
Device deletion (devargs removal, bus removal, freeing device
structure) is now handled only during bus cleanup, not in unplug.
Additionally, move driver pointer clearing from individual bus unplug
operations to EAL's local_dev_remove() where the unplug operation is
invoked. This centralizes driver lifecycle management and eliminates
code duplication across bus drivers.
For vdev, add a check in rte_vdev_uninit() since this public API can
be called on devices without a driver attached.
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
doc/guides/prog_guide/device_hotplug.rst | 18 ++++---
drivers/bus/auxiliary/auxiliary_common.c | 46 ++++++----------
drivers/bus/cdx/cdx.c | 29 ++--------
drivers/bus/fslmc/fslmc_bus.c | 7 +--
drivers/bus/ifpga/ifpga_bus.c | 63 ++++++++++------------
drivers/bus/pci/pci_common.c | 57 ++++----------------
drivers/bus/platform/platform.c | 16 +++---
drivers/bus/uacce/uacce.c | 67 ++++++++----------------
drivers/bus/vdev/vdev.c | 53 ++++++++-----------
lib/eal/common/eal_common_dev.c | 8 +--
lib/eal/include/bus_driver.h | 4 +-
11 files changed, 129 insertions(+), 239 deletions(-)
diff --git a/doc/guides/prog_guide/device_hotplug.rst b/doc/guides/prog_guide/device_hotplug.rst
index 7eb7fbcc2b..d21ba0c244 100644
--- a/doc/guides/prog_guide/device_hotplug.rst
+++ b/doc/guides/prog_guide/device_hotplug.rst
@@ -165,7 +165,7 @@ using ``rte_dev_event_callback_register()`` function.
on the device in question.
When ``RTE_DEV_EVENT_REMOVE`` event is delivered,
it indicates that the kernel has removed the device;
- the application should call ``rte_dev_remove()`` to clean up EAL resources.
+ the application should call ``rte_dev_remove()`` to unplug the device driver.
Event Notification Usage
@@ -256,13 +256,17 @@ When ``rte_dev_remove()`` is called, the following sequence occurs:
See `Multi-process Synchronization`_ for details.
#. **Device Unplug**:
- The bus's ``unplug()`` method is called (``dev->bus->unplug()``),
- which triggers the driver's remove function.
- This typically stops device operations, releases device resources,
- unmaps memory regions, and unregisters from subsystems.
+ The bus's ``unplug_device()`` method is called (``dev->bus->unplug_device()``),
+ which triggers the driver's remove function
+ and releases resources allocated during probe
+ (such as interrupt handles and device memory mappings).
-#. **Devargs Cleanup**:
- The devargs associated with the device are removed from the global list.
+.. note::
+
+ The device structure, its devargs, and its entry in the bus device list
+ are NOT freed during ``rte_dev_remove()``.
+ They remain in memory until ``rte_eal_cleanup()`` is called,
+ at which point the bus's ``cleanup()`` method handles complete device deletion.
Multi-process Synchronization
diff --git a/drivers/bus/auxiliary/auxiliary_common.c b/drivers/bus/auxiliary/auxiliary_common.c
index 048aacf254..10f466e57a 100644
--- a/drivers/bus/auxiliary/auxiliary_common.c
+++ b/drivers/bus/auxiliary/auxiliary_common.c
@@ -122,13 +122,11 @@ auxiliary_probe_device(struct rte_driver *drv, struct rte_device *dev)
return ret;
}
-/*
- * Call the remove() function of the driver.
- */
static int
-rte_auxiliary_driver_remove_dev(struct rte_auxiliary_device *dev)
+auxiliary_unplug_device(struct rte_device *rte_dev)
{
- const struct rte_auxiliary_driver *drv = RTE_BUS_DRIVER(dev->device.driver, *drv);
+ const struct rte_auxiliary_driver *drv = RTE_BUS_DRIVER(rte_dev->driver, *drv);
+ struct rte_auxiliary_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
int ret = 0;
AUXILIARY_LOG(DEBUG, "Driver %s remove auxiliary device %s on NUMA node %i",
@@ -140,8 +138,8 @@ rte_auxiliary_driver_remove_dev(struct rte_auxiliary_device *dev)
return ret;
}
- /* clear driver structure */
- dev->device.driver = NULL;
+ rte_intr_instance_free(dev->intr_handle);
+ dev->intr_handle = NULL;
return 0;
}
@@ -181,22 +179,6 @@ rte_auxiliary_unregister(struct rte_auxiliary_driver *driver)
rte_bus_remove_driver(&auxiliary_bus, &driver->driver);
}
-static int
-auxiliary_unplug(struct rte_device *dev)
-{
- struct rte_auxiliary_device *adev = RTE_BUS_DEVICE(dev, *adev);
- int ret;
-
- ret = rte_auxiliary_driver_remove_dev(adev);
- if (ret == 0) {
- rte_bus_remove_device(&auxiliary_bus, &adev->device);
- rte_devargs_remove(dev->devargs);
- rte_intr_instance_free(adev->intr_handle);
- free(adev);
- }
- return ret;
-}
-
static int
auxiliary_cleanup(void)
{
@@ -206,13 +188,17 @@ auxiliary_cleanup(void)
RTE_BUS_FOREACH_DEV(dev, &auxiliary_bus) {
int ret;
- if (!rte_dev_is_probed(&dev->device))
- continue;
- ret = auxiliary_unplug(&dev->device);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
+ if (rte_dev_is_probed(&dev->device)) {
+ ret = auxiliary_unplug_device(&dev->device);
+ if (ret < 0) {
+ rte_errno = errno;
+ error = -1;
+ }
}
+
+ rte_devargs_remove(dev->device.devargs);
+ rte_bus_remove_device(&auxiliary_bus, &dev->device);
+ free(dev);
}
return error;
@@ -265,7 +251,7 @@ struct rte_bus auxiliary_bus = {
.find_device = rte_bus_generic_find_device,
.match = auxiliary_bus_match,
.probe_device = auxiliary_probe_device,
- .unplug = auxiliary_unplug,
+ .unplug_device = auxiliary_unplug_device,
.parse = auxiliary_parse,
.dma_map = auxiliary_dma_map,
.dma_unmap = auxiliary_dma_unmap,
diff --git a/drivers/bus/cdx/cdx.c b/drivers/bus/cdx/cdx.c
index 2443161e1a..c0b46a41ad 100644
--- a/drivers/bus/cdx/cdx.c
+++ b/drivers/bus/cdx/cdx.c
@@ -374,14 +374,11 @@ rte_cdx_unregister(struct rte_cdx_driver *driver)
rte_bus_remove_driver(&rte_cdx_bus, &driver->driver);
}
-/*
- * If vendor/device ID match, call the remove() function of the
- * driver.
- */
static int
-cdx_detach_dev(struct rte_cdx_device *dev)
+cdx_unplug_device(struct rte_device *rte_dev)
{
- const struct rte_cdx_driver *dr = RTE_BUS_DRIVER(dev->device.driver, *dr);
+ const struct rte_cdx_driver *dr = RTE_BUS_DRIVER(rte_dev->driver, *dr);
+ struct rte_cdx_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
int ret = 0;
CDX_BUS_DEBUG("detach device %s using driver: %s",
@@ -393,9 +390,6 @@ cdx_detach_dev(struct rte_cdx_device *dev)
return ret;
}
- /* clear driver structure */
- dev->device.driver = NULL;
-
rte_cdx_unmap_device(dev);
rte_intr_instance_free(dev->intr_handle);
@@ -404,21 +398,6 @@ cdx_detach_dev(struct rte_cdx_device *dev)
return 0;
}
-static int
-cdx_unplug(struct rte_device *dev)
-{
- struct rte_cdx_device *cdx_dev = RTE_BUS_DEVICE(dev, *cdx_dev);
- int ret;
-
- ret = cdx_detach_dev(cdx_dev);
- if (ret == 0) {
- rte_bus_remove_device(&rte_cdx_bus, &cdx_dev->device);
- rte_devargs_remove(dev->devargs);
- free(cdx_dev);
- }
- return ret;
-}
-
static int
cdx_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len)
{
@@ -452,7 +431,7 @@ static struct rte_bus rte_cdx_bus = {
.find_device = rte_bus_generic_find_device,
.match = cdx_bus_match,
.probe_device = cdx_probe_device,
- .unplug = cdx_unplug,
+ .unplug_device = cdx_unplug_device,
.parse = cdx_parse,
.dma_map = cdx_dma_map,
.dma_unmap = cdx_dma_unmap,
diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index c7549a361a..dca4c5b182 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -520,6 +520,7 @@ fslmc_bus_probe_device(struct rte_driver *driver, struct rte_device *rte_dev)
return 0;
}
+ /* FIXME: probe_device should allocate intr_handle */
ret = drv->probe(drv, dev);
if (ret != 0) {
DPAA2_BUS_ERR("Unable to probe");
@@ -531,7 +532,7 @@ fslmc_bus_probe_device(struct rte_driver *driver, struct rte_device *rte_dev)
}
static int
-fslmc_bus_unplug(struct rte_device *rte_dev)
+fslmc_bus_unplug_device(struct rte_device *rte_dev)
{
struct rte_dpaa2_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
const struct rte_dpaa2_driver *drv = RTE_BUS_DRIVER(rte_dev->driver, *drv);
@@ -540,7 +541,7 @@ fslmc_bus_unplug(struct rte_device *rte_dev)
int ret = drv->remove(dev);
if (ret != 0)
return ret;
- dev->device.driver = NULL;
+ /* FIXME: unplug_device should free intr_handle */
DPAA2_BUS_INFO("%s Un-Plugged", dev->device.name);
return 0;
}
@@ -558,7 +559,7 @@ struct rte_bus rte_fslmc_bus = {
.get_iommu_class = rte_dpaa2_get_iommu_class,
.match = fslmc_bus_match,
.probe_device = fslmc_bus_probe_device,
- .unplug = fslmc_bus_unplug,
+ .unplug_device = fslmc_bus_unplug_device,
.dev_iterate = rte_bus_generic_dev_iterate,
};
diff --git a/drivers/bus/ifpga/ifpga_bus.c b/drivers/bus/ifpga/ifpga_bus.c
index af77d69ef6..7e2e2efce0 100644
--- a/drivers/bus/ifpga/ifpga_bus.c
+++ b/drivers/bus/ifpga/ifpga_bus.c
@@ -279,6 +279,25 @@ ifpga_probe_device(struct rte_driver *drv, struct rte_device *dev)
return ret;
}
+static int
+ifpga_unplug_device(struct rte_device *dev)
+{
+ const struct rte_afu_driver *afu_drv = RTE_BUS_DRIVER(dev->driver, *afu_drv);
+ struct rte_afu_device *afu_dev = RTE_BUS_DEVICE(dev, *afu_dev);
+ int ret = 0;
+
+ if (afu_drv->remove) {
+ ret = afu_drv->remove(afu_dev);
+ if (ret)
+ return ret;
+ }
+
+ rte_intr_instance_free(afu_dev->intr_handle);
+ afu_dev->intr_handle = NULL;
+
+ return 0;
+}
+
/*
* Cleanup the content of the Intel FPGA bus, and call the remove() function
* for all registered devices.
@@ -290,52 +309,24 @@ ifpga_cleanup(void)
int error = 0;
RTE_BUS_FOREACH_DEV(afu_dev, &rte_ifpga_bus) {
- const struct rte_afu_driver *drv;
int ret = 0;
- if (!rte_dev_is_probed(&afu_dev->device))
- goto free;
- drv = RTE_BUS_DRIVER(afu_dev->device.driver, *drv);
- if (drv->remove == NULL)
- goto free;
-
- ret = drv->remove(afu_dev);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
+ if (rte_dev_is_probed(&afu_dev->device)) {
+ ret = ifpga_unplug_device(&afu_dev->device);
+ if (ret < 0) {
+ rte_errno = errno;
+ error = -1;
+ }
}
- afu_dev->device.driver = NULL;
-free:
- rte_bus_remove_device(&rte_ifpga_bus, &afu_dev->device);
rte_devargs_remove(afu_dev->device.devargs);
- rte_intr_instance_free(afu_dev->intr_handle);
+ rte_bus_remove_device(&rte_ifpga_bus, &afu_dev->device);
free(afu_dev);
}
return error;
}
-static int
-ifpga_unplug(struct rte_device *dev)
-{
- struct rte_afu_device *afu_dev = RTE_BUS_DEVICE(dev, *afu_dev);
- const struct rte_afu_driver *afu_drv = RTE_BUS_DRIVER(dev->driver, *afu_drv);
- int ret;
-
- ret = afu_drv->remove(afu_dev);
- if (ret)
- return ret;
-
- rte_bus_remove_device(&rte_ifpga_bus, &afu_dev->device);
-
- rte_devargs_remove(dev->devargs);
- rte_intr_instance_free(afu_dev->intr_handle);
- free(afu_dev);
- return 0;
-
-}
-
static int
ifpga_parse(const char *name, void *addr)
{
@@ -387,7 +378,7 @@ static struct rte_bus rte_ifpga_bus = {
.find_device = rte_bus_generic_find_device,
.match = ifpga_bus_match,
.probe_device = ifpga_probe_device,
- .unplug = ifpga_unplug,
+ .unplug_device = ifpga_unplug_device,
.parse = ifpga_parse,
};
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 791e9a7b49..bf4822f7ec 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -282,13 +282,10 @@ pci_probe_device(struct rte_driver *drv, struct rte_device *dev)
return ret;
}
-/*
- * If vendor/device ID match, call the remove() function of the
- * driver.
- */
static int
-rte_pci_detach_dev(struct rte_pci_device *dev)
+pci_unplug_device(struct rte_device *rte_dev)
{
+ struct rte_pci_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
struct rte_pci_addr *loc;
const struct rte_pci_driver *dr = RTE_BUS_DRIVER(dev->device.driver, *dr);
int ret = 0;
@@ -308,9 +305,6 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
return ret;
}
- /* clear driver structure */
- dev->device.driver = NULL;
-
if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
/* unmap resources for devices that use igb_uio */
rte_pci_unmap_device(dev);
@@ -330,33 +324,17 @@ pci_cleanup(void)
int error = 0;
RTE_BUS_FOREACH_DEV(dev, &rte_pci_bus) {
- const struct rte_pci_driver *drv;
int ret = 0;
- if (!rte_dev_is_probed(&dev->device))
- goto free;
- drv = RTE_BUS_DRIVER(dev->device.driver, *drv);
- if (drv->remove == NULL)
- goto free;
-
- ret = drv->remove(dev);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
+ if (rte_dev_is_probed(&dev->device)) {
+ ret = pci_unplug_device(&dev->device);
+ if (ret < 0) {
+ rte_errno = errno;
+ error = -1;
+ }
}
- if (drv->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
- rte_pci_unmap_device(dev);
-
- dev->device.driver = NULL;
-
-free:
- /* free interrupt handles */
- rte_intr_instance_free(dev->intr_handle);
- dev->intr_handle = NULL;
- rte_intr_instance_free(dev->vfio_req_intr_handle);
- dev->vfio_req_intr_handle = NULL;
-
+ rte_devargs_remove(dev->device.devargs);
rte_bus_remove_device(&rte_pci_bus, &dev->device);
pci_free(RTE_PCI_DEVICE_INTERNAL(dev));
}
@@ -521,21 +499,6 @@ pci_sigbus_handler(const void *failure_addr)
return ret;
}
-static int
-pci_unplug(struct rte_device *dev)
-{
- struct rte_pci_device *pdev = RTE_BUS_DEVICE(dev, *pdev);
- int ret;
-
- ret = rte_pci_detach_dev(pdev);
- if (ret == 0) {
- rte_bus_remove_device(&rte_pci_bus, &pdev->device);
- rte_devargs_remove(dev->devargs);
- pci_free(RTE_PCI_DEVICE_INTERNAL(pdev));
- }
- return ret;
-}
-
static int
pci_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len)
{
@@ -784,7 +747,7 @@ struct rte_bus rte_pci_bus = {
.find_device = rte_bus_generic_find_device,
.match = pci_bus_match,
.probe_device = pci_probe_device,
- .unplug = pci_unplug,
+ .unplug_device = pci_unplug_device,
.parse = pci_parse,
.dev_compare = pci_dev_compare,
.devargs_parse = rte_pci_devargs_parse,
diff --git a/drivers/bus/platform/platform.c b/drivers/bus/platform/platform.c
index 170a2e03d0..5b3c78a505 100644
--- a/drivers/bus/platform/platform.c
+++ b/drivers/bus/platform/platform.c
@@ -416,19 +416,15 @@ device_release_driver(struct rte_platform_device *pdev)
if (ret)
PLATFORM_LOG_LINE(WARNING, "failed to remove %s", pdev->name);
}
-
- pdev->device.driver = NULL;
}
static int
-platform_bus_unplug(struct rte_device *dev)
+platform_bus_unplug_device(struct rte_device *dev)
{
struct rte_platform_device *pdev = RTE_BUS_DEVICE(dev, *pdev);
device_release_driver(pdev);
device_cleanup(pdev);
- rte_devargs_remove(pdev->device.devargs);
- free(pdev);
return 0;
}
@@ -501,10 +497,12 @@ platform_bus_cleanup(void)
struct rte_platform_device *pdev;
RTE_BUS_FOREACH_DEV(pdev, &platform_bus) {
+ if (rte_dev_is_probed(&pdev->device))
+ platform_bus_unplug_device(&pdev->device);
+
+ rte_devargs_remove(pdev->device.devargs);
rte_bus_remove_device(&platform_bus, &pdev->device);
- if (!rte_dev_is_probed(&pdev->device))
- continue;
- platform_bus_unplug(&pdev->device);
+ free(pdev);
}
return 0;
@@ -516,7 +514,7 @@ static struct rte_bus platform_bus = {
.find_device = rte_bus_generic_find_device,
.match = platform_bus_match,
.probe_device = platform_bus_probe_device,
- .unplug = platform_bus_unplug,
+ .unplug_device = platform_bus_unplug_device,
.parse = platform_bus_parse,
.dma_map = platform_bus_dma_map,
.dma_unmap = platform_bus_dma_unmap,
diff --git a/drivers/bus/uacce/uacce.c b/drivers/bus/uacce/uacce.c
index 8a3c55b248..bfe1f26557 100644
--- a/drivers/bus/uacce/uacce.c
+++ b/drivers/bus/uacce/uacce.c
@@ -385,40 +385,10 @@ uacce_probe_device(struct rte_driver *drv, struct rte_device *dev)
}
static int
-uacce_cleanup(void)
+uacce_unplug_device(struct rte_device *rte_dev)
{
- struct rte_uacce_device *dev;
- int error = 0;
-
- RTE_BUS_FOREACH_DEV(dev, &uacce_bus) {
- const struct rte_uacce_driver *dr;
- int ret = 0;
-
- if (!rte_dev_is_probed(&dev->device))
- goto free;
- dr = RTE_BUS_DRIVER(dev->device.driver, *dr);
- if (dr->remove == NULL)
- goto free;
-
- ret = dr->remove(dev);
- if (ret < 0) {
- rte_errno = errno;
- error = -1;
- }
- dev->device.driver = NULL;
-
-free:
- rte_bus_remove_device(&uacce_bus, &dev->device);
- free(dev);
- }
-
- return error;
-}
-
-static int
-uacce_detach_dev(struct rte_uacce_device *dev)
-{
- const struct rte_uacce_driver *dr = RTE_BUS_DRIVER(dev->device.driver, *dr);
+ const struct rte_uacce_driver *dr = RTE_BUS_DRIVER(rte_dev->driver, *dr);
+ struct rte_uacce_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
int ret = 0;
UACCE_BUS_DEBUG("detach device %s using driver: %s", dev->device.name, dr->driver.name);
@@ -429,25 +399,32 @@ uacce_detach_dev(struct rte_uacce_device *dev)
return ret;
}
- dev->device.driver = NULL;
-
return 0;
}
static int
-uacce_unplug(struct rte_device *dev)
+uacce_cleanup(void)
{
- struct rte_uacce_device *uacce_dev = RTE_BUS_DEVICE(dev, *uacce_dev);
- int ret;
+ struct rte_uacce_device *dev;
+ int error = 0;
- ret = uacce_detach_dev(uacce_dev);
- if (ret == 0) {
- rte_bus_remove_device(&uacce_bus, &uacce_dev->device);
- rte_devargs_remove(dev->devargs);
- free(uacce_dev);
+ RTE_BUS_FOREACH_DEV(dev, &uacce_bus) {
+ int ret = 0;
+
+ if (rte_dev_is_probed(&dev->device)) {
+ ret = uacce_unplug_device(&dev->device);
+ if (ret < 0) {
+ rte_errno = errno;
+ error = -1;
+ }
+ }
+
+ rte_devargs_remove(dev->device.devargs);
+ rte_bus_remove_device(&uacce_bus, &dev->device);
+ free(dev);
}
- return ret;
+ return error;
}
static int
@@ -577,7 +554,7 @@ static struct rte_bus uacce_bus = {
.cleanup = uacce_cleanup,
.match = uacce_bus_match,
.probe_device = uacce_probe_device,
- .unplug = uacce_unplug,
+ .unplug_device = uacce_unplug_device,
.find_device = rte_bus_generic_find_device,
.parse = uacce_parse,
.dev_iterate = rte_bus_generic_dev_iterate,
diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
index 09221ccdea..7e94f86e28 100644
--- a/drivers/bus/vdev/vdev.c
+++ b/drivers/bus/vdev/vdev.c
@@ -343,19 +343,15 @@ rte_vdev_init(const char *name, const char *args)
}
static int
-vdev_remove_driver(struct rte_vdev_device *dev)
+vdev_unplug_device(struct rte_device *rte_dev)
{
- const char *name = rte_vdev_device_name(dev);
- const struct rte_vdev_driver *driver;
+ const struct rte_vdev_driver *driver = RTE_BUS_DRIVER(rte_dev->driver, *driver);
+ struct rte_vdev_device *dev = RTE_BUS_DEVICE(rte_dev, *dev);
- if (!dev->device.driver) {
- VDEV_LOG(DEBUG, "no driver attach to device %s", name);
- return 1;
- }
+ if (driver->remove)
+ return driver->remove(dev);
- driver = RTE_BUS_DRIVER(dev->device.driver, *driver);
-
- return driver->remove(dev);
+ return 0;
}
RTE_EXPORT_SYMBOL(rte_vdev_uninit)
@@ -376,7 +372,12 @@ rte_vdev_uninit(const char *name)
goto unlock;
}
- ret = vdev_remove_driver(dev);
+ if (rte_dev_is_probed(&dev->device)) {
+ ret = vdev_unplug_device(&dev->device);
+ } else {
+ VDEV_LOG(DEBUG, "no driver attach to device %s", name);
+ ret = 1;
+ }
if (ret)
goto unlock;
@@ -553,27 +554,21 @@ vdev_cleanup(void)
struct rte_vdev_device *dev;
int error = 0;
+ rte_spinlock_recursive_lock(&vdev_device_list_lock);
RTE_BUS_FOREACH_DEV(dev, &rte_vdev_bus) {
- const struct rte_vdev_driver *drv;
int ret;
- if (!rte_dev_is_probed(&dev->device))
- goto free;
-
- drv = RTE_BUS_DRIVER(dev->device.driver, *drv);
-
- if (drv->remove == NULL)
- goto free;
-
- ret = drv->remove(dev);
- if (ret < 0)
- error = -1;
+ if (rte_dev_is_probed(&dev->device)) {
+ ret = vdev_unplug_device(&dev->device);
+ if (ret < 0)
+ error = -1;
+ }
- dev->device.driver = NULL;
-free:
+ rte_devargs_remove(dev->device.devargs);
rte_bus_remove_device(&rte_vdev_bus, &dev->device);
free(dev);
}
+ rte_spinlock_recursive_unlock(&vdev_device_list_lock);
return error;
}
@@ -591,12 +586,6 @@ vdev_find_device(const struct rte_bus *bus, const struct rte_device *start,
return dev;
}
-static int
-vdev_unplug(struct rte_device *dev)
-{
- return rte_vdev_uninit(dev->name);
-}
-
static enum rte_iova_mode
vdev_get_iommu_class(void)
{
@@ -623,7 +612,7 @@ static struct rte_bus rte_vdev_bus = {
.find_device = vdev_find_device,
.match = vdev_bus_match,
.probe_device = vdev_probe_device,
- .unplug = vdev_unplug,
+ .unplug_device = vdev_unplug_device,
.parse = vdev_parse,
.dma_map = vdev_dma_map,
.dma_unmap = vdev_dma_unmap,
diff --git a/lib/eal/common/eal_common_dev.c b/lib/eal/common/eal_common_dev.c
index 2a2103ec57..762ed09e21 100644
--- a/lib/eal/common/eal_common_dev.c
+++ b/lib/eal/common/eal_common_dev.c
@@ -385,19 +385,21 @@ local_dev_remove(struct rte_device *dev)
{
int ret;
- if (dev->bus->unplug == NULL) {
- EAL_LOG(ERR, "Function unplug not supported by bus (%s)",
+ if (dev->bus->unplug_device == NULL) {
+ EAL_LOG(ERR, "Function unplug_device not supported by bus (%s)",
dev->bus->name);
return -ENOTSUP;
}
- ret = dev->bus->unplug(dev);
+ ret = dev->bus->unplug_device(dev);
if (ret) {
EAL_LOG(ERR, "Driver cannot detach the device (%s)",
dev->name);
return (ret < 0) ? ret : -ENOENT;
}
+ dev->driver = NULL;
+
return 0;
}
diff --git a/lib/eal/include/bus_driver.h b/lib/eal/include/bus_driver.h
index 9711e6712b..fde55ff06d 100644
--- a/lib/eal/include/bus_driver.h
+++ b/lib/eal/include/bus_driver.h
@@ -101,7 +101,7 @@ typedef int (*rte_bus_probe_device_t)(struct rte_driver *drv, struct rte_device
* 0 on success.
* !0 on error.
*/
-typedef int (*rte_bus_unplug_t)(struct rte_device *dev);
+typedef int (*rte_bus_unplug_device_t)(struct rte_device *dev);
/**
* Bus specific parsing function.
@@ -323,7 +323,7 @@ struct rte_bus {
rte_bus_find_device_t find_device; /**< Find a device on the bus */
rte_bus_match_t match; /**< Check if driver matches device */
rte_bus_probe_device_t probe_device; /**< Probe single device with driver */
- rte_bus_unplug_t unplug; /**< Remove single device from driver */
+ rte_bus_unplug_device_t unplug_device; /**< Remove single device from driver */
rte_bus_parse_t parse; /**< Parse a device name */
rte_bus_dev_compare_t dev_compare; /**< Compare two device names */
rte_bus_devargs_parse_t devargs_parse; /**< Parse bus devargs */
--
2.54.0
^ permalink raw reply related
* [PATCH v3 07/11] bus/ifpga: allocate interrupt during probing
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, Rosen Xu
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Allocating the interrupt handle is a waste of memory if no device is
probed later (like for example, if a allowlist is passed).
Instead, allocate this handle at the time probe_device is called.
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
drivers/bus/ifpga/ifpga_bus.c | 29 ++++++++++++++++-------------
1 file changed, 16 insertions(+), 13 deletions(-)
diff --git a/drivers/bus/ifpga/ifpga_bus.c b/drivers/bus/ifpga/ifpga_bus.c
index 2c22329f65..af77d69ef6 100644
--- a/drivers/bus/ifpga/ifpga_bus.c
+++ b/drivers/bus/ifpga/ifpga_bus.c
@@ -144,14 +144,6 @@ ifpga_scan_one(struct rte_rawdev *rawdev,
afu_dev->id.uuid.uuid_high = 0;
afu_dev->id.port = afu_pr_conf.afu_id.port;
- /* Allocate interrupt instance */
- afu_dev->intr_handle =
- rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
- if (afu_dev->intr_handle == NULL) {
- IFPGA_BUS_ERR("Failed to allocate intr handle");
- goto end;
- }
-
if (rawdev->dev_ops && rawdev->dev_ops->dev_info_get)
rawdev->dev_ops->dev_info_get(rawdev, afu_dev, sizeof(*afu_dev));
@@ -177,10 +169,7 @@ ifpga_scan_one(struct rte_rawdev *rawdev,
end:
rte_kvargs_free(kvlist);
free(path);
- if (afu_dev) {
- rte_intr_instance_free(afu_dev->intr_handle);
- free(afu_dev);
- }
+ free(afu_dev);
return NULL;
}
@@ -272,8 +261,22 @@ ifpga_probe_device(struct rte_driver *drv, struct rte_device *dev)
{
struct rte_afu_device *afu_dev = RTE_BUS_DEVICE(dev, *afu_dev);
struct rte_afu_driver *afu_drv = RTE_BUS_DRIVER(drv, *afu_drv);
+ int ret;
+
+ afu_dev->intr_handle =
+ rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
+ if (afu_dev->intr_handle == NULL) {
+ IFPGA_BUS_ERR("Failed to allocate intr handle");
+ return -ENOMEM;
+ }
+
+ ret = afu_drv->probe(afu_dev);
+ if (ret != 0) {
+ rte_intr_instance_free(afu_dev->intr_handle);
+ afu_dev->intr_handle = NULL;
+ }
- return afu_drv->probe(afu_dev);
+ return ret;
}
/*
--
2.54.0
^ permalink raw reply related
* [PATCH v3 06/11] bus/vmbus: allocate interrupt during probing
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, Wei Hu
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Allocating the interrupt handle is a waste of memory if no device is
probed later (like for example, if a allowlist is passed).
Instead, allocate this handle at the time probe_device is called.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Reviewed-by: Long Li <longli@microsoft.com>
---
Changes since v1:
- fixed/reordered interrupt handle allocation,
---
drivers/bus/vmbus/linux/vmbus_bus.c | 6 ------
drivers/bus/vmbus/vmbus_common.c | 18 ++++++++++++++++--
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/drivers/bus/vmbus/linux/vmbus_bus.c b/drivers/bus/vmbus/linux/vmbus_bus.c
index 0af10f6a69..77d904ad6d 100644
--- a/drivers/bus/vmbus/linux/vmbus_bus.c
+++ b/drivers/bus/vmbus/linux/vmbus_bus.c
@@ -345,12 +345,6 @@ vmbus_scan_one(const char *name)
}
}
- /* Allocate interrupt handle instance */
- dev->intr_handle =
- rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
- if (dev->intr_handle == NULL)
- goto error;
-
/* device is valid, add in list (sorted) */
VMBUS_LOG(DEBUG, "Adding vmbus device %s", name);
diff --git a/drivers/bus/vmbus/vmbus_common.c b/drivers/bus/vmbus/vmbus_common.c
index 74c1ddff69..bfb45e963c 100644
--- a/drivers/bus/vmbus/vmbus_common.c
+++ b/drivers/bus/vmbus/vmbus_common.c
@@ -100,10 +100,16 @@ vmbus_probe_device(struct rte_driver *drv, struct rte_device *dev)
return 1;
}
+ /* allocate interrupt handle instance */
+ vmbus_dev->intr_handle =
+ rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
+ if (vmbus_dev->intr_handle == NULL)
+ return -ENOMEM;
+
/* map resources for device */
ret = rte_vmbus_map_device(vmbus_dev);
if (ret != 0)
- return ret;
+ goto free_intr;
if (vmbus_dev->device.numa_node < 0 && rte_socket_count() > 1)
VMBUS_LOG(INFO, "Device %s is not NUMA-aware", guid);
@@ -112,7 +118,15 @@ vmbus_probe_device(struct rte_driver *drv, struct rte_device *dev)
VMBUS_LOG(INFO, " probe driver: %s", vmbus_drv->driver.name);
ret = vmbus_drv->probe(vmbus_drv, vmbus_dev);
if (ret != 0)
- rte_vmbus_unmap_device(vmbus_dev);
+ goto unmap;
+
+ return 0;
+
+unmap:
+ rte_vmbus_unmap_device(vmbus_dev);
+free_intr:
+ rte_intr_instance_free(vmbus_dev->intr_handle);
+ vmbus_dev->intr_handle = NULL;
return ret;
}
--
2.54.0
^ permalink raw reply related
* [PATCH v3 05/11] bus/vmbus: fix interrupt leak in cleanup
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, stable, Wei Hu
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
When calling this bus cleanup, interrupt handle was not released.
Fixes: 65780eada9d9 ("bus/vmbus: support cleanup")
Cc: stable@dpdk.org
Signed-off-by: David Marchand <david.marchand@redhat.com>
Reviewed-by: Long Li <longli@microsoft.com>
---
drivers/bus/vmbus/vmbus_common.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/bus/vmbus/vmbus_common.c b/drivers/bus/vmbus/vmbus_common.c
index 01573927ce..74c1ddff69 100644
--- a/drivers/bus/vmbus/vmbus_common.c
+++ b/drivers/bus/vmbus/vmbus_common.c
@@ -150,6 +150,7 @@ rte_vmbus_cleanup(void)
error = -1;
rte_vmbus_unmap_device(dev);
+ rte_intr_instance_free(dev->intr_handle);
dev->device.driver = NULL;
rte_bus_remove_device(&rte_vmbus_bus, &dev->device);
--
2.54.0
^ permalink raw reply related
* [PATCH v3 04/11] bus/pci: fix mapping leak in bus cleanup
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, stable, Chenbo Xia, Nipun Gupta,
Morten Brørup, Kevin Laatz
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
When calling this bus cleanup, PCI resources were not unmapped.
Fixes: 1cab1a40ea9b ("bus: cleanup devices on shutdown")
Cc: stable@dpdk.org
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
drivers/bus/pci/pci_common.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index fd18b8772b..791e9a7b49 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -344,6 +344,10 @@ pci_cleanup(void)
rte_errno = errno;
error = -1;
}
+
+ if (drv->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
+ rte_pci_unmap_device(dev);
+
dev->device.driver = NULL;
free:
--
2.54.0
^ permalink raw reply related
* [PATCH v3 03/11] bus/vdev: remove driver setting in probe
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev; +Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Setting the device driver field is not the responsibility of the
probe_device callback anymore, but that of EAL (see local_dev_probe).
Yet, because of the VDEV API, rte_vdev_init() must be updated to mark
the device as probed.
Fixes: f282771a04ef ("bus: factorize driver reference")
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
Changes since v1:
- implement the same way as EAL,
---
drivers/bus/vdev/vdev.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
index 3bddf8938c..09221ccdea 100644
--- a/drivers/bus/vdev/vdev.c
+++ b/drivers/bus/vdev/vdev.c
@@ -188,7 +188,6 @@ vdev_probe_device(struct rte_driver *drv, struct rte_device *dev)
struct rte_vdev_driver *vdev_drv = RTE_BUS_DRIVER(drv, *vdev_drv);
const char *name;
enum rte_iova_mode iova_mode;
- int ret;
name = rte_vdev_device_name(vdev_dev);
VDEV_LOG(DEBUG, "Search driver to probe device %s", name);
@@ -200,10 +199,7 @@ vdev_probe_device(struct rte_driver *drv, struct rte_device *dev)
return -1;
}
- ret = vdev_drv->probe(vdev_dev);
- if (ret == 0)
- vdev_dev->device.driver = &vdev_drv->driver;
- return ret;
+ return vdev_drv->probe(vdev_dev);
}
/* The caller shall be responsible for thread-safe */
@@ -328,7 +324,10 @@ rte_vdev_init(const char *name, const char *args)
} else if (rte_dev_is_probed(&dev->device)) {
ret = -EEXIST;
} else {
+ dev->device.driver = drv;
ret = rte_vdev_bus.probe_device(drv, &dev->device);
+ if (ret != 0)
+ dev->device.driver = NULL;
}
if (ret < 0) {
/* If fails, remove it from vdev list */
--
2.54.0
^ permalink raw reply related
* [PATCH v3 02/11] dma/idxd: remove next pointer in bus specific device
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev
Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal, Kevin Laatz
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
The dma/idxd devices are now stored in a list of generic rte_device
objects.
Fixes: b4f0974a995b ("bus: factorize device list")
Signed-off-by: David Marchand <david.marchand@redhat.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
drivers/dma/idxd/idxd_bus.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/dma/idxd/idxd_bus.c b/drivers/dma/idxd/idxd_bus.c
index 4810d52f2a..2ec526ec09 100644
--- a/drivers/dma/idxd/idxd_bus.c
+++ b/drivers/dma/idxd/idxd_bus.c
@@ -34,7 +34,6 @@ struct dsa_wq_addr {
/** a DSA device instance */
struct rte_dsa_device {
struct rte_device device; /**< Inherit core device */
- TAILQ_ENTRY(rte_dsa_device) next; /**< next dev in list */
char wq_name[32]; /**< the workqueue name/number e.g. wq0.1 */
struct dsa_wq_addr addr; /**< Identifies the specific WQ */
--
2.54.0
^ permalink raw reply related
* [PATCH v3 01/11] bus: fix reference to plug callback
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev; +Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal
In-Reply-To: <20260623105439.2144694-1-david.marchand@redhat.com>
Remove now unused typedef, update documentation
and some log following the callback rename.
Fixes: 76622feba9e6 ("bus: refactor device probe")
Signed-off-by: David Marchand <david.marchand@redhat.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
Changes since v1:
- remove missed rte_bus_plug_t typedef,
---
doc/guides/prog_guide/device_hotplug.rst | 2 +-
lib/eal/common/eal_common_dev.c | 2 +-
lib/eal/include/bus_driver.h | 13 -------------
3 files changed, 2 insertions(+), 15 deletions(-)
diff --git a/doc/guides/prog_guide/device_hotplug.rst b/doc/guides/prog_guide/device_hotplug.rst
index 9896a097f3..7eb7fbcc2b 100644
--- a/doc/guides/prog_guide/device_hotplug.rst
+++ b/doc/guides/prog_guide/device_hotplug.rst
@@ -234,7 +234,7 @@ When ``rte_dev_probe()`` is called, the following sequence occurs:
and the attach operation fails if the device is not found.
#. **Device Probe**:
- The bus's ``plug()`` method is called, which triggers the device driver's probe function.
+ The bus's ``probe_device()`` method is called, which triggers the device driver's probe function.
The probe function typically allocates device-specific resources,
maps device memory regions, initializes device hardware,
and registers the device with the appropriate subsystem (e.g., ethdev for network devices).
diff --git a/lib/eal/common/eal_common_dev.c b/lib/eal/common/eal_common_dev.c
index 48b631532a..2a2103ec57 100644
--- a/lib/eal/common/eal_common_dev.c
+++ b/lib/eal/common/eal_common_dev.c
@@ -193,7 +193,7 @@ local_dev_probe(const char *devargs, struct rte_device **new_dev)
goto err_devarg;
if (da->bus->probe_device == NULL) {
- EAL_LOG(ERR, "Function plug not supported by bus (%s)",
+ EAL_LOG(ERR, "Function probe_device not supported by bus (%s)",
da->bus->name);
ret = -ENOTSUP;
goto err_devarg;
diff --git a/lib/eal/include/bus_driver.h b/lib/eal/include/bus_driver.h
index 0a7e23d98d..9711e6712b 100644
--- a/lib/eal/include/bus_driver.h
+++ b/lib/eal/include/bus_driver.h
@@ -75,19 +75,6 @@ typedef struct rte_device *
(*rte_bus_find_device_t)(const struct rte_bus *bus, const struct rte_device *start,
rte_dev_cmp_t cmp, const void *data);
-/**
- * Implementation specific probe function which is responsible for linking
- * devices on that bus with applicable drivers.
- *
- * @param dev
- * Device pointer that was returned by a previous call to find_device.
- *
- * @return
- * 0 on success.
- * !0 on error.
- */
-typedef int (*rte_bus_plug_t)(struct rte_device *dev);
-
/**
* Implementation specific probe function which is responsible for linking
* devices on that bus with applicable drivers.
--
2.54.0
^ permalink raw reply related
* [PATCH v3 00/11] Bus cleanup infrastructure and fixes
From: David Marchand @ 2026-06-23 10:54 UTC (permalink / raw)
To: dev; +Cc: thomas, stephen, bruce.richardson, fengchengwen, longli,
hemant.agrawal
In-Reply-To: <20260611094551.1514962-1-david.marchand@redhat.com>
This is a followup of the previous bus refactoring.
See https://inbox.dpdk.org/dev/CAJFAV8zvFpLwz8SY8DUUezyJyM43eRZ17Yj30ex808eHC4ZE=g@mail.gmail.com/.
This series refactors the bus cleanup infrastructure to reduce code
duplication and fix resource leaks in several bus drivers.
It should address the leak Thomas pointed at.
The first part of the series (patches 1-6) addresses several bugs and
inconsistencies:
- Documentation and log message inconsistencies from earlier bus
refactoring
- Device list management issues in dma/idxd and bus/vdev
- Resource leaks in PCI and VMBUS bus cleanup (mappings and interrupts)
- Deferred interrupt allocation to probe time (VMBUS)
The core infrastructure changes (patches 7-8) introduce the generic
cleanup framework:
- Refactors unplug operations to be the counterpart of probe_device
- Implements rte_bus_generic_cleanup() to centralize cleanup logic
- Adds .free_device operation to struct rte_bus
The final patches (9-10) convert the VMBUS bus to use the generic
cleanup helper.
After this series, most buses use the generic cleanup helper, eliminating
duplicated code and ensuring consistent cleanup behavior across the
codebase.
NXP bus drivers require more (leak) fixes and refactoring and
are left untouched.
--
David Marchand
Changes since v2:
- moved ifpga interruption allocation,
Changes since v1:
- dropped all changes on DPAA and FSLMC bus,
- added one more cleanup on the first patch,
- changed coding style in rte_vdev_init,
- implemented explicit .free_device instead of hack for calling free(),
- reordered interrupt handle allocation in VMBUS bus,
David Marchand (11):
bus: fix reference to plug callback
dma/idxd: remove next pointer in bus specific device
bus/vdev: remove driver setting in probe
bus/pci: fix mapping leak in bus cleanup
bus/vmbus: fix interrupt leak in cleanup
bus/vmbus: allocate interrupt during probing
bus/ifpga: allocate interrupt during probing
bus: align unplug with device probe
bus: implement cleanup in EAL
bus/vmbus: store name in bus specific device
bus/vmbus: support unplug
doc/guides/prog_guide/device_hotplug.rst | 20 +++---
doc/guides/rel_notes/release_26_07.rst | 4 ++
drivers/bus/auxiliary/auxiliary_common.c | 54 +++------------
drivers/bus/cdx/cdx.c | 29 ++------
drivers/bus/dpaa/dpaa_bus.c | 4 +-
drivers/bus/fslmc/fslmc_bus.c | 9 +--
drivers/bus/ifpga/ifpga_bus.c | 88 ++++++++----------------
drivers/bus/pci/pci_common.c | 68 +++---------------
drivers/bus/platform/platform.c | 26 ++-----
drivers/bus/uacce/uacce.c | 59 +++-------------
drivers/bus/vdev/vdev.c | 76 ++++++++------------
drivers/bus/vmbus/bus_vmbus_driver.h | 1 +
drivers/bus/vmbus/linux/vmbus_bus.c | 16 +----
drivers/bus/vmbus/vmbus_common.c | 58 +++++++++-------
drivers/dma/idxd/idxd_bus.c | 1 -
lib/eal/common/eal_common_bus.c | 33 ++++++++-
lib/eal/common/eal_common_dev.c | 10 +--
lib/eal/include/bus_driver.h | 51 +++++++++-----
18 files changed, 234 insertions(+), 373 deletions(-)
--
2.54.0
^ permalink raw reply
* RE: [PATCH v3 05/25] bpf/validate: introduce debugging interface
From: Marat Khalili @ 2026-06-23 10:29 UTC (permalink / raw)
To: Thomas Monjalon; +Cc: Konstantin Ananyev, dev@dpdk.org
In-Reply-To: <3BbIFKkFSyOqZrm3k570og@monjalon.net>
> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Tuesday 23 June 2026 11:19
> To: Marat Khalili <marat.khalili@huawei.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@huawei.com>; dev@dpdk.org
> Subject: Re: [PATCH v3 05/25] bpf/validate: introduce debugging interface
>
> 12/06/2026 12:47, Marat Khalili:
> > +#ifndef LIST_FOREACH_SAFE
> > +/* We need this macro which neither Linux nor EAL for Linux include yet. */
> > +#define LIST_FOREACH_SAFE(var, head, field, tvar) \
> > + for ((var) = LIST_FIRST((head)); \
> > + (var) && ((tvar) = LIST_NEXT((var), field), 1); \
> > + (var) = (tvar))
> > +#else
> > +#ifdef RTE_EXEC_ENV_LINUX
> > +#error "Don't need LIST_FOREACH_SAFE in this version of DPDK anymore, remove it."
> > +#endif
> > +#endif
>
> It fails on Alpine Linux.
> Why adding this #error?
>
This is interesting. My mental model was that Linux is never going to have
LIST_FOREACH_SAFE, but DPDK will eventually gain its own polyfill. I was
actually expecting it to happen before my patch is published, so this was a
reminder to remove my own definition since it clearly belongs to some common
library. Turns out I was wrong on both accounts: there are Linuxes that define
LIST_FOREACH_SAFE, and I managed to submit faster. Apart from these
organizational issues the whole else branch can be safely removed. Do you want
me to submit an updated version?
^ permalink raw reply
* Re: [PATCH v3 05/25] bpf/validate: introduce debugging interface
From: Thomas Monjalon @ 2026-06-23 10:18 UTC (permalink / raw)
To: Marat Khalili; +Cc: Konstantin Ananyev, dev
In-Reply-To: <20260612104743.6465-6-marat.khalili@huawei.com>
12/06/2026 12:47, Marat Khalili:
> +#ifndef LIST_FOREACH_SAFE
> +/* We need this macro which neither Linux nor EAL for Linux include yet. */
> +#define LIST_FOREACH_SAFE(var, head, field, tvar) \
> + for ((var) = LIST_FIRST((head)); \
> + (var) && ((tvar) = LIST_NEXT((var), field), 1); \
> + (var) = (tvar))
> +#else
> +#ifdef RTE_EXEC_ENV_LINUX
> +#error "Don't need LIST_FOREACH_SAFE in this version of DPDK anymore, remove it."
> +#endif
> +#endif
It fails on Alpine Linux.
Why adding this #error?
^ permalink raw reply
* RE: [PATCH v3 2/6] test/bpf: add JSET test with small immediate
From: Marat Khalili @ 2026-06-23 10:16 UTC (permalink / raw)
To: Stephen Hemminger, dev@dpdk.org; +Cc: Konstantin Ananyev
In-Reply-To: <20260621162524.82690-3-stephen@networkplumber.org>
This instruction has an interesting behavior for negative values of immediate,
to be thorough I would test them as well. Fixed x86 should pass but who knows.
I would also welcome a test reproducing the problem with shifts.
For the narrow scope of this test,
Acked-by: Marat Khalili <marat.khalili@huawei.com>
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Sunday 21 June 2026 17:24
> To: dev@dpdk.org
> Cc: Stephen Hemminger <stephen@networkplumber.org>; Konstantin Ananyev <konstantin.ananyev@huawei.com>;
> Marat Khalili <marat.khalili@huawei.com>
> Subject: [PATCH v3 2/6] test/bpf: add JSET test with small immediate
>
> The existing jump test only used a 32-bit JSET mask,
> so the broken imm8 encoding of TEST in the x86 JIT was never exercised.
> Add a case with a byte-sized mask;
> run_test() runs it through the interpreter and the JIT.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> app/test/test_bpf.c | 82 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 82 insertions(+)
>
> diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
> index dd24722450..e70dea736f 100644
> --- a/app/test/test_bpf.c
> +++ b/app/test/test_bpf.c
> @@ -3158,7 +3158,89 @@ static const struct ebpf_insn test_ld_mbuf3_prog[] = {
> };
>
> /* all bpf test cases */
> +/*
> + * JSET with a byte-sized mask: exercises the imm8 path of the TEST
> + * encoding in the x86 JIT (a 32-bit mask takes a different path).
> + */
> +static const struct ebpf_insn test_jset1_prog[] = {
> + {
> + .code = (BPF_ALU | EBPF_MOV | BPF_K),
> + .dst_reg = EBPF_REG_0,
> + .imm = 0,
> + },
> + {
> + .code = (BPF_LDX | BPF_MEM | BPF_B),
> + .dst_reg = EBPF_REG_2,
> + .src_reg = EBPF_REG_1,
> + .off = offsetof(struct dummy_offset, u8),
> + },
> + /* bit 0 is set in the input: branch is taken */
> + {
> + .code = (BPF_JMP | BPF_JSET | BPF_K),
> + .dst_reg = EBPF_REG_2,
> + .imm = 0x1,
> + .off = 1,
> + },
> + {
> + .code = (BPF_JMP | BPF_JA),
> + .off = 1,
> + },
> + {
> + .code = (EBPF_ALU64 | BPF_OR | BPF_K),
> + .dst_reg = EBPF_REG_0,
> + .imm = 0x1,
> + },
> + /* bit 1 is clear in the input: branch is not taken */
> + {
> + .code = (BPF_JMP | BPF_JSET | BPF_K),
> + .dst_reg = EBPF_REG_2,
> + .imm = 0x2,
> + .off = 1,
> + },
> + {
> + .code = (BPF_JMP | BPF_JA),
> + .off = 1,
> + },
> + {
> + .code = (EBPF_ALU64 | BPF_OR | BPF_K),
> + .dst_reg = EBPF_REG_0,
> + .imm = 0x2,
> + },
> + {
> + .code = (BPF_JMP | EBPF_EXIT),
> + },
> +};
> +
> +static void
> +test_jset1_prepare(void *arg)
> +{
> + struct dummy_offset *df = arg;
> +
> + memset(df, 0, sizeof(*df));
> + df->u8 = 0x1; /* bit 0 set, bit 1 clear */
> +}
> +
> +static int
> +test_jset1_check(uint64_t rc, const void *arg)
> +{
> + return cmp_res(__func__, 0x1, rc, arg, arg, 0);
> +}
> +
> static const struct bpf_test tests[] = {
> + {
> + .name = "test_jset1",
> + .arg_sz = sizeof(struct dummy_offset),
> + .prm = {
> + .ins = test_jset1_prog,
> + .nb_ins = RTE_DIM(test_jset1_prog),
> + .prog_arg = {
> + .type = RTE_BPF_ARG_PTR,
> + .size = sizeof(struct dummy_offset),
> + },
> + },
> + .prepare = test_jset1_prepare,
> + .check_result = test_jset1_check,
> + },
> {
> .name = "test_store1",
> .arg_sz = sizeof(struct dummy_offset),
> --
> 2.53.0
^ permalink raw reply
* RE: [PATCH v3 1/6] bpf/x86: fix JIT encoding of BPF_JSET with immediate
From: Marat Khalili @ 2026-06-23 10:11 UTC (permalink / raw)
To: Stephen Hemminger, dev@dpdk.org
Cc: stable@dpdk.org, Konstantin Ananyev, Ferruh Yigit
In-Reply-To: <20260621162524.82690-2-stephen@networkplumber.org>
With the condition that the commit message is proofread,
Acked-by: Marat Khalili <marat.khalili@huawei.com>
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Sunday 21 June 2026 17:24
> To: dev@dpdk.org
> Cc: Stephen Hemminger <stephen@networkplumber.org>; stable@dpdk.org; Konstantin Ananyev
> <konstantin.ananyev@huawei.com>; Marat Khalili <marat.khalili@huawei.com>; Ferruh Yigit
> <ferruh.yigit@amd.com>
> Subject: [PATCH v3 1/6] bpf/x86: fix JIT encoding of BPF_JSET with immediate
>
> Several place in x86 JIT code, it assumes that for small immediate
> values the instruction size is one byte; but it is not.
>
> The immddiate form of the instruction takes a 32 bit value.
> The broken version of emit_tst_imm() emits TEST (0xF7 /0)
> but sized the immediate with imm_size(), which can return 1 byte.
>
> A small mask like BPF_JSET | BPF_K #0x1 then produced a
> 4-byte instruction the CPU decodes as 7,
> swallowing the following Jcc and crashing.
>
> Always emit a 32-bit immediate for TEST, ROR and SHIFT.
The commit message needs to be LLMed for typos and factual mistakes.
>
> Bugzilla ID: 1959
> Fixes: cc752e43e079 ("bpf: add JIT compilation for x86_64 ISA")
> Cc: stable@dpdk.org
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> lib/bpf/bpf_jit_x86.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/lib/bpf/bpf_jit_x86.c b/lib/bpf/bpf_jit_x86.c
> index 88b1b5aeab..b14a574703 100644
> --- a/lib/bpf/bpf_jit_x86.c
> +++ b/lib/bpf/bpf_jit_x86.c
> @@ -300,7 +300,7 @@ emit_ror_imm(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm)
> emit_rex(st, BPF_ALU, 0, dreg);
> emit_bytes(st, &ops, sizeof(ops));
> emit_modregrm(st, MOD_DIRECT, mods, dreg);
> - emit_imm(st, imm, imm_size(imm));
> + emit_imm(st, imm, sizeof(uint8_t));
The fix appears to be correct, although this function was only ever called with
imm == 8, so the problem was not reproducible.
> }
>
> /*
> @@ -441,7 +441,7 @@ emit_shift_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg,
> uint32_t imm)
> {
> emit_shift(st, op, dreg);
> - emit_imm(st, imm, imm_size(imm));
> + emit_imm(st, imm, sizeof(uint8_t));
The fix appears to be correct, I would welcome a test reproducing the problem.
> }
>
> /*
> @@ -921,7 +921,7 @@ emit_tst_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm)
> emit_rex(st, op, 0, dreg);
> emit_bytes(st, &ops, sizeof(ops));
> emit_modregrm(st, MOD_DIRECT, mods, dreg);
> - emit_imm(st, imm, imm_size(imm));
> + emit_imm(st, imm, sizeof(int32_t));
The fix appears to be correct.
> }
>
> static void
> --
> 2.53.0
^ permalink raw reply
* Re: [PATCH v6 00/11] bpf: introduce extensible load API
From: Thomas Monjalon @ 2026-06-23 9:19 UTC (permalink / raw)
To: Marat Khalili; +Cc: dev, Konstantin Ananyev, Stephen Hemminger, david.marchand
In-Reply-To: <20260617194425.12690-1-marat.khalili@huawei.com>
> Marat Khalili (11):
> bpf: make logging prefixes more consistent
> bpf: introduce extensible load API
> bpf: support up to 5 arguments
> bpf: add cBPF origin to rte_bpf_load_ex
> bpf: support rte_bpf_prm_ex with port callbacks
> bpf: support loading ELF files from memory
> test/bpf: test loading cBPF directly
> test/bpf: test loading ELF file from memory
> doc: add release notes for new extensible BPF API
> doc: add load API to BPF programmer's guide
> test/bpf: add tests for error handling contracts
RTE_EXPORT_EXPERIMENTAL_SYMBOL(, 26.11)
changed to
RTE_EXPORT_EXPERIMENTAL_SYMBOL(, 26.07)
Doc and tests split and squashed in relevant commits
to have atomic changes.
Applied
^ permalink raw reply
* Re: [PATCH v2 1/1] common/cnxk: add bulk Rx queue enable/disable
From: Jerin Jacob @ 2026-06-23 9:13 UTC (permalink / raw)
To: rkudurumalla
Cc: Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao,
Harman Kalra, dev, jerinj
In-Reply-To: <20260623054424.1943810-1-rkudurumalla@marvell.com>
On Tue, Jun 23, 2026 at 11:21 AM rkudurumalla <rkudurumalla@marvell.com> wrote:
>
> From: Rakesh Kudurumalla <rkudurumalla@marvell.com>
>
> Add roc_nix_rq_multi_ena_dis() to batch RQ enable/disable mailbox
> AQ operations and process them in a single mbox_process() call.
>
> Skip mbox_process() when no messages were queued (e.g. all qid are
> UINT16_MAX) by checking num_msgs and msg_size via mbox_nonempty_nolock().
>
> Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
Always add change diff. Also Cc: the commenter who commented on v1.
Cc: @Stephen Hemminger
Applied to dpdk-next-net-mrvl/for-main. Thanks
> ---
> drivers/common/cnxk/roc_mbox_priv.h | 6 +-
> drivers/common/cnxk/roc_nix.h | 2 +
> drivers/common/cnxk/roc_nix_queue.c | 102 ++++++++++++++++++
> .../common/cnxk/roc_platform_base_symbols.c | 1 +
> 4 files changed, 107 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_mbox_priv.h b/drivers/common/cnxk/roc_mbox_priv.h
> index 354c8fa52a..e9da1959b6 100644
> --- a/drivers/common/cnxk/roc_mbox_priv.h
> +++ b/drivers/common/cnxk/roc_mbox_priv.h
> @@ -113,14 +113,12 @@ mbox_rsp_init(uint16_t mbox_id, void *msghdr)
> }
>
> static inline bool
> -mbox_nonempty(struct mbox *mbox, int devid)
> +mbox_nonempty_nolock(struct mbox *mbox, int devid)
> {
> struct mbox_dev *mdev = &mbox->dev[devid];
> bool ret;
>
> - plt_spinlock_lock(&mdev->mbox_lock);
> - ret = mdev->num_msgs != 0;
> - plt_spinlock_unlock(&mdev->mbox_lock);
> + ret = mdev->num_msgs != 0 && mdev->msg_size != 0;
>
> return ret;
> }
> diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
> index 8ba8b3e0b6..f495e2a5ad 100644
> --- a/drivers/common/cnxk/roc_nix.h
> +++ b/drivers/common/cnxk/roc_nix.h
> @@ -1094,6 +1094,8 @@ int __roc_api roc_nix_rq_modify(struct roc_nix *roc_nix, struct roc_nix_rq *rq,
> bool ena);
> int __roc_api roc_nix_rq_cman_config(struct roc_nix *roc_nix, struct roc_nix_rq *rq);
> int __roc_api roc_nix_rq_ena_dis(struct roc_nix_rq *rq, bool enable);
> +int __roc_api roc_nix_rq_multi_ena_dis(struct roc_nix *roc_nix, struct roc_nix_rq *rqs,
> + int nb_rx_queues, bool enable);
> int __roc_api roc_nix_rq_is_sso_enable(struct roc_nix *roc_nix, uint32_t qid);
> int __roc_api roc_nix_rq_fini(struct roc_nix_rq *rq);
> int __roc_api roc_nix_cq_init(struct roc_nix *roc_nix, struct roc_nix_cq *cq);
> diff --git a/drivers/common/cnxk/roc_nix_queue.c b/drivers/common/cnxk/roc_nix_queue.c
> index ef9b651022..075c9c1591 100644
> --- a/drivers/common/cnxk/roc_nix_queue.c
> +++ b/drivers/common/cnxk/roc_nix_queue.c
> @@ -10,6 +10,49 @@
> /* Default SQB slack per SQ */
> #define ROC_NIX_SQB_SLACK_DFLT 24
>
> +typedef void *(*nix_aq_enq_alloc_t)(struct mbox *mbox);
> +
> +static inline void
> +nix_aq_enq_write_rq_ena(void *msg, uint16_t qid, bool enable)
> +{
> + struct nix_aq_enq_req *aq = msg;
> +
> + aq->qidx = qid;
> + aq->ctype = NIX_AQ_CTYPE_RQ;
> + aq->op = NIX_AQ_INSTOP_WRITE;
> + aq->rq.ena = enable;
> + aq->rq_mask.ena = ~(aq->rq_mask.ena);
> +}
> +
> +static inline int
> +nix_rq_bulk_ena_dis_fill(struct mbox *mbox, struct roc_nix_rq *rqs, int nb_rx_queues,
> + bool enable, nix_aq_enq_alloc_t alloc)
> +{
> + int i;
> +
> + for (i = 0; i < nb_rx_queues; i++) {
> + void *aq;
> + int rc;
> +
> + if (rqs[i].qid == UINT16_MAX)
> + continue;
> +
> + aq = alloc(mbox);
> + if (!aq) {
> + rc = mbox_process(mbox);
> + if (rc)
> + return rc;
> + aq = alloc(mbox);
> + if (!aq)
> + return -ENOSPC;
> + }
> +
> + nix_aq_enq_write_rq_ena(aq, rqs[i].qid, enable);
> + }
> +
> + return 0;
> +}
> +
> static inline uint32_t
> nix_qsize_to_val(enum nix_q_size qsize)
> {
> @@ -47,6 +90,28 @@ nix_rq_vwqe_flush(struct roc_nix_rq *rq, uint16_t vwqe_interval)
> }
> }
>
> +static int
> +nix_rq_bulk_ena_dis(struct nix *nix, struct roc_nix_rq *rqs, int nb_rx_queues, bool enable)
> +{
> + struct mbox *mbox = mbox_get((&nix->dev)->mbox);
> + nix_aq_enq_alloc_t alloc;
> + int rc;
> +
> + if (roc_model_is_cn9k())
> + alloc = (nix_aq_enq_alloc_t)mbox_alloc_msg_nix_aq_enq;
> + else if (roc_model_is_cn10k())
> + alloc = (nix_aq_enq_alloc_t)mbox_alloc_msg_nix_cn10k_aq_enq;
> + else /* CN20K */
> + alloc = (nix_aq_enq_alloc_t)mbox_alloc_msg_nix_cn20k_aq_enq;
> +
> + rc = nix_rq_bulk_ena_dis_fill(mbox, rqs, nb_rx_queues, enable, alloc);
> + if (!rc && mbox_nonempty_nolock(mbox, 0))
> + rc = mbox_process(mbox);
> +
> + mbox_put(mbox);
> + return rc;
> +}
> +
> int
> nix_rq_ena_dis(struct dev *dev, struct roc_nix_rq *rq, bool enable)
> {
> @@ -126,6 +191,43 @@ roc_nix_sq_ena_dis(struct roc_nix_sq *sq, bool enable)
> return rc;
> }
>
> +int
> +roc_nix_rq_multi_ena_dis(struct roc_nix *roc_nix, struct roc_nix_rq *rqs, int nb_rx_queues,
> + bool enable)
> +{
> + struct nix *nix = roc_nix_to_nix_priv(roc_nix);
> + struct roc_nix_rq *rq;
> + int rc, i;
> +
> + rc = nix_rq_bulk_ena_dis(nix, rqs, nb_rx_queues, enable);
> + if (rc) {
> + plt_err("Failed to %s Rx queues rc=%d pf=%d vf=%d nb_rx_queues=%d",
> + enable ? "enable" : "disable", rc, nix->dev.pf, nix->dev.vf,
> + nb_rx_queues);
> + return rc;
> + }
> +
> + for (i = 0; i < nb_rx_queues; i++) {
> + rq = &rqs[i];
> +
> + if (rq->qid == UINT16_MAX)
> + continue;
> +
> + nix_rq_vwqe_flush(rq, nix->vwqe_interval);
> +
> + /* Check for meta aura if RQ is enabled */
> + if (enable && nix->need_meta_aura) {
> + rc = roc_nix_inl_meta_aura_check(rq->roc_nix, rq);
> + if (rc) {
> + plt_err("Failed meta aura check for rq=%u rc=%d pf=%d vf=%d",
> + rq->qid, rc, nix->dev.pf, nix->dev.vf);
> + return rc;
> + }
> + }
> + }
> + return 0;
> +}
> +
> int
> roc_nix_rq_ena_dis(struct roc_nix_rq *rq, bool enable)
> {
> diff --git a/drivers/common/cnxk/roc_platform_base_symbols.c b/drivers/common/cnxk/roc_platform_base_symbols.c
> index ed34d4b05b..08b2f4c6f8 100644
> --- a/drivers/common/cnxk/roc_platform_base_symbols.c
> +++ b/drivers/common/cnxk/roc_platform_base_symbols.c
> @@ -353,6 +353,7 @@ RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_ena_dis)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_is_sso_enable)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_init)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_modify)
> +RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_multi_ena_dis)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_cman_config)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_rq_fini)
> RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_cq_init)
> --
> 2.25.1
>
^ permalink raw reply
* Re: [PATCH v5] graph: add optional profiling stats
From: Jerin Jacob @ 2026-06-23 9:08 UTC (permalink / raw)
To: Morten Brørup
Cc: thomas, david.marchand, dev, Jerin Jacob, Kiran Kumar K,
Nithin Dabilpuram, Zhirun Yan
In-Reply-To: <98CBD80474FA8B44BF855DF32C47DC35F65938@smartserver.smartshare.dk>
On Tue, Jun 23, 2026 at 12:40 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> +Thomas Monjalon & +David Marchand, as intended by Jerin
>
> > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > Sent: Tuesday, 23 June 2026 08.57
> >
> > On Tue, Jun 23, 2026 at 12:15 PM Morten Brørup
> > <mb@smartsharesystems.com> wrote:
> > >
> > > > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > > > Sent: Tuesday, 23 June 2026 07.13
> > > >
> > > > On Mon, Jun 22, 2026 at 12:11 AM Morten Brørup
> > > > <mb@smartsharesystems.com> wrote:
> > > > >
> > > > > Added graph node profiling stats, build time configurable by
> > enabling
> > > > > RTE_GRAPH_PROFILE in rte_config.h.
> > > > >
> > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > >
> > > > Please update app/test/test_graph.c to validate this featue.
> > >
> > > Ack.
> > >
> > > > > @@ -92,7 +92,62 @@ rte_graph_obj_dump(FILE *f, struct rte_graph
> > *g,
> > > > bool all)
> > > > > fprintf(f, " total_sched_fail=%"
> > PRId64
> > > > "\n",
> > > > > n->dispatch.total_sched_fail);
> > > > > }
> > > > > - fprintf(f, " total_calls=%" PRId64 "\n", n-
> > > > >total_calls);
> > > > > + fprintf(f, " total_calls=%" PRIu64 "\n", n-
> > > > >total_calls);
> > > > > + if (rte_graph_has_stats_feature()) {
> > > > > + fprintf(f, " total_cycles=%" PRIu64
> > ",
> > > > avg cycles/call=%.1f\n",
> > > > > + n->total_cycles,
> > > > > + n->total_calls == 0 ?
> > > > (double)0 :
> > > > > + (double)n->total_cycles /
> > > > (double)n->total_calls);
> > > > > + }
> > > > > +#ifdef RTE_GRAPH_PROFILE
> > > >
> > > >
> > > > Please introduce rte_graph_has_profile_featue() similar to
> > > > rte_graph_has_stats_feature() to reduce if def clutter as possible.
> > >
> > > Disagree, see below.
> > >
> > > >
> > > > > + uint64_t calls = n->usage_stats[0].calls;
> > > > > + fprintf(f, " objs[0]\n");
> > > > > + fprintf(f, " calls=%" PRIu64 ", cycles=%"
> > > > PRIu64 ", avg cycles/call=%.1f\n",
> > > > > + calls,
> > > >
> > > > >
> > > > > diff --git a/lib/graph/rte_graph_worker_common.h
> > > > b/lib/graph/rte_graph_worker_common.h
> > > > > index 4ab53a533e..0d8039575d 100644
> > > > > --- a/lib/graph/rte_graph_worker_common.h
> > > > > +++ b/lib/graph/rte_graph_worker_common.h
> > > > > @@ -144,12 +144,26 @@ struct __rte_cache_aligned rte_node {
> > > > > rte_node_process_t process; /**< Process
> > > > function. */
> > > > > uint64_t process_u64;
> > > > > };
> > > > > + /** Fast path area cache line 3. */
> > > > > +#ifdef RTE_GRAPH_PROFILE
> > > > > + struct {
> > > > > + uint64_t calls; /**< Calls processing
> > > > resp. 0 or 1 objects. */
> > > > > + uint64_t cycles; /**< Cycles spent
> > > > processing resp. 0 or 1 objects. */
> > > > > + } usage_stats[2]; /**< Usage when this node
> > > > processed 0 or 1 objects. */
> > > > > + uint64_t full_burst_calls; /**< Calls processing
> > a
> > > > full burst of objects. */
> > > > > + uint64_t full_burst_cycles; /**< Cycles spent
> > > > processing a full burst of objects. */
> > > > > + uint64_t half_burst_calls; /**< Calls processing
> > a
> > > > half burst of objects. */
> > > > > + uint64_t half_burst_cycles; /**< Cycles spent
> > > > processing a half burst of objects. */
> > > > > + /** Fast path area cache line 4. */
> > > > > +#endif
> > > >
> > > > Is it an ABI breakage?
> > >
> > > No. The modifications are enclosed in #ifdef, and disabled by
> > default.
> > > It is generally required that when rte_config.h options are modified,
> > both the application and DPDK itself are built together; and then
> > API/ABI breakage becomes irrelevant.
> >
> >
> > Yes. I don't know the current policy for this. Adding @Thomas Monjalon
> > @David Marchand
> >
> >
> > >
> > > IMO, we should keep our structures lean in release builds. This means
> > that fields used for detailed profiling, advanced debugging, cookie
> > validation, etc. should use the #ifdef pattern rather than the
> > rte_lib_has_some_feature() pattern; especially if they affect the size
> > of a structure. And when those fields are not present, any code
> > accessing them cannot use the rte_lib_has_some_feature() pattern.
> > > The mbuf and mempool libraries also use #ifdef pattern for similar
> > features.
> >
> > Yes for the structure inclusion we can use #ifdef. But inside the
> > function we can use rte_lib_has_some_feature() scheme. Reasons are :
> > 1)It will remove the ifdef cultter
> > 2)Detect the compilation issue even if the feature is disabled. This
> > will make sure reduce the build options to enable build sanity
> > 3) Compiler is smart enough to understand to disable the block if the
> > feature is not enabled.(Just like #ifdef)
>
> I agree with these advantages.
> But a function using rte_lib_has_some_feature() cannot access non-existing fields:
> https://godbolt.org/z/s3nKx45Ms
I missed that.
You can add these new struct updates in the slowpath area of rte_node.
Above offsetof(struct rte_node, ctx)
Use RTE_NEXT_ABI, Get around off, ABI breakge issue.
>
> So sometimes #ifdef is required in the code too.
>
> >
> >
> >
> > >
> > > >
> > > > > alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node
> > > > *nodes[]; /**< Next nodes. */
> > > > > };
> > > > > };
> > > > >
^ permalink raw reply
* [PATCH] common/cnxk: fix inline dev null dereference
From: Aarnav JP @ 2026-06-23 8:54 UTC (permalink / raw)
To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Harman Kalra, Rakesh Kudurumalla
Cc: jerinj, rbhansali, Aarnav JP, stable
inl_dev is initialized to NULL and only assigned within the
if (idev && idev->nix_inl_dev) block.
Move inl_dev->res_addr_offset and inl_dev->cpt_cq_ena
accesses inside this null-guarded block in
nix_inl_inb_ipsec_sa_tbl_setup() and nix_inl_reass_inb_sa_tbl_setup()
to avoid dereferencing a null pointer.
Fixes: 3fdf3e53f3c4 ("common/cnxk: enable CPT CQ for inline IPsec inbound")
Cc: stable@dpdk.org
Signed-off-by: Aarnav JP <ajp@marvell.com>
---
drivers/common/cnxk/roc_nix_inl.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/common/cnxk/roc_nix_inl.c b/drivers/common/cnxk/roc_nix_inl.c
index db101e71a5..935dd37778 100644
--- a/drivers/common/cnxk/roc_nix_inl.c
+++ b/drivers/common/cnxk/roc_nix_inl.c
@@ -409,7 +409,7 @@ nix_inl_inb_ipsec_sa_tbl_setup(struct roc_nix *roc_nix)
struct nix_inl_dev *inl_dev = NULL;
uint64_t max_sa, i, sa_pow2_sz;
uint64_t sa_idx_w, lenm1_max;
- uint64_t res_addr_offset;
+ uint64_t res_addr_offset = 0;
uint8_t profile_id = 0;
struct mbox *mbox;
size_t inb_sa_sz;
@@ -503,13 +503,12 @@ nix_inl_inb_ipsec_sa_tbl_setup(struct roc_nix *roc_nix)
def_cptq = 0;
else
def_cptq = inl_dev->nix_inb_qids[inl_dev->inb_cpt_lf_id];
+ res_addr_offset = (uint64_t)(inl_dev->res_addr_offset & 0xFF) << 48;
+ if (res_addr_offset)
+ res_addr_offset |= (1UL << 56);
+ cpt_cq_ena = (uint64_t)inl_dev->cpt_cq_ena << 63;
}
- res_addr_offset = (uint64_t)(inl_dev->res_addr_offset & 0xFF) << 48;
- if (res_addr_offset)
- res_addr_offset |= (1UL << 56);
-
- cpt_cq_ena = (uint64_t)inl_dev->cpt_cq_ena << 63;
lf_cfg->enable = 1;
lf_cfg->profile_id = profile_id; /* IPsec profile is 0th one */
lf_cfg->rx_inline_sa_base = (uintptr_t)nix->inb_sa_base[profile_id] | cpt_cq_ena;
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v5] graph: add optional profiling stats
From: saeed bishara @ 2026-06-23 8:33 UTC (permalink / raw)
To: Morten Brørup
Cc: Jerin Jacob, dev, Jerin Jacob, Kiran Kumar K, Nithin Dabilpuram,
Zhirun Yan
In-Reply-To: <CALBAE1OZDXfH1FoVLJ=vXVqr=9hYPgqeCzN8h278wN8FbU+XaQ@mail.gmail.com>
> > > > + /** Fast path area cache line 3. */
> > > > +#ifdef RTE_GRAPH_PROFILE
> > > > + struct {
> > > > + uint64_t calls; /**< Calls processing
> > > resp. 0 or 1 objects. */
> > > > + uint64_t cycles; /**< Cycles spent
> > > processing resp. 0 or 1 objects. */
> > > > + } usage_stats[2]; /**< Usage when this node
> > > processed 0 or 1 objects. */
> > > > + uint64_t full_burst_calls; /**< Calls processing a
> > > full burst of objects. */
> > > > + uint64_t full_burst_cycles; /**< Cycles spent
> > > processing a full burst of objects. */
> > > > + uint64_t half_burst_calls; /**< Calls processing a
> > > half burst of objects. */
> > > > + uint64_t half_burst_cycles; /**< Cycles spent
> > > processing a half burst of objects. */
> > > > + /** Fast path area cache line 4. */
> > > > +#endif
> > >
> > > Is it an ABI breakage?
Can you consider one array for all cases?
also, instead of adding cacheline for this profiling data, can we
share with line 1 that used solely for xstats?
^ permalink raw reply
* Re: [PATCH] app/dma_perf: skip case if worker maps to main lcore
From: Bruce Richardson @ 2026-06-23 8:07 UTC (permalink / raw)
To: Rupesh Chiluka; +Cc: Cheng Jiang, Chengwen Feng, dev, gakhil, anoobj, ktejasree
In-Reply-To: <20260623045841.2602104-1-rchiluka@marvell.com>
On Tue, Jun 23, 2026 at 10:28:41AM +0530, Rupesh Chiluka wrote:
> Refuse to run DMA/CPU mem-copy cases when any worker is bound to the
> EAL main lcore.
>
Can you explain a bit more why?
> Signed-off-by: Rupesh Chiluka <rchiluka@marvell.com>
> ---
> app/test-dma-perf/main.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index 4249dcfd3d..b6aa5b8401 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -109,6 +109,7 @@ run_test_case(struct test_configure *case_cfg)
> static void
> run_test(uint32_t case_id, struct test_configure *case_cfg)
> {
> + uint32_t main_lcore = rte_get_main_lcore();
> uint32_t nb_lcores = rte_lcore_count();
> struct test_configure_entry *mem_size = &case_cfg->mem_size;
> struct test_configure_entry *buf_size = &case_cfg->buf_size;
> @@ -122,6 +123,14 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
> return;
> }
>
> + for (uint32_t i = 0; i < case_cfg->num_worker; i++) {
> + if (case_cfg->dma_config[i].lcore_dma_map.lcore == main_lcore) {
> + printf("Case %u: worker %u cannot run on the EAL main lcore (%u).\n",
> + case_id, i, main_lcore);
> + return;
> + }
> + }
> +
> printf("Number of used lcores: %u.\n", nb_lcores);
>
> if (mem_size->incr != 0)
> --
> 2.48.1
>
^ permalink raw reply
* Re: [PATCH] common/mlx5: fix high SMMU TLB miss with mempool alignment
From: yangxingui @ 2026-06-23 7:11 UTC (permalink / raw)
To: dev
Cc: stephen, david.marchand, thomas, dsosnowski, viacheslavo, bingz,
orika, suanmingm, matan, dmitry.kozliuk, fengchengwen,
yangshuaisong, lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260612071434.2722918-1-yangxingui@huawei.com>
Friendly ping...
On 2026/6/12 15:14, Xingui Yang wrote:
> From: Shuaisong Yang <yangshuaisong@h-partners.com>
>
> On Kunpeng SoC with mlx CX7, dpdk-l3fwd with intra-NUMA core pinning
> under SMMU nonstrict/strict mode shows about 30% performance degradation
> compared to cross-NUMA pinning. With SMMU disabled or passthrough mode,
> intra-NUMA performs as expected (slightly better than cross-NUMA).
>
> CX7 in NUMA1
> NUMA node0 CPU(s): 0-39
> NUMA node1 CPU(s): 40-79
>
> intra-NUMA:
> dpdk-l3fwd -l 40-55 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
> --config='(0,0,40),(0,1,41),(0,2,42),(0,3,43),(0,4,44),\
> (0,5,45),(0,6,46),(0,7,47),(0,8,48),(0,9,49),\
> (0,10,50),(0,11,51),(0,12,52),(0,13,53),\
> (0,14,54),(0,15,55)' \
> --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64
>
> cross-NUMA:
> dpdk-l3fwd -l 11-26 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
> --config='(0,0,11),(0,1,12),(0,2,13),(0,3,14),(0,4,15),\
> (0,5,16),(0,6,17),(0,7,18),(0,8,19),(0,9,20),\
> (0,10,21),(0,11,22),(0,12,23),(0,13,24),\
> (0,14,25),(0,15,26)' \
> --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64
>
> The root cause is that under SMMU enabled mode, the mempool allocated
> for intra-NUMA pinning is aligned to system page size instead of
> hugepage size, while cross-NUMA pinning correctly uses hugepage size
> alignment. This causes high TLB miss rates under SMMU.
>
> Align all memory ranges to hugepage boundaries during mempool
> registration to ensure hugepage_sz alignment, thereby reducing TLB
> misses and fixing the intra-NUMA performance degradation.
>
> Fixes: 690b2a88c2f7 ("common/mlx5: add mempool registration facilities")
> Cc: stable@dpdk.org
>
> Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> ---
> .mailmap | 1 +
> drivers/common/mlx5/mlx5_common_mr.c | 53 +++++++++++++++++++---------
> 2 files changed, 37 insertions(+), 17 deletions(-)
>
> diff --git a/.mailmap b/.mailmap
> index 4001e5fb0e..e13e88db1b 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -1979,3 +1979,4 @@ Zongyu Wu <wuzongyu1@huawei.com>
> Zorik Machulsky <zorik@amazon.com>
> Zyta Szpak <zyta@marvell.com> <zr@semihalf.com>
> Zyta Szpak <zyta@marvell.com> <zyta.szpak@semihalf.com>
> +Shuaisong Yang <yangshuaisong@h-partners.com>
> diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
> index aa2d5e88a4..aee037abb4 100644
> --- a/drivers/common/mlx5/mlx5_common_mr.c
> +++ b/drivers/common/mlx5/mlx5_common_mr.c
> @@ -1524,7 +1524,9 @@ mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem,
> * @param[in] is_extmem
> * Whether the pool is contains only external pinned buffers.
> * @param[out] out
> - * Receives memory ranges to register, aligned to the system page size.
> + * Receives memory ranges to register. Aligned to the hugepage size
> + * if all ranges reside on hugepages of the same size,
> + * otherwise aligned to the system page size.
> * The caller must release them with free().
> * @param[out] out_n
> * Receives the number of @p out items.
> @@ -1541,7 +1543,9 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
> {
> struct mlx5_range *ranges = NULL;
> unsigned int i, ranges_n = 0;
> + bool same_hugepage_sz = true;
> struct rte_memseg_list *msl;
> + uint64_t hugepage_sz = 0;
>
> if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) {
> DRV_LOG(ERR, "Cannot get address ranges for mempool %s",
> @@ -1552,28 +1556,43 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
> *share_hugepage = false;
> msl = rte_mem_virt2memseg_list((void *)ranges[0].start);
> if (msl != NULL) {
> - uint64_t hugepage_sz = 0;
> + hugepage_sz = msl->page_sz;
>
> /* Check that all ranges are on pages of the same size. */
> for (i = 0; i < ranges_n; i++) {
> - if (hugepage_sz != 0 && hugepage_sz != msl->page_sz)
> + struct rte_memseg_list *range_msl;
> + range_msl = rte_mem_virt2memseg_list(
> + (void *)ranges[i].start);
> + if (range_msl == NULL ||
> + range_msl->page_sz != hugepage_sz) {
> + same_hugepage_sz = false;
> break;
> - hugepage_sz = msl->page_sz;
> + }
> }
> - if (i == ranges_n) {
> - /*
> - * If the entire pool is within one hugepage,
> - * combine all ranges into one of the hugepage size.
> - */
> - uintptr_t reg_start = ranges[0].start;
> - uintptr_t reg_end = ranges[ranges_n - 1].end;
> - uintptr_t hugepage_start =
> - RTE_ALIGN_FLOOR(reg_start, hugepage_sz);
> - uintptr_t hugepage_end = hugepage_start + hugepage_sz;
> - if (reg_end < hugepage_end) {
> - ranges[0].start = hugepage_start;
> + }
> + if (same_hugepage_sz && hugepage_sz > 0) {
> + unsigned int orig_ranges_n = ranges_n;
> +
> + for (i = 0; i < ranges_n; i++) {
> + ranges[i].start = RTE_ALIGN_FLOOR(ranges[i].start,
> + hugepage_sz);
> + ranges[i].end = RTE_ALIGN_CEIL(ranges[i].end,
> + hugepage_sz);
> + }
> + ranges_n = 1;
> + for (i = 1; i < orig_ranges_n; i++) {
> + if (ranges[ranges_n - 1].end >= ranges[i].start)
> + ranges[ranges_n - 1].end =
> + RTE_MAX(ranges[ranges_n - 1].end,
> + ranges[i].end);
> + else
> + ranges[ranges_n++] = ranges[i];
> + }
> + if (ranges_n == 1) {
> + uintptr_t hugepage_end = ranges[0].start + hugepage_sz;
> +
> + if (ranges[0].end <= hugepage_end) {
> ranges[0].end = hugepage_end;
> - ranges_n = 1;
> *share_hugepage = true;
> }
> }
>
^ permalink raw reply
* RE: [PATCH v5] graph: add optional profiling stats
From: Morten Brørup @ 2026-06-23 7:10 UTC (permalink / raw)
To: Jerin Jacob, thomas, david.marchand
Cc: dev, Jerin Jacob, Kiran Kumar K, Nithin Dabilpuram, Zhirun Yan
In-Reply-To: <CALBAE1OZDXfH1FoVLJ=vXVqr=9hYPgqeCzN8h278wN8FbU+XaQ@mail.gmail.com>
+Thomas Monjalon & +David Marchand, as intended by Jerin
> From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> Sent: Tuesday, 23 June 2026 08.57
>
> On Tue, Jun 23, 2026 at 12:15 PM Morten Brørup
> <mb@smartsharesystems.com> wrote:
> >
> > > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > > Sent: Tuesday, 23 June 2026 07.13
> > >
> > > On Mon, Jun 22, 2026 at 12:11 AM Morten Brørup
> > > <mb@smartsharesystems.com> wrote:
> > > >
> > > > Added graph node profiling stats, build time configurable by
> enabling
> > > > RTE_GRAPH_PROFILE in rte_config.h.
> > > >
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > >
> > > Please update app/test/test_graph.c to validate this featue.
> >
> > Ack.
> >
> > > > @@ -92,7 +92,62 @@ rte_graph_obj_dump(FILE *f, struct rte_graph
> *g,
> > > bool all)
> > > > fprintf(f, " total_sched_fail=%"
> PRId64
> > > "\n",
> > > > n->dispatch.total_sched_fail);
> > > > }
> > > > - fprintf(f, " total_calls=%" PRId64 "\n", n-
> > > >total_calls);
> > > > + fprintf(f, " total_calls=%" PRIu64 "\n", n-
> > > >total_calls);
> > > > + if (rte_graph_has_stats_feature()) {
> > > > + fprintf(f, " total_cycles=%" PRIu64
> ",
> > > avg cycles/call=%.1f\n",
> > > > + n->total_cycles,
> > > > + n->total_calls == 0 ?
> > > (double)0 :
> > > > + (double)n->total_cycles /
> > > (double)n->total_calls);
> > > > + }
> > > > +#ifdef RTE_GRAPH_PROFILE
> > >
> > >
> > > Please introduce rte_graph_has_profile_featue() similar to
> > > rte_graph_has_stats_feature() to reduce if def clutter as possible.
> >
> > Disagree, see below.
> >
> > >
> > > > + uint64_t calls = n->usage_stats[0].calls;
> > > > + fprintf(f, " objs[0]\n");
> > > > + fprintf(f, " calls=%" PRIu64 ", cycles=%"
> > > PRIu64 ", avg cycles/call=%.1f\n",
> > > > + calls,
> > >
> > > >
> > > > diff --git a/lib/graph/rte_graph_worker_common.h
> > > b/lib/graph/rte_graph_worker_common.h
> > > > index 4ab53a533e..0d8039575d 100644
> > > > --- a/lib/graph/rte_graph_worker_common.h
> > > > +++ b/lib/graph/rte_graph_worker_common.h
> > > > @@ -144,12 +144,26 @@ struct __rte_cache_aligned rte_node {
> > > > rte_node_process_t process; /**< Process
> > > function. */
> > > > uint64_t process_u64;
> > > > };
> > > > + /** Fast path area cache line 3. */
> > > > +#ifdef RTE_GRAPH_PROFILE
> > > > + struct {
> > > > + uint64_t calls; /**< Calls processing
> > > resp. 0 or 1 objects. */
> > > > + uint64_t cycles; /**< Cycles spent
> > > processing resp. 0 or 1 objects. */
> > > > + } usage_stats[2]; /**< Usage when this node
> > > processed 0 or 1 objects. */
> > > > + uint64_t full_burst_calls; /**< Calls processing
> a
> > > full burst of objects. */
> > > > + uint64_t full_burst_cycles; /**< Cycles spent
> > > processing a full burst of objects. */
> > > > + uint64_t half_burst_calls; /**< Calls processing
> a
> > > half burst of objects. */
> > > > + uint64_t half_burst_cycles; /**< Cycles spent
> > > processing a half burst of objects. */
> > > > + /** Fast path area cache line 4. */
> > > > +#endif
> > >
> > > Is it an ABI breakage?
> >
> > No. The modifications are enclosed in #ifdef, and disabled by
> default.
> > It is generally required that when rte_config.h options are modified,
> both the application and DPDK itself are built together; and then
> API/ABI breakage becomes irrelevant.
>
>
> Yes. I don't know the current policy for this. Adding @Thomas Monjalon
> @David Marchand
>
>
> >
> > IMO, we should keep our structures lean in release builds. This means
> that fields used for detailed profiling, advanced debugging, cookie
> validation, etc. should use the #ifdef pattern rather than the
> rte_lib_has_some_feature() pattern; especially if they affect the size
> of a structure. And when those fields are not present, any code
> accessing them cannot use the rte_lib_has_some_feature() pattern.
> > The mbuf and mempool libraries also use #ifdef pattern for similar
> features.
>
> Yes for the structure inclusion we can use #ifdef. But inside the
> function we can use rte_lib_has_some_feature() scheme. Reasons are :
> 1)It will remove the ifdef cultter
> 2)Detect the compilation issue even if the feature is disabled. This
> will make sure reduce the build options to enable build sanity
> 3) Compiler is smart enough to understand to disable the block if the
> feature is not enabled.(Just like #ifdef)
I agree with these advantages.
But a function using rte_lib_has_some_feature() cannot access non-existing fields:
https://godbolt.org/z/s3nKx45Ms
So sometimes #ifdef is required in the code too.
>
>
>
> >
> > >
> > > > alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node
> > > *nodes[]; /**< Next nodes. */
> > > > };
> > > > };
> > > >
^ permalink raw reply
* Re: [PATCH v5] graph: add optional profiling stats
From: Jerin Jacob @ 2026-06-23 6:56 UTC (permalink / raw)
To: Morten Brørup
Cc: dev, Jerin Jacob, Kiran Kumar K, Nithin Dabilpuram, Zhirun Yan
In-Reply-To: <98CBD80474FA8B44BF855DF32C47DC35F65937@smartserver.smartshare.dk>
On Tue, Jun 23, 2026 at 12:15 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > Sent: Tuesday, 23 June 2026 07.13
> >
> > On Mon, Jun 22, 2026 at 12:11 AM Morten Brørup
> > <mb@smartsharesystems.com> wrote:
> > >
> > > Added graph node profiling stats, build time configurable by enabling
> > > RTE_GRAPH_PROFILE in rte_config.h.
> > >
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >
> > Please update app/test/test_graph.c to validate this featue.
>
> Ack.
>
> > > @@ -92,7 +92,62 @@ rte_graph_obj_dump(FILE *f, struct rte_graph *g,
> > bool all)
> > > fprintf(f, " total_sched_fail=%" PRId64
> > "\n",
> > > n->dispatch.total_sched_fail);
> > > }
> > > - fprintf(f, " total_calls=%" PRId64 "\n", n-
> > >total_calls);
> > > + fprintf(f, " total_calls=%" PRIu64 "\n", n-
> > >total_calls);
> > > + if (rte_graph_has_stats_feature()) {
> > > + fprintf(f, " total_cycles=%" PRIu64 ",
> > avg cycles/call=%.1f\n",
> > > + n->total_cycles,
> > > + n->total_calls == 0 ?
> > (double)0 :
> > > + (double)n->total_cycles /
> > (double)n->total_calls);
> > > + }
> > > +#ifdef RTE_GRAPH_PROFILE
> >
> >
> > Please introduce rte_graph_has_profile_featue() similar to
> > rte_graph_has_stats_feature() to reduce if def clutter as possible.
>
> Disagree, see below.
>
> >
> > > + uint64_t calls = n->usage_stats[0].calls;
> > > + fprintf(f, " objs[0]\n");
> > > + fprintf(f, " calls=%" PRIu64 ", cycles=%"
> > PRIu64 ", avg cycles/call=%.1f\n",
> > > + calls,
> >
> > >
> > > diff --git a/lib/graph/rte_graph_worker_common.h
> > b/lib/graph/rte_graph_worker_common.h
> > > index 4ab53a533e..0d8039575d 100644
> > > --- a/lib/graph/rte_graph_worker_common.h
> > > +++ b/lib/graph/rte_graph_worker_common.h
> > > @@ -144,12 +144,26 @@ struct __rte_cache_aligned rte_node {
> > > rte_node_process_t process; /**< Process
> > function. */
> > > uint64_t process_u64;
> > > };
> > > + /** Fast path area cache line 3. */
> > > +#ifdef RTE_GRAPH_PROFILE
> > > + struct {
> > > + uint64_t calls; /**< Calls processing
> > resp. 0 or 1 objects. */
> > > + uint64_t cycles; /**< Cycles spent
> > processing resp. 0 or 1 objects. */
> > > + } usage_stats[2]; /**< Usage when this node
> > processed 0 or 1 objects. */
> > > + uint64_t full_burst_calls; /**< Calls processing a
> > full burst of objects. */
> > > + uint64_t full_burst_cycles; /**< Cycles spent
> > processing a full burst of objects. */
> > > + uint64_t half_burst_calls; /**< Calls processing a
> > half burst of objects. */
> > > + uint64_t half_burst_cycles; /**< Cycles spent
> > processing a half burst of objects. */
> > > + /** Fast path area cache line 4. */
> > > +#endif
> >
> > Is it an ABI breakage?
>
> No. The modifications are enclosed in #ifdef, and disabled by default.
> It is generally required that when rte_config.h options are modified, both the application and DPDK itself are built together; and then API/ABI breakage becomes irrelevant.
Yes. I don't know the current policy for this. Adding @Thomas Monjalon
@David Marchand
>
> IMO, we should keep our structures lean in release builds. This means that fields used for detailed profiling, advanced debugging, cookie validation, etc. should use the #ifdef pattern rather than the rte_lib_has_some_feature() pattern; especially if they affect the size of a structure. And when those fields are not present, any code accessing them cannot use the rte_lib_has_some_feature() pattern.
> The mbuf and mempool libraries also use #ifdef pattern for similar features.
Yes for the structure inclusion we can use #ifdef. But inside the
function we can use rte_lib_has_some_feature() scheme. Reasons are :
1)It will remove the ifdef cultter
2)Detect the compilation issue even if the feature is disabled. This
will make sure reduce the build options to enable build sanity
3) Compiler is smart enough to understand to disable the block if the
feature is not enabled.(Just like #ifdef)
>
> >
> > > alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node
> > *nodes[]; /**< Next nodes. */
> > > };
> > > };
> > >
^ permalink raw reply
* RE: [PATCH v5] graph: add optional profiling stats
From: Morten Brørup @ 2026-06-23 6:45 UTC (permalink / raw)
To: Jerin Jacob
Cc: dev, Jerin Jacob, Kiran Kumar K, Nithin Dabilpuram, Zhirun Yan
In-Reply-To: <CALBAE1OCbZ9GB1oGYgR5hRtzjRmev2ZFOaSNndf02AZptihOcw@mail.gmail.com>
> From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> Sent: Tuesday, 23 June 2026 07.13
>
> On Mon, Jun 22, 2026 at 12:11 AM Morten Brørup
> <mb@smartsharesystems.com> wrote:
> >
> > Added graph node profiling stats, build time configurable by enabling
> > RTE_GRAPH_PROFILE in rte_config.h.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>
> Please update app/test/test_graph.c to validate this featue.
Ack.
> > @@ -92,7 +92,62 @@ rte_graph_obj_dump(FILE *f, struct rte_graph *g,
> bool all)
> > fprintf(f, " total_sched_fail=%" PRId64
> "\n",
> > n->dispatch.total_sched_fail);
> > }
> > - fprintf(f, " total_calls=%" PRId64 "\n", n-
> >total_calls);
> > + fprintf(f, " total_calls=%" PRIu64 "\n", n-
> >total_calls);
> > + if (rte_graph_has_stats_feature()) {
> > + fprintf(f, " total_cycles=%" PRIu64 ",
> avg cycles/call=%.1f\n",
> > + n->total_cycles,
> > + n->total_calls == 0 ?
> (double)0 :
> > + (double)n->total_cycles /
> (double)n->total_calls);
> > + }
> > +#ifdef RTE_GRAPH_PROFILE
>
>
> Please introduce rte_graph_has_profile_featue() similar to
> rte_graph_has_stats_feature() to reduce if def clutter as possible.
Disagree, see below.
>
> > + uint64_t calls = n->usage_stats[0].calls;
> > + fprintf(f, " objs[0]\n");
> > + fprintf(f, " calls=%" PRIu64 ", cycles=%"
> PRIu64 ", avg cycles/call=%.1f\n",
> > + calls,
>
> >
> > diff --git a/lib/graph/rte_graph_worker_common.h
> b/lib/graph/rte_graph_worker_common.h
> > index 4ab53a533e..0d8039575d 100644
> > --- a/lib/graph/rte_graph_worker_common.h
> > +++ b/lib/graph/rte_graph_worker_common.h
> > @@ -144,12 +144,26 @@ struct __rte_cache_aligned rte_node {
> > rte_node_process_t process; /**< Process
> function. */
> > uint64_t process_u64;
> > };
> > + /** Fast path area cache line 3. */
> > +#ifdef RTE_GRAPH_PROFILE
> > + struct {
> > + uint64_t calls; /**< Calls processing
> resp. 0 or 1 objects. */
> > + uint64_t cycles; /**< Cycles spent
> processing resp. 0 or 1 objects. */
> > + } usage_stats[2]; /**< Usage when this node
> processed 0 or 1 objects. */
> > + uint64_t full_burst_calls; /**< Calls processing a
> full burst of objects. */
> > + uint64_t full_burst_cycles; /**< Cycles spent
> processing a full burst of objects. */
> > + uint64_t half_burst_calls; /**< Calls processing a
> half burst of objects. */
> > + uint64_t half_burst_cycles; /**< Cycles spent
> processing a half burst of objects. */
> > + /** Fast path area cache line 4. */
> > +#endif
>
> Is it an ABI breakage?
No. The modifications are enclosed in #ifdef, and disabled by default.
It is generally required that when rte_config.h options are modified, both the application and DPDK itself are built together; and then API/ABI breakage becomes irrelevant.
IMO, we should keep our structures lean in release builds. This means that fields used for detailed profiling, advanced debugging, cookie validation, etc. should use the #ifdef pattern rather than the rte_lib_has_some_feature() pattern; especially if they affect the size of a structure. And when those fields are not present, any code accessing them cannot use the rte_lib_has_some_feature() pattern.
The mbuf and mempool libraries also use #ifdef pattern for similar features.
>
> > alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node
> *nodes[]; /**< Next nodes. */
> > };
> > };
> >
^ permalink raw reply
* [PATCH v7 4/4] net/zxdh: optimize Tx xmit pkts performance
From: Junlong Wang @ 2026-06-23 6:09 UTC (permalink / raw)
To: stephen; +Cc: dev, Junlong Wang
In-Reply-To: <20260623060909.97023-1-wang.junlong1@zte.com.cn>
[-- Attachment #1.1.1: Type: text/plain, Size: 21243 bytes --]
Add simple Tx xmit functions (zxdh_xmit_pkts_simple)
for single-segment packet xmit.
Signed-off-by: Junlong Wang <wang.junlong1@zte.com.cn>
---
drivers/net/zxdh/zxdh_ethdev.c | 15 +-
drivers/net/zxdh/zxdh_queue.h | 2 +-
drivers/net/zxdh/zxdh_rxtx.c | 389 ++++++++++++++++++++++++++-------
drivers/net/zxdh/zxdh_rxtx.h | 13 +-
4 files changed, 323 insertions(+), 96 deletions(-)
diff --git a/drivers/net/zxdh/zxdh_ethdev.c b/drivers/net/zxdh/zxdh_ethdev.c
index fe76139f3d..cf0395aee8 100644
--- a/drivers/net/zxdh/zxdh_ethdev.c
+++ b/drivers/net/zxdh/zxdh_ethdev.c
@@ -490,7 +490,7 @@ zxdh_dev_free_mbufs(struct rte_eth_dev *dev)
if (!vq)
continue;
while ((buf = zxdh_queue_detach_unused(vq)) != NULL)
- rte_pktmbuf_free(buf);
+ rte_pktmbuf_free_seg(buf);
PMD_DRV_LOG(DEBUG, "freeing %s[%d] used and unused buf",
"rxq", i * 2);
}
@@ -499,7 +499,7 @@ zxdh_dev_free_mbufs(struct rte_eth_dev *dev)
if (!vq)
continue;
while ((buf = zxdh_queue_detach_unused(vq)) != NULL)
- rte_pktmbuf_free(buf);
+ rte_pktmbuf_free_seg(buf);
PMD_DRV_LOG(DEBUG, "freeing %s[%d] used and unused buf",
"txq", i * 2 + 1);
}
@@ -1291,10 +1291,17 @@ static int zxdh_scattered_rx(struct rte_eth_dev *eth_dev)
static int32_t
zxdh_set_rxtx_funcs(struct rte_eth_dev *eth_dev)
{
- eth_dev->tx_pkt_prepare = zxdh_xmit_pkts_prepare;
+ uint64_t tx_offloads = eth_dev->data->dev_conf.txmode.offloads;
+
eth_dev->data->scattered_rx = zxdh_scattered_rx(eth_dev);
- eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_packed;
+ if (!(tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)) {
+ eth_dev->tx_pkt_prepare = zxdh_xmit_pkts_simple_prepare;
+ eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_simple;
+ } else {
+ eth_dev->tx_pkt_prepare = zxdh_xmit_pkts_prepare;
+ eth_dev->tx_pkt_burst = &zxdh_xmit_pkts_packed;
+ }
if (eth_dev->data->scattered_rx)
eth_dev->rx_pkt_burst = &zxdh_recv_pkts_packed;
diff --git a/drivers/net/zxdh/zxdh_queue.h b/drivers/net/zxdh/zxdh_queue.h
index b079272162..091d1f25db 100644
--- a/drivers/net/zxdh/zxdh_queue.h
+++ b/drivers/net/zxdh/zxdh_queue.h
@@ -374,7 +374,7 @@ zxdh_queue_full(const struct zxdh_virtqueue *vq)
}
static inline void
-zxdh_queue_store_flags_packed(struct zxdh_vring_packed_desc *dp, uint16_t flags)
+zxdh_queue_store_flags_packed(volatile struct zxdh_vring_packed_desc *dp, uint16_t flags)
{
rte_io_wmb();
dp->flags = flags;
diff --git a/drivers/net/zxdh/zxdh_rxtx.c b/drivers/net/zxdh/zxdh_rxtx.c
index ab0510a753..ffacb5d94f 100644
--- a/drivers/net/zxdh/zxdh_rxtx.c
+++ b/drivers/net/zxdh/zxdh_rxtx.c
@@ -114,6 +114,22 @@
RTE_MBUF_F_TX_SEC_OFFLOAD | \
RTE_MBUF_F_TX_UDP_SEG)
+#if RTE_CACHE_LINE_SIZE == 128
+#define NEXT_CACHELINE_OFF_16B 8
+#define NEXT_CACHELINE_OFF_8B 16
+#elif RTE_CACHE_LINE_SIZE == 64
+#define NEXT_CACHELINE_OFF_16B 4
+#define NEXT_CACHELINE_OFF_8B 8
+#else
+#define NEXT_CACHELINE_OFF_16B (RTE_CACHE_LINE_SIZE / 16)
+#define NEXT_CACHELINE_OFF_8B (RTE_CACHE_LINE_SIZE / 8)
+#endif
+#define N_PER_LOOP NEXT_CACHELINE_OFF_8B
+#define N_PER_LOOP_MASK (N_PER_LOOP - 1)
+
+#define rxq_get_vq(q) ((q)->vq)
+#define txq_get_vq(q) ((q)->vq)
+
uint32_t zxdh_outer_l2_type[16] = {
0,
RTE_PTYPE_L2_ETHER,
@@ -201,43 +217,6 @@ uint32_t zxdh_inner_l4_type[16] = {
0,
};
-static void
-zxdh_xmit_cleanup_inorder_packed(struct zxdh_virtqueue *vq, int32_t num)
-{
- uint16_t used_idx = 0;
- uint16_t id = 0;
- uint16_t curr_id = 0;
- uint16_t free_cnt = 0;
- uint16_t size = vq->vq_nentries;
- struct zxdh_vring_packed_desc *desc = vq->vq_packed.ring.desc;
- struct zxdh_vq_desc_extra *dxp = NULL;
-
- used_idx = vq->vq_used_cons_idx;
- /* desc_is_used has a load-acquire or rte_io_rmb inside
- * and wait for used desc in virtqueue.
- */
- while (num > 0 && desc_is_used(&desc[used_idx], vq)) {
- id = desc[used_idx].id;
- do {
- curr_id = used_idx;
- dxp = &vq->vq_descx[used_idx];
- used_idx += dxp->ndescs;
- free_cnt += dxp->ndescs;
- num -= dxp->ndescs;
- if (used_idx >= size) {
- used_idx -= size;
- vq->used_wrap_counter ^= 1;
- }
- if (dxp->cookie != NULL) {
- rte_pktmbuf_free(dxp->cookie);
- dxp->cookie = NULL;
- }
- } while (curr_id != id);
- }
- vq->vq_used_cons_idx = used_idx;
- vq->vq_free_cnt += free_cnt;
-}
-
static inline uint16_t
zxdh_get_mtu(struct zxdh_virtqueue *vq)
{
@@ -334,7 +313,7 @@ zxdh_xmit_fill_net_hdr(struct zxdh_virtqueue *vq, struct rte_mbuf *cookie,
}
static inline void
-zxdh_enqueue_xmit_packed_fast(struct zxdh_virtnet_tx *txvq,
+zxdh_xmit_enqueue_push(struct zxdh_virtnet_tx *txvq,
struct rte_mbuf *cookie)
{
struct zxdh_virtqueue *vq = txvq->vq;
@@ -345,7 +324,6 @@ zxdh_enqueue_xmit_packed_fast(struct zxdh_virtnet_tx *txvq,
uint8_t hdr_len = vq->hw->dl_net_hdr_len;
struct zxdh_vring_packed_desc *dp = &vq->vq_packed.ring.desc[id];
- dxp->ndescs = 1;
dxp->cookie = cookie;
hdr = rte_pktmbuf_mtod_offset(cookie, struct zxdh_net_hdr_dl *, -hdr_len);
zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
@@ -362,52 +340,57 @@ zxdh_enqueue_xmit_packed_fast(struct zxdh_virtnet_tx *txvq,
}
static inline void
-zxdh_enqueue_xmit_packed(struct zxdh_virtnet_tx *txvq,
+zxdh_xmit_enqueue_append(struct zxdh_virtnet_tx *txvq,
struct rte_mbuf *cookie,
uint16_t needed)
{
struct zxdh_tx_region *txr = txvq->zxdh_net_hdr_mz->addr;
struct zxdh_virtqueue *vq = txvq->vq;
- uint16_t id = vq->vq_avail_idx;
- struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
+ struct zxdh_vq_desc_extra *dep = &vq->vq_descx[0];
uint16_t head_idx = vq->vq_avail_idx;
uint16_t idx = head_idx;
struct zxdh_vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
struct zxdh_vring_packed_desc *head_dp = &vq->vq_packed.ring.desc[idx];
struct zxdh_net_hdr_dl *hdr = NULL;
-
- uint16_t head_flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
+ uint16_t id = vq->vq_avail_idx;
+ struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
uint8_t hdr_len = vq->hw->dl_net_hdr_len;
+ uint16_t head_flags = 0;
- dxp->ndescs = needed;
- dxp->cookie = cookie;
- head_flags |= vq->cached_flags;
+ /*
+ * IMPORTANT: For multi-seg packets, we set the head descriptor's cookie to NULL
+ * and store each segment's mbuf in its corresponding vq_descx[idx].cookie.
+ * This is required for the per-descriptor mbuf free in zxdh_xmit_fast_flush()
+ * which uses rte_pktmbuf_free_seg() to free individual segments.
+ * Any code path that attempts to read vq_descx[head_id].cookie will see NULL
+ * and must handle this case appropriately.
+ */
+ dxp->cookie = NULL;
+ /* setup first tx ring slot to point to header stored in reserved region. */
start_dp[idx].addr = txvq->zxdh_net_hdr_mem + RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
start_dp[idx].len = hdr_len;
- head_flags |= ZXDH_VRING_DESC_F_NEXT;
+ start_dp[idx].id = idx;
+ head_flags |= vq->cached_flags | ZXDH_VRING_DESC_F_NEXT;
hdr = (void *)&txr[idx].tx_hdr;
- rte_prefetch1(hdr);
+ zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
+
idx++;
if (idx >= vq->vq_nentries) {
idx -= vq->vq_nentries;
vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
}
- zxdh_xmit_fill_net_hdr(vq, cookie, hdr);
-
do {
start_dp[idx].addr = rte_pktmbuf_iova(cookie);
start_dp[idx].len = cookie->data_len;
- start_dp[idx].id = id;
- if (likely(idx != head_idx)) {
- uint16_t flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
-
- flags |= vq->cached_flags;
- start_dp[idx].flags = flags;
- }
+ start_dp[idx].id = idx;
+ dep[idx].cookie = cookie;
+ uint16_t flags = cookie->next ? ZXDH_VRING_DESC_F_NEXT : 0;
+ flags |= vq->cached_flags;
+ start_dp[idx].flags = flags;
idx++;
if (idx >= vq->vq_nentries) {
idx -= vq->vq_nentries;
@@ -417,7 +400,6 @@ zxdh_enqueue_xmit_packed(struct zxdh_virtnet_tx *txvq,
vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
vq->vq_avail_idx = idx;
-
zxdh_queue_store_flags_packed(head_dp, head_flags);
}
@@ -456,7 +438,7 @@ zxdh_update_packet_stats(struct zxdh_virtnet_stats *stats, struct rte_mbuf *mbuf
}
static void
-zxdh_xmit_flush(struct zxdh_virtqueue *vq)
+zxdh_xmit_fast_flush(struct zxdh_virtqueue *vq)
{
uint16_t id = 0;
uint16_t curr_id = 0;
@@ -472,20 +454,22 @@ zxdh_xmit_flush(struct zxdh_virtqueue *vq)
* for a used descriptor in the virtqueue.
*/
while (desc_is_used(&desc[used_idx], vq)) {
+ rte_prefetch0(&desc[used_idx + NEXT_CACHELINE_OFF_16B]);
id = desc[used_idx].id;
do {
+ desc[used_idx].id = used_idx;
curr_id = used_idx;
dxp = &vq->vq_descx[used_idx];
- used_idx += dxp->ndescs;
- free_cnt += dxp->ndescs;
- if (used_idx >= size) {
- used_idx -= size;
- vq->used_wrap_counter ^= 1;
- }
if (dxp->cookie != NULL) {
- rte_pktmbuf_free(dxp->cookie);
+ rte_pktmbuf_free_seg(dxp->cookie);
dxp->cookie = NULL;
}
+ used_idx += 1;
+ free_cnt += 1;
+ if (unlikely(used_idx == size)) {
+ used_idx = 0;
+ vq->used_wrap_counter ^= 1;
+ }
} while (curr_id != id);
}
vq->vq_used_cons_idx = used_idx;
@@ -499,13 +483,12 @@ zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkt
struct zxdh_virtqueue *vq = txvq->vq;
uint16_t nb_tx = 0;
- zxdh_xmit_flush(vq);
+ zxdh_xmit_fast_flush(vq);
for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
struct rte_mbuf *txm = tx_pkts[nb_tx];
int32_t can_push = 0;
int32_t slots = 0;
- int32_t need = 0;
rte_prefetch0(txm);
/* optimize ring usage */
@@ -522,26 +505,15 @@ zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkt
* default => number of segments + 1
**/
slots = txm->nb_segs + !can_push;
- need = slots - vq->vq_free_cnt;
/* Positive value indicates it need free vring descriptors */
- if (unlikely(need > 0)) {
- zxdh_xmit_cleanup_inorder_packed(vq, need);
- need = slots - vq->vq_free_cnt;
- if (unlikely(need > 0)) {
- PMD_TX_LOG(ERR,
- " No enough %d free tx descriptors to transmit."
- "freecnt %d",
- need,
- vq->vq_free_cnt);
- break;
- }
- }
+ if (unlikely(slots > vq->vq_free_cnt))
+ break;
/* Enqueue Packet buffers */
if (can_push)
- zxdh_enqueue_xmit_packed_fast(txvq, txm);
+ zxdh_xmit_enqueue_push(txvq, txm);
else
- zxdh_enqueue_xmit_packed(txvq, txm, slots);
+ zxdh_xmit_enqueue_append(txvq, txm, slots);
zxdh_update_packet_stats(&txvq->stats, txm);
}
txvq->stats.packets += nb_tx;
@@ -602,6 +574,48 @@ uint16_t zxdh_xmit_pkts_prepare(void *tx_queue, struct rte_mbuf **tx_pkts,
return nb_tx;
}
+uint16_t zxdh_xmit_pkts_simple_prepare(void *tx_queue, struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ struct zxdh_virtnet_tx *txvq = tx_queue;
+ struct zxdh_hw *hw = txvq->vq->hw;
+ uint16_t nb_tx;
+
+ for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
+ struct rte_mbuf *m = tx_pkts[nb_tx];
+ int32_t error;
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+ error = rte_validate_tx_offload(m);
+ if (unlikely(error)) {
+ rte_errno = -error;
+ break;
+ }
+#endif
+
+ error = rte_net_intel_cksum_prepare(m);
+ if (unlikely(error)) {
+ rte_errno = -error;
+ break;
+ }
+ if (m->data_off < ZXDH_DL_NET_HDR_SIZE) {
+ PMD_TX_LOG(ERR, "HEADROOM too small for zxdh Tx downlink header");
+ txvq->stats.invalid_hdr_len_err += nb_pkts - nb_tx;
+ rte_errno = ENOMEM;
+ break;
+ }
+
+ error = dl_net_hdr_check(m, hw);
+ if (unlikely(error)) {
+ rte_errno = ENOTSUP;
+ txvq->stats.errors += nb_pkts - nb_tx;
+ txvq->stats.offload_cfg_err += nb_pkts - nb_tx;
+ break;
+ }
+ }
+ return nb_tx;
+}
+
static uint16_t
zxdh_dequeue_burst_rx_packed(struct zxdh_virtqueue *vq,
struct rte_mbuf **rx_pkts,
@@ -1070,7 +1084,6 @@ uint16_t zxdh_recv_single_pkts(void *rx_queue, struct rte_mbuf **rcv_pkts, uint1
if (unlikely(zxdh_init_mbuf(rxm, len, hw, &vq->rxq) < 0))
continue;
- rcv_pkts[nb_rx] = rxm;
zxdh_update_packet_stats(&rxvq->stats, rxm);
nb_rx++;
}
@@ -1084,3 +1097,209 @@ uint16_t zxdh_recv_single_pkts(void *rx_queue, struct rte_mbuf **rcv_pkts, uint1
}
return nb_rx;
}
+
+static inline void pkt_padding(struct rte_mbuf *cookie, struct zxdh_hw *hw)
+{
+ uint16_t mtu_or_mss = 0;
+ uint16_t pkt_flag_lw16 = ZXDH_NO_IPID_UPDATE;
+ uint16_t l3_offset;
+ uint8_t pcode = ZXDH_PCODE_NO_IP_PKT_TYPE;
+ uint8_t l3_ptype = ZXDH_PI_L3TYPE_NOIP;
+ struct zxdh_pi_hdr *pi_hdr;
+ struct zxdh_pd_hdr_dl *pd_hdr;
+ struct zxdh_net_hdr_dl *net_hdr_dl = hw->net_hdr_dl;
+ uint8_t hdr_len = hw->dl_net_hdr_len;
+ uint16_t ol_flag = 0;
+ struct zxdh_net_hdr_dl *hdr;
+
+ hdr = rte_pktmbuf_mtod_offset(cookie, struct zxdh_net_hdr_dl *, -hdr_len);
+ rte_memcpy(hdr, net_hdr_dl, hdr_len);
+
+ /* Update mbuf to reflect the prepended header */
+ cookie->data_off -= hdr_len;
+ cookie->data_len += hdr_len;
+ cookie->pkt_len += hdr_len;
+
+ if (hw->has_tx_offload) {
+ pi_hdr = &hdr->pipd_hdr_dl.pi_hdr;
+ pd_hdr = &hdr->pipd_hdr_dl.pd_hdr;
+
+ pcode = ZXDH_PCODE_IP_PKT_TYPE;
+ if (cookie->ol_flags & RTE_MBUF_F_TX_IPV6)
+ l3_ptype = ZXDH_PI_L3TYPE_IPV6;
+ else if (cookie->ol_flags & RTE_MBUF_F_TX_IPV4)
+ l3_ptype = ZXDH_PI_L3TYPE_IP;
+ else
+ pcode = ZXDH_PCODE_NO_IP_PKT_TYPE;
+
+ if (cookie->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+ mtu_or_mss = (cookie->tso_segsz >= ZXDH_MIN_MSS) ?
+ cookie->tso_segsz : ZXDH_MIN_MSS;
+ pi_hdr->pkt_flag_hi8 |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+ pkt_flag_lw16 |= ZXDH_NO_IP_FRAGMENT | ZXDH_TX_IP_CKSUM_CAL;
+ pcode = ZXDH_PCODE_TCP_PKT_TYPE;
+ } else if (cookie->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
+ mtu_or_mss = hw->eth_dev->data->mtu;
+ mtu_or_mss = (mtu_or_mss >= ZXDH_MIN_MSS) ? mtu_or_mss : ZXDH_MIN_MSS;
+ pkt_flag_lw16 |= ZXDH_TX_IP_CKSUM_CAL;
+ pi_hdr->pkt_flag_hi8 |= ZXDH_NO_TCP_FRAGMENT | ZXDH_TX_TCPUDP_CKSUM_CAL;
+ pcode = ZXDH_PCODE_UDP_PKT_TYPE;
+ } else {
+ pkt_flag_lw16 |= ZXDH_NO_IP_FRAGMENT;
+ pi_hdr->pkt_flag_hi8 |= ZXDH_NO_TCP_FRAGMENT;
+ }
+
+ if (cookie->ol_flags & RTE_MBUF_F_TX_IP_CKSUM)
+ pkt_flag_lw16 |= ZXDH_TX_IP_CKSUM_CAL;
+
+ if ((cookie->ol_flags & RTE_MBUF_F_TX_UDP_CKSUM) == RTE_MBUF_F_TX_UDP_CKSUM) {
+ pcode = ZXDH_PCODE_UDP_PKT_TYPE;
+ pi_hdr->pkt_flag_hi8 |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+ } else if ((cookie->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) ==
+ RTE_MBUF_F_TX_TCP_CKSUM) {
+ pcode = ZXDH_PCODE_TCP_PKT_TYPE;
+ pi_hdr->pkt_flag_hi8 |= ZXDH_TX_TCPUDP_CKSUM_CAL;
+ }
+ pkt_flag_lw16 |= (mtu_or_mss >> ZXDH_MTU_MSS_UNIT_SHIFTBIT) & ZXDH_MTU_MSS_MASK;
+ pi_hdr->pkt_flag_lw16 = rte_be_to_cpu_16(pkt_flag_lw16);
+ pi_hdr->pkt_type = l3_ptype | ZXDH_PKT_FORM_CPU | pcode;
+
+ l3_offset = hdr_len + cookie->l2_len;
+ l3_offset += (cookie->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) ?
+ cookie->outer_l2_len + cookie->outer_l3_len : 0;
+ pi_hdr->l3_offset = rte_be_to_cpu_16(l3_offset);
+ pi_hdr->l4_offset = rte_be_to_cpu_16(l3_offset + cookie->l3_len);
+ if (cookie->ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM)
+ ol_flag |= ZXDH_PD_OFFLOAD_OUTER_IPCSUM;
+ } else {
+ pd_hdr = &hdr->pd_hdr;
+ }
+
+ pd_hdr->dst_vfid = rte_be_to_cpu_16(cookie->port);
+
+ if (cookie->ol_flags & (RTE_MBUF_F_TX_VLAN | RTE_MBUF_F_TX_QINQ)) {
+ ol_flag |= ZXDH_PD_OFFLOAD_CVLAN_INSERT;
+ pd_hdr->cvlan_insert = rte_be_to_cpu_16(cookie->vlan_tci);
+ if (cookie->ol_flags & RTE_MBUF_F_TX_QINQ) {
+ ol_flag |= ZXDH_PD_OFFLOAD_SVLAN_INSERT;
+ pd_hdr->svlan_insert = rte_be_to_cpu_16(cookie->vlan_tci_outer);
+ }
+ }
+
+ pd_hdr->ol_flag = rte_be_to_cpu_16(ol_flag);
+}
+
+/*
+ * Populate N_PER_LOOP descriptors with data from N_PER_LOOP single-segment mbufs.
+ * Note: The simple transmit path (zxdh_xmit_pkts_simple) is selected only when
+ * RTE_ETH_TX_OFFLOAD_MULTI_SEGS is disabled, so all packets handled here are
+ * guaranteed to be single-segment.
+ */
+static inline void
+tx_bunch(struct zxdh_virtqueue *vq, volatile struct zxdh_vring_packed_desc *txdp,
+ struct rte_mbuf **pkts, uint16_t start_id)
+{
+ uint16_t flags = vq->cached_flags;
+ int i;
+ for (i = 0; i < N_PER_LOOP; ++i, ++txdp, ++pkts) {
+ /* write data to descriptor */
+ txdp->addr = rte_mbuf_data_iova(*pkts);
+ txdp->len = (*pkts)->data_len;
+ txdp->id = start_id + i;
+ txdp->flags = flags;
+ }
+}
+
+/* Populate 1 descriptor with data from 1 single-segment mbuf */
+static inline void
+tx1(struct zxdh_virtqueue *vq, volatile struct zxdh_vring_packed_desc *txdp,
+ struct rte_mbuf *pkts, uint16_t id)
+{
+ uint16_t flags = vq->cached_flags;
+ txdp->addr = rte_mbuf_data_iova(pkts);
+ txdp->len = pkts->data_len;
+ txdp->id = id;
+ zxdh_queue_store_flags_packed(txdp, flags);
+}
+
+static void submit_to_backend_simple(struct zxdh_virtqueue *vq,
+ struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+ struct zxdh_hw *hw = vq->hw;
+ struct rte_mbuf *m = NULL;
+ uint16_t id = vq->vq_avail_idx;
+ struct zxdh_vring_packed_desc *txdp = &vq->vq_packed.ring.desc[id];
+ struct zxdh_vq_desc_extra *dxp = &vq->vq_descx[id];
+ int mainpart, leftover;
+ int i, j;
+
+ /*
+ * Process most of the packets in chunks of N pkts. Any
+ * leftover packets will get processed one at a time.
+ */
+ mainpart = (nb_pkts & ~N_PER_LOOP_MASK);
+ leftover = (nb_pkts & N_PER_LOOP_MASK);
+
+ for (i = 0; i < mainpart; i += N_PER_LOOP) {
+ rte_prefetch0(dxp + i);
+ rte_prefetch0(tx_pkts + i);
+ for (j = 0; j < N_PER_LOOP; ++j) {
+ m = *(tx_pkts + i + j);
+ pkt_padding(m, hw);
+ (dxp + i + j)->cookie = (void *)m;
+ zxdh_update_packet_stats(&vq->txq.stats, m);
+ }
+ /* write data to descriptor */
+ tx_bunch(vq, txdp + i, tx_pkts + i, id + i);
+ }
+
+ if (leftover > 0) {
+ rte_prefetch0(dxp + mainpart);
+ rte_prefetch0(tx_pkts + mainpart);
+
+ for (i = 0; i < leftover; ++i) {
+ m = *(tx_pkts + mainpart + i);
+ pkt_padding(m, hw);
+ (dxp + mainpart + i)->cookie = m;
+ tx1(vq, txdp + mainpart + i, *(tx_pkts + mainpart + i), id + mainpart + i);
+ zxdh_update_packet_stats(&vq->txq.stats, m);
+ }
+ }
+}
+
+uint16_t zxdh_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+ struct zxdh_virtnet_tx *txvq = tx_queue;
+ struct zxdh_virtqueue *vq = txq_get_vq(txvq);
+ uint16_t nb_tx = 0, nb_tx_left;
+
+ zxdh_xmit_fast_flush(vq);
+
+ nb_pkts = (uint16_t)RTE_MIN(nb_pkts, vq->vq_free_cnt);
+ if (unlikely(nb_pkts == 0)) {
+ txvq->stats.idle++;
+ return 0;
+ }
+
+ nb_tx_left = nb_pkts;
+ if ((vq->vq_avail_idx + nb_pkts) >= vq->vq_nentries) {
+ nb_tx = vq->vq_nentries - vq->vq_avail_idx;
+ nb_tx_left = nb_pkts - nb_tx;
+ submit_to_backend_simple(vq, tx_pkts, nb_tx);
+ vq->vq_avail_idx = 0;
+ vq->cached_flags ^= ZXDH_VRING_PACKED_DESC_F_AVAIL_USED;
+
+ vq->vq_free_cnt -= nb_tx;
+ tx_pkts += nb_tx;
+ }
+ if (nb_tx_left) {
+ submit_to_backend_simple(vq, tx_pkts, nb_tx_left);
+ vq->vq_avail_idx += nb_tx_left;
+ vq->vq_free_cnt -= nb_tx_left;
+ }
+
+ zxdh_queue_notify(vq);
+ txvq->stats.packets += nb_pkts;
+
+ return nb_pkts;
+}
diff --git a/drivers/net/zxdh/zxdh_rxtx.h b/drivers/net/zxdh/zxdh_rxtx.h
index dba9567414..627e8b05c3 100644
--- a/drivers/net/zxdh/zxdh_rxtx.h
+++ b/drivers/net/zxdh/zxdh_rxtx.h
@@ -56,18 +56,19 @@ struct __rte_cache_aligned zxdh_virtnet_rx {
struct __rte_cache_aligned zxdh_virtnet_tx {
struct zxdh_virtqueue *vq;
-
- rte_iova_t zxdh_net_hdr_mem; /* hdr for each xmit packet */
- uint16_t queue_id; /* DPDK queue index. */
- uint16_t port_id; /* Device port identifier. */
+ const struct rte_memzone *zxdh_net_hdr_mz; /* memzone to populate hdr. */
+ rte_iova_t zxdh_net_hdr_mem; /* hdr for each xmit packet */
struct zxdh_virtnet_stats stats;
const struct rte_memzone *mz; /* mem zone to populate TX ring. */
- const struct rte_memzone *zxdh_net_hdr_mz; /* memzone to populate hdr. */
+ uint64_t offloads;
+ uint16_t queue_id; /* DPDK queue index. */
+ uint16_t port_id; /* Device port identifier. */
};
uint16_t zxdh_xmit_pkts_packed(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
uint16_t zxdh_xmit_pkts_prepare(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
uint16_t zxdh_recv_pkts_packed(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
uint16_t zxdh_recv_single_pkts(void *rx_queue, struct rte_mbuf **rcv_pkts, uint16_t nb_pkts);
-
+uint16_t zxdh_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+uint16_t zxdh_xmit_pkts_simple_prepare(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
#endif /* ZXDH_RXTX_H */
--
2.27.0
[-- Attachment #1.1.2: Type: text/html , Size: 53255 bytes --]
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox