* [PATCH 6/9] bus/fslmc/dpio: tune DQRI interrupt coalescing holdoff
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
The portal DQRI interrupt used a fixed threshold of 3 and a raw 0xFF
timeout. Parameterize dpaa2_dpio_intr_init() with (threshold, timeout) so
each mode supplies its own: the event driver keeps the legacy 3 / 0xFF
and its DPAA2_PORTAL_INTR_THRESHOLD / DPAA2_PORTAL_INTR_TIMEOUT env-var
overrides, while rx-queue interrupts default the threshold to the HW DQRR
ring depth (ring-1, =7 on QBMan >= 4.1) and use a coalescing holdoff in
microseconds, converted to ITP units from the MC-reported QBMan clock
(itp = holdoff_us * clk_MHz / 256, capped at the 12-bit field). The setup
is portal-wide and idempotent, so the first mode to arm a given portal
wins; a portal is normally driven by a single mode.
The net/dpaa2 PMD exposes both rx-queue-interrupt knobs as per-port
devargs: drv_rx_intr_holdoff_us (default 100us) and drv_rx_intr_threshold
(default 0 = ring-1, clamped to [1, ring-1]). Also expose
dpaa2_dpio_intr_deinit() (no longer event-only), and on the intr_init
error paths close the epoll fd and disable the interrupt.
Add qbman_swp_dqrr_size() to expose the ring depth.
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
doc/guides/nics/dpaa2.rst | 10 +++
drivers/bus/fslmc/portal/dpaa2_hw_dpio.c | 72 +++++++++++++------
drivers/bus/fslmc/portal/dpaa2_hw_dpio.h | 12 +++-
.../fslmc/qbman/include/fsl_qbman_portal.h | 9 +++
drivers/bus/fslmc/qbman/qbman_portal.c | 6 ++
drivers/net/dpaa2/dpaa2_ethdev.c | 60 +++++++++++++++-
drivers/net/dpaa2/dpaa2_ethdev.h | 7 ++
7 files changed, 151 insertions(+), 25 deletions(-)
diff --git a/doc/guides/nics/dpaa2.rst b/doc/guides/nics/dpaa2.rst
index 2d70bd0ab9..47a52c9287 100644
--- a/doc/guides/nics/dpaa2.rst
+++ b/doc/guides/nics/dpaa2.rst
@@ -492,6 +492,16 @@ for details.
packets, so that user can check what is wrong with those packets.
e.g. ``fslmc:dpni.1,drv_error_queue=1``
+* Use dev arg option ``drv_rx_intr_holdoff_us=<uint32>`` to set the Rx queue
+ interrupt coalescing holdoff in microseconds (default 100). Only applies in
+ Rx queue interrupt mode.
+ e.g. ``fslmc:dpni.1,drv_rx_intr_holdoff_us=50``
+
+* Use dev arg option ``drv_rx_intr_threshold=<uint32>`` to set the Rx queue
+ interrupt coalescing frame threshold; 0 (default) means the DQRR ring depth
+ minus one.
+ e.g. ``fslmc:dpni.1,drv_rx_intr_threshold=4``
+
Enabling logs
-------------
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
index e6b4e74b3b..c5525a94fa 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
@@ -206,12 +206,35 @@ dpaa2_affine_dpio_intr_to_respective_core(int32_t dpio_id, int cpu_id)
}
#endif /* RTE_EVENT_DPAA2 */
+/* holdoff (us) -> QBMan ITP units (256 cycles each), capped at the 12-bit field */
+RTE_EXPORT_INTERNAL_SYMBOL(dpaa2_dpio_holdoff_to_itp)
+int dpaa2_dpio_holdoff_to_itp(struct dpaa2_dpio_dev *dpio_dev, uint32_t holdoff_us)
+{
+ uint32_t qman_mhz = 0;
+ struct dpio_attr attr;
+ uint64_t itp;
+
+ if (dpio_get_attributes(dpio_dev->dpio, CMD_PRI_LOW, dpio_dev->token, &attr) == 0)
+ qman_mhz = attr.clk / 1000000;
+ itp = qman_mhz ? ((uint64_t)holdoff_us * qman_mhz) / 256 : 0xFF;
+ if (itp > 0xfff) /* 12-bit ITP field */
+ itp = 0xfff;
+
+ return (int)itp;
+}
+
+/* threshold: DQRR fill raising DQRI (< ring depth); timeout: holdoff in ITP units.
+ * Per-mode values from the caller (eventdev vs rx-queue intr); no env override.
+ * The DQRI config is portal-wide and this is idempotent: the first caller to
+ * arm a portal wins, a later caller's values are ignored (a portal normally
+ * serves a single mode).
+ */
RTE_EXPORT_INTERNAL_SYMBOL(dpaa2_dpio_intr_init)
-int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
+int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, int threshold,
+ int timeout, bool build_epoll)
{
- struct epoll_event epoll_ev;
int eventfd, dpio_epoll_fd, ret;
- int threshold = 0x3, timeout = 0xFF;
+ struct epoll_event epoll_ev;
if (dpio_dev->intr_enabled)
return 0;
@@ -222,12 +245,6 @@ int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
return -1;
}
- if (getenv("DPAA2_PORTAL_INTR_THRESHOLD"))
- threshold = atoi(getenv("DPAA2_PORTAL_INTR_THRESHOLD"));
-
- if (getenv("DPAA2_PORTAL_INTR_TIMEOUT"))
- sscanf(getenv("DPAA2_PORTAL_INTR_TIMEOUT"), "%x", &timeout);
-
qbman_swp_interrupt_set_trigger(dpio_dev->sw_portal,
QBMAN_SWP_INTERRUPT_DQRI);
qbman_swp_interrupt_clear_status(dpio_dev->sw_portal, 0xffffffff);
@@ -238,9 +255,9 @@ int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
dpio_dev->epoll_fd = -1;
/* The event PMD dequeues by sleeping on a private epoll instance owned
- * by the portal, so build it here. A caller that waits on another
- * epoll (the net rx-queue-interrupt path uses the application's) skips
- * this.
+ * by the portal, so build it here. The net rx-queue-interrupt path
+ * exposes the raw eventfd through the generic ethdev API and waits on
+ * the application's own epoll instead, so it skips this.
*/
if (build_epoll) {
dpio_epoll_fd = epoll_create(1);
@@ -269,11 +286,14 @@ int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
return 0;
}
-#ifdef RTE_EVENT_DPAA2
-static void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
+RTE_EXPORT_INTERNAL_SYMBOL(dpaa2_dpio_intr_deinit)
+void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
{
int ret;
+ if (!dpio_dev->intr_enabled)
+ return;
+
ret = rte_dpaa2_intr_disable(dpio_dev->intr_handle, 0);
if (ret)
DPAA2_BUS_ERR("DPIO interrupt disable failed");
@@ -284,7 +304,6 @@ static void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
}
dpio_dev->intr_enabled = 0;
}
-#endif
static int
dpaa2_configure_stashing(struct dpaa2_dpio_dev *dpio_dev, int cpu_id)
@@ -306,9 +325,18 @@ dpaa2_configure_stashing(struct dpaa2_dpio_dev *dpio_dev, int cpu_id)
}
#ifdef RTE_EVENT_DPAA2
- if (dpaa2_dpio_intr_init(dpio_dev, true)) {
- DPAA2_BUS_ERR("Interrupt registration failed for dpio");
- return -1;
+ {
+ int threshold = 3, timeout = 0xFF;
+
+ if (getenv("DPAA2_PORTAL_INTR_THRESHOLD"))
+ threshold = atoi(getenv("DPAA2_PORTAL_INTR_THRESHOLD"));
+ if (getenv("DPAA2_PORTAL_INTR_TIMEOUT"))
+ sscanf(getenv("DPAA2_PORTAL_INTR_TIMEOUT"), "%x", &timeout);
+
+ if (dpaa2_dpio_intr_init(dpio_dev, threshold, timeout, true)) {
+ DPAA2_BUS_ERR("Interrupt registration failed for dpio");
+ return -1;
+ }
}
dpaa2_affine_dpio_intr_to_respective_core(dpio_dev->hw_id, cpu_id);
#endif
@@ -319,9 +347,11 @@ dpaa2_configure_stashing(struct dpaa2_dpio_dev *dpio_dev, int cpu_id)
static void dpaa2_put_qbman_swp(struct dpaa2_dpio_dev *dpio_dev)
{
if (dpio_dev) {
-#ifdef RTE_EVENT_DPAA2
+ /* rx-queue interrupts (net PMD) can arm a portal without the
+ * event driver; tear it down unconditionally. Safe when never
+ * armed: intr_deinit returns early if intr is not enabled.
+ */
dpaa2_dpio_intr_deinit(dpio_dev);
-#endif
rte_atomic16_clear(&dpio_dev->ref_count);
}
}
@@ -512,6 +542,8 @@ dpaa2_create_dpio_device(int vdev_fd,
goto err;
}
+ DPAA2_BUS_DEBUG("QBMAN clk = %u Hz (%u MHz)", attr.clk, attr.clk / 1000000);
+
/* find the SoC type for the first time */
if (!dpaa2_svr_family) {
struct mc_soc_version mc_plat_info = {0};
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
index 10dd968e5f..090fa14410 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
@@ -50,9 +50,17 @@ int dpaa2_affine_qbman_swp(void);
__rte_internal
int dpaa2_affine_qbman_ethrx_swp(void);
-/* set up a DPIO portal's DQRI interrupt (rx-queue interrupt mode) */
+/* set up / tear down a DPIO portal's DQRI interrupt (rx-queue interrupt mode) */
__rte_internal
-int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll);
+int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, int threshold,
+ int timeout, bool build_epoll);
+
+__rte_internal
+void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev);
+
+/* convert a coalescing holdoff (microseconds) to QBMan ITP units */
+__rte_internal
+int dpaa2_dpio_holdoff_to_itp(struct dpaa2_dpio_dev *dpio_dev, uint32_t holdoff_us);
/* allocate memory for FQ - dq storage */
__rte_internal
diff --git a/drivers/bus/fslmc/qbman/include/fsl_qbman_portal.h b/drivers/bus/fslmc/qbman/include/fsl_qbman_portal.h
index 5375ea386d..842ef6f067 100644
--- a/drivers/bus/fslmc/qbman/include/fsl_qbman_portal.h
+++ b/drivers/bus/fslmc/qbman/include/fsl_qbman_portal.h
@@ -157,6 +157,15 @@ uint32_t qbman_swp_intr_timeout_read_status(struct qbman_swp *p);
*/
void qbman_swp_intr_timeout_write(struct qbman_swp *p, uint32_t mask);
+/**
+ * qbman_swp_dqrr_size() - Get the HW DQRR ring depth of a software portal.
+ * @p: the given software portal object.
+ *
+ * Returns the number of DQRR entries (4 on QBMan < 4.1, 8 on >= 4.1). Useful
+ * as the upper bound for the DQRR interrupt coalescing threshold.
+ */
+uint8_t qbman_swp_dqrr_size(struct qbman_swp *p);
+
/**
* qbman_swp_interrupt_get_trigger() - Get the data in software portal
* interrupt enable register.
diff --git a/drivers/bus/fslmc/qbman/qbman_portal.c b/drivers/bus/fslmc/qbman/qbman_portal.c
index 947415363a..81c2d87e0a 100644
--- a/drivers/bus/fslmc/qbman/qbman_portal.c
+++ b/drivers/bus/fslmc/qbman/qbman_portal.c
@@ -433,6 +433,12 @@ void qbman_swp_intr_timeout_write(struct qbman_swp *p, uint32_t mask)
qbman_cinh_write(&p->sys, QBMAN_CINH_SWP_ITPR, mask);
}
+RTE_EXPORT_INTERNAL_SYMBOL(qbman_swp_dqrr_size)
+uint8_t qbman_swp_dqrr_size(struct qbman_swp *p)
+{
+ return p->dqrr.dqrr_size;
+}
+
uint32_t qbman_swp_interrupt_get_trigger(struct qbman_swp *p)
{
return qbman_cinh_read(&p->sys, QBMAN_CINH_SWP_IER);
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.c b/drivers/net/dpaa2/dpaa2_ethdev.c
index 6407c24755..7ca454eaae 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.c
+++ b/drivers/net/dpaa2/dpaa2_ethdev.c
@@ -36,6 +36,9 @@
#define DRIVER_ERROR_QUEUE "drv_err_queue"
#define DRIVER_NO_TAILDROP "drv_no_taildrop"
#define DRIVER_NO_DATA_STASHING "drv_no_data_stashing"
+#define DRIVER_RX_INTR_HOLDOFF_US "drv_rx_intr_holdoff_us"
+#define DPAA2_RX_INTR_HOLDOFF_US_DEF 100
+#define DRIVER_RX_INTR_THRESHOLD "drv_rx_intr_threshold"
#define CHECK_INTERVAL 100 /* 100ms */
#define MAX_REPEAT_TIME 90 /* 9s (90 * 100ms) in total */
@@ -3078,7 +3081,7 @@ dpaa2_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
struct dpaa2_dev_priv *priv = dev->data->dev_private;
struct dpaa2_queue *dpaa2_q = priv->rx_vq[queue_id];
struct dpaa2_dpio_dev *dpio, *old;
- int ret;
+ int ret, threshold, timeout, dqrr_max;
if (!dpaa2_q->napi_dpcon)
return -ENOTSUP; /* no channel -> caller keeps polling */
@@ -3087,10 +3090,22 @@ dpaa2_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
return -EIO;
dpio = DPAA2_PER_LCORE_ETHRX_DPIO;
+ /* threshold from drv_rx_intr_threshold (0 = ring-1), holdoff from
+ * drv_rx_intr_holdoff_us. idempotent: no-op if the dpio is already
+ * armed (e.g. event driver)
+ */
+ dqrr_max = qbman_swp_dqrr_size(dpio->sw_portal) - 1;
+ threshold = priv->rx_intr_threshold ? (int)priv->rx_intr_threshold : dqrr_max;
+ if (threshold < 1 || threshold > dqrr_max) {
+ DPAA2_PMD_WARN("drv_rx_intr_threshold %d out of [1, %d], clamping",
+ threshold, dqrr_max);
+ threshold = threshold < 1 ? 1 : dqrr_max;
+ }
+ timeout = dpaa2_dpio_holdoff_to_itp(dpio, priv->rx_intr_holdoff_us);
/* build_epoll=false: the generic ethdev rx-intr API waits on the
* application epoll, not the portal's private one (event PMD only).
*/
- ret = dpaa2_dpio_intr_init(dpio, false); /* VFIO eventfd, no MC */
+ ret = dpaa2_dpio_intr_init(dpio, threshold, timeout, false);
if (ret)
return ret;
@@ -3346,6 +3361,35 @@ dpaa2_get_devargs(struct rte_devargs *devargs, const char *key)
return 1;
}
+static int
+u32_devarg_handler(__rte_unused const char *key, const char *value, void *opaque)
+{
+ char *end;
+ unsigned long v = strtoul(value, &end, 0);
+
+ if (*value == '\0' || *end != '\0' || v > UINT32_MAX)
+ return -1;
+ *(uint32_t *)opaque = (uint32_t)v;
+
+ return 0;
+}
+
+/* Read a u32-valued devarg into *out, leaving *out untouched if absent. */
+static void
+dpaa2_get_devargs_u32(struct rte_devargs *devargs, const char *key, uint32_t *out)
+{
+ struct rte_kvargs *kvlist;
+
+ if (!devargs)
+ return;
+ kvlist = rte_kvargs_parse(devargs->args, NULL);
+ if (!kvlist)
+ return;
+ if (rte_kvargs_count(kvlist, key))
+ rte_kvargs_process(kvlist, key, u32_devarg_handler, out);
+ rte_kvargs_free(kvlist);
+}
+
static int
dpaa2_dev_init(struct rte_eth_dev *eth_dev)
{
@@ -3373,6 +3417,14 @@ dpaa2_dev_init(struct rte_eth_dev *eth_dev)
DPAA2_PMD_INFO("No RX prefetch mode");
}
+ priv->rx_intr_holdoff_us = DPAA2_RX_INTR_HOLDOFF_US_DEF;
+ dpaa2_get_devargs_u32(dev->devargs, DRIVER_RX_INTR_HOLDOFF_US,
+ &priv->rx_intr_holdoff_us);
+
+ priv->rx_intr_threshold = 0;
+ dpaa2_get_devargs_u32(dev->devargs, DRIVER_RX_INTR_THRESHOLD,
+ &priv->rx_intr_threshold);
+
if (dpaa2_get_devargs(dev->devargs, DRIVER_LOOPBACK_MODE)) {
priv->flags |= DPAA2_RX_LOOPBACK_MODE;
DPAA2_PMD_INFO("Rx loopback mode");
@@ -3888,5 +3940,7 @@ RTE_PMD_REGISTER_PARAM_STRING(NET_DPAA2_PMD_DRIVER_NAME,
DRIVER_RX_PARSE_ERR_DROP "=<int>"
DRIVER_ERROR_QUEUE "=<int>"
DRIVER_NO_TAILDROP "=<int>"
- DRIVER_NO_DATA_STASHING "=<int>");
+ DRIVER_NO_DATA_STASHING "=<int> "
+ DRIVER_RX_INTR_HOLDOFF_US "=<uint32> "
+ DRIVER_RX_INTR_THRESHOLD "=<uint32>");
RTE_LOG_REGISTER_DEFAULT(dpaa2_logtype_pmd, NOTICE);
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.h b/drivers/net/dpaa2/dpaa2_ethdev.h
index 65fb48bd27..d8be1f8bce 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.h
+++ b/drivers/net/dpaa2/dpaa2_ethdev.h
@@ -412,6 +412,13 @@ struct dpaa2_dev_priv {
uint8_t max_cgs;
uint8_t cgid_in_use[MAX_RX_QUEUES];
+ /* DQRI holdoff (us) for rx-queue interrupts (drv_rx_intr_holdoff_us) */
+ uint32_t rx_intr_holdoff_us;
+ /* DQRI threshold for rx-queue interrupts (drv_rx_intr_threshold);
+ * 0 = auto (DQRR ring depth - 1)
+ */
+ uint32_t rx_intr_threshold;
+
/* Current hash distribution size per RX TC, written by
* dpaa2_setup_flow_dist_size() and read by reta_query / reta_update.
* Zero means "use default" (= nb_rx_queues clamped to dist_queues).
--
2.43.0
^ permalink raw reply related
* [PATCH 5/9] net/dpaa2: support Rx queue interrupts
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
Implement .rx_queue_intr_enable / .rx_queue_intr_disable so a worker
can sleep on a queue's data-availability notification instead of
busy-polling, through the generic rte_eth_dev_rx_intr_* API.
A worker wakes on its software portal's DQRI, which fires when the
portal's DQRR holds frames, so the Rx FQ must be scheduled to a channel
that portal dequeues. The natural dpni_set_queue with a notification
destination holds the global MC lock long enough to wedge the firmware
and must target a disabled dpni. But the polling portal is only known
once a worker affines, after dev_start, so the destination cannot be
the worker's portal.
Bind each Rx FQ to its own DPCON channel instead. The default Rx burst
pulls frames from the FQ with a volatile dequeue and cannot be
interrupt-driven; to wake on the DQRI the FQ must be pushed to the
portal's DQRR. dev_start issues the DEST_DPCON set_queue statically on
the still-disabled dpni with no knowledge of the polling lcore; a worker
later subscribes its own ethrx portal to the channel and arms the DQRI
in rx_queue_intr_enable (a one-shot per-portal MC op plus QBMan, never
the wedging set_queue).
This pushed/DQRR consumption is how the event PMD works, but the DPCON
use differs. The event PMD uses one DPCON per worker, concentrates N
FQs onto it, and lets the QBMan scheduler load-balance events across
cores. Here affinity is static and there is no scheduling, so each FQ
gets its own DPCON (one per FQ, more channels, drawn from the shared
pool that the DPCON move to the fslmc bus now feeds), bound once at
dev_start before the lcore is known. Frames are delivered by
rte_eth_rx_burst (dpaa2_dev_rx_dqrr), not as events via
rte_event_dequeue.
rte_eth_dev_rx_intr_enable(q) subscribes the lcore portal to q's DPCON
and arms the DQRI. rte_eth_dev_rx_intr_ctl_q(q) adds q's eventfd (the
portal DQRI fd) to the thread epoll.
wire
|
[ DPMAC ]
|
[ DPNI ] (1)
|
TC0: FQ0 FQ1 FQ2 FQ3 (2)
| | | | (3)
[DPCON][DPCON][DPCON][DPCON]
\ | | / (4)
[ DPIO A ] [ DPIO B ] (5)
| |
DQRR DQRR (6)
| |
DQRI DQRI (7)
| |
eventfd eventfd (8)
| |
rte_epoll_wait rte_epoll_wait (9)
| |
dpaa2_dev_rx_dqrr (10)
(1) WRIOP picks a TC (QoS), then RSS-hashes within the TC to an FQ
(2) FQ0..FQ3 are the rte_eth Rx queues
(3) dpni_set_queue(DEST_DPCON): one DPCON per FQ
(4) the lcore portal subscribes to its DPCONs (push_set)
(5) one QBMan software portal per lcore
(6) QMan pushes the FDs into the portal DQRR
(7) DQRI is raised when the DQRR is non-empty
(8) a portal's queues share one fd (its DQRI eventfd)
(9) worker sleeps here when all its queues are idle
(10) dpaa2_dev_rx_dqrr drains the DQRR, demuxes FDs to FQs by fqd_ctx
The DQRI and eventfd are portal-wide: a queue's eventfd is its portal's
DQRI fd, and the inhibit bit is refcounted by armed queues so disabling
one queue never masks a sibling. The static per-queue bind also lets a
queue be re-homed to another lcore at runtime, the new worker
reclaiming the channel, with no set_queue and no port stop.
On single-core 64-byte forwarding this interrupt path runs at ~5.0 Mpps
versus ~5.86 Mpps polling: per-frame DQRR demux and consume cost about
15 percent over the polling batch dequeue.
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
doc/guides/nics/features/dpaa2.ini | 1 +
doc/guides/rel_notes/release_26_07.rst | 1 +
drivers/bus/fslmc/portal/dpaa2_hw_dpio.c | 11 +-
drivers/bus/fslmc/portal/dpaa2_hw_dpio.h | 4 +
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 27 ++-
drivers/bus/fslmc/qbman/qbman_portal.c | 1 +
drivers/net/dpaa2/dpaa2_ethdev.c | 293 ++++++++++++++++++++++-
drivers/net/dpaa2/dpaa2_ethdev.h | 3 +
drivers/net/dpaa2/dpaa2_rxtx.c | 122 ++++++++++
9 files changed, 457 insertions(+), 6 deletions(-)
diff --git a/doc/guides/nics/features/dpaa2.ini b/doc/guides/nics/features/dpaa2.ini
index 5def653d1d..b53353eb77 100644
--- a/doc/guides/nics/features/dpaa2.ini
+++ b/doc/guides/nics/features/dpaa2.ini
@@ -7,6 +7,7 @@
Speed capabilities = Y
Link status = Y
Link status event = Y
+Rx interrupt = Y
Burst mode info = Y
Queue start/stop = Y
Scattered Rx = Y
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 103c4034ca..87c7c57bcc 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -129,6 +129,7 @@ New Features
* **Updated NXP dpaa2 driver.**
* Added RSS RETA query and update support.
+ * Added Rx queue interrupt support.
* **Updated PCAP ethernet driver.**
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
index 3a5abb2e6d..e6b4e74b3b 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
@@ -204,13 +204,18 @@ dpaa2_affine_dpio_intr_to_respective_core(int32_t dpio_id, int cpu_id)
fclose(file);
}
+#endif /* RTE_EVENT_DPAA2 */
-static int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
+RTE_EXPORT_INTERNAL_SYMBOL(dpaa2_dpio_intr_init)
+int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
{
struct epoll_event epoll_ev;
int eventfd, dpio_epoll_fd, ret;
int threshold = 0x3, timeout = 0xFF;
+ if (dpio_dev->intr_enabled)
+ return 0;
+
ret = rte_dpaa2_intr_enable(dpio_dev->intr_handle, 0);
if (ret) {
DPAA2_BUS_ERR("Interrupt registration failed");
@@ -259,9 +264,12 @@ static int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epol
dpio_dev->epoll_fd = dpio_epoll_fd;
}
+ dpio_dev->intr_enabled = 1;
+
return 0;
}
+#ifdef RTE_EVENT_DPAA2
static void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
{
int ret;
@@ -274,6 +282,7 @@ static void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
close(dpio_dev->epoll_fd);
dpio_dev->epoll_fd = -1;
}
+ dpio_dev->intr_enabled = 0;
}
#endif
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
index 328e1e788a..10dd968e5f 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.h
@@ -50,6 +50,10 @@ int dpaa2_affine_qbman_swp(void);
__rte_internal
int dpaa2_affine_qbman_ethrx_swp(void);
+/* set up a DPIO portal's DQRI interrupt (rx-queue interrupt mode) */
+__rte_internal
+int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll);
+
/* allocate memory for FQ - dq storage */
__rte_internal
int
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index 79a2ec41e3..af75e96b27 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -133,6 +133,8 @@ struct dpaa2_dpio_dev {
struct rte_intr_handle *intr_handle; /* Interrupt related info */
int32_t epoll_fd; /**< File descriptor created for interrupt polling */
int32_t hw_id; /**< An unique ID of this DPIO device instance */
+ uint8_t intr_enabled; /**< DQRI portal interrupt already set up */
+ uint16_t ethrx_intr_refcnt; /**< rx queues currently armed on this portal */
struct dpaa2_portal_dqrr dpaa2_held_bufs;
};
@@ -164,6 +166,20 @@ typedef void (dpaa2_queue_cb_dqrr_t)(struct qbman_swp *swp,
typedef void (dpaa2_queue_cb_eqresp_free_t)(uint16_t eqresp_ci,
struct dpaa2_queue *dpaa2_q);
+#define DPAA2_NAPI_FD_STASH_SIZE 64 /*!< power of 2; >= 2x rx burst so the
+ * peer port's frames fit before HW
+ * backpressure (2 ports/worker)
+ */
+
+/* Lcore-local FIFO of raw FDs demuxed to this queue by another queue's burst
+ * on the same portal (see dpaa2_queue::napi_stash).
+ */
+struct dpaa2_napi_stash {
+ uint16_t head; /*!< pop index (drain) */
+ uint16_t tail; /*!< push index (park) */
+ struct qbman_fd fd[DPAA2_NAPI_FD_STASH_SIZE];
+};
+
struct __rte_cache_aligned dpaa2_queue {
struct rte_mempool *mb_pool; /**< mbuf pool to populate RX ring. */
union {
@@ -176,7 +192,7 @@ struct __rte_cache_aligned dpaa2_queue {
uint8_t cgid; /*! < Congestion Group id for this queue */
uint64_t rx_pkts;
uint64_t tx_pkts;
- uint64_t err_pkts;
+ uint64_t err_pkts; /*!< also counts NAPI stash-full drops (imissed) */
union {
/**Ingress*/
struct queue_storage_info_t *q_storage[RTE_MAX_LCORE];
@@ -195,6 +211,15 @@ struct __rte_cache_aligned dpaa2_queue {
uint64_t offloads;
uint64_t lpbk_cntx;
uint8_t data_stashing_off;
+ /* NAPI rx-interrupt: per-queue DPCON bound to this FQ at dev_start
+ * (DEST_DPCON, static); the polling worker subscribes its ethrx portal
+ * to the channel and arms the DQRI, rx_dqrr drains+demuxes by fqd_ctx.
+ */
+ struct dpaa2_dpcon_dev *napi_dpcon; /*!< notif channel, NULL = napi off */
+ RTE_ATOMIC(struct dpaa2_dpio_dev *) napi_sub_dpio; /*!< subscribed portal or NULL */
+ uint8_t napi_channel_index; /*!< portal-local static-dequeue idx */
+ uint8_t napi_armed; /*!< this queue requests DQRI wakeups */
+ struct dpaa2_napi_stash napi_stash; /*!< NAPI/DQRR demux FDs (~2 KB) */
};
struct swp_active_dqs {
diff --git a/drivers/bus/fslmc/qbman/qbman_portal.c b/drivers/bus/fslmc/qbman/qbman_portal.c
index 84853924e7..947415363a 100644
--- a/drivers/bus/fslmc/qbman/qbman_portal.c
+++ b/drivers/bus/fslmc/qbman/qbman_portal.c
@@ -448,6 +448,7 @@ int qbman_swp_interrupt_get_inhibit(struct qbman_swp *p)
return qbman_cinh_read(&p->sys, QBMAN_CINH_SWP_IIR);
}
+RTE_EXPORT_INTERNAL_SYMBOL(qbman_swp_interrupt_set_inhibit)
void qbman_swp_interrupt_set_inhibit(struct qbman_swp *p, int inhibit)
{
qbman_cinh_write(&p->sys, QBMAN_CINH_SWP_IIR,
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.c b/drivers/net/dpaa2/dpaa2_ethdev.c
index 8589398324..6407c24755 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.c
+++ b/drivers/net/dpaa2/dpaa2_ethdev.c
@@ -658,6 +658,8 @@ dpaa2_clear_queue_active_dps(struct dpaa2_queue *q, int num_lcores)
}
}
+static void dpaa2_dev_rx_queue_intr_unbind(struct dpaa2_queue *dpaa2_q);
+
static void
dpaa2_free_rx_tx_queues(struct rte_eth_dev *dev)
{
@@ -675,6 +677,12 @@ dpaa2_free_rx_tx_queues(struct rte_eth_dev *dev)
/* cleaning up queue storage */
for (i = 0; i < priv->nb_rx_queues; i++) {
dpaa2_q = priv->rx_vq[i];
+ if (dpaa2_q->napi_dpcon) { /* release the rx-intr channel */
+ dpaa2_dev_rx_queue_intr_unbind(dpaa2_q);
+ rte_dpaa2_free_dpcon_dev(dpaa2_q->napi_dpcon);
+ dpaa2_q->napi_dpcon = NULL;
+ dpaa2_q->napi_sub_dpio = NULL;
+ }
dpaa2_clear_queue_active_dps(dpaa2_q,
RTE_MAX_LCORE);
dpaa2_queue_storage_free(dpaa2_q,
@@ -880,6 +888,21 @@ dpaa2_eth_dev_configure(struct rte_eth_dev *dev)
}
}
+ if (dev->data->dev_conf.intr_conf.rxq) {
+ if (!dev->intr_handle)
+ dev->intr_handle = rte_intr_instance_alloc(
+ RTE_INTR_INSTANCE_F_PRIVATE);
+ if (!dev->intr_handle ||
+ rte_intr_vec_list_alloc(dev->intr_handle, "rxq_intr",
+ dev->data->nb_rx_queues) ||
+ rte_intr_nb_efd_set(dev->intr_handle,
+ dev->data->nb_rx_queues) ||
+ rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_EXT)) {
+ DPAA2_PMD_ERR("Failed to set up rx-queue interrupts");
+ return -rte_errno;
+ }
+ }
+
dpaa2_tm_init(dev);
return 0;
@@ -898,6 +921,7 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
{
struct dpaa2_dev_priv *priv = dev->data->dev_private;
struct fsl_mc_io *dpni = dev->process_private;
+ bool dpcon_allocated = false;
struct dpaa2_queue *dpaa2_q;
struct dpni_queue cfg;
uint8_t options = 0;
@@ -938,6 +962,21 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
dpaa2_q->bp_array = rte_dpaa2_bpid_info;
dpaa2_q->offloads = rx_conf->offloads;
+ /* NAPI: grab a DPCON channel so dev_start can bind this FQ statically.
+ * The DQRR burst replaces the poll path for every queue at once, so a
+ * missing channel is fatal rather than a silent per-queue fallback.
+ */
+ dpaa2_q->napi_sub_dpio = NULL;
+ if (dev->data->dev_conf.intr_conf.rxq && !dpaa2_q->napi_dpcon) {
+ dpaa2_q->napi_dpcon = rte_dpaa2_alloc_dpcon_dev();
+ if (!dpaa2_q->napi_dpcon) {
+ DPAA2_PMD_ERR("rxq %d: no DPCON for rx-queue interrupts",
+ rx_queue_id);
+ return -ENODEV;
+ }
+ dpcon_allocated = true;
+ }
+
/*Get the flow id from given VQ id*/
flow_id = dpaa2_q->flow_id;
memset(&cfg, 0, sizeof(struct dpni_queue));
@@ -945,6 +984,10 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
options = options | DPNI_QUEUE_OPT_USER_CTX;
cfg.user_context = (size_t)(dpaa2_q);
+ /* clear any stale DPIO dest left scheduled by a prior rx-intr run */
+ options |= DPNI_QUEUE_OPT_DEST;
+ cfg.destination.type = DPNI_DEST_NONE;
+
/* check if a private cgr available. */
for (i = 0; i < priv->max_cgs; i++) {
if (!priv->cgid_in_use[i]) {
@@ -985,7 +1028,7 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
dpaa2_q->tc_index, flow_id, options, &cfg);
if (ret) {
DPAA2_PMD_ERR("Error in setting the rx flow: = %d", ret);
- return ret;
+ goto err_free_dpcon;
}
dpaa2_q->nb_desc = nb_rx_desc;
@@ -1026,7 +1069,7 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
if (ret) {
DPAA2_PMD_ERR("Error in setting taildrop. err=(%d)",
ret);
- return ret;
+ goto err_free_dpcon;
}
} else { /* Disable tail Drop */
struct dpni_taildrop taildrop = {0};
@@ -1046,12 +1089,22 @@ dpaa2_dev_rx_queue_setup(struct rte_eth_dev *dev,
if (ret) {
DPAA2_PMD_ERR("Error in setting taildrop. err=(%d)",
ret);
- return ret;
+ goto err_free_dpcon;
}
}
dev->data->rx_queues[rx_queue_id] = dpaa2_q;
return 0;
+
+err_free_dpcon:
+ /* free only the DPCON this call allocated; a pre-existing one belongs to
+ * an earlier setup and is released at dev_close
+ */
+ if (dpcon_allocated) {
+ rte_dpaa2_free_dpcon_dev(dpaa2_q->napi_dpcon);
+ dpaa2_q->napi_dpcon = NULL;
+ }
+ return ret;
}
static int
@@ -1210,6 +1263,62 @@ dpaa2_dev_tx_queue_setup(struct rte_eth_dev *dev,
return 0;
}
+/* Fully release a queue's rx-interrupt state: detach the FQ from its DPCON,
+ * unbind the static dequeue channel from the portal and free any stashed FDs.
+ * Teardown only: the port is stopped and the portal quiesced; not a runtime
+ * rx_queue_intr_disable() replacement. Call before freeing the DPCON.
+ */
+static void
+dpaa2_dev_rx_queue_intr_unbind(struct dpaa2_queue *dpaa2_q)
+{
+ struct dpaa2_dev_priv *priv;
+ struct dpaa2_dpio_dev *dpio;
+ struct fsl_mc_io *dpni;
+ struct dpni_queue cfg;
+ int ret;
+
+ if (!dpaa2_q || !dpaa2_q->napi_dpcon)
+ return;
+
+ /* detach the FQ from its DPCON so it no longer points at a channel
+ * about to be returned to the pool (dpni is disabled at teardown)
+ */
+ priv = dpaa2_q->eth_data->dev_private;
+ dpni = priv->eth_dev->process_private;
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.destination.type = DPNI_DEST_NONE;
+ ret = dpni_set_queue(dpni, CMD_PRI_LOW, priv->token, DPNI_QUEUE_RX,
+ dpaa2_q->tc_index, dpaa2_q->flow_id,
+ DPNI_QUEUE_OPT_DEST, &cfg);
+ if (ret)
+ DPAA2_PMD_ERR("napi: DEST_NONE rxq flow %u: %d",
+ dpaa2_q->flow_id, ret);
+
+ /* unbind the static dequeue channel from the portal it was armed on */
+ dpio = rte_atomic_load_explicit(&dpaa2_q->napi_sub_dpio,
+ rte_memory_order_acquire);
+ if (dpio) {
+ qbman_swp_push_set(dpio->sw_portal,
+ dpaa2_q->napi_channel_index, 0);
+ if (dpaa2_q->napi_armed) {
+ dpaa2_q->napi_armed = 0;
+ if (dpio->ethrx_intr_refcnt > 0 &&
+ --dpio->ethrx_intr_refcnt == 0)
+ qbman_swp_interrupt_set_inhibit(dpio->sw_portal, 1);
+ }
+ ret = dpio_remove_static_dequeue_channel(dpio->dpio, CMD_PRI_LOW,
+ dpio->token, dpaa2_q->napi_dpcon->dpcon_id);
+ if (ret)
+ DPAA2_PMD_ERR("napi: remove DPCON %d static dequeue channel: %d",
+ dpaa2_q->napi_dpcon->dpcon_id, ret);
+ rte_atomic_store_explicit(&dpaa2_q->napi_sub_dpio, NULL,
+ rte_memory_order_release);
+ }
+
+ /* free FDs parked for this queue but never drained by a burst */
+ dpaa2_dev_rx_queue_napi_stash_drain(dpaa2_q);
+}
+
static void
dpaa2_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t rx_queue_id)
{
@@ -1239,6 +1348,12 @@ dpaa2_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t rx_queue_id)
priv->cgid_in_use[dpaa2_q->cgid] = 0;
dpaa2_q->cgid = DPAA2_INVALID_CGID;
}
+
+ if (dpaa2_q->napi_dpcon) {
+ dpaa2_dev_rx_queue_intr_unbind(dpaa2_q);
+ rte_dpaa2_free_dpcon_dev(dpaa2_q->napi_dpcon);
+ dpaa2_q->napi_dpcon = NULL;
+ }
}
static int
@@ -1389,6 +1504,36 @@ dpaa2_dev_start(struct rte_eth_dev *dev)
intr_handle = dpaa2_dev->intr_handle;
PMD_INIT_FUNC_TRACE();
+
+ /* NAPI: bind each rx FQ to its own DPCON channel while the dpni is still
+ * disabled (a DEST set_queue on an enabled dpni wedges the shared MC).
+ * Static, affinity-free; the polling worker subscribes its portal later.
+ */
+ if (dev->data->dev_conf.intr_conf.rxq) {
+ for (i = 0; i < data->nb_rx_queues; i++) {
+ dpaa2_q = data->rx_queues[i];
+ if (!dpaa2_q->napi_dpcon)
+ continue;
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.destination.type = DPNI_DEST_DPCON;
+ cfg.destination.id = dpaa2_q->napi_dpcon->dpcon_id;
+ cfg.user_context = (size_t)dpaa2_q;
+ ret = dpni_set_queue(dpni, CMD_PRI_LOW, priv->token,
+ DPNI_QUEUE_RX, dpaa2_q->tc_index,
+ dpaa2_q->flow_id,
+ DPNI_QUEUE_OPT_DEST | DPNI_QUEUE_OPT_USER_CTX,
+ &cfg);
+ if (ret) {
+ DPAA2_PMD_ERR("napi: DPCON bind rxq %d: %d", i, ret);
+ return ret;
+ }
+ }
+ /* DQRR burst for all queues; a queue only yields frames once
+ * rx_queue_intr_enable() has subscribed its portal
+ */
+ dev->rx_pkt_burst = dpaa2_dev_rx_dqrr;
+ }
+
ret = dpni_enable(dpni, CMD_PRI_LOW, priv->token);
if (ret) {
DPAA2_PMD_ERR("Failure in enabling dpni %d device: err=%d",
@@ -1859,6 +2004,13 @@ dpaa2_dev_stats_get(struct rte_eth_dev *dev,
stats->oerrors = value.page_2.egress_discarded_frames;
stats->imissed = value.page_2.ingress_nobuffer_discards;
+ /* software Rx drops (full napi stash) are not in the HW counters */
+ for (i = 0; i < priv->nb_rx_queues; i++) {
+ dpaa2_rxq = priv->rx_vq[i];
+ if (dpaa2_rxq != NULL)
+ stats->imissed += dpaa2_rxq->err_pkts;
+ }
+
/* Fill in per queue stats */
if (qstats != NULL) {
for (i = 0; (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) &&
@@ -2172,8 +2324,10 @@ dpaa2_dev_stats_reset(struct rte_eth_dev *dev)
/* Reset the per queue stats in dpaa2_queue structure */
for (i = 0; i < priv->nb_rx_queues; i++) {
dpaa2_q = priv->rx_vq[i];
- if (dpaa2_q)
+ if (dpaa2_q) {
dpaa2_q->rx_pkts = 0;
+ dpaa2_q->err_pkts = 0;
+ }
}
for (i = 0; i < priv->nb_tx_queues; i++) {
@@ -2901,6 +3055,135 @@ rte_pmd_dpaa2_thread_init(void)
}
}
+/* Arm rx-queue interrupts on the worker lcore: subscribe its ethrx portal to
+ * the queue's DPCON channel (one-shot per-portal MC) and unmask the portal DQRI
+ * (pure QBMan).
+ *
+ * Affinity is static queue-to-lcore; a lcore may own several rx queues. The
+ * DQRI and the eventfd are portal-wide, so frames are demuxed by fqd_ctx in the
+ * burst and the portal's inhibit bit is reference-counted by the number of its
+ * queues currently armed (ethrx_intr_refcnt) -- disabling one queue must not
+ * mask wakeups still wanted by its siblings. napi_armed and ethrx_intr_refcnt
+ * are plain (not atomic): these ops run on the queue's owner lcore against its
+ * own portal (one portal per lcore), so per-portal isolation keeps them from
+ * racing, not control-plane serialization.
+ *
+ * A re-home reclaims the channel by poking the old portal, so the caller must
+ * have quiesced the previous owner and disabled the queue there; napi_armed is
+ * then 0 and only the new portal is counted.
+ */
+static int
+dpaa2_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+ struct dpaa2_dev_priv *priv = dev->data->dev_private;
+ struct dpaa2_queue *dpaa2_q = priv->rx_vq[queue_id];
+ struct dpaa2_dpio_dev *dpio, *old;
+ int ret;
+
+ if (!dpaa2_q->napi_dpcon)
+ return -ENOTSUP; /* no channel -> caller keeps polling */
+
+ if (dpaa2_affine_qbman_ethrx_swp())
+ return -EIO;
+ dpio = DPAA2_PER_LCORE_ETHRX_DPIO;
+
+ /* build_epoll=false: the generic ethdev rx-intr API waits on the
+ * application epoll, not the portal's private one (event PMD only).
+ */
+ ret = dpaa2_dpio_intr_init(dpio, false); /* VFIO eventfd, no MC */
+ if (ret)
+ return ret;
+
+ old = rte_atomic_load_explicit(&dpaa2_q->napi_sub_dpio, rte_memory_order_acquire);
+ if (old && old != dpio && dpaa2_q->napi_armed) {
+ DPAA2_PMD_ERR("rxq %d still armed on another portal; disable it first",
+ queue_id);
+ return -EBUSY;
+ }
+ if (old != dpio) {
+ if (old) { /* reclaim from old portal (quiesced; QBMan MMIO unsynced) */
+ qbman_swp_push_set(old->sw_portal,
+ dpaa2_q->napi_channel_index, 0);
+ ret = dpio_remove_static_dequeue_channel(old->dpio,
+ CMD_PRI_LOW, old->token,
+ dpaa2_q->napi_dpcon->dpcon_id);
+ /* push_set(0) above already stops the old portal from
+ * dequeuing; a failed unbind only leaks a static-channel
+ * slot on the old DPIO, so warn and proceed
+ */
+ if (ret)
+ DPAA2_PMD_WARN("napi: reclaim rxq %d: %d",
+ queue_id, ret);
+ /* on no portal until the add below succeeds */
+ rte_atomic_store_explicit(&dpaa2_q->napi_sub_dpio, NULL,
+ rte_memory_order_release);
+ }
+ ret = dpio_add_static_dequeue_channel(dpio->dpio, CMD_PRI_LOW,
+ dpio->token, dpaa2_q->napi_dpcon->dpcon_id,
+ &dpaa2_q->napi_channel_index);
+ if (ret) {
+ DPAA2_PMD_ERR("napi: subscribe rxq %d: %d", queue_id, ret);
+ return ret;
+ }
+ qbman_swp_push_set(dpio->sw_portal,
+ dpaa2_q->napi_channel_index, 1);
+ /* point this queue's eventfd at the portal's DQRI fd so the
+ * generic rte_eth_dev_rx_intr_ctl_q epoll wakes on it
+ */
+ if (rte_intr_vec_list_index_set(dev->intr_handle, queue_id, queue_id) ||
+ rte_intr_efds_index_set(dev->intr_handle, queue_id,
+ rte_intr_fd_get(dpio->intr_handle))) {
+ DPAA2_PMD_ERR("napi: efd wiring rxq %d", queue_id);
+ /* unwind the half-done subscription so HW and driver
+ * state stay consistent
+ */
+ qbman_swp_push_set(dpio->sw_portal,
+ dpaa2_q->napi_channel_index, 0);
+ dpio_remove_static_dequeue_channel(dpio->dpio,
+ CMD_PRI_LOW, dpio->token,
+ dpaa2_q->napi_dpcon->dpcon_id);
+ return -EIO;
+ }
+ rte_atomic_store_explicit(&dpaa2_q->napi_sub_dpio, dpio, rte_memory_order_release);
+ }
+
+ /* arm this queue; the portal DQRI is unmasked only on the 0 -> 1 edge
+ * of its armed-queue count
+ */
+ if (!dpaa2_q->napi_armed) {
+ dpaa2_q->napi_armed = 1;
+ if (dpio->ethrx_intr_refcnt++ == 0) {
+ qbman_swp_interrupt_clear_status(dpio->sw_portal,
+ 0xffffffff);
+ qbman_swp_interrupt_set_inhibit(dpio->sw_portal, 0);
+ }
+ }
+
+ return 0;
+}
+
+/* Disarm rx-queue interrupts for this queue. The portal DQRI is masked only
+ * once the last of its queues disarms; act on the portal the queue is actually
+ * subscribed to, not the caller's current portal.
+ */
+static int
+dpaa2_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+ struct dpaa2_dev_priv *priv = dev->data->dev_private;
+ struct dpaa2_queue *dpaa2_q = priv->rx_vq[queue_id];
+ struct dpaa2_dpio_dev *dpio;
+
+ dpio = rte_atomic_load_explicit(&dpaa2_q->napi_sub_dpio, rte_memory_order_acquire);
+ if (dpio && dpaa2_q->napi_armed) {
+ dpaa2_q->napi_armed = 0;
+ if (dpio->ethrx_intr_refcnt > 0 &&
+ --dpio->ethrx_intr_refcnt == 0)
+ qbman_swp_interrupt_set_inhibit(dpio->sw_portal, 1);
+ }
+
+ return 0;
+}
+
static struct eth_dev_ops dpaa2_ethdev_ops = {
.dev_configure = dpaa2_eth_dev_configure,
.dev_start = dpaa2_dev_start,
@@ -2929,6 +3212,8 @@ static struct eth_dev_ops dpaa2_ethdev_ops = {
.vlan_tpid_set = dpaa2_vlan_tpid_set,
.rx_queue_setup = dpaa2_dev_rx_queue_setup,
.rx_queue_release = dpaa2_dev_rx_queue_release,
+ .rx_queue_intr_enable = dpaa2_dev_rx_queue_intr_enable,
+ .rx_queue_intr_disable = dpaa2_dev_rx_queue_intr_disable,
.tx_queue_setup = dpaa2_dev_tx_queue_setup,
.rx_burst_mode_get = dpaa2_dev_rx_burst_mode_get,
.tx_burst_mode_get = dpaa2_dev_tx_burst_mode_get,
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.h b/drivers/net/dpaa2/dpaa2_ethdev.h
index 3f224c654e..65fb48bd27 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.h
+++ b/drivers/net/dpaa2/dpaa2_ethdev.h
@@ -500,6 +500,9 @@ uint16_t dpaa2_dev_loopback_rx(void *queue, struct rte_mbuf **bufs,
uint16_t dpaa2_dev_prefetch_rx(void *queue, struct rte_mbuf **bufs,
uint16_t nb_pkts);
+uint16_t dpaa2_dev_rx_dqrr(void *queue, struct rte_mbuf **bufs,
+ uint16_t nb_pkts);
+void dpaa2_dev_rx_queue_napi_stash_drain(struct dpaa2_queue *dpaa2_q);
void dpaa2_dev_process_parallel_event(struct qbman_swp *swp,
const struct qbman_fd *fd,
const struct qbman_result *dq,
diff --git a/drivers/net/dpaa2/dpaa2_rxtx.c b/drivers/net/dpaa2/dpaa2_rxtx.c
index b316e23e87..189accc1de 100644
--- a/drivers/net/dpaa2/dpaa2_rxtx.c
+++ b/drivers/net/dpaa2/dpaa2_rxtx.c
@@ -922,6 +922,128 @@ dpaa2_dev_prefetch_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
return num_rx;
}
+/* Convert a DQRR'd FD (single or scatter-gather) to an mbuf and apply software
+ * VLAN strip, like the poll path.
+ */
+static inline struct rte_mbuf *
+dpaa2_dqrr_fd_to_mbuf(const struct qbman_fd *fd,
+ struct rte_eth_dev_data *eth_data)
+{
+ struct rte_mbuf *m;
+
+ if (unlikely(DPAA2_FD_GET_FORMAT(fd) == qbman_fd_sg))
+ m = eth_sg_fd_to_mbuf(fd, eth_data->port_id);
+ else
+ m = eth_fd_to_mbuf(fd, eth_data->port_id);
+ if (eth_data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP)
+ rte_vlan_strip(m);
+ return m;
+}
+
+/* prefetch a DQRR'd FD's HW annotation (parse area) ahead of conversion */
+static inline void
+dpaa2_dqrr_prefetch_annot(const struct qbman_fd *fd)
+{
+ rte_prefetch0((void *)((size_t)DPAA2_IOVA_TO_VADDR(DPAA2_GET_FD_ADDR(fd))
+ + DPAA2_FD_PTA_SIZE));
+}
+
+/* Free FDs a sibling burst parked in this queue's stash but that were never
+ * drained (queue released/freed while the lcore still held its frames).
+ */
+void
+dpaa2_dev_rx_queue_napi_stash_drain(struct dpaa2_queue *dpaa2_q)
+{
+ struct dpaa2_napi_stash *stash = &dpaa2_q->napi_stash;
+ const struct qbman_fd *fd;
+
+ while (stash->head != stash->tail) {
+ fd = &stash->fd[stash->head & (DPAA2_NAPI_FD_STASH_SIZE - 1)];
+ rte_pktmbuf_free(dpaa2_dqrr_fd_to_mbuf(fd, dpaa2_q->eth_data));
+ stash->head++;
+ }
+ stash->head = 0;
+ stash->tail = 0;
+}
+
+/* rx interrupt/DQRR path: the FQ is scheduled to a channel the lcore's ethrx
+ * portal statically dequeues -- a VDQ on a scheduled FQ never completes, so DQRR
+ * is the only model compatible with interrupt sleep. One portal serves every
+ * queue the lcore owns, so the burst demuxes by fqd_ctx: own frames are
+ * returned, foreign ones have their raw FD parked in the target queue's stash.
+ *
+ * The application must therefore poll all queues assigned to the lcore after a
+ * wakeup -- the same scheduling contract as plain DPDK polling. When a foreign
+ * queue's stash is full the FD is dropped (freed) rather than left on the shared
+ * DQRR ring, which would head-of-line block every other queue on the portal.
+ */
+uint16_t __rte_hot
+dpaa2_dev_rx_dqrr(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct dpaa2_queue *dpaa2_q = queue;
+ struct rte_eth_dev_data *eth_data = dpaa2_q->eth_data;
+ struct dpaa2_napi_stash *stash = &dpaa2_q->napi_stash;
+ const struct qbman_result *dq;
+ const struct qbman_fd *fd;
+ struct dpaa2_queue *rxq;
+ struct qbman_swp *swp;
+ uint16_t num_rx = 0;
+
+ if (unlikely(!DPAA2_PER_LCORE_ETHRX_DPIO)) {
+ if (dpaa2_affine_qbman_ethrx_swp()) {
+ DPAA2_PMD_ERR("Failure in affining portal");
+ return 0;
+ }
+ }
+ swp = DPAA2_PER_LCORE_ETHRX_PORTAL;
+
+ /* our frames parked by another queue's burst -- convert now (hot) */
+ while (num_rx < nb_pkts && stash->head != stash->tail) {
+ fd = &stash->fd[stash->head & (DPAA2_NAPI_FD_STASH_SIZE - 1)];
+ if (dpaa2_svr_family != SVR_LX2160A &&
+ (uint16_t)(stash->head + 1) != stash->tail)
+ dpaa2_dqrr_prefetch_annot(&stash->fd[(stash->head + 1) &
+ (DPAA2_NAPI_FD_STASH_SIZE - 1)]);
+ bufs[num_rx++] = dpaa2_dqrr_fd_to_mbuf(fd, eth_data);
+ stash->head++;
+ }
+
+ while (num_rx < nb_pkts) {
+ dq = qbman_swp_dqrr_next(swp);
+ if (!dq)
+ break; /* ring momentarily empty */
+ qbman_swp_prefetch_dqrr_next(swp);
+ fd = qbman_result_DQ_fd(dq);
+ /* parse summary is in the FRC on LX2160A; annotation is HW-stashed */
+ if (dpaa2_svr_family != SVR_LX2160A)
+ dpaa2_dqrr_prefetch_annot(fd);
+ rxq = (struct dpaa2_queue *)(size_t)qbman_result_DQ_fqd_ctx(dq);
+ if (unlikely(!rxq))
+ rxq = dpaa2_q;
+ if (rxq == dpaa2_q) {
+ bufs[num_rx++] = dpaa2_dqrr_fd_to_mbuf(fd, eth_data);
+ } else {
+ struct dpaa2_napi_stash *fs = &rxq->napi_stash;
+
+ if (unlikely((uint16_t)(fs->tail - fs->head) >=
+ DPAA2_NAPI_FD_STASH_SIZE)) {
+ /* stash full: drop rather than leave it on the ring
+ * and head-of-line block the shared portal
+ */
+ rte_pktmbuf_free(dpaa2_dqrr_fd_to_mbuf(fd, rxq->eth_data));
+ rxq->err_pkts++;
+ } else {
+ fs->fd[fs->tail & (DPAA2_NAPI_FD_STASH_SIZE - 1)] = *fd;
+ fs->tail++;
+ }
+ }
+ qbman_swp_dqrr_consume(swp, dq);
+ }
+
+ dpaa2_q->rx_pkts += num_rx;
+ return num_rx;
+}
+
void __rte_hot
dpaa2_dev_process_parallel_event(struct qbman_swp *swp,
const struct qbman_fd *fd,
--
2.43.0
^ permalink raw reply related
* [PATCH 4/9] bus/fslmc/dpio: make the portal DQRI epoll optional
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
dpaa2_dpio_intr_init() builds a private epoll instance the event PMD
sleeps on. The upcoming net rx-queue-interrupt path waits on the
application's own epoll instead, so that instance would be built but
never used.
Add a build_epoll parameter: pass true to build it (event PMD), false
to skip the epoll_create/epoll_ctl. epoll_fd is set to -1 when none is
built and closed in intr_deinit only when valid. The sole caller passes
true: no functional change.
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
drivers/bus/fslmc/portal/dpaa2_hw_dpio.c | 44 +++++++++++++++++-------
1 file changed, 32 insertions(+), 12 deletions(-)
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
index 2a9e519668..3a5abb2e6d 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpio.c
@@ -205,13 +205,12 @@ dpaa2_affine_dpio_intr_to_respective_core(int32_t dpio_id, int cpu_id)
fclose(file);
}
-static int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev)
+static int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev, bool build_epoll)
{
struct epoll_event epoll_ev;
int eventfd, dpio_epoll_fd, ret;
int threshold = 0x3, timeout = 0xFF;
- dpio_epoll_fd = epoll_create(1);
ret = rte_dpaa2_intr_enable(dpio_dev->intr_handle, 0);
if (ret) {
DPAA2_BUS_ERR("Interrupt registration failed");
@@ -231,16 +230,34 @@ static int dpaa2_dpio_intr_init(struct dpaa2_dpio_dev *dpio_dev)
qbman_swp_dqrr_thrshld_write(dpio_dev->sw_portal, threshold);
qbman_swp_intr_timeout_write(dpio_dev->sw_portal, timeout);
- eventfd = rte_intr_fd_get(dpio_dev->intr_handle);
- epoll_ev.events = EPOLLIN | EPOLLPRI | EPOLLET;
- epoll_ev.data.fd = eventfd;
+ dpio_dev->epoll_fd = -1;
- ret = epoll_ctl(dpio_epoll_fd, EPOLL_CTL_ADD, eventfd, &epoll_ev);
- if (ret < 0) {
- DPAA2_BUS_ERR("epoll_ctl failed");
- return -1;
+ /* The event PMD dequeues by sleeping on a private epoll instance owned
+ * by the portal, so build it here. A caller that waits on another
+ * epoll (the net rx-queue-interrupt path uses the application's) skips
+ * this.
+ */
+ if (build_epoll) {
+ dpio_epoll_fd = epoll_create(1);
+ if (dpio_epoll_fd < 0) {
+ DPAA2_BUS_ERR("epoll_create failed");
+ rte_dpaa2_intr_disable(dpio_dev->intr_handle, 0);
+ return -1;
+ }
+
+ eventfd = rte_intr_fd_get(dpio_dev->intr_handle);
+ epoll_ev.events = EPOLLIN | EPOLLPRI | EPOLLET;
+ epoll_ev.data.fd = eventfd;
+
+ ret = epoll_ctl(dpio_epoll_fd, EPOLL_CTL_ADD, eventfd, &epoll_ev);
+ if (ret < 0) {
+ DPAA2_BUS_ERR("epoll_ctl failed");
+ rte_dpaa2_intr_disable(dpio_dev->intr_handle, 0);
+ close(dpio_epoll_fd);
+ return -1;
+ }
+ dpio_dev->epoll_fd = dpio_epoll_fd;
}
- dpio_dev->epoll_fd = dpio_epoll_fd;
return 0;
}
@@ -253,7 +270,10 @@ static void dpaa2_dpio_intr_deinit(struct dpaa2_dpio_dev *dpio_dev)
if (ret)
DPAA2_BUS_ERR("DPIO interrupt disable failed");
- close(dpio_dev->epoll_fd);
+ if (dpio_dev->epoll_fd >= 0) {
+ close(dpio_dev->epoll_fd);
+ dpio_dev->epoll_fd = -1;
+ }
}
#endif
@@ -277,7 +297,7 @@ dpaa2_configure_stashing(struct dpaa2_dpio_dev *dpio_dev, int cpu_id)
}
#ifdef RTE_EVENT_DPAA2
- if (dpaa2_dpio_intr_init(dpio_dev)) {
+ if (dpaa2_dpio_intr_init(dpio_dev, true)) {
DPAA2_BUS_ERR("Interrupt registration failed for dpio");
return -1;
}
--
2.43.0
^ permalink raw reply related
* [PATCH 3/9] bus/fslmc: move DPCON management from event driver to bus
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
The DPCON allocation helpers (rte_dpaa2_alloc_dpcon_dev /
rte_dpaa2_free_dpcon_dev) lived in the event driver, but a notification
channel is a generic QBMan resource. Move dpaa2_hw_dpcon.c to the fslmc
bus and export the helpers as internal symbols so both the event PMD and
the net driver's rx-queue interrupt path can draw channels from the same
pool. No functional change.
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
drivers/bus/fslmc/meson.build | 1 +
.../dpaa2 => bus/fslmc/portal}/dpaa2_hw_dpcon.c | 16 +++++++---------
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 8 ++++++++
drivers/event/dpaa2/dpaa2_eventdev.h | 5 +++--
drivers/event/dpaa2/meson.build | 1 -
5 files changed, 19 insertions(+), 12 deletions(-)
rename drivers/{event/dpaa2 => bus/fslmc/portal}/dpaa2_hw_dpcon.c (90%)
diff --git a/drivers/bus/fslmc/meson.build b/drivers/bus/fslmc/meson.build
index ceae1c6c11..50d9e91a37 100644
--- a/drivers/bus/fslmc/meson.build
+++ b/drivers/bus/fslmc/meson.build
@@ -22,6 +22,7 @@ sources = files(
'mc/mc_sys.c',
'portal/dpaa2_hw_dpbp.c',
'portal/dpaa2_hw_dpci.c',
+ 'portal/dpaa2_hw_dpcon.c',
'portal/dpaa2_hw_dpio.c',
'portal/dpaa2_hw_dprc.c',
'qbman/qbman_portal.c',
diff --git a/drivers/event/dpaa2/dpaa2_hw_dpcon.c b/drivers/bus/fslmc/portal/dpaa2_hw_dpcon.c
similarity index 90%
rename from drivers/event/dpaa2/dpaa2_hw_dpcon.c
rename to drivers/bus/fslmc/portal/dpaa2_hw_dpcon.c
index ea5b0d4b85..6fd96ec0b9 100644
--- a/drivers/event/dpaa2/dpaa2_hw_dpcon.c
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_dpcon.c
@@ -18,13 +18,12 @@
#include <rte_cycles.h>
#include <rte_kvargs.h>
#include <dev_driver.h>
-#include <ethdev_driver.h>
+#include <eal_export.h>
#include <bus_fslmc_driver.h>
#include <mc/fsl_dpcon.h>
#include <portal/dpaa2_hw_pvt.h>
-#include "dpaa2_eventdev.h"
-#include "dpaa2_eventdev_logs.h"
+#include <fslmc_logs.h>
TAILQ_HEAD(dpcon_dev_list, dpaa2_dpcon_dev);
static struct dpcon_dev_list dpcon_dev_list
@@ -55,8 +54,7 @@ rte_dpaa2_create_dpcon_device(int dev_fd __rte_unused,
/* Allocate DPAA2 dpcon handle */
dpcon_node = rte_malloc(NULL, sizeof(struct dpaa2_dpcon_dev), 0);
if (!dpcon_node) {
- DPAA2_EVENTDEV_ERR(
- "Memory allocation failed for dpcon device");
+ DPAA2_BUS_ERR("Memory allocation failed for dpcon device");
return -1;
}
@@ -65,8 +63,7 @@ rte_dpaa2_create_dpcon_device(int dev_fd __rte_unused,
ret = dpcon_open(&dpcon_node->dpcon,
CMD_PRI_LOW, dpcon_id, &dpcon_node->token);
if (ret) {
- DPAA2_EVENTDEV_ERR("Unable to open dpcon device: err(%d)",
- ret);
+ DPAA2_BUS_ERR("Unable to open dpcon device: err(%d)", ret);
rte_free(dpcon_node);
return -1;
}
@@ -75,8 +72,7 @@ rte_dpaa2_create_dpcon_device(int dev_fd __rte_unused,
ret = dpcon_get_attributes(&dpcon_node->dpcon,
CMD_PRI_LOW, dpcon_node->token, &attr);
if (ret != 0) {
- DPAA2_EVENTDEV_ERR("dpcon attribute fetch failed: err(%d)",
- ret);
+ DPAA2_BUS_ERR("dpcon attribute fetch failed: err(%d)", ret);
rte_free(dpcon_node);
return -1;
}
@@ -92,6 +88,7 @@ rte_dpaa2_create_dpcon_device(int dev_fd __rte_unused,
return 0;
}
+RTE_EXPORT_INTERNAL_SYMBOL(rte_dpaa2_alloc_dpcon_dev)
struct dpaa2_dpcon_dev *rte_dpaa2_alloc_dpcon_dev(void)
{
struct dpaa2_dpcon_dev *dpcon_dev = NULL;
@@ -105,6 +102,7 @@ struct dpaa2_dpcon_dev *rte_dpaa2_alloc_dpcon_dev(void)
return dpcon_dev;
}
+RTE_EXPORT_INTERNAL_SYMBOL(rte_dpaa2_free_dpcon_dev)
void rte_dpaa2_free_dpcon_dev(struct dpaa2_dpcon_dev *dpcon)
{
struct dpaa2_dpcon_dev *dpcon_dev = NULL;
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index e625a5c035..79a2ec41e3 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -274,6 +274,14 @@ struct dpaa2_dpcon_dev {
uint8_t channel_index;
};
+/* DPCON channel allocation -- managed by the fslmc bus so both the net
+ * NAPI/DQRR rx path and the event PMD can grab channels.
+ */
+__rte_internal
+struct dpaa2_dpcon_dev *rte_dpaa2_alloc_dpcon_dev(void);
+__rte_internal
+void rte_dpaa2_free_dpcon_dev(struct dpaa2_dpcon_dev *dpcon);
+
/* Refer to Table 7-3 in SEC BG */
#define QBMAN_FLE_WORD4_FMT_SBF 0x0 /* Single buffer frame */
#define QBMAN_FLE_WORD4_FMT_SGE 0x2 /* Scatter gather frame */
diff --git a/drivers/event/dpaa2/dpaa2_eventdev.h b/drivers/event/dpaa2/dpaa2_eventdev.h
index bb87bdbab2..f53efce61c 100644
--- a/drivers/event/dpaa2/dpaa2_eventdev.h
+++ b/drivers/event/dpaa2/dpaa2_eventdev.h
@@ -85,8 +85,9 @@ struct dpaa2_eventdev {
uint32_t event_dev_cfg;
};
-struct dpaa2_dpcon_dev *rte_dpaa2_alloc_dpcon_dev(void);
-void rte_dpaa2_free_dpcon_dev(struct dpaa2_dpcon_dev *dpcon);
+/* rte_dpaa2_alloc_dpcon_dev()/rte_dpaa2_free_dpcon_dev() now live in the fslmc
+ * bus (portal/dpaa2_hw_pvt.h), which this header's includers already pull in.
+ */
int test_eventdev_dpaa2(void);
diff --git a/drivers/event/dpaa2/meson.build b/drivers/event/dpaa2/meson.build
index dd5063af43..62b8507652 100644
--- a/drivers/event/dpaa2/meson.build
+++ b/drivers/event/dpaa2/meson.build
@@ -7,7 +7,6 @@ if not is_linux
endif
deps += ['bus_vdev', 'net_dpaa2', 'crypto_dpaa2_sec']
sources = files(
- 'dpaa2_hw_dpcon.c',
'dpaa2_eventdev.c',
'dpaa2_eventdev_selftest.c',
)
--
2.43.0
^ permalink raw reply related
* [PATCH 2/9] eal/interrupts: keep real errno on epoll error
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena
Cc: dev, Maxime Leroy, stable, Harman Kalra, Cunming Liang
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
Some interrupt users have several vectors backed by the same eventfd
(e.g. several Rx queues behind one DPAA2 portal eventfd). Adding the
second vector to the same epoll instance then fails with EEXIST.
Upper layers such as ethdev and bbdev already treat -EEXIST as a
non-fatal duplicate registration (if (ret && ret != -EEXIST)), but
rte_intr_rx_ctl() lost that information: rte_epoll_ctl() returned -1 and
rte_intr_rx_ctl() flattened every failure to -EPERM.
Return the negative errno from rte_epoll_ctl() (its documented contract
is already "a negative value") and stop rte_intr_rx_ctl() from
flattening errors to -EPERM, so EEXIST reaches the upper layers that
already handle it; other failures carry their real errno.
Fixes: 9efe9c6cdcac ("eal/linux: add epoll wrappers")
Fixes: c9f3ec1a0f3f ("eal/linux: add Rx interrupt control function")
Cc: stable@dpdk.org
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
lib/eal/include/rte_epoll.h | 3 ++-
lib/eal/linux/eal_interrupts.c | 18 +++++++++++-------
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/lib/eal/include/rte_epoll.h b/lib/eal/include/rte_epoll.h
index ae0cf20853..0c7b510563 100644
--- a/lib/eal/include/rte_epoll.h
+++ b/lib/eal/include/rte_epoll.h
@@ -104,7 +104,8 @@ rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
* Note: The caller must take care the object deletion after CTL_DEL.
* @return
* - On success, zero.
- * - On failure, a negative value.
+ * - On failure, a negative errno value, e.g. -EEXIST if the fd is already
+ * registered on the epoll instance (a fd shared between vectors).
*/
int
rte_epoll_ctl(int epfd, int op, int fd,
diff --git a/lib/eal/linux/eal_interrupts.c b/lib/eal/linux/eal_interrupts.c
index 5d0607effe..4cfaeba7fe 100644
--- a/lib/eal/linux/eal_interrupts.c
+++ b/lib/eal/linux/eal_interrupts.c
@@ -1443,7 +1443,7 @@ rte_epoll_ctl(int epfd, int op, int fd,
if (!event) {
EAL_LOG(ERR, "rte_epoll_event can't be NULL");
- return -1;
+ return -EINVAL;
}
/* using per thread epoll fd */
@@ -1460,13 +1460,21 @@ rte_epoll_ctl(int epfd, int op, int fd,
ev.events = event->epdata.event;
if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+ int err = errno;
+
+ /* the fd is already in the set (e.g. shared across vectors):
+ * keep the event valid and report -EEXIST, not a hard error.
+ */
+ if (op == EPOLL_CTL_ADD && err == EEXIST)
+ return -EEXIST;
+
EAL_LOG(ERR, "Error op %d fd %d epoll_ctl, %s",
- op, fd, strerror(errno));
+ op, fd, strerror(err));
if (op == EPOLL_CTL_ADD)
/* rollback status when CTL_ADD fail */
rte_atomic_store_explicit(&event->status, RTE_EPOLL_INVALID,
rte_memory_order_relaxed);
- return -1;
+ return -err;
}
if (op == EPOLL_CTL_DEL && rte_atomic_load_explicit(&event->status,
@@ -1518,8 +1526,6 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
EAL_LOG(DEBUG,
"efd %d associated with vec %d added on epfd %d",
rev->fd, vec, epfd);
- else
- rc = -EPERM;
break;
case RTE_INTR_EVENT_DEL:
epfd_op = EPOLL_CTL_DEL;
@@ -1531,8 +1537,6 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
}
rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
- if (rc)
- rc = -EPERM;
break;
default:
EAL_LOG(ERR, "event op type mismatch");
--
2.43.0
^ permalink raw reply related
* [PATCH 1/9] net/dpaa2: implement RSS RETA query and update
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
In-Reply-To: <20260611154926.392670-1-maxime@leroys.fr>
DPAA2 dispatches RX frames to FQs using 'queue_id = hash % dist_size',
where dist_size is set per-TC via the dpni_set_rx_hash_dist MC command.
There is no software-visible indirection table, so the standard DPDK
RETA API has never been exposed by this PMD.
Implement reta_update / reta_query as an emulation on top of
dpni_set_rx_hash_dist. The emulation accepts only the uniform pattern
'reta[i] = i % N' for some N in the HW-allowed set (1, 2, 3, 4, 6, 7,
8, 12, 14, 16, 24, ...). Non-uniform or weighted patterns are rejected
with -ENOTSUP, as the HW has no arbitrary indirection table.
Changing N sets the size of the contiguous queue subset that RSS
spreads traffic over; the queues above N are left out of the hash
distribution. This covers the patterns that matter here, e.g. growing
or shrinking the active subset to scale CPU cores with load, or
reserving the upper queues for specific traffic that rte_flow steers
there for dedicated polling or QoS handling on its own core.
Refactor the existing dpaa2_setup_flow_dist() to delegate to a new
helper dpaa2_setup_flow_dist_size() that takes the dist_size explicitly
and caches it in priv->dist_size_cur[tc] so reta_query() can report it.
reta_query() returns reta[i] = i % N: this is representative, not
bit-exact, as the HW maps the hash to a queue through its distribution
size encoding rather than a plain modulo. reta_update() takes the RSS
hash set from dev_conf (rx_adv_conf.rss_conf.rss_hf); a prior
rss_hash_update() with a different hf is not re-read.
The advertised reta_size is 64 (one rte_eth_rss_reta_entry64 group), the
smallest legal value and enough for all HW-permitted N values up to 64.
Signed-off-by: Maxime Leroy <maxime@leroys.fr>
---
doc/guides/nics/features/dpaa2.ini | 1 +
doc/guides/rel_notes/release_26_07.rst | 4 +
drivers/net/dpaa2/base/dpaa2_hw_dpni.c | 34 ++--
drivers/net/dpaa2/dpaa2_ethdev.c | 205 +++++++++++++++++++++++++
drivers/net/dpaa2/dpaa2_ethdev.h | 9 ++
5 files changed, 244 insertions(+), 9 deletions(-)
diff --git a/doc/guides/nics/features/dpaa2.ini b/doc/guides/nics/features/dpaa2.ini
index 5f9c587847..5def653d1d 100644
--- a/doc/guides/nics/features/dpaa2.ini
+++ b/doc/guides/nics/features/dpaa2.ini
@@ -15,6 +15,7 @@ Promiscuous mode = Y
Allmulticast mode = Y
Unicast MAC filter = Y
RSS hash = Y
+RSS reta update = Y
VLAN filter = Y
Flow control = Y
Traffic manager = Y
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index b5285af5fe..103c4034ca 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -126,6 +126,10 @@ New Features
* Added support for selective Rx in scalar SPRQ Rx path.
+* **Updated NXP dpaa2 driver.**
+
+ * Added RSS RETA query and update support.
+
* **Updated PCAP ethernet driver.**
* Added support for VLAN insertion and stripping.
diff --git a/drivers/net/dpaa2/base/dpaa2_hw_dpni.c b/drivers/net/dpaa2/base/dpaa2_hw_dpni.c
index 13825046d8..4cbc890cee 100644
--- a/drivers/net/dpaa2/base/dpaa2_hw_dpni.c
+++ b/drivers/net/dpaa2/base/dpaa2_hw_dpni.c
@@ -103,15 +103,10 @@ dpaa2_setup_flow_dist(struct rte_eth_dev *eth_dev,
uint64_t req_dist_set, int tc_index)
{
struct dpaa2_dev_priv *priv = eth_dev->data->dev_private;
- struct fsl_mc_io *dpni = eth_dev->process_private;
- struct dpni_rx_dist_cfg tc_cfg;
- struct dpkg_profile_cfg kg_cfg;
- void *p_params;
- int ret, tc_dist_queues;
+ int tc_dist_queues;
- /*TC distribution size is set with dist_queues or
- * nb_rx_queues % dist_queues in order of TC priority index.
- * Calculating dist size for this tc_index:-
+ /* TC distribution size is set with dist_queues or
+ * (nb_rx_queues - tc_index*dist_queues) in order of TC priority index.
*/
tc_dist_queues = eth_dev->data->nb_rx_queues -
tc_index * priv->dist_queues;
@@ -123,6 +118,24 @@ dpaa2_setup_flow_dist(struct rte_eth_dev *eth_dev,
if (tc_dist_queues > priv->dist_queues)
tc_dist_queues = priv->dist_queues;
+ return dpaa2_setup_flow_dist_size(eth_dev, req_dist_set,
+ tc_index, tc_dist_queues);
+}
+
+int
+dpaa2_setup_flow_dist_size(struct rte_eth_dev *eth_dev,
+ uint64_t req_dist_set, int tc_index, uint16_t dist_size)
+{
+ struct dpaa2_dev_priv *priv = eth_dev->data->dev_private;
+ struct fsl_mc_io *dpni = eth_dev->process_private;
+ struct dpni_rx_dist_cfg tc_cfg;
+ struct dpkg_profile_cfg kg_cfg;
+ void *p_params;
+ int ret;
+
+ if (dist_size == 0)
+ return 0;
+
p_params = rte_malloc(NULL,
DIST_PARAM_IOVA_SIZE, RTE_CACHE_LINE_SIZE);
if (!p_params) {
@@ -150,7 +163,7 @@ dpaa2_setup_flow_dist(struct rte_eth_dev *eth_dev,
return -ENOBUFS;
}
- tc_cfg.dist_size = tc_dist_queues;
+ tc_cfg.dist_size = dist_size;
tc_cfg.enable = true;
tc_cfg.tc = tc_index;
@@ -168,6 +181,9 @@ dpaa2_setup_flow_dist(struct rte_eth_dev *eth_dev,
return ret;
}
+ if (tc_index < MAX_TCS)
+ priv->dist_size_cur[tc_index] = dist_size;
+
return 0;
}
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.c b/drivers/net/dpaa2/dpaa2_ethdev.c
index 803a8321e0..8589398324 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.c
+++ b/drivers/net/dpaa2/dpaa2_ethdev.c
@@ -80,6 +80,33 @@ bool dpaa2_print_parser_result;
#define MAX_NB_RX_DESC_IN_PEB 11264
static int total_nb_rx_desc;
+/* Size of the RETA (Redirection Table) we expose to the standard DPDK API.
+ * Must be a multiple of RTE_ETH_RETA_GROUP_SIZE (64). DPAA2 has no actual
+ * indirection table in HW; this is the granularity at which uniform RSS
+ * patterns are inspected by dpaa2_dev_rss_reta_update().
+ */
+#define DPAA2_RETA_SIZE 64
+
+/* Values of dist_size accepted by the DPNI 'dpni_set_rx_hash_dist' MC command.
+ * Source: fsl_dpni.h, "struct dpni_rx_dist_cfg::dist_size" documentation.
+ * Used by dpaa2_dev_rss_reta_update() to validate user-requested patterns.
+ */
+static const uint16_t dpaa2_dist_size_allowed[] = {
+ 1, 2, 3, 4, 6, 7, 8, 12, 14, 16, 24, 28, 32, 48, 56, 64,
+ 96, 112, 128, 192, 224, 256, 384, 448, 512, 768, 896, 1024,
+};
+
+static bool
+dpaa2_dist_size_is_supported(uint16_t n)
+{
+ size_t i;
+ for (i = 0; i < RTE_DIM(dpaa2_dist_size_allowed); i++) {
+ if (dpaa2_dist_size_allowed[i] == n)
+ return true;
+ }
+ return false;
+}
+
int dpaa2_valid_dev;
struct rte_mempool *dpaa2_tx_sg_pool;
@@ -425,6 +452,14 @@ dpaa2_dev_info_get(struct rte_eth_dev *dev,
dev_info->max_vfs = 0;
dev_info->max_vmdq_pools = RTE_ETH_16_POOLS;
dev_info->flow_type_rss_offloads = DPAA2_RSS_OFFLOAD_ALL;
+ /* DPAA2 has no software-visible indirection table: incoming packets are
+ * dispatched to FQs via 'queue_id = hash % dist_size'. We expose the
+ * standard RETA API as an emulation that only accepts uniform patterns
+ * 'reta[i] = i % N' and translates them into a dpni_set_rx_hash_dist
+ * command with dist_size=N. See dpaa2_dev_rss_reta_update().
+ */
+ dev_info->reta_size = DPAA2_RETA_SIZE;
+ dev_info->hash_key_size = 0;
dev_info->default_rxportconf.burst_size = dpaa2_dqrr_size;
/* same is rx size for best perf */
@@ -2508,6 +2543,174 @@ dpaa2_dev_rss_hash_conf_get(struct rte_eth_dev *dev,
return 0;
}
+/* Emulation of the standard DPDK RETA API on top of DPAA2's
+ * dpni_set_rx_hash_dist MC command.
+ *
+ * DPAA2 hardware dispatches incoming frames using 'queue_id = hash % dist_size'
+ * (no software-visible indirection table). To expose the standard
+ * rte_eth_dev_rss_reta_update() interface, we accept ONLY uniform patterns of
+ * the form 'reta[i] = i % N' where N is in the HW-allowed dist_size list. Any
+ * other pattern (weighted RSS, non-contiguous queue IDs, gaps) is rejected
+ * with -ENOTSUP. This is enough to support dynamic RSS scale-up/down across
+ * a contiguous queue subset, which is the main use case for adaptive
+ * dataplane CPU usage.
+ *
+ * Applies the new dist_size on every configured RX TC, mirroring the
+ * behavior of dpaa2_dev_rss_hash_update().
+ */
+static int
+dpaa2_dev_rss_reta_update(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size)
+{
+ struct dpaa2_dev_priv *priv = dev->data->dev_private;
+ struct rte_eth_conf *eth_conf = &dev->data->dev_conf;
+ uint16_t i, max_q = 0, n;
+ int tc_index, ret;
+ bool any_set = false;
+
+ PMD_INIT_FUNC_TRACE();
+
+ if (reta_size != DPAA2_RETA_SIZE) {
+ DPAA2_PMD_ERR("Invalid reta_size %u (expected %u)",
+ reta_size, DPAA2_RETA_SIZE);
+ return -EINVAL;
+ }
+
+ /* dpaa2 cannot merge a partial RETA into the live table, so only a
+ * full update (every entry of every group) is accepted.
+ */
+ for (i = 0; i < reta_size / RTE_ETH_RETA_GROUP_SIZE; i++) {
+ if (reta_conf[i].mask != UINT64_MAX) {
+ DPAA2_PMD_ERR("partial RETA update not supported; set all %u entries",
+ DPAA2_RETA_SIZE);
+ return -ENOTSUP;
+ }
+ }
+
+ /* First pass: validate queue IDs, find max, and require at least
+ * one slot to be selected via the per-group mask.
+ */
+ for (i = 0; i < reta_size; i++) {
+ uint16_t grp = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t pos = i % RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t q;
+
+ if (!(reta_conf[grp].mask & (1ULL << pos)))
+ continue;
+ any_set = true;
+
+ q = reta_conf[grp].reta[pos];
+ if (q >= dev->data->nb_rx_queues) {
+ DPAA2_PMD_ERR(
+ "reta[%u] = %u out of range (max %u)",
+ i, q, dev->data->nb_rx_queues - 1);
+ return -EINVAL;
+ }
+ if (q > max_q)
+ max_q = q;
+ }
+
+ if (!any_set) {
+ DPAA2_PMD_WARN("reta_update called with empty mask, no-op");
+ return 0;
+ }
+
+ n = max_q + 1;
+
+ /* Second pass: enforce the uniform pattern reta[i] = i % n on every
+ * slot the user has selected. dpaa2 HW cannot honor any other layout.
+ */
+ for (i = 0; i < reta_size; i++) {
+ uint16_t grp = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t pos = i % RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t expected = i % n;
+ uint16_t q;
+
+ if (!(reta_conf[grp].mask & (1ULL << pos)))
+ continue;
+
+ q = reta_conf[grp].reta[pos];
+ if (q != expected) {
+ DPAA2_PMD_ERR(
+ "Non-uniform RETA pattern at slot %u "
+ "(got queue %u, expected %u). dpaa2 HW "
+ "only supports queue_id = hash mod N with "
+ "contiguous queues 0..N-1.",
+ i, q, expected);
+ return -ENOTSUP;
+ }
+ }
+
+ if (!dpaa2_dist_size_is_supported(n)) {
+ DPAA2_PMD_ERR(
+ "dist_size %u not supported by HW. Allowed: "
+ "1,2,3,4,6,7,8,12,14,16,24,28,32,48,56,64,...",
+ n);
+ return -ENOTSUP;
+ }
+
+ /* Apply on every configured RX TC, matching rss_hash_update behavior. */
+ for (tc_index = 0; tc_index < priv->num_rx_tc; tc_index++) {
+ ret = dpaa2_setup_flow_dist_size(dev,
+ eth_conf->rx_adv_conf.rss_conf.rss_hf,
+ tc_index, n);
+ if (ret) {
+ DPAA2_PMD_ERR(
+ "Failed to apply dist_size=%u on tc%d (err=%d)",
+ n, tc_index, ret);
+ return ret;
+ }
+ }
+
+ DPAA2_PMD_DEBUG("RETA updated: dist_size now %u on %u TC(s)",
+ n, priv->num_rx_tc);
+ return 0;
+}
+
+/* Synthesizes a RETA snapshot from the currently-active dist_size on TC 0.
+ * Since DPAA2 always uses uniform 'hash mod N' distribution, the returned
+ * RETA is reta[i] = i % dist_size_cur[0].
+ */
+static int
+dpaa2_dev_rss_reta_query(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size)
+{
+ struct dpaa2_dev_priv *priv = dev->data->dev_private;
+ uint16_t i, n;
+
+ PMD_INIT_FUNC_TRACE();
+
+ if (reta_size != DPAA2_RETA_SIZE) {
+ DPAA2_PMD_ERR("Invalid reta_size %u (expected %u)",
+ reta_size, DPAA2_RETA_SIZE);
+ return -EINVAL;
+ }
+
+ /* Use the cached dist_size on TC 0 (representative). Fall back to the
+ * default (nb_rx_queues clamped to dist_queues) when never programmed.
+ */
+ n = priv->dist_size_cur[0];
+ if (n == 0) {
+ n = priv->dist_queues;
+ if (n > dev->data->nb_rx_queues)
+ n = dev->data->nb_rx_queues;
+ }
+ if (n == 0)
+ return -EINVAL;
+
+ for (i = 0; i < reta_size; i++) {
+ uint16_t grp = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t pos = i % RTE_ETH_RETA_GROUP_SIZE;
+
+ if (reta_conf[grp].mask & (1ULL << pos))
+ reta_conf[grp].reta[pos] = i % n;
+ }
+
+ return 0;
+}
+
RTE_EXPORT_INTERNAL_SYMBOL(dpaa2_eth_eventq_attach)
int dpaa2_eth_eventq_attach(const struct rte_eth_dev *dev,
int eth_rx_queue_id,
@@ -2736,6 +2939,8 @@ static struct eth_dev_ops dpaa2_ethdev_ops = {
.mac_addr_set = dpaa2_dev_set_mac_addr,
.rss_hash_update = dpaa2_dev_rss_hash_update,
.rss_hash_conf_get = dpaa2_dev_rss_hash_conf_get,
+ .reta_update = dpaa2_dev_rss_reta_update,
+ .reta_query = dpaa2_dev_rss_reta_query,
.flow_ops_get = dpaa2_dev_flow_ops_get,
.rxq_info_get = dpaa2_rxq_info_get,
.txq_info_get = dpaa2_txq_info_get,
diff --git a/drivers/net/dpaa2/dpaa2_ethdev.h b/drivers/net/dpaa2/dpaa2_ethdev.h
index 4da47a543a..3f224c654e 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.h
+++ b/drivers/net/dpaa2/dpaa2_ethdev.h
@@ -412,6 +412,12 @@ struct dpaa2_dev_priv {
uint8_t max_cgs;
uint8_t cgid_in_use[MAX_RX_QUEUES];
+ /* Current hash distribution size per RX TC, written by
+ * dpaa2_setup_flow_dist_size() and read by reta_query / reta_update.
+ * Zero means "use default" (= nb_rx_queues clamped to dist_queues).
+ */
+ uint16_t dist_size_cur[MAX_TCS];
+
uint16_t dpni_ver_major;
uint16_t dpni_ver_minor;
uint32_t speed_capa;
@@ -468,6 +474,9 @@ int dpaa2_distset_to_dpkg_profile_cfg(uint64_t req_dist_set,
int dpaa2_setup_flow_dist(struct rte_eth_dev *eth_dev,
uint64_t req_dist_set, int tc_index);
+int dpaa2_setup_flow_dist_size(struct rte_eth_dev *eth_dev,
+ uint64_t req_dist_set, int tc_index, uint16_t dist_size);
+
int dpaa2_remove_flow_dist(struct rte_eth_dev *eth_dev,
uint8_t tc_index);
--
2.43.0
^ permalink raw reply related
* [PATCH 0/9] net/dpaa2: NAPI-style Rx queue interrupts
From: Maxime Leroy @ 2026-06-11 15:49 UTC (permalink / raw)
To: hemant.agrawal, sachin.saxena; +Cc: dev, Maxime Leroy
This series lets a dpaa2 worker sleep on a queue's data-availability
notification instead of busy-polling, exposed through the generic
rte_eth_dev_rx_intr_* API (NAPI-style: poll while frames keep coming,
arm the interrupt and sleep when the queue runs dry).
Why it is not a trivial .rx_queue_intr_enable
----------------------------------------------
A worker wakes on its software portal's DQRI, which fires when the
portal's DQRR holds frames. The default dpaa2 Rx burst pulls frames
from the FQ with a volatile dequeue and cannot be interrupt-driven; to
wake on the DQRI the FQ must instead be pushed to the portal's DQRR.
The natural dpni_set_queue with a notification destination would have to
target the worker's portal, but that portal is only known once a worker
affines, after dev_start, and that MC command holds the global MC lock
long enough to wedge the firmware while traffic runs. So the bind cannot
be done late, against the polling lcore.
Design
------
Each Rx FQ is bound to its own DPCON channel, statically, at dev_start
while the dpni is still disabled (no knowledge of the polling lcore). A
worker later subscribes its own ethrx portal to the channel and arms the
DQRI in rx_queue_intr_enable, a one-shot per-portal op, never the wedging
set_queue. One portal serves every queue a worker owns, so the DQRR
burst demuxes frames to their FQ by fqd_ctx; foreign frames are parked in
the target queue's stash, so the application polls all its queues after a
wakeup, the same scheduling contract as plain DPDK polling. A queue can
be re-homed to another lcore at runtime with no set_queue and no port
stop.
This reuses the event PMD's pushed/DQRR model but with one DPCON per FQ
and static affinity (no QBMan scheduling), so the DPCON allocator is
moved from the event driver to the fslmc bus and shared.
Patches 3 to 6 build the interrupt support proper, on top of three bug
fixes the path depends on and which it uncovered: patch 2 (eal, the
shared portal eventfd must not fail with -EEXIST), patch 7 (rx_queue_count
NULL on the primary process) and patch 8 (fast-path ops NULL after port
stop). They are real fixes, tagged for stable and backportable on their
own. Patches 1 (RSS RETA) and 9 (drop the software VLAN strip) are
independent net/dpaa2 changes the interrupt path does not require.
Tested on LX2160A (lx2160acex7).
Maxime Leroy (9):
net/dpaa2: implement RSS RETA query and update
eal/interrupts: keep real errno on epoll error
bus/fslmc: move DPCON management from event driver to bus
bus/fslmc/dpio: make the portal DQRI epoll optional
net/dpaa2: support Rx queue interrupts
bus/fslmc/dpio: tune DQRI interrupt coalescing holdoff
net/dpaa2: fix Rx queue count for primary process
ethdev: keep fast-path ops valid after port stop
net/dpaa2: drop the fake software VLAN strip offload
doc/guides/nics/dpaa2.rst | 10 +
doc/guides/nics/features/dpaa2.ini | 2 +
doc/guides/rel_notes/release_26_07.rst | 8 +
drivers/bus/fslmc/meson.build | 1 +
.../fslmc/portal}/dpaa2_hw_dpcon.c | 16 +-
drivers/bus/fslmc/portal/dpaa2_hw_dpio.c | 113 +++-
drivers/bus/fslmc/portal/dpaa2_hw_dpio.h | 12 +
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 35 +-
.../fslmc/qbman/include/fsl_qbman_portal.h | 9 +
drivers/bus/fslmc/qbman/qbman_portal.c | 7 +
drivers/event/dpaa2/dpaa2_eventdev.h | 5 +-
drivers/event/dpaa2/meson.build | 1 -
drivers/net/dpaa2/base/dpaa2_hw_dpni.c | 34 +-
drivers/net/dpaa2/dpaa2_ethdev.c | 556 +++++++++++++++++-
drivers/net/dpaa2/dpaa2_ethdev.h | 19 +
drivers/net/dpaa2/dpaa2_rxtx.c | 123 +++-
lib/eal/include/rte_epoll.h | 3 +-
lib/eal/linux/eal_interrupts.c | 18 +-
lib/ethdev/ethdev_private.c | 7 +
19 files changed, 908 insertions(+), 71 deletions(-)
rename drivers/{event/dpaa2 => bus/fslmc/portal}/dpaa2_hw_dpcon.c (90%)
--
2.43.0
^ permalink raw reply
* Re: [PATCH v2 01/22] net/cnxk: update mbuf next field for multi segment
From: Stephen Hemminger @ 2026-06-11 15:26 UTC (permalink / raw)
To: Rahul Bhansali
Cc: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
Satha Rao, Harman Kalra, jerinj
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>
On Thu, 11 Jun 2026 19:50:08 +0530
Rahul Bhansali <rbhansali@marvell.com> wrote:
> As per the requirement of rte_mbuf_raw_reset_bulk(), the mbuf's
> 'next' and 'nb_segs' fields are required to be reset.
> This reset these field for multi-segment mbufs on cn9k platform.
>
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
Please put a cover letter on large multi-patch series in future.
The CI AI review doesn't look at original source and uses cost optimized
model (ie. is not that smart). Did a UI based review and it saw:
Reviewed the v2 series. Three issues, rest look good.
[06/22] net/cnxk: reserve memory for lookup mem at probe
Error: error path returns success. At the new check rc is still 0 from
the prior successful roc_nix_dev_init(). When
cnxk_nix_fastpath_lookup_mem_get() returns NULL, "goto dev_fini" falls
through to "return rc" with rc == 0, so dev init reports success and the
rc=%d message prints 0. Set rc before the jump:
if (!cnxk_nix_fastpath_lookup_mem_get()) {
plt_err("Failed to reserve lookup memory");
rc = -ENOMEM;
goto dev_fini;
}
[07/22] drivers: add support for devargs skip size
Warning: shared memzone freed while other ports still use it.
SKIP_SIZE_PKIND_MEMZONE is a single global memzone created once (lookup-
guarded) in roc_npc_init(), but roc_npc_fini() frees it unconditionally.
On a multi-port device the first port closed tears down the table the
other ports still read in roc_npc_skip_size_pkind_get() during inbound SA
creation; the lookup then returns NULL and skip-size pkind selection
silently stops working for the surviving ports. Refcount the memzone or
tie its lifetime to the common/inline layer instead of per-NPC fini.
[19/22] net/cnxk: add FEC get set and capability ops
Warning: feature not reflected in the features matrix. features.rst maps
the FEC feature to fec_get_capability/fec_get/fec_set, which this patch
implements, but doc/guides/nics/features/cnxk.ini is not updated with
"FEC = Y". The ops return NOTSUP on VF/SDP, so cnxk_vf.ini is correct as-
is. Add the matrix entry.
Note on [16/22]: the changes are good. Moving cpt_cq_ena inside the
"if (idev && idev->nix_inl_dev)" block fixes a NULL deref of inl_dev, and
cpt_cq_ena is initialized to 0 so the fall-through default is correct.
The roc_dev.c / roc_ree.c error-path rework fixes real leaks and wrong-
success returns.
Other patches reviewed with no issues.
^ permalink raw reply
* [PATCH v8 18/18] vfio: introduce cdev mode
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Bruce Richardson
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Add support for VFIO cdev (also known as IOMMUFD) API. The group API is now
considered legacy in the kernel, and all further development is expected to
happen in IOMMUFD infrastructure.
To assist any future use of VFIO cdev mode for custom behavior, also
introduce "get device number" API, which is kind-of-but-not-really similar
to the concept of IOMMU group.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
lib/eal/freebsd/eal.c | 10 +
lib/eal/include/rte_vfio.h | 31 +++
lib/eal/linux/eal_vfio.c | 210 +++++++++++++++++
lib/eal/linux/eal_vfio.h | 29 ++-
lib/eal/linux/eal_vfio_cdev.c | 390 +++++++++++++++++++++++++++++++
lib/eal/linux/eal_vfio_mp_sync.c | 42 ++++
lib/eal/linux/meson.build | 1 +
7 files changed, 711 insertions(+), 2 deletions(-)
create mode 100644 lib/eal/linux/eal_vfio_cdev.c
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index bb05a969a9..a84280a66c 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -931,3 +931,13 @@ rte_vfio_get_mode(void)
{
return RTE_VFIO_MODE_NONE;
}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_num)
+int
+rte_vfio_get_device_num(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *vfio_device_num)
+{
+ rte_errno = ENOTSUP;
+ return -1;
+}
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index c4ba0e5cda..502a68c948 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -28,6 +28,8 @@ extern "C" {
#define RTE_VFIO_DIR "/dev/vfio"
#define RTE_VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define RTE_VFIO_IOMMUFD_PATH "/dev/iommu"
+#define RTE_VFIO_CDEV_DEVICES_PATH "/dev/vfio/devices"
#define RTE_VFIO_GROUP_FMT "/dev/vfio/%u"
#define RTE_VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
#define RTE_VFIO_NOIOMMU_MODE "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
@@ -48,11 +50,13 @@ struct vfio_device_info;
* - RTE_VFIO_MODE_NONE: VFIO is not enabled.
* - RTE_VFIO_MODE_GROUP: Legacy group mode.
* - RTE_VFIO_MODE_NOIOMMU: Unsafe no-IOMMU mode.
+ * - RTE_VFIO_MODE_CDEV: Character device mode.
*/
enum rte_vfio_mode {
RTE_VFIO_MODE_NONE = 0, /**< VFIO not enabled */
RTE_VFIO_MODE_GROUP, /**< Group mode */
RTE_VFIO_MODE_NOIOMMU, /**< Group mode with no IOMMU protection */
+ RTE_VFIO_MODE_CDEV, /**< Device mode */
};
/**
@@ -197,6 +201,33 @@ __rte_internal
int
rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num);
+/**
+ * @internal
+ * Parse VFIO cdev device number for a device.
+ *
+ * This function is only relevant on Linux in cdev mode.
+ *
+ * @param sysfs_base
+ * Sysfs path prefix.
+ * @param dev_addr
+ * Device identifier.
+ * @param vfio_device_num
+ * Pointer to where VFIO cdev device number will be stored.
+ *
+ * @return
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENODEV - Device not managed by VFIO.
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
+ */
+__rte_internal
+int
+rte_vfio_get_device_num(const char *sysfs_base, const char *dev_addr, int *vfio_device_num);
+
/**
* @internal
* Get device information.
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index c104008a43..004ee48cf5 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -348,6 +348,20 @@ vfio_container_get_by_group_num(int group_num)
return NULL;
}
+static struct container *
+vfio_container_get_by_dev_num(int dev_num)
+{
+ struct container *cfg;
+ struct vfio_device *dev;
+
+ CONTAINER_FOREACH_ACTIVE(cfg) {
+ DEVICE_FOREACH_ACTIVE(cfg, dev)
+ if (dev->dev_num == dev_num)
+ return cfg;
+ }
+ return NULL;
+}
+
static struct container *
vfio_container_create(void)
{
@@ -517,6 +531,55 @@ vfio_setup_dma_mem(struct container *cfg)
return 0;
}
+static enum vfio_result
+vfio_cdev_assign_device(struct container *cfg, const char *sysfs_base,
+ const char *dev_addr, struct vfio_device **out_dev)
+{
+ struct vfio_device *dev, *found_dev;
+ enum vfio_result res;
+ int dev_num, ret;
+
+ /* get the cdev device number from sysfs */
+ ret = vfio_cdev_get_device_num(sysfs_base, dev_addr, &dev_num);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Failed to get cdev device number for %s", dev_addr);
+ return VFIO_ERROR;
+ } else if (ret == 0) {
+ EAL_LOG(ERR, "Device %s not bound to vfio-pci cdev", dev_addr);
+ return VFIO_NOT_MANAGED;
+ }
+
+ /* do we already have this device? */
+ found_dev = vfio_cdev_get_dev_by_num(cfg, dev_num);
+ if (found_dev != NULL) {
+ EAL_LOG(ERR, "Device %s already assigned to this container", dev_addr);
+ *out_dev = found_dev;
+ return VFIO_EXISTS;
+ }
+ /* create new device structure */
+ dev = vfio_device_create(cfg);
+ if (dev == NULL) {
+ EAL_LOG(ERR, "No space to track new VFIO cdev device");
+ return VFIO_NO_SPACE;
+ }
+ /* store device number */
+ dev->dev_num = dev_num;
+
+ /* set up our device now and store it in config */
+ ret = vfio_cdev_setup_device(cfg, dev);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Cannot setup cdev device %s", dev_addr);
+ res = VFIO_ERROR;
+ goto err;
+ }
+ *out_dev = dev;
+ return VFIO_SUCCESS;
+
+err:
+ vfio_device_erase(cfg, dev);
+ return res;
+}
+
static enum vfio_result
vfio_group_assign_device(struct container *cfg, const char *sysfs_base,
const char *dev_addr, struct vfio_device **out_dev)
@@ -663,6 +726,49 @@ rte_vfio_container_assign_device(int container_fd, const char *sysfs_base, const
return -1;
}
+ /*
+ * The device-to-container assignment is a complex problem to solve, for the following
+ * reasons:
+ *
+ * 1. PCI infrastructure is decoupled from VFIO, so PCI does not know anything about VFIO
+ *
+ * This means that while 99% of VFIO usage is PCI-related, we cannot communicate to PCI that
+ * we want to map a particular device using a particular container. Previously, this was
+ * achieved using back-channel communication via IOMMU group binding, so that whenever PCI
+ * map actually happens, VFIO knows which container to use, so this is roughly the model we
+ * are going with.
+ *
+ * 2. VFIO cannot depend on PCI because VFIO is in EAL
+ *
+ * We cannot "assign" a PCI device to container using rte_pci_device pointer because VFIO
+ * cannot depend on PCI definitions, nor can't we even assume that our device is in fact a
+ * PCI device, even though in practice this is true (at the time of this writing, FSLMC is
+ * the only bus doing non-PCI VFIO mappings, but FSLMC manages all VFIO infrastructure by
+ * itself, so in practice even counting FSLMC bus, we're always dealing with PCI devices).
+ *
+ * 3. The "assignment" means different things for group and cdev mode
+ *
+ * In group mode, to "bind" a device to a specific container, it is enough to bind its
+ * IOMMU group, so that when rte_vfio_setup_device() is called, we simply retrieve already
+ * existing group, and through that we figure out which container to use.
+ *
+ * For cdev mode, there are no "groups", so "assignment" either means we store some kind of
+ * uniquely identifying token (such as device number, or an opaque pointer), or we simply
+ * open the device straight away, and when rte_vfio_setup_device() comes we simply return
+ * the fd that was already opened at assign.
+ *
+ * Doing it the latter way (opening the device at assign for both group and cdev modes)
+ * actually solves all of these problems, so that's what we're going to do - the device
+ * setup API call will actually just assign the device to default container, while release
+ * will automatically cleanup and unassign anything that needs unassigned. There will be no
+ * "unassign" call, as it is not necessary.
+ *
+ * There is one downside for group mode when adding duplicate devices: to get to device fd,
+ * we need to go through the entire codepath before we arrive at fd only to realize it was
+ * already opened earlier, but this is acceptable compromise for unifying the API around
+ * device assignment.
+ */
+
if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
EAL_LOG(ERR, "VFIO support not initialized");
rte_errno = ENXIO;
@@ -683,6 +789,9 @@ rte_vfio_container_assign_device(int container_fd, const char *sysfs_base, const
case RTE_VFIO_MODE_NOIOMMU:
res = vfio_group_assign_device(cfg, sysfs_base, dev_addr, &dev);
break;
+ case RTE_VFIO_MODE_CDEV:
+ res = vfio_cdev_assign_device(cfg, sysfs_base, dev_addr, &dev);
+ break;
default:
EAL_LOG(ERR, "Unsupported VFIO mode");
res = VFIO_NOT_SUPPORTED;
@@ -755,6 +864,24 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
res = vfio_group_assign_device(cfg, sysfs_base, dev_addr, &dev);
break;
}
+ case RTE_VFIO_MODE_CDEV:
+ {
+ int dev_num;
+
+ /* find device number */
+ ret = vfio_cdev_get_device_num(sysfs_base, dev_addr, &dev_num);
+ if (ret < 0)
+ goto assign_fail;
+ else if (ret == 0)
+ goto not_managed;
+
+ cfg = vfio_container_get_by_dev_num(dev_num);
+ if (cfg == NULL)
+ cfg = vfio_cfg.default_cfg;
+
+ res = vfio_cdev_assign_device(cfg, sysfs_base, dev_addr, &dev);
+ break;
+ }
default:
EAL_LOG(ERR, "Unsupported VFIO mode");
rte_errno = ENOTSUP;
@@ -874,6 +1001,12 @@ rte_vfio_release_device(const char *sysfs_base __rte_unused,
}
break;
}
+ case RTE_VFIO_MODE_CDEV:
+ {
+ /* for cdev, just erase the device and we're done */
+ vfio_device_erase(cfg, dev);
+ break;
+ }
default:
EAL_LOG(ERR, "Unsupported VFIO mode");
rte_errno = ENOTSUP;
@@ -939,6 +1072,9 @@ vfio_select_mode(void)
if (vfio_sync_mode(cfg, &mode) < 0)
goto err;
+ /* if primary is in cdev mode, we need to sync ioas as well */
+ if (mode == RTE_VFIO_MODE_CDEV && vfio_cdev_sync_ioas(cfg) < 0)
+ goto err;
/* primary handles DMA setup for default containers */
group_cfg->dma_setup_done = true;
@@ -958,6 +1094,19 @@ vfio_select_mode(void)
return RTE_VFIO_MODE_NOIOMMU;
return RTE_VFIO_MODE_GROUP;
}
+ EAL_LOG(DEBUG, "VFIO group mode not available, trying cdev mode...");
+ /* try cdev mode */
+ if (vfio_cdev_enable(cfg) == 0) {
+ if (vfio_cdev_setup_ioas(cfg) < 0)
+ goto err_mpsync;
+ if (vfio_setup_dma_mem(cfg) < 0)
+ goto err_mpsync;
+ if (vfio_register_mem_event_callback() < 0)
+ goto err_mpsync;
+
+ return RTE_VFIO_MODE_CDEV;
+ }
+ EAL_LOG(DEBUG, "VFIO cdev mode not available");
err_mpsync:
vfio_mp_sync_cleanup();
err:
@@ -972,6 +1121,7 @@ vfio_mode_to_str(enum rte_vfio_mode mode)
switch (mode) {
case RTE_VFIO_MODE_GROUP: return "group";
case RTE_VFIO_MODE_NOIOMMU: return "noiommu";
+ case RTE_VFIO_MODE_CDEV: return "cdev";
default: return "not initialized";
}
}
@@ -1111,6 +1261,40 @@ rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_
return 0;
}
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_num)
+int
+rte_vfio_get_device_num(const char *sysfs_base, const char *dev_addr, int *device_num)
+{
+ int ret;
+
+ if (sysfs_base == NULL || dev_addr == NULL || device_num == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
+ if (vfio_cfg.mode != RTE_VFIO_MODE_CDEV) {
+ EAL_LOG(ERR, "VFIO not initialized in cdev mode");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ ret = vfio_cdev_get_device_num(sysfs_base, dev_addr, device_num);
+ if (ret < 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (ret == 0) {
+ rte_errno = ENODEV;
+ return -1;
+ }
+ return 0;
+}
+
static int
vfio_dma_mem_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
uint64_t len, int do_map)
@@ -1310,6 +1494,25 @@ rte_vfio_container_create(void)
cfg->container_fd = container_fd;
break;
}
+ case RTE_VFIO_MODE_CDEV:
+ {
+ /* Open new iommufd for custom container */
+ container_fd = vfio_cdev_get_iommufd();
+ if (container_fd < 0) {
+ EAL_LOG(ERR, "Cannot open iommufd for cdev container");
+ rte_errno = EIO;
+ goto err;
+ }
+ cfg->container_fd = container_fd;
+
+ /* Set up IOAS for this container */
+ if (vfio_cdev_setup_ioas(cfg) < 0) {
+ EAL_LOG(ERR, "Cannot setup IOAS for cdev container");
+ rte_errno = EIO;
+ goto err;
+ }
+ break;
+ }
default:
EAL_LOG(NOTICE, "Unsupported VFIO mode");
rte_errno = ENOTSUP;
@@ -1368,6 +1571,13 @@ rte_vfio_container_destroy(int container_fd)
vfio_group_erase(cfg, grp);
}
break;
+ case RTE_VFIO_MODE_CDEV:
+ /* erase all devices */
+ DEVICE_FOREACH_ACTIVE(cfg, dev) {
+ EAL_LOG(DEBUG, "Device vfio%d still open, closing", dev->dev_num);
+ vfio_device_erase(cfg, dev);
+ }
+ break;
default:
EAL_LOG(ERR, "Unsupported VFIO mode");
rte_errno = ENOTSUP;
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 68d3a3ec6e..52cb3a0e08 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -48,7 +48,10 @@ struct vfio_group {
/* device tracking (common for group and cdev modes) */
struct vfio_device {
bool active;
- int group; /**< back-reference to group list (group mode) */
+ union {
+ int group; /**< back-reference to group list (group mode) */
+ int dev_num; /**< device number, e.g., X in /dev/vfio/devices/vfioX (cdev mode) */
+ };
int fd;
};
@@ -61,12 +64,20 @@ struct vfio_group_config {
struct vfio_group groups[RTE_MAX_VFIO_GROUPS];
};
+/* cdev mode specific configuration */
+struct vfio_cdev_config {
+ uint32_t ioas_id;
+};
+
/* per-container configuration */
struct container {
bool active;
int container_fd;
struct user_mem_maps mem_maps;
- struct vfio_group_config group_cfg;
+ union {
+ struct vfio_group_config group_cfg;
+ struct vfio_cdev_config cdev_cfg;
+ };
int n_devices;
struct vfio_device devices[RTE_MAX_VFIO_DEVICES];
};
@@ -160,12 +171,24 @@ int vfio_group_setup_iommu(struct container *cfg);
int vfio_group_setup_device_fd(const char *dev_addr,
struct vfio_group *grp, struct vfio_device *dev);
+/* cdev mode functions */
+int vfio_cdev_enable(struct container *cfg);
+int vfio_cdev_setup_ioas(struct container *cfg);
+int vfio_cdev_sync_ioas(struct container *cfg);
+int vfio_cdev_get_iommufd(void);
+int vfio_cdev_get_device_num(const char *sysfs_base, const char *dev_addr,
+ int *cdev_dev_num);
+struct vfio_device *vfio_cdev_get_dev_by_num(struct container *cfg, int cdev_dev_num);
+int vfio_cdev_setup_device(struct container *cfg, struct vfio_device *dev);
+
#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
#define EAL_VFIO_MP "eal_vfio_mp_sync"
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
#define SOCKET_REQ_IOMMU_TYPE 0x400
+#define SOCKET_REQ_CDEV 0x800
+#define SOCKET_REQ_IOAS_ID 0x1000
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
@@ -176,6 +199,8 @@ struct vfio_mp_param {
union {
int group_num;
int iommu_type_id;
+ int cdev_dev_num;
+ int ioas_id;
enum rte_vfio_mode mode;
};
};
diff --git a/lib/eal/linux/eal_vfio_cdev.c b/lib/eal/linux/eal_vfio_cdev.c
new file mode 100644
index 0000000000..ce61a97853
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_cdev.c
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2025 Intel Corporation
+ */
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <uapi/linux/iommufd.h>
+#include <uapi/linux/vfio.h>
+
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_string_fns.h>
+
+#include "eal_vfio.h"
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+static int vfio_cdev_dma_map(struct container *cfg);
+static int vfio_cdev_dma_mem_map(struct container *cfg, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map);
+
+/* IOMMUFD cdev mode IOMMU operations */
+static const struct vfio_iommu_ops iommufd_ops = {
+ .type_id = 0, /* cdev mode doesn't use type_id */
+ .name = "IOMMUFD",
+ .partial_unmap = false,
+ .dma_map_func = &vfio_cdev_dma_map,
+ .dma_user_map_func = &vfio_cdev_dma_mem_map
+};
+
+static int
+vfio_cdev_dma_mem_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct iommu_ioas_map ioas_map;
+ struct iommu_ioas_unmap ioas_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&ioas_map, 0, sizeof(ioas_map));
+ ioas_map.size = sizeof(struct iommu_ioas_map);
+ ioas_map.flags = IOMMU_IOAS_MAP_FIXED_IOVA |
+ IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_WRITEABLE;
+ ioas_map.ioas_id = cfg->cdev_cfg.ioas_id;
+ ioas_map.user_va = vaddr;
+ ioas_map.length = len;
+ ioas_map.iova = iova;
+
+ ret = ioctl(cfg->container_fd, IOMMU_IOAS_MAP, &ioas_map);
+ if (ret) {
+ /**
+ * In case the mapping was already done EEXIST will be
+ * returned from kernel.
+ */
+ if (errno == EEXIST) {
+ EAL_LOG(DEBUG,
+ "Memory segment is already mapped, skipping");
+ } else {
+ EAL_LOG(ERR,
+ "Cannot set up DMA remapping, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+ }
+ } else {
+ memset(&ioas_unmap, 0, sizeof(ioas_unmap));
+ ioas_unmap.size = sizeof(struct iommu_ioas_unmap);
+ ioas_unmap.ioas_id = cfg->cdev_cfg.ioas_id;
+ ioas_unmap.length = len;
+ ioas_unmap.iova = iova;
+
+ ret = ioctl(cfg->container_fd, IOMMU_IOAS_UNMAP, &ioas_unmap);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot clear DMA remapping, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+cdev_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
+{
+ struct container *cfg = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ return vfio_cdev_dma_mem_map(cfg, ms->addr_64, ms->iova, ms->len, 1);
+}
+
+static int
+vfio_cdev_dma_map(struct container *cfg)
+{
+ return rte_memseg_walk(cdev_map, cfg);
+}
+
+int
+vfio_cdev_sync_ioas(struct container *cfg)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ p->req = SOCKET_REQ_IOAS_ID;
+ rte_strscpy(mp_req.name, EAL_VFIO_MP, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 0) {
+ cfg->cdev_cfg.ioas_id = p->ioas_id;
+ free(mp_reply.msgs);
+ return 0;
+ }
+ }
+
+ free(mp_reply.msgs);
+ EAL_LOG(ERR, "Cannot request ioas_id");
+ return -1;
+}
+
+int
+vfio_cdev_setup_ioas(struct container *cfg)
+{
+ struct iommu_ioas_alloc ioas_alloc;
+ int ret;
+
+ /* Allocate an IOAS */
+ memset(&ioas_alloc, 0, sizeof(ioas_alloc));
+ ioas_alloc.size = sizeof(struct iommu_ioas_alloc);
+ ioas_alloc.flags = 0;
+
+ ret = ioctl(cfg->container_fd, IOMMU_IOAS_ALLOC, &ioas_alloc);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot allocate IOAS, error %i (%s)",
+ errno, strerror(errno));
+ return -1;
+ }
+ cfg->cdev_cfg.ioas_id = ioas_alloc.out_ioas_id;
+
+ EAL_LOG(DEBUG, "Allocated IOAS with ID %u", cfg->cdev_cfg.ioas_id);
+ return 0;
+}
+
+int
+vfio_cdev_get_iommufd(void)
+{
+ int iommufd;
+
+ /* if not requesting via mp, open iommufd locally */
+ iommufd = open(RTE_VFIO_IOMMUFD_PATH, O_RDWR);
+ if (iommufd < 0) {
+ EAL_LOG(ERR, "Cannot open %s: %s",
+ RTE_VFIO_IOMMUFD_PATH, strerror(errno));
+ return -1;
+ }
+
+ return iommufd;
+}
+
+int
+vfio_cdev_enable(struct container *cfg)
+{
+ int iommufd;
+
+ /* Check if iommufd device exists */
+ if (access(RTE_VFIO_IOMMUFD_PATH, F_OK) != 0) {
+ EAL_LOG(DEBUG,
+ "IOMMUFD device does not exist, skipping VFIO cdev support...");
+ return 1;
+ }
+
+ /* open iommufd */
+ iommufd = vfio_cdev_get_iommufd();
+ if (iommufd < 0)
+ return -1;
+
+ /* cdev mode does not have different IOMMU ops */
+ vfio_cfg.ops = &iommufd_ops;
+
+ cfg->container_fd = iommufd;
+ return 0;
+}
+
+int
+vfio_cdev_get_device_num(const char *sysfs_base, const char *dev_addr, int *cdev_dev_num)
+{
+ char linkname[PATH_MAX];
+ char filename[PATH_MAX];
+ char *dev_tok, *end;
+ int dev_num;
+ DIR *dir;
+ struct dirent *entry;
+
+ memset(linkname, 0, sizeof(linkname));
+ memset(filename, 0, sizeof(filename));
+
+ /* check if vfio-dev directory exists for this device */
+ snprintf(linkname, sizeof(linkname),
+ "%s/%s/vfio-dev", sysfs_base, dev_addr);
+
+ dir = opendir(linkname);
+ if (dir == NULL) {
+ /* device doesn't have vfio-dev, not bound to vfio-pci cdev */
+ return 0;
+ }
+
+ /* find vfioX entry in vfio-dev directory */
+ while ((entry = readdir(dir)) != NULL) {
+ if (strncmp(entry->d_name, "vfio", 4) == 0) {
+ /* parse device number from vfioX */
+ errno = 0;
+ dev_tok = entry->d_name + 4; /* skip "vfio" prefix */
+ end = dev_tok;
+ dev_num = strtol(dev_tok, &end, 10);
+ if (end == dev_tok || *end != '\0' || errno != 0) {
+ EAL_LOG(ERR, "%s error parsing VFIO cdev device number!",
+ dev_addr);
+ closedir(dir);
+ return -1;
+ }
+ *cdev_dev_num = dev_num;
+ closedir(dir);
+ return 1;
+ }
+ }
+
+ closedir(dir);
+ /* no vfio device found */
+ return 0;
+}
+
+struct vfio_device *
+vfio_cdev_get_dev_by_num(struct container *cfg, int cdev_dev_num)
+{
+ struct vfio_device *dev;
+ /* find device handle */
+ DEVICE_FOREACH_ACTIVE(cfg, dev) {
+ if (dev->dev_num != cdev_dev_num)
+ continue;
+ return dev;
+ }
+ return NULL;
+}
+
+static int
+cdev_open_device_fd(int cdev_dev_num)
+{
+ char devname[PATH_MAX] = {0};
+ int dev_fd;
+
+ snprintf(devname, sizeof(devname), "%s/vfio%d",
+ RTE_VFIO_CDEV_DEVICES_PATH, cdev_dev_num);
+
+ dev_fd = open(devname, O_RDWR);
+ if (dev_fd < 0) {
+ EAL_LOG(ERR, "Cannot open %s: %s", devname, strerror(errno));
+ return -1;
+ }
+
+ return dev_fd;
+}
+
+static int
+cdev_attach_device_to_iommufd(struct container *cfg, struct vfio_device *dev)
+{
+ struct vfio_device_bind_iommufd bind = {0};
+ struct vfio_device_attach_iommufd_pt attach = {0};
+ rte_uuid_t vf_token;
+
+ rte_eal_vfio_get_vf_token(vf_token);
+
+ /* try with token first */
+ if (!rte_uuid_is_null(vf_token)) {
+ bind.flags = VFIO_DEVICE_BIND_FLAG_TOKEN;
+ bind.token_uuid_ptr = (uintptr_t)&vf_token;
+ bind.argsz = sizeof(bind);
+ bind.iommufd = cfg->container_fd;
+
+ /* this may fail because the kernel is too old */
+ if (ioctl(dev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind) < 0) {
+ EAL_LOG(DEBUG, "Failed to bind device %d with VF token", dev->dev_num);
+ EAL_LOG(NOTICE, "Unable to use VF tokens with current kernel version.");
+ EAL_LOG(NOTICE, "Please use kernel >=6.17 or use group mode.");
+ /* erase the bind structure */
+ bind = (struct vfio_device_bind_iommufd){0};
+ } else {
+ goto attach;
+ }
+ }
+ bind.flags = 0;
+ bind.argsz = sizeof(bind);
+ bind.iommufd = cfg->container_fd;
+
+ if (ioctl(dev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind) < 0) {
+ EAL_LOG(ERR, "Cannot bind device to IOMMUFD, error %i (%s)",
+ errno, strerror(errno));
+ return -1;
+ }
+
+attach:
+ /* attach device to IOAS */
+ attach.argsz = sizeof(attach);
+ attach.flags = 0;
+ attach.pt_id = cfg->cdev_cfg.ioas_id;
+
+ if (ioctl(dev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach) < 0) {
+ EAL_LOG(ERR, "Cannot attach device to IOAS, error %i (%s)",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vfio_cdev_request_dev_fd(struct vfio_device *dev)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+ int device_fd = -1;
+
+ /* secondary process requests device fd from primary */
+ p->req = SOCKET_REQ_CDEV;
+ p->cdev_dev_num = dev->dev_num;
+ rte_strscpy(mp_req.name, EAL_VFIO_MP, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1)
+ device_fd = mp_rep->fds[0];
+ }
+
+ free(mp_reply.msgs);
+
+ if (device_fd < 0) {
+ EAL_LOG(ERR, "Cannot request device fd for vfio%d", dev->dev_num);
+ return -1;
+ }
+ dev->fd = device_fd;
+
+ return 0;
+}
+
+int
+vfio_cdev_setup_device(struct container *cfg, struct vfio_device *dev)
+{
+ int device_fd;
+
+ /* get device fd - primary or custom container opens it, secondary requests from primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY || !vfio_container_is_default(cfg)) {
+ device_fd = cdev_open_device_fd(dev->dev_num);
+ if (device_fd < 0)
+ return -1;
+ dev->fd = device_fd;
+
+ /* attach device to iommufd - only in primary */
+ if (cdev_attach_device_to_iommufd(cfg, dev) < 0)
+ return -1;
+ } else if (vfio_cdev_request_dev_fd(dev) < 0) {
+ return -1;
+ }
+ return 0;
+}
diff --git a/lib/eal/linux/eal_vfio_mp_sync.c b/lib/eal/linux/eal_vfio_mp_sync.c
index 9a07d35023..6d94f44af8 100644
--- a/lib/eal/linux/eal_vfio_mp_sync.c
+++ b/lib/eal/linux/eal_vfio_mp_sync.c
@@ -93,6 +93,48 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
}
break;
}
+ case SOCKET_REQ_CDEV:
+ {
+ struct container *cfg;
+ struct vfio_device *dev;
+
+ if (vfio_cfg.mode != RTE_VFIO_MODE_CDEV) {
+ EAL_LOG(ERR, "VFIO not initialized in cdev mode");
+ r->result = SOCKET_ERR;
+ break;
+ }
+
+ r->req = SOCKET_REQ_CDEV;
+ r->cdev_dev_num = m->cdev_dev_num;
+
+ cfg = vfio_cfg.default_cfg;
+ dev = vfio_cdev_get_dev_by_num(cfg, m->cdev_dev_num);
+ if (dev == NULL) {
+ r->result = SOCKET_NO_FD;
+ } else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = dev->fd;
+ }
+ break;
+ }
+ case SOCKET_REQ_IOAS_ID:
+ {
+ struct container *cfg;
+
+ if (vfio_cfg.mode != RTE_VFIO_MODE_CDEV) {
+ EAL_LOG(ERR, "VFIO not initialized in cdev mode");
+ r->result = SOCKET_ERR;
+ break;
+ }
+
+ r->req = SOCKET_REQ_IOAS_ID;
+ cfg = vfio_cfg.default_cfg;
+ r->ioas_id = cfg->cdev_cfg.ioas_id;
+
+ r->result = SOCKET_OK;
+ break;
+ }
default:
EAL_LOG(ERR, "vfio received invalid message!");
return -1;
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 5ec8eddaa2..c164a30b49 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
'eal_thread.c',
'eal_timer.c',
'eal_vfio.c',
+ 'eal_vfio_cdev.c',
'eal_vfio_group.c',
'eal_vfio_mp_sync.c',
)
--
2.47.3
^ permalink raw reply related
* [PATCH v8 17/18] vfio: remove no-IOMMU check API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Bruce Richardson
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The `rte_vfio_noiommu_is_enabled()` check has now been replaced by the new
mode API, so remove it.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
lib/eal/freebsd/eal.c | 6 ------
lib/eal/include/rte_vfio.h | 14 --------------
lib/eal/linux/eal_vfio.c | 7 -------
3 files changed, 27 deletions(-)
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index 6c1d1e3751..bb05a969a9 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -850,12 +850,6 @@ int rte_vfio_is_enabled(__rte_unused const char *modname)
return 0;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
-int rte_vfio_noiommu_is_enabled(void)
-{
- return 0;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
int
rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 0af41c3610..c4ba0e5cda 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -170,20 +170,6 @@ __rte_internal
enum rte_vfio_mode
rte_vfio_get_mode(void);
-/**
- * @internal
- * Check if VFIO NOIOMMU mode is enabled.
- *
- * This function is only relevant on Linux in group mode.
- *
- * @return
- * 1 if enabled.
- * 0 if not enabled or not supported.
- */
-__rte_internal
-int
-rte_vfio_noiommu_is_enabled(void);
-
/**
* @internal
* Parse IOMMU group number for a device.
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 708d14ad51..c104008a43 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -1278,13 +1278,6 @@ container_dma_unmap(struct container *cfg, uint64_t vaddr, uint64_t iova,
return ret;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
-int
-rte_vfio_noiommu_is_enabled(void)
-{
- return vfio_cfg.mode == RTE_VFIO_MODE_NOIOMMU;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
int
rte_vfio_container_create(void)
--
2.47.3
^ permalink raw reply related
* [PATCH v8 16/18] net/ntnic: use the new VFIO mode API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Christian Koue Muf, Serhii Iliushyk
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Use new VFIO mode API to query no-IOMMU status.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/ntnic/ntnic_ethdev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ntnic/ntnic_ethdev.c b/drivers/net/ntnic/ntnic_ethdev.c
index 7cc90a7a5b..8b6bca974c 100644
--- a/drivers/net/ntnic/ntnic_ethdev.c
+++ b/drivers/net/ntnic/ntnic_ethdev.c
@@ -2690,7 +2690,7 @@ nthw_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
(pci_dev->device.devargs->data ? pci_dev->device.devargs->data : "NULL"));
}
- const int n_rte_vfio_no_io_mmu_enabled = rte_vfio_noiommu_is_enabled();
+ const int n_rte_vfio_no_io_mmu_enabled = rte_vfio_get_mode() == RTE_VFIO_MODE_NOIOMMU;
NT_LOG(DBG, NTNIC, "vfio_no_iommu_enabled=%d", n_rte_vfio_no_io_mmu_enabled);
if (n_rte_vfio_no_io_mmu_enabled) {
--
2.47.3
^ permalink raw reply related
* [PATCH v8 15/18] net/hinic3: use the new VFIO mode API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Feifei Wang
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Use new VFIO mode API to query no-IOMMU status.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/hinic3/base/hinic3_hwdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/hinic3/base/hinic3_hwdev.c b/drivers/net/hinic3/base/hinic3_hwdev.c
index d09a8f7e7d..224be6f81c 100644
--- a/drivers/net/hinic3/base/hinic3_hwdev.c
+++ b/drivers/net/hinic3/base/hinic3_hwdev.c
@@ -78,7 +78,8 @@ hinic3_is_vfio_iommu_enable(const struct rte_eth_dev *eth_dev)
{
struct rte_pci_device *pci_dev = RTE_CLASS_TO_BUS_DEVICE(eth_dev, *pci_dev);
- return pci_dev->kdrv == RTE_PCI_KDRV_VFIO && rte_vfio_noiommu_is_enabled() != 1;
+ return pci_dev->kdrv == RTE_PCI_KDRV_VFIO &&
+ rte_vfio_get_mode() != RTE_VFIO_MODE_NOIOMMU;
}
int
--
2.47.3
^ permalink raw reply related
* [PATCH v8 14/18] bus/fslmc: use the new VFIO mode API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Hemant Agrawal, Sachin Saxena
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
FSLMC bus only supports operating in group mode, and relies on no-IOMMU
mode checks. Use the new VFIO API's to query no-IOMMU status, as well as
protect the bus from initializing in non-group mode.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Hemant Agrawal <hemant.agrawal@nxp.com>
---
drivers/bus/fslmc/fslmc_bus.c | 10 +++++++++-
drivers/bus/fslmc/fslmc_vfio.c | 2 +-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index c7549a361a..a225c88b86 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -332,6 +332,13 @@ rte_fslmc_scan(void)
goto scan_fail;
}
+ /* for container groups to work, VFIO must be in group mode */
+ if (rte_vfio_get_mode() != RTE_VFIO_MODE_GROUP &&
+ rte_vfio_get_mode() != RTE_VFIO_MODE_NOIOMMU) {
+ ret = -EINVAL;
+ goto scan_fail;
+ }
+
ret = fslmc_get_container_group(group_name, &groupid);
if (ret != 0)
goto scan_fail;
@@ -500,7 +507,8 @@ rte_dpaa2_get_iommu_class(void)
return RTE_IOVA_DC;
/* check if all devices on the bus support Virtual addressing or not */
- if (fslmc_all_device_support_iova() != 0 && rte_vfio_noiommu_is_enabled() == 0)
+ if (fslmc_all_device_support_iova() != 0 &&
+ rte_vfio_get_mode() != RTE_VFIO_MODE_NOIOMMU)
return RTE_IOVA_VA;
return RTE_IOVA_PA;
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 3ca68ccf24..15273fcd57 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -192,7 +192,7 @@ fslmc_vfio_add_group(int vfio_group_fd,
group->fd = vfio_group_fd;
group->groupid = iommu_group_num;
rte_strscpy(group->group_name, group_name, sizeof(group->group_name));
- if (rte_vfio_noiommu_is_enabled() > 0)
+ if (rte_vfio_get_mode() == RTE_VFIO_MODE_NOIOMMU)
group->iommu_type = VFIO_NOIOMMU_IOMMU;
else
group->iommu_type = VFIO_TYPE1_IOMMU;
--
2.47.3
^ permalink raw reply related
* [PATCH v8 12/18] vfio: cleanup and refactor
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Wathsala Vithanage, Bruce Richardson, Nipun Gupta,
Nikhil Agarwal, Hemant Agrawal, Sachin Saxena, Chenbo Xia,
Ajit Khaparde, Vikas Gupta, Dimon Zhao, Leon Yu, Sam Chen
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Currently, VFIO code is a bit of an incoherent mess internally, with API's
bleeding into each other, inconsistent returns, and a certain amount of
spaghetti stemming from organic growth.
Refactor VFIO code to achieve the following goals:
- Make all error handling consistent, and provide/document rte_errno values
returned from API's to indicate various conditions.
- Introduce new "VFIO mode" concept. This new API will tell caller if
VFIO is enabled, and whether it is using group API, and whether it is
running in no-IOMMU mode.
- Decouple rte_vfio_setup_device semantics from PCI bus return convention.
Currently, when device is not managed by VFIO, rte_vfio_setup_device
will return 1, which is bus speak for "skip this device", however VFIO
has nothing to do with PCI bus and should not follow its API conventions.
- Perform device setup in device assign, and make device setup use shared
code path with device assign and explicitly assuming default container.
This is technically not necessary for group mode as device set up is a
two-step process in that mode, but coming cdev mode will have a
single-step device setup, and it would be easier if the worked the same
way under the hood.
- Make VFIO internals more readable. Introduce a lot of infrastructure and
more explicit validation, rather than over-reliance on sentinel values
and implicit assumptions. This will also make it easier to integrate cdev
mode down the line, as it will rely on most of this infrastructure.
This will change behavior of the following functions:
- `rte_vfio_setup_device` - when the device is not managed by VFIO, the
function will now return -1 with `rte_errno` set to ENODEV
- `rte_vfio_get_group_num` - when the device is not managed by VFIO, the
function will now return -1 with `rte_errno` set to ENODEV
- `rte_vfio_container_destroy` - the function will now release and close
all group and device resources associated with the container being
destroyed by this call
All users of `rte_vfio_setup_device` and `rte_vfio_get_group_num` have been
adjusted to account for API change.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Hemant Agrawal <hemant.agrawal@nxp.com>
---
config/arm/meson.build | 1 +
config/meson.build | 1 +
drivers/bus/cdx/cdx_vfio.c | 17 +-
drivers/bus/fslmc/fslmc_vfio.c | 4 +-
drivers/bus/pci/linux/pci_vfio.c | 19 +-
drivers/crypto/bcmfs/bcmfs_vfio.c | 6 +-
drivers/net/nbl/nbl_common/nbl_userdev.c | 2 +-
lib/eal/freebsd/eal.c | 16 +
lib/eal/include/rte_vfio.h | 250 ++-
lib/eal/linux/eal_vfio.c | 2265 +++++++---------------
lib/eal/linux/eal_vfio.h | 142 +-
lib/eal/linux/eal_vfio_group.c | 984 ++++++++++
lib/eal/linux/eal_vfio_mp_sync.c | 38 +-
lib/eal/linux/meson.build | 1 +
14 files changed, 2104 insertions(+), 1642 deletions(-)
create mode 100644 lib/eal/linux/eal_vfio_group.c
diff --git a/config/arm/meson.build b/config/arm/meson.build
index 27b549a052..2b73cbef7e 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -147,6 +147,7 @@ implementer_cavium = {
'description': 'Cavium',
'flags': [
['RTE_MAX_VFIO_GROUPS', 128],
+ ['RTE_MAX_VFIO_DEVICES', 256],
['RTE_MAX_LCORE', 96],
['RTE_MAX_NUMA_NODES', 2]
],
diff --git a/config/meson.build b/config/meson.build
index d7f5e55c18..0e6e478fc8 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -387,6 +387,7 @@ dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
# values which have defaults which may be overridden
dpdk_conf.set('RTE_MAX_VFIO_GROUPS', 64)
+dpdk_conf.set('RTE_MAX_VFIO_DEVICES', 256)
dpdk_conf.set('RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB', 64)
dpdk_conf.set('RTE_LIBRTE_DPAA2_USE_PHYS_IOVA', true)
if get_option('mbuf_refcnt_atomic')
diff --git a/drivers/bus/cdx/cdx_vfio.c b/drivers/bus/cdx/cdx_vfio.c
index 9bae264409..873f0f3460 100644
--- a/drivers/bus/cdx/cdx_vfio.c
+++ b/drivers/bus/cdx/cdx_vfio.c
@@ -22,6 +22,7 @@
#include <eal_export.h>
#include <rte_eal_paging.h>
+#include <rte_errno.h>
#include <rte_malloc.h>
#include <rte_vfio.h>
@@ -402,8 +403,12 @@ cdx_vfio_map_resource_primary(struct rte_cdx_device *dev)
ret = rte_vfio_setup_device(RTE_CDX_BUS_DEVICES_PATH, dev_name,
&vfio_dev_fd);
- if (ret)
+ if (ret < 0) {
+ /* Device not managed by VFIO - skip */
+ if (rte_errno == ENODEV)
+ ret = 1;
return ret;
+ }
ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
if (ret)
@@ -513,11 +518,13 @@ cdx_vfio_map_resource_secondary(struct rte_cdx_device *dev)
return -1;
}
- ret = rte_vfio_setup_device(RTE_CDX_BUS_DEVICES_PATH, dev_name,
- &vfio_dev_fd);
- if (ret)
+ ret = rte_vfio_setup_device(RTE_CDX_BUS_DEVICES_PATH, dev_name, &vfio_dev_fd);
+ if (ret < 0) {
+ /* Device not managed by VFIO - skip */
+ if (rte_errno == ENODEV)
+ ret = 1;
return ret;
-
+ }
ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
if (ret)
goto err_vfio_dev_fd;
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 412b70e5ae..3ca68ccf24 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -364,9 +364,9 @@ fslmc_get_group_id(const char *group_name,
/* get group number */
ret = rte_vfio_get_group_num(SYSFS_FSL_MC_DEVICES,
group_name, groupid);
- if (ret <= 0) {
+ if (ret < 0) {
DPAA2_BUS_ERR("Find %s IOMMU group", group_name);
- if (ret < 0)
+ if (rte_errno != ENODEV)
return ret;
return -EIO;
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 54e9506058..0d30b1cdf1 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -20,6 +20,7 @@
#include <rte_malloc.h>
#include <rte_vfio.h>
#include <rte_eal.h>
+#include <rte_errno.h>
#include <bus_driver.h>
#include <rte_spinlock.h>
#include <rte_tailq.h>
@@ -752,10 +753,13 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
loc->domain, loc->bus, loc->devid, loc->function);
- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd);
- if (ret)
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, &vfio_dev_fd);
+ if (ret < 0) {
+ /* Device not managed by VFIO - skip */
+ if (rte_errno == ENODEV)
+ ret = 1;
return ret;
+ }
ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
if (ret)
@@ -965,10 +969,13 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
return -1;
}
- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd);
- if (ret)
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, &vfio_dev_fd);
+ if (ret < 0) {
+ /* Device not managed by VFIO - skip */
+ if (rte_errno == ENODEV)
+ ret = 1;
return ret;
+ }
ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
if (ret)
diff --git a/drivers/crypto/bcmfs/bcmfs_vfio.c b/drivers/crypto/bcmfs/bcmfs_vfio.c
index d00aaf1bb7..92d8de4443 100644
--- a/drivers/crypto/bcmfs/bcmfs_vfio.c
+++ b/drivers/crypto/bcmfs/bcmfs_vfio.c
@@ -9,6 +9,7 @@
#include <sys/mman.h>
#include <sys/ioctl.h>
+#include <rte_errno.h>
#include <rte_vfio.h>
#include "bcmfs_device.h"
@@ -26,7 +27,10 @@ vfio_map_dev_obj(const char *path, const char *dev_obj,
struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
ret = rte_vfio_setup_device(path, dev_obj, dev_fd);
- if (ret) {
+ if (ret < 0) {
+ /* Device not managed by VFIO - skip */
+ if (rte_errno == ENODEV)
+ ret = 1;
BCMFS_LOG(ERR, "VFIO Setting for device failed");
return ret;
}
diff --git a/drivers/net/nbl/nbl_common/nbl_userdev.c b/drivers/net/nbl/nbl_common/nbl_userdev.c
index fb256e543f..9aacf7438c 100644
--- a/drivers/net/nbl/nbl_common/nbl_userdev.c
+++ b/drivers/net/nbl/nbl_common/nbl_userdev.c
@@ -413,7 +413,7 @@ static int nbl_mdev_map_device(struct nbl_adapter *adapter)
"%s/%s/", rte_pci_get_sysfs_path(), dev_name);
ret = rte_vfio_get_group_num(pathname, dev_name, &common->iommu_group_num);
- if (ret <= 0) {
+ if (ret < 0) {
NBL_LOG(ERR, "nbl vfio group number failed");
return -1;
}
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index cda72dfd1d..6c1d1e3751 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -921,3 +921,19 @@ rte_vfio_container_assign_device(__rte_unused int vfio_container_fd,
rte_errno = ENOTSUP;
return -1;
}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_info)
+int
+rte_vfio_get_device_info(__rte_unused int vfio_dev_fd,
+ __rte_unused struct vfio_device_info *device_info)
+{
+ rte_errno = ENOTSUP;
+ return -1;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_mode)
+enum rte_vfio_mode
+rte_vfio_get_mode(void)
+{
+ return RTE_VFIO_MODE_NONE;
+}
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 941b7d0541..0af41c3610 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -18,6 +18,7 @@
#include <stdint.h>
#include <rte_compat.h>
+#include <rte_common.h>
#ifdef __cplusplus
extern "C" {
@@ -29,8 +30,7 @@ extern "C" {
#define RTE_VFIO_CONTAINER_PATH "/dev/vfio/vfio"
#define RTE_VFIO_GROUP_FMT "/dev/vfio/%u"
#define RTE_VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
-#define RTE_VFIO_NOIOMMU_MODE \
- "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
+#define RTE_VFIO_NOIOMMU_MODE "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
#endif /* RTE_EXEC_ENV_LINUX */
@@ -39,28 +39,49 @@ struct vfio_device_info;
#define RTE_VFIO_DEFAULT_CONTAINER_FD (-1)
+/**
+ * @enum rte_vfio_mode
+ * Enumeration of VFIO operational modes.
+ *
+ * These modes define how VFIO devices are accessed and managed:
+ *
+ * - RTE_VFIO_MODE_NONE: VFIO is not enabled.
+ * - RTE_VFIO_MODE_GROUP: Legacy group mode.
+ * - RTE_VFIO_MODE_NOIOMMU: Unsafe no-IOMMU mode.
+ */
+enum rte_vfio_mode {
+ RTE_VFIO_MODE_NONE = 0, /**< VFIO not enabled */
+ RTE_VFIO_MODE_GROUP, /**< Group mode */
+ RTE_VFIO_MODE_NOIOMMU, /**< Group mode with no IOMMU protection */
+};
+
/**
* @internal
- * Setup vfio_cfg for the device identified by its address.
- * It discovers the configured I/O MMU groups or sets a new one for the device.
- * If a new groups is assigned, the DMA mapping is performed.
+ * Set up a device managed by VFIO driver.
*
- * This function is only relevant to linux and will return
- * an error on BSD.
+ * If the device was not previously assigned to a container using
+ * `rte_vfio_container_assign_device()`, default container will be used.
+ *
+ * This function is only relevant on Linux.
*
* @param sysfs_base
- * sysfs path prefix.
- *
+ * Sysfs path prefix.
* @param dev_addr
- * device location.
- *
+ * Device identifier.
* @param vfio_dev_fd
- * Pointer to VFIO fd, will be set to the opened device fd on success.
+ * Pointer to where VFIO device file descriptor will be stored.
*
* @return
* 0 on success.
- * <0 on failure.
- * >1 if the device cannot be managed this way.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENODEV - Device not managed by VFIO.
+ * - ENOSPC - No space in VFIO container to track the device.
+ * - EINVAL - Invalid parameters.
+ * - EIO - Error during underlying VFIO operations.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
@@ -68,99 +89,127 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
/**
* @internal
- * Release a device mapped to a VFIO-managed I/O MMU group.
+ * Release a device managed by VFIO driver.
*
- * This function is only relevant to linux and will return
- * an error on BSD.
+ * This function is only relevant on Linux.
+ *
+ * @note As a result of this function, all internal resources used by the device will be released,
+ * so if the device was using a non-default container, it will need to be reassigned to the
+ * container before it can be used again.
*
* @param sysfs_base
- * sysfs path prefix.
- *
+ * Sysfs path prefix.
* @param dev_addr
- * device location.
- *
+ * Device identifier.
* @param fd
- * VFIO fd.
+ * A previously set up VFIO file descriptor.
*
* @return
* 0 on success.
- * <0 on failure.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENOENT - Device not found in any container.
+ * - EINVAL - Invalid parameters.
+ * - EIO - Error during underlying VFIO operations.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
/**
* @internal
- * Enable a VFIO-related kmod.
+ * Enable VFIO subsystem and check if specified kernel module is loaded.
*
- * This function is only relevant to linux and will return
- * an error on BSD.
+ * In case of success, `rte_vfio_get_mode()` can be used to retrieve the VFIO mode in use.
+ *
+ * This function is only relevant on Linux.
*
* @param modname
- * kernel module name.
+ * Kernel module name.
*
* @return
* 0 on success.
- * <0 on failure.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Operation not supported.
*/
__rte_internal
int rte_vfio_enable(const char *modname);
/**
* @internal
- * Check whether a VFIO-related kmod is enabled.
+ * Check if VFIO subsystem is initialized and a specified kernel module is loaded.
*
- * This function is only relevant to Linux.
+ * This function is only relevant on Linux.
*
* @param modname
- * kernel module name.
+ * Kernel module name.
*
* @return
- * 1 if true.
- * 0 otherwise.
+ * 1 if enabled.
+ * 0 if not enabled or not supported.
*/
__rte_internal
int rte_vfio_is_enabled(const char *modname);
/**
* @internal
- * Whether VFIO NOIOMMU mode is enabled.
+ * Get current VFIO mode.
*
- * This function is only relevant to Linux.
+ * This function is only relevant on Linux.
*
* @return
- * 1 if true.
- * 0 if false.
- * <0 for errors.
+ * VFIO mode currently in use.
*/
__rte_internal
-int rte_vfio_noiommu_is_enabled(void);
+enum rte_vfio_mode
+rte_vfio_get_mode(void);
+
+/**
+ * @internal
+ * Check if VFIO NOIOMMU mode is enabled.
+ *
+ * This function is only relevant on Linux in group mode.
+ *
+ * @return
+ * 1 if enabled.
+ * 0 if not enabled or not supported.
+ */
+__rte_internal
+int
+rte_vfio_noiommu_is_enabled(void);
/**
* @internal
* Parse IOMMU group number for a device.
*
- * This function is only relevant to linux and will return
- * an error on BSD.
+ * This function is only relevant on Linux in group mode.
*
* @param sysfs_base
- * sysfs path prefix.
- *
+ * Sysfs path prefix.
* @param dev_addr
- * device location.
- *
+ * Device identifier.
* @param iommu_group_num
- * iommu group number
+ * Pointer to where IOMMU group number will be stored.
*
* @return
- * >0 on success
- * 0 for non-existent group or VFIO
- * <0 for errors
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENODEV - Device not managed by VFIO.
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
-rte_vfio_get_group_num(const char *sysfs_base,
- const char *dev_addr, int *iommu_group_num);
+rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num);
/**
* @internal
@@ -179,7 +228,12 @@ rte_vfio_get_group_num(const char *sysfs_base,
*
* @return
* 0 on success.
- * <0 on failure.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
@@ -187,14 +241,17 @@ rte_vfio_get_device_info(int vfio_dev_fd, struct vfio_device_info *device_info);
/**
* @internal
- * Get the default VFIO container fd
+ * Get the default VFIO container file descriptor.
*
- * This function is only relevant to linux and will return
- * an error on BSD.
+ * This function is only relevant on Linux.
*
* @return
- * > 0 default container fd
- * < 0 if VFIO is not enabled or not supported
+ * Non-negative container file descriptor on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
@@ -202,7 +259,9 @@ rte_vfio_get_container_fd(void);
/**
* @internal
- * Create a new container for device binding.
+ * Create a new VFIO container for device assignment and DMA mapping.
+ *
+ * This function is only relevant on Linux.
*
* @note Any newly allocated DPDK memory will not be mapped into these
* containers by default, user needs to manage DMA mappings for
@@ -213,8 +272,14 @@ rte_vfio_get_container_fd(void);
* devices between multiple processes is not supported.
*
* @return
- * the container fd if successful
- * <0 if failed
+ * Non-negative container file descriptor on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENOSPC - Maximum number of containers reached.
+ * - EIO - Underlying VFIO operation failed.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
@@ -222,14 +287,22 @@ rte_vfio_container_create(void);
/**
* @internal
- * Destroy the container, unbind all vfio groups within it.
+ * Destroy a VFIO container and unmap all devices assigned to it.
+ *
+ * This function is only relevant on Linux.
*
* @param container_fd
- * the container fd to destroy
+ * File descriptor of container to destroy.
*
* @return
- * 0 if successful
- * <0 if failed
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENODEV - Container not managed by VFIO.
+ * - EINVAL - Invalid container file descriptor.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
@@ -255,32 +328,45 @@ rte_vfio_container_destroy(int container_fd);
* @return
* 0 on success.
* <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - ENODEV - Device not managed by VFIO.
+ * - EEXIST - Device already assigned to the container.
+ * - ENOSPC - No space in VFIO container to assign device.
+ * - EINVAL - Invalid container file descriptor.
+ * - EIO - Error during underlying VFIO operations.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
-rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
- const char *dev_addr);
+rte_vfio_container_assign_device(int vfio_container_fd,
+ const char *sysfs_base, const char *dev_addr);
/**
* @internal
* Perform DMA mapping for devices in a container.
*
- * @param container_fd
- * the specified container fd. Use RTE_VFIO_DEFAULT_CONTAINER_FD to
- * use the default container.
+ * This function is only relevant on Linux.
*
+ * @param container_fd
+ * Container file descriptor. Use RTE_VFIO_DEFAULT_CONTAINER_FD to use the default container.
* @param vaddr
* Starting virtual address of memory to be mapped.
- *
* @param iova
* Starting IOVA address of memory to be mapped.
- *
* @param len
* Length of memory segment being mapped.
*
* @return
- * 0 if successful
- * <0 if failed
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - EIO - DMA mapping operation failed.
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
@@ -291,22 +377,26 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
* @internal
* Perform DMA unmapping for devices in a container.
*
- * @param container_fd
- * the specified container fd. Use RTE_VFIO_DEFAULT_CONTAINER_FD to
- * use the default container.
+ * This function is only relevant on Linux.
*
+ * @param container_fd
+ * Container file descriptor. Use RTE_VFIO_DEFAULT_CONTAINER_FD to use the default container.
* @param vaddr
* Starting virtual address of memory to be unmapped.
- *
* @param iova
* Starting IOVA address of memory to be unmapped.
- *
* @param len
* Length of memory segment being unmapped.
*
* @return
- * 0 if successful
- * <0 if failed
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ *
+ * Possible rte_errno values include:
+ * - EIO - DMA unmapping operation failed.
+ * - EINVAL - Invalid parameters.
+ * - ENXIO - VFIO support not initialized.
+ * - ENOTSUP - Unsupported VFIO mode.
*/
__rte_internal
int
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 7893d334eb..708d14ad51 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -9,6 +9,7 @@
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
+#include <sys/stat.h>
#include <dirent.h>
#include <rte_errno.h>
@@ -24,80 +25,39 @@
#include "eal_private.h"
#include "eal_internal_cfg.h"
-#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
-
-/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
- * recreate the mappings for DPDK segments, but we cannot do so for memory that
- * was registered by the user themselves, so we need to store the user mappings
- * somewhere, to recreate them later.
+/*
+ * rte_errno convention:
+ *
+ * - EINVAL: invalid parameters
+ * - ENOTSUP: current mode does not support this operation
+ * - ENOXIO: VFIO not initialized
+ * - ENODEV: device not managed by VFIO
+ * - ENOSPC: no space in config
+ * - EEXIST: device already assigned
+ * - ENOENT: group or device not found
+ * - EIO: underlying VFIO operation failed
*/
-#define EAL_VFIO_MAX_USER_MEM_MAPS 256
-struct user_mem_map {
- uint64_t addr; /**< start VA */
- uint64_t iova; /**< start IOVA */
- uint64_t len; /**< total length of the mapping */
- uint64_t chunk; /**< this mapping can be split in chunks of this size */
-};
-struct user_mem_maps {
- rte_spinlock_recursive_t lock;
- int n_maps;
- struct user_mem_map maps[EAL_VFIO_MAX_USER_MEM_MAPS];
+/* functions can fail for multiple reasons, and errno is tedious */
+enum vfio_result {
+ VFIO_SUCCESS,
+ VFIO_ERROR,
+ VFIO_EXISTS,
+ VFIO_NOT_SUPPORTED,
+ VFIO_NOT_MANAGED,
+ VFIO_NOT_FOUND,
+ VFIO_NO_SPACE,
};
-struct vfio_config {
- int vfio_enabled;
- int vfio_container_fd;
- int vfio_active_groups;
- const struct vfio_iommu_type *vfio_iommu_type;
- struct vfio_group vfio_groups[RTE_MAX_VFIO_GROUPS];
- struct user_mem_maps mem_maps;
+struct container containers[RTE_MAX_VFIO_CONTAINERS] = {0};
+struct vfio_config vfio_cfg = {
+ .mode = RTE_VFIO_MODE_NONE,
+ .default_cfg = &containers[0]
};
-/* per-process VFIO config */
-static struct vfio_config vfio_cfgs[RTE_MAX_VFIO_CONTAINERS];
-static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
-
-static int vfio_type1_dma_map(int);
-static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_spapr_dma_map(int);
-static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_noiommu_dma_map(int);
-static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
+static int vfio_dma_mem_map(struct container *cfg, uint64_t vaddr,
uint64_t iova, uint64_t len, int do_map);
-static int vfio_container_group_bind(int container_fd, int iommu_group_num);
-static int vfio_container_group_unbind(int container_fd, int iommu_group_num);
-
-/* IOMMU types we support */
-static const struct vfio_iommu_type iommu_types[] = {
- /* x86 IOMMU, otherwise known as type 1 */
- {
- .type_id = VFIO_TYPE1_IOMMU,
- .name = "Type 1",
- .partial_unmap = false,
- .dma_map_func = &vfio_type1_dma_map,
- .dma_user_map_func = &vfio_type1_dma_mem_map
- },
- /* ppc64 IOMMU, otherwise known as spapr */
- {
- .type_id = VFIO_SPAPR_TCE_v2_IOMMU,
- .name = "sPAPR",
- .partial_unmap = true,
- .dma_map_func = &vfio_spapr_dma_map,
- .dma_user_map_func = &vfio_spapr_dma_mem_map
- },
- /* IOMMU-less mode */
- {
- .type_id = VFIO_NOIOMMU_IOMMU,
- .name = "No-IOMMU",
- .partial_unmap = true,
- .dma_map_func = &vfio_noiommu_dma_map,
- .dma_user_map_func = &vfio_noiommu_dma_mem_map
- },
-};
-
static int
is_null_map(const struct user_mem_map *map)
{
@@ -353,279 +313,106 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
}
-static int
-vfio_open_group_fd(int iommu_group_num, bool mp_request)
+bool
+vfio_container_is_default(struct container *cfg)
{
- int vfio_group_fd;
- char filename[PATH_MAX];
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
- /* if not requesting via mp, open the group locally */
- if (!mp_request) {
- /* try regular group format */
- snprintf(filename, sizeof(filename), RTE_VFIO_GROUP_FMT, iommu_group_num);
- vfio_group_fd = open(filename, O_RDWR);
- if (vfio_group_fd < 0) {
- /* if file not found, it's not an error */
- if (errno != ENOENT) {
- EAL_LOG(ERR, "Cannot open %s: %s",
- filename, strerror(errno));
- return -1;
- }
-
- /* special case: try no-IOMMU path as well */
- snprintf(filename, sizeof(filename), RTE_VFIO_NOIOMMU_GROUP_FMT,
- iommu_group_num);
- vfio_group_fd = open(filename, O_RDWR);
- if (vfio_group_fd < 0) {
- if (errno != ENOENT) {
- EAL_LOG(ERR,
- "Cannot open %s: %s",
- filename, strerror(errno));
- return -1;
- }
- return -ENOENT;
- }
- /* noiommu group found */
- }
-
- return vfio_group_fd;
- }
- /* if we're in a secondary process, request group fd from the primary
- * process via mp channel.
- */
- p->req = SOCKET_REQ_GROUP;
- p->group_num = iommu_group_num;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- vfio_group_fd = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
- vfio_group_fd = mp_rep->fds[0];
- } else if (p->result == SOCKET_NO_FD) {
- EAL_LOG(ERR, "Bad VFIO group fd");
- vfio_group_fd = -ENOENT;
- }
- }
-
- free(mp_reply.msgs);
- if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
- EAL_LOG(ERR, "Cannot request VFIO group fd");
- return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_num(int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
- unsigned int i, j;
-
- for (i = 0; i < RTE_DIM(vfio_cfgs); i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < RTE_DIM(vfio_cfg->vfio_groups); j++) {
- if (vfio_cfg->vfio_groups[j].group_num ==
- iommu_group_num)
- return vfio_cfg;
- }
- }
-
- return NULL;
-}
-
-static int
-vfio_get_group_fd(struct vfio_config *vfio_cfg,
- int iommu_group_num)
-{
- struct vfio_group *cur_grp = NULL;
- int vfio_group_fd;
- unsigned int i;
-
- /* check if we already have the group descriptor open */
- for (i = 0; i < RTE_DIM(vfio_cfg->vfio_groups); i++)
- if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
- return vfio_cfg->vfio_groups[i].fd;
-
- /* Lets see first if there is room for a new group */
- if (vfio_cfg->vfio_active_groups == RTE_DIM(vfio_cfg->vfio_groups)) {
- EAL_LOG(ERR, "Maximum number of VFIO groups reached!");
- return -1;
- }
-
- /* Now lets get an index for the new group */
- for (i = 0; i < RTE_DIM(vfio_cfg->vfio_groups); i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (cur_grp == NULL) {
- EAL_LOG(ERR, "No VFIO group free slot found");
- return -1;
- }
-
- /*
- * When opening a group fd, we need to decide whether to open it locally
- * or request it from the primary process via mp_sync.
- *
- * For the default container, secondary processes use mp_sync so that
- * the primary process tracks the group fd and maintains VFIO state
- * across all processes.
- *
- * For custom containers, we open the group fd locally in each process
- * since custom containers are process-local and the primary has no
- * knowledge of them. Requesting a group fd from the primary for a
- * container it doesn't know about would be incorrect.
- */
- const struct internal_config *internal_conf = eal_get_internal_configuration();
- bool mp_request = (internal_conf->process_type == RTE_PROC_SECONDARY) &&
- (vfio_cfg == default_vfio_cfg);
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num, mp_request);
- if (vfio_group_fd < 0) {
- EAL_LOG(ERR, "Failed to open VFIO group %d",
- iommu_group_num);
- return vfio_group_fd;
- }
-
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- unsigned int i, j;
-
- for (i = 0; i < RTE_DIM(vfio_cfgs); i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < RTE_DIM(vfio_cfg->vfio_groups); j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
+ return cfg == vfio_cfg.default_cfg;
}
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
+static struct container *
+vfio_container_get_by_fd(int container_fd)
{
- unsigned int i;
+ struct container *cfg;
if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
- return default_vfio_cfg;
+ return vfio_cfg.default_cfg;
- for (i = 0; i < RTE_DIM(vfio_cfgs); i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
+ CONTAINER_FOREACH_ACTIVE(cfg) {
+ if (cfg->container_fd == container_fd)
+ return cfg;
}
-
return NULL;
}
-int
-vfio_get_group_fd_by_num(int iommu_group_num)
+static struct container *
+vfio_container_get_by_group_num(int group_num)
{
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+ struct container *cfg;
+ struct vfio_group *grp;
- return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+ CONTAINER_FOREACH_ACTIVE(cfg) {
+ GROUP_FOREACH_ACTIVE(cfg, grp)
+ if (grp->group_num == group_num)
+ return cfg;
+ }
+ return NULL;
}
-static int
-get_vfio_group_idx(int vfio_group_fd)
+static struct container *
+vfio_container_create(void)
{
- struct vfio_config *vfio_cfg;
- unsigned int i, j;
-
- for (i = 0; i < RTE_DIM(vfio_cfgs); i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < RTE_DIM(vfio_cfg->vfio_groups); j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return j;
+ struct container *cfg;
+
+ /* find an unused container config */
+ CONTAINER_FOREACH(cfg) {
+ if (!cfg->active) {
+ *cfg = CONTAINER_INITIALIZER;
+ cfg->active = true;
+ return cfg;
+ }
}
-
- return -1;
+ /* no space */
+ return NULL;
}
static void
-vfio_group_device_get(int vfio_group_fd)
+vfio_container_erase(struct container *cfg)
{
- struct vfio_config *vfio_cfg;
- int i;
+ if (cfg->container_fd >= 0 && close(cfg->container_fd))
+ EAL_LOG(ERR, "Error when closing container, %d (%s)", errno, strerror(errno));
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO group fd!");
- return;
- }
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0)
- EAL_LOG(ERR, "Wrong VFIO group index (%d)", i);
- else
- vfio_cfg->vfio_groups[i].devices++;
+ *cfg = (struct container){0};
}
-static void
-vfio_group_device_put(int vfio_group_fd)
+static struct vfio_device *
+vfio_device_create(struct container *cfg)
{
- struct vfio_config *vfio_cfg;
- int i;
+ struct vfio_device *dev;
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO group fd!");
- return;
- }
+ /* is there space? */
+ if (cfg->n_devices == RTE_DIM(cfg->devices))
+ return NULL;
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0)
- EAL_LOG(ERR, "Wrong VFIO group index (%d)", i);
- else
- vfio_cfg->vfio_groups[i].devices--;
-}
+ DEVICE_FOREACH(cfg, dev) {
+ if (dev->active)
+ continue;
+ dev->active = true;
+ /* set to invalid fd */
+ dev->fd = -1;
-static int
-vfio_group_device_count(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO group fd!");
- return -1;
+ cfg->n_devices++;
+ return dev;
}
+ /* should not happen */
+ EAL_LOG(WARNING, "Could not find space in device list for container");
+ return NULL;
+}
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0) {
- EAL_LOG(ERR, "Wrong VFIO group index (%d)", i);
- return -1;
- }
+static void
+vfio_device_erase(struct container *cfg, struct vfio_device *dev)
+{
+ if (dev->fd >= 0 && close(dev->fd))
+ EAL_LOG(ERR, "Error when closing device, %d (%s)", errno, strerror(errno));
- return vfio_cfg->vfio_groups[i].devices;
+ *dev = (struct vfio_device){0};
+ cfg->n_devices--;
}
static void
vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
void *arg __rte_unused)
{
+ struct container *cfg = vfio_cfg.default_cfg;
struct rte_memseg_list *msl;
struct rte_memseg *ms;
size_t cur_len = 0;
@@ -640,11 +427,9 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
/* Maintain granularity of DMA map/unmap to memseg size */
for (; cur_len < len; cur_len += page_sz) {
if (type == RTE_MEM_EVENT_ALLOC)
- vfio_dma_mem_map(default_vfio_cfg, vfio_va,
- vfio_va, page_sz, 1);
+ vfio_dma_mem_map(cfg, vfio_va, vfio_va, page_sz, 1);
else
- vfio_dma_mem_map(default_vfio_cfg, vfio_va,
- vfio_va, page_sz, 0);
+ vfio_dma_mem_map(cfg, vfio_va, vfio_va, page_sz, 0);
vfio_va += page_sz;
}
@@ -662,11 +447,9 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
goto next;
}
if (type == RTE_MEM_EVENT_ALLOC)
- vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
- ms->iova, ms->len, 1);
+ vfio_dma_mem_map(cfg, ms->addr_64, ms->iova, ms->len, 1);
else
- vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
- ms->iova, ms->len, 0);
+ vfio_dma_mem_map(cfg, ms->addr_64, ms->iova, ms->len, 0);
next:
cur_len += ms->len;
++ms;
@@ -674,445 +457,535 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
}
static int
-vfio_sync_default_container(void)
+vfio_register_mem_event_callback(void)
{
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
- int iommu_type_id;
- unsigned int i;
+ int ret;
- /* cannot be called from primary */
- if (rte_eal_process_type() != RTE_PROC_SECONDARY)
- return -1;
+ ret = rte_mem_event_callback_register(VFIO_MEM_EVENT_CLB_NAME,
+ vfio_mem_event_callback, NULL);
- /* default container fd should have been opened in rte_vfio_enable() */
- if (!default_vfio_cfg->vfio_enabled ||
- default_vfio_cfg->vfio_container_fd < 0) {
- EAL_LOG(ERR, "VFIO support is not initialized");
+ if (ret && rte_errno != ENOTSUP) {
+ EAL_LOG(ERR, "Could not install memory event callback for VFIO");
return -1;
}
+ if (ret)
+ EAL_LOG(DEBUG, "Memory event callbacks not supported");
+ else
+ EAL_LOG(DEBUG, "Installed memory event callback for VFIO");
- /* find default container's IOMMU type */
- p->req = SOCKET_REQ_IOMMU_TYPE;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- iommu_type_id = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK)
- iommu_type_id = p->iommu_type_id;
- }
- free(mp_reply.msgs);
- if (iommu_type_id < 0) {
- EAL_LOG(ERR,
- "Could not get IOMMU type for default container");
- return -1;
- }
-
- /* we now have an fd for default container, as well as its IOMMU type.
- * now, set up default VFIO container config to match.
- */
- for (i = 0; i < RTE_DIM(iommu_types); i++) {
- const struct vfio_iommu_type *t = &iommu_types[i];
- if (t->type_id != iommu_type_id)
- continue;
-
- /* we found our IOMMU type */
- default_vfio_cfg->vfio_iommu_type = t;
-
- return 0;
- }
- EAL_LOG(ERR, "Could not find IOMMU type id (%i)",
- iommu_type_id);
- return -1;
+ return 0;
}
static int
-vfio_clear_group(int vfio_group_fd)
+vfio_setup_dma_mem(struct container *cfg)
{
- int i;
- struct vfio_config *vfio_cfg;
+ struct user_mem_maps *user_mem_maps = &cfg->mem_maps;
+ int i, ret;
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO group fd!");
+ /* do we need to map DPDK-managed memory? */
+ if (vfio_container_is_default(cfg) && rte_eal_process_type() == RTE_PROC_PRIMARY)
+ ret = vfio_cfg.ops->dma_map_func(cfg);
+ else
+ ret = 0;
+ if (ret) {
+ EAL_LOG(ERR, "DMA remapping failed, error %i (%s)",
+ errno, strerror(errno));
return -1;
}
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0)
- return -1;
- vfio_cfg->vfio_groups[i].group_num = -1;
- vfio_cfg->vfio_groups[i].fd = -1;
- vfio_cfg->vfio_groups[i].devices = 0;
- vfio_cfg->vfio_active_groups--;
+ /*
+ * not all IOMMU types support DMA mapping, but if we have mappings in the list - that
+ * means we have previously mapped something successfully, so we can be sure that DMA
+ * mapping is supported.
+ */
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ struct user_mem_map *map;
+ map = &user_mem_maps->maps[i];
+
+ ret = vfio_cfg.ops->dma_user_map_func(cfg, map->addr, map->iova, map->len, 1);
+ if (ret) {
+ EAL_LOG(ERR, "Couldn't map user memory for DMA: "
+ "va: 0x%" PRIx64 " "
+ "iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIu64,
+ map->addr, map->iova,
+ map->len);
+ return -1;
+ }
+ }
return 0;
}
+static enum vfio_result
+vfio_group_assign_device(struct container *cfg, const char *sysfs_base,
+ const char *dev_addr, struct vfio_device **out_dev)
+{
+ struct vfio_group_config *group_cfg = &cfg->group_cfg;
+ struct vfio_group *grp;
+ struct vfio_device *dev;
+ int iommu_group_num;
+ enum vfio_result res;
+ int ret;
+
+ /* allocate new device in config */
+ dev = vfio_device_create(cfg);
+ if (dev == NULL) {
+ EAL_LOG(ERR, "No space to track new VFIO device");
+ return VFIO_NO_SPACE;
+ }
+
+ /* remember to register mem event callback for default container in primary */
+ bool need_clb = vfio_container_is_default(cfg) &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+ /* get group number for this device */
+ ret = vfio_group_get_num(sysfs_base, dev_addr, &iommu_group_num);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Cannot get IOMMU group for %s", dev_addr);
+ res = VFIO_ERROR;
+ goto device_erase;
+ } else if (ret == 0) {
+ res = VFIO_NOT_MANAGED;
+ goto device_erase;
+ }
+
+ /* group may already exist as multiple devices may share group */
+ grp = vfio_group_get_by_num(cfg, iommu_group_num);
+ if (grp == NULL) {
+ /* no device currently uses this group, create it */
+ grp = vfio_group_create(cfg, iommu_group_num);
+ if (grp == NULL) {
+ EAL_LOG(ERR, "Cannot allocate group for device %s", dev_addr);
+ res = VFIO_NO_SPACE;
+ goto device_erase;
+ }
+
+ /* open group fd */
+ ret = vfio_group_open_fd(cfg, grp);
+ if (ret == -ENOENT) {
+ EAL_LOG(DEBUG, "Device %s (IOMMU group %d) not managed by VFIO",
+ dev_addr, iommu_group_num);
+ res = VFIO_NOT_MANAGED;
+ goto group_erase;
+ } else if (ret < 0) {
+ EAL_LOG(ERR, "Cannot open VFIO group %d for device %s",
+ iommu_group_num, dev_addr);
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+
+ /* prepare group (viability + container attach) */
+ ret = vfio_group_prepare(cfg, grp);
+ if (ret < 0) {
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+
+ /* set up IOMMU type once per container */
+ if (!group_cfg->iommu_type_set) {
+ ret = vfio_group_setup_iommu(cfg);
+ if (ret < 0) {
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+ group_cfg->iommu_type_set = true;
+ }
+
+ /* set up DMA memory once per container */
+ if (!group_cfg->dma_setup_done) {
+ rte_spinlock_recursive_lock(&cfg->mem_maps.lock);
+ ret = vfio_setup_dma_mem(cfg);
+ rte_spinlock_recursive_unlock(&cfg->mem_maps.lock);
+ if (ret < 0) {
+ EAL_LOG(ERR, "DMA remapping for %s failed", dev_addr);
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+ group_cfg->dma_setup_done = true;
+ }
+
+ /* set up mem event callback if needed */
+ if (need_clb && !group_cfg->mem_event_clb_set) {
+ ret = vfio_register_mem_event_callback();
+ if (ret < 0) {
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+ group_cfg->mem_event_clb_set = true;
+ }
+ }
+
+ /* open dev fd */
+ ret = vfio_group_setup_device_fd(dev_addr, grp, dev);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Cannot open VFIO device %s, error %i (%s)",
+ dev_addr, errno, strerror(errno));
+ res = VFIO_ERROR;
+ goto group_erase;
+ }
+
+ /*
+ * we would've liked to prevent user from assigning devices twice to
+ * prevent resource leaks, but for group mode this is not possible, as
+ * there is no way to know which fd belongs to which group/device.
+ *
+ * we also do not need to look in other configs as if we were to attempt
+ * to use a different container, the kernel wouldn't have allowed us to
+ * bind the group to the container in the first place.
+ */
+ *out_dev = dev;
+ return VFIO_SUCCESS;
+group_erase:
+ /* this may be a pre-existing group so only erase it if it has no devices */
+ if (grp->n_devices == 0)
+ vfio_group_erase(cfg, grp);
+ /* if we registered callback, unregister it */
+ if (group_cfg->n_groups == 0 && group_cfg->mem_event_clb_set) {
+ rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, NULL);
+ group_cfg->mem_event_clb_set = false;
+ }
+device_erase:
+ vfio_device_erase(cfg, dev);
+ return res;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_assign_device)
+int
+rte_vfio_container_assign_device(int container_fd, const char *sysfs_base, const char *dev_addr)
+{
+ struct container *cfg;
+ enum vfio_result res;
+ struct vfio_device *dev;
+
+ if (sysfs_base == NULL || dev_addr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
+ cfg = vfio_container_get_by_fd(container_fd);
+ if (cfg == NULL) {
+ EAL_LOG(ERR, "Invalid VFIO container fd");
+ rte_errno = EINVAL;
+ return -1;
+ }
+ /* protect memory configuration while setting up IOMMU/DMA */
+ rte_mcfg_mem_read_lock();
+
+ switch (vfio_cfg.mode) {
+ case RTE_VFIO_MODE_GROUP:
+ case RTE_VFIO_MODE_NOIOMMU:
+ res = vfio_group_assign_device(cfg, sysfs_base, dev_addr, &dev);
+ break;
+ default:
+ EAL_LOG(ERR, "Unsupported VFIO mode");
+ res = VFIO_NOT_SUPPORTED;
+ break;
+ }
+ rte_mcfg_mem_read_unlock();
+
+ switch (res) {
+ case VFIO_SUCCESS:
+ return 0;
+ case VFIO_EXISTS:
+ rte_errno = EEXIST;
+ return -1;
+ case VFIO_NOT_MANAGED:
+ EAL_LOG(DEBUG, "Device %s not managed by VFIO", dev_addr);
+ rte_errno = ENODEV;
+ return -1;
+ case VFIO_NO_SPACE:
+ EAL_LOG(ERR, "No space in VFIO container to assign device %s", dev_addr);
+ rte_errno = ENOSPC;
+ return -1;
+ default:
+ EAL_LOG(ERR, "Error assigning device %s to container", dev_addr);
+ rte_errno = EIO;
+ return -1;
+ }
+}
+
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
int
rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd)
{
- struct vfio_group_status group_status = {
- .argsz = sizeof(group_status)
- };
- struct vfio_config *vfio_cfg;
- struct user_mem_maps *user_mem_maps;
- int vfio_container_fd;
- int vfio_group_fd;
- int iommu_group_num;
- rte_uuid_t vf_token;
- int i, ret;
- const struct internal_config *internal_conf =
- eal_get_internal_configuration();
-
- /* get group number */
- ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
- if (ret == 0) {
- EAL_LOG(NOTICE,
- "%s not managed by VFIO driver, skipping",
- dev_addr);
- return 1;
- }
-
- /* if negative, something failed */
- if (ret < 0)
- return -1;
-
- /* get the actual group fd */
- vfio_group_fd = vfio_get_group_fd_by_num(iommu_group_num);
- if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
- return -1;
-
- /*
- * if vfio_group_fd == -ENOENT, that means the device
- * isn't managed by VFIO
- */
- if (vfio_group_fd == -ENOENT) {
- EAL_LOG(NOTICE,
- "%s not managed by VFIO driver, skipping",
- dev_addr);
- return 1;
- }
-
- /*
- * at this point, we know that this group is viable (meaning, all devices
- * are either bound to VFIO or not bound to anything)
- */
-
- /* check if the group is viable */
- ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
- if (ret) {
- EAL_LOG(ERR, "%s cannot get VFIO group status, "
- "error %i (%s)", dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
- EAL_LOG(ERR, "%s VFIO group is not viable! "
- "Not all devices in IOMMU group bound to VFIO or unbound",
- dev_addr);
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
- vfio_container_fd = vfio_cfg->vfio_container_fd;
- user_mem_maps = &vfio_cfg->mem_maps;
-
- /* check if group does not have a container yet */
- if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
-
- /* add group to a container */
- ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
- &vfio_container_fd);
- if (ret) {
- EAL_LOG(ERR,
- "%s cannot add VFIO group to container, error "
- "%i (%s)", dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /*
- * pick an IOMMU type and set up DMA mappings for container
- *
- * needs to be done only once, only when first group is
- * assigned to a container and only in primary process.
- * Note this can happen several times with the hotplug
- * functionality.
- */
- if (internal_conf->process_type == RTE_PROC_PRIMARY &&
- vfio_cfg->vfio_active_groups == 1 &&
- vfio_group_device_count(vfio_group_fd) == 0) {
- const struct vfio_iommu_type *t;
-
- /* select an IOMMU type which we will be using */
- t = vfio_set_iommu_type(vfio_container_fd);
- if (!t) {
- EAL_LOG(ERR,
- "%s failed to select IOMMU type",
- dev_addr);
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- }
- /* lock memory hotplug before mapping and release it
- * after registering callback, to prevent races
- */
- rte_mcfg_mem_read_lock();
- if (vfio_cfg == default_vfio_cfg)
- ret = t->dma_map_func(vfio_container_fd);
- else
- ret = 0;
- if (ret) {
- EAL_LOG(ERR,
- "%s DMA remapping failed, error "
- "%i (%s)",
- dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- rte_mcfg_mem_read_unlock();
- return -1;
- }
-
- vfio_cfg->vfio_iommu_type = t;
-
- /* re-map all user-mapped segments */
- rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
- /* this IOMMU type may not support DMA mapping, but
- * if we have mappings in the list - that means we have
- * previously mapped something successfully, so we can
- * be sure that DMA mapping is supported.
- */
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- struct user_mem_map *map;
- map = &user_mem_maps->maps[i];
-
- ret = t->dma_user_map_func(
- vfio_container_fd,
- map->addr, map->iova, map->len,
- 1);
- if (ret) {
- EAL_LOG(ERR, "Couldn't map user memory for DMA: "
- "va: 0x%" PRIx64 " "
- "iova: 0x%" PRIx64 " "
- "len: 0x%" PRIu64,
- map->addr, map->iova,
- map->len);
- rte_spinlock_recursive_unlock(
- &user_mem_maps->lock);
- rte_mcfg_mem_read_unlock();
- return -1;
- }
- }
- rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-
- /* register callback for mem events */
- if (vfio_cfg == default_vfio_cfg)
- ret = rte_mem_event_callback_register(
- VFIO_MEM_EVENT_CLB_NAME,
- vfio_mem_event_callback, NULL);
- else
- ret = 0;
- /* unlock memory hotplug */
- rte_mcfg_mem_read_unlock();
-
- if (ret && rte_errno != ENOTSUP) {
- EAL_LOG(ERR, "Could not install memory event callback for VFIO");
- return -1;
- }
- if (ret)
- EAL_LOG(DEBUG, "Memory event callbacks not supported");
- else
- EAL_LOG(DEBUG, "Installed memory event callback for VFIO");
- }
- } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
- vfio_cfg == default_vfio_cfg &&
- vfio_cfg->vfio_iommu_type == NULL) {
- /* if we're not a primary process, we do not set up the VFIO
- * container because it's already been set up by the primary
- * process. instead, we simply ask the primary about VFIO type
- * we are using, and set the VFIO config up appropriately.
- */
- ret = vfio_sync_default_container();
- if (ret < 0) {
- EAL_LOG(ERR, "Could not sync default VFIO container");
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- }
- /* we have successfully initialized VFIO, notify user */
- const struct vfio_iommu_type *t =
- default_vfio_cfg->vfio_iommu_type;
- EAL_LOG(INFO, "Using IOMMU type %d (%s)",
- t->type_id, t->name);
- }
-
- rte_eal_vfio_get_vf_token(vf_token);
-
- /* get a file descriptor for the device with VF token firstly */
- if (!rte_uuid_is_null(vf_token)) {
- char vf_token_str[RTE_UUID_STRLEN];
- char dev[PATH_MAX];
-
- rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str));
- snprintf(dev, sizeof(dev),
- "%s vf_token=%s", dev_addr, vf_token_str);
-
- *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD,
- dev);
- if (*vfio_dev_fd >= 0)
- goto out;
- }
-
- /* get a file descriptor for the device */
- *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
- if (*vfio_dev_fd < 0) {
- /* if we cannot get a device fd, this implies a problem with
- * the VFIO group or the container not having IOMMU configured.
- */
-
- EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed",
- dev_addr);
- close(vfio_group_fd);
- vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /* device is now set up */
-out:
- vfio_group_device_get(vfio_group_fd);
-
- return 0;
-}
-
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
-int
-rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
- int vfio_dev_fd)
-{
- struct vfio_config *vfio_cfg;
- int vfio_group_fd;
- int iommu_group_num;
+ struct container *cfg;
+ struct vfio_device *dev;
+ enum vfio_result res;
int ret;
- /* we don't want any DMA mapping messages to come while we're detaching
- * VFIO device, because this might be the last device and we might need
- * to unregister the callback.
- */
+ if (sysfs_base == NULL || dev_addr == NULL || vfio_dev_fd == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
rte_mcfg_mem_read_lock();
- /* get group number */
- ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
- if (ret <= 0) {
- EAL_LOG(WARNING, "%s not managed by VFIO driver",
- dev_addr);
- /* This is an error at this point. */
- ret = -1;
- goto out;
- }
-
- /* get the actual group fd */
- vfio_group_fd = vfio_get_group_fd_by_num(iommu_group_num);
- if (vfio_group_fd < 0) {
- EAL_LOG(INFO, "vfio_get_group_fd_by_num failed for %s",
- dev_addr);
- ret = vfio_group_fd;
- goto out;
+ switch (vfio_cfg.mode) {
+ case RTE_VFIO_MODE_GROUP:
+ case RTE_VFIO_MODE_NOIOMMU:
+ {
+ int iommu_group_num;
+
+ /* find group number */
+ ret = vfio_group_get_num(sysfs_base, dev_addr, &iommu_group_num);
+ if (ret < 0)
+ goto assign_fail;
+ else if (ret == 0)
+ goto not_managed;
+
+ /* find config by group */
+ cfg = vfio_container_get_by_group_num(iommu_group_num);
+ if (cfg == NULL)
+ cfg = vfio_cfg.default_cfg;
+
+ res = vfio_group_assign_device(cfg, sysfs_base, dev_addr, &dev);
+ break;
}
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-
- /* At this point we got an active group. Closing it will make the
- * container detachment. If this is the last active group, VFIO kernel
- * code will unset the container and the IOMMU mappings.
- */
-
- /* Closing a device */
- if (close(vfio_dev_fd) < 0) {
- EAL_LOG(INFO, "Error when closing vfio_dev_fd for %s",
- dev_addr);
+ default:
+ EAL_LOG(ERR, "Unsupported VFIO mode");
+ rte_errno = ENOTSUP;
ret = -1;
- goto out;
+ goto unlock;
}
- /* An VFIO group can have several devices attached. Just when there is
- * no devices remaining should the group be closed.
- */
- vfio_group_device_put(vfio_group_fd);
- if (!vfio_group_device_count(vfio_group_fd)) {
-
- if (close(vfio_group_fd) < 0) {
- EAL_LOG(INFO, "Error when closing vfio_group_fd for %s",
- dev_addr);
- ret = -1;
- goto out;
- }
-
- if (vfio_clear_group(vfio_group_fd) < 0) {
- EAL_LOG(INFO, "Error when clearing group for %s",
- dev_addr);
- ret = -1;
- goto out;
- }
+ switch (res) {
+ case VFIO_NOT_MANAGED:
+not_managed:
+ EAL_LOG(DEBUG, "Device %s not managed by VFIO", dev_addr);
+ rte_errno = ENODEV;
+ ret = -1;
+ goto unlock;
+ case VFIO_SUCCESS:
+ case VFIO_EXISTS:
+ break;
+ case VFIO_NO_SPACE:
+ EAL_LOG(ERR, "No space in VFIO container to assign device %s", dev_addr);
+ rte_errno = ENOSPC;
+ ret = -1;
+ goto unlock;
+ default:
+assign_fail:
+ EAL_LOG(ERR, "Error assigning device %s to container", dev_addr);
+ rte_errno = EIO;
+ ret = -1;
+ goto unlock;
}
-
- /* if there are no active device groups, unregister the callback to
- * avoid spurious attempts to map/unmap memory from VFIO.
- */
- if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
- rte_eal_process_type() != RTE_PROC_SECONDARY)
- rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
- NULL);
+ *vfio_dev_fd = dev->fd;
/* success */
ret = 0;
-out:
+unlock:
rte_mcfg_mem_read_unlock();
+
return ret;
}
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
+int
+rte_vfio_release_device(const char *sysfs_base __rte_unused,
+ const char *dev_addr, int vfio_dev_fd)
+{
+ struct container *cfg = NULL, *icfg;
+ struct vfio_device *dev = NULL, *idev;
+ int ret;
+
+ if (sysfs_base == NULL || dev_addr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
+ rte_mcfg_mem_read_lock();
+
+ /* we need to find both config and device */
+ CONTAINER_FOREACH_ACTIVE(icfg) {
+ DEVICE_FOREACH_ACTIVE(icfg, idev) {
+ if (idev->fd != vfio_dev_fd)
+ continue;
+ cfg = icfg;
+ dev = idev;
+ goto found;
+ }
+ }
+found:
+ if (dev == NULL) {
+ EAL_LOG(ERR, "Device %s not managed by any container", dev_addr);
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+
+ switch (vfio_cfg.mode) {
+ case RTE_VFIO_MODE_GROUP:
+ case RTE_VFIO_MODE_NOIOMMU:
+ {
+ int iommu_group_num = dev->group;
+ struct vfio_group_config *group_cfg = &cfg->group_cfg;
+ struct vfio_group *grp;
+
+ bool need_clb = vfio_container_is_default(cfg) &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+ /* find the group */
+ grp = vfio_group_get_by_num(cfg, iommu_group_num);
+ if (grp == NULL) {
+ /* shouldn't happen because we already know the device is valid */
+ EAL_LOG(ERR, "IOMMU group %d not found in container",
+ iommu_group_num);
+ rte_errno = EIO;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* close device handle */
+ vfio_device_erase(cfg, dev);
+
+ /* remove device from group */
+ grp->n_devices--;
+
+ /* was this the last device? */
+ if (grp->n_devices == 0)
+ vfio_group_erase(cfg, grp);
+
+ /* if no more groups left, remove callback */
+ if (need_clb && group_cfg->n_groups == 0 && group_cfg->mem_event_clb_set) {
+ rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, NULL);
+ group_cfg->mem_event_clb_set = false;
+ }
+ break;
+ }
+ default:
+ EAL_LOG(ERR, "Unsupported VFIO mode");
+ rte_errno = ENOTSUP;
+ ret = -1;
+ goto unlock;
+ }
+ ret = 0;
+unlock:
+ rte_mcfg_mem_read_unlock();
+
+ return ret;
+}
+
+static int
+vfio_sync_mode(struct container *cfg, enum rte_vfio_mode *mode)
+{
+ struct vfio_mp_param *p;
+ struct rte_mp_msg mp_req = {0};
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {5, 0};
+
+ /* request iommufd from primary via mp_sync */
+ rte_strscpy(mp_req.name, EAL_VFIO_MP, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+ p = (struct vfio_mp_param *)mp_req.param;
+ p->req = SOCKET_REQ_CONTAINER;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ struct rte_mp_msg *mp_rep;
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cfg->container_fd = mp_rep->fds[0];
+ *mode = p->mode;
+ free(mp_reply.msgs);
+ return 0;
+ }
+ }
+
+ free(mp_reply.msgs);
+ EAL_LOG(ERR, "Cannot request container_fd");
+ return -1;
+}
+
+static enum rte_vfio_mode
+vfio_select_mode(void)
+{
+ struct container *cfg;
+ enum rte_vfio_mode mode = RTE_VFIO_MODE_NONE;
+
+ cfg = vfio_container_create();
+ /* cannot happen */
+ if (cfg == NULL || cfg != vfio_cfg.default_cfg) {
+ EAL_LOG(ERR, "Unexpected VFIO config structure");
+ return RTE_VFIO_MODE_NONE;
+ }
+
+ /* for secondary, just ask the primary for the container and mode */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ struct vfio_group_config *group_cfg = &cfg->group_cfg;
+
+ if (vfio_sync_mode(cfg, &mode) < 0)
+ goto err;
+
+ /* primary handles DMA setup for default containers */
+ group_cfg->dma_setup_done = true;
+ return mode;
+ }
+ /* if we failed mp sync setup, we cannot initialize VFIO */
+ if (vfio_mp_sync_setup() < 0)
+ return RTE_VFIO_MODE_NONE;
+
+ /* try group mode first */
+ if (vfio_group_enable(cfg) == 0) {
+ /* check for noiommu */
+ int ret = vfio_group_noiommu_is_enabled();
+ if (ret < 0)
+ goto err_mpsync;
+ else if (ret == 1)
+ return RTE_VFIO_MODE_NOIOMMU;
+ return RTE_VFIO_MODE_GROUP;
+ }
+err_mpsync:
+ vfio_mp_sync_cleanup();
+err:
+ vfio_container_erase(cfg);
+
+ return RTE_VFIO_MODE_NONE;
+}
+
+static const char *
+vfio_mode_to_str(enum rte_vfio_mode mode)
+{
+ switch (mode) {
+ case RTE_VFIO_MODE_GROUP: return "group";
+ case RTE_VFIO_MODE_NOIOMMU: return "noiommu";
+ default: return "not initialized";
+ }
+}
+
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_enable)
int
rte_vfio_enable(const char *modname)
{
- /* initialize group list */
- unsigned int i, j;
int vfio_available;
- DIR *dir;
- const struct internal_config *internal_conf =
- eal_get_internal_configuration();
+ enum rte_vfio_mode mode = RTE_VFIO_MODE_NONE;
- rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
-
- for (i = 0; i < RTE_DIM(vfio_cfgs); i++) {
- vfio_cfgs[i].vfio_container_fd = -1;
- vfio_cfgs[i].vfio_active_groups = 0;
- vfio_cfgs[i].vfio_iommu_type = NULL;
- vfio_cfgs[i].mem_maps.lock = lock;
-
- for (j = 0; j < RTE_DIM(vfio_cfgs[i].vfio_groups); j++) {
- vfio_cfgs[i].vfio_groups[j].fd = -1;
- vfio_cfgs[i].vfio_groups[j].group_num = -1;
- vfio_cfgs[i].vfio_groups[j].devices = 0;
- }
+ if (modname == NULL) {
+ rte_errno = EINVAL;
+ return -1;
}
EAL_LOG(DEBUG, "Probing VFIO support...");
@@ -1132,36 +1005,16 @@ rte_vfio_enable(const char *modname)
"VFIO modules not loaded, skipping VFIO support...");
return 0;
}
+ EAL_LOG(DEBUG, "VFIO module '%s' loaded, attempting to initialize VFIO...", modname);
+ mode = vfio_select_mode();
- /* VFIO directory might not exist (e.g., unprivileged containers) */
- dir = opendir(RTE_VFIO_DIR);
- if (dir == NULL) {
- EAL_LOG(DEBUG,
- "VFIO directory does not exist, skipping VFIO support...");
- return 0;
- }
- closedir(dir);
-
- if (internal_conf->process_type == RTE_PROC_PRIMARY) {
- if (vfio_mp_sync_setup() == -1) {
- default_vfio_cfg->vfio_container_fd = -1;
- } else {
- /* open a default container */
- default_vfio_cfg->vfio_container_fd = vfio_open_container_fd(false);
- }
- } else {
- /* get the default container from the primary process */
- default_vfio_cfg->vfio_container_fd =
- vfio_open_container_fd(true);
- }
-
- /* check if we have VFIO driver enabled */
- if (default_vfio_cfg->vfio_container_fd != -1) {
- EAL_LOG(INFO, "VFIO support initialized");
- default_vfio_cfg->vfio_enabled = 1;
- } else {
+ /* have we initialized anything? */
+ if (mode == RTE_VFIO_MODE_NONE)
EAL_LOG(NOTICE, "VFIO support could not be initialized");
- }
+ else
+ EAL_LOG(NOTICE, "VFIO support initialized: %s mode", vfio_mode_to_str(mode));
+
+ vfio_cfg.mode = mode;
return 0;
}
@@ -1170,40 +1023,17 @@ RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_is_enabled)
int
rte_vfio_is_enabled(const char *modname)
{
- const int mod_available = rte_eal_check_module(modname) > 0;
- return default_vfio_cfg->vfio_enabled && mod_available;
+ const int mod_available = modname ? rte_eal_check_module(modname) > 0 : 0;
+ return vfio_cfg.default_cfg->active && mod_available;
}
int
vfio_get_iommu_type(void)
{
- if (default_vfio_cfg->vfio_iommu_type == NULL)
+ if (vfio_cfg.ops == NULL)
return -1;
- return default_vfio_cfg->vfio_iommu_type->type_id;
-}
-
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd)
-{
- unsigned idx;
- for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
- const struct vfio_iommu_type *t = &iommu_types[idx];
-
- int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
- t->type_id);
- if (!ret) {
- EAL_LOG(INFO, "Using IOMMU type %d (%s)",
- t->type_id, t->name);
- return t;
- }
- /* not an error, there may be more supported IOMMU types */
- EAL_LOG(DEBUG, "Set IOMMU type %d (%s) failed, error "
- "%i (%s)", t->type_id, t->name, errno,
- strerror(errno));
- }
- /* if we didn't find a suitable IOMMU type, fail */
- return NULL;
+ return vfio_cfg.ops->type_id;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_info)
@@ -1212,126 +1042,27 @@ rte_vfio_get_device_info(int vfio_dev_fd, struct vfio_device_info *device_info)
{
int ret;
- if (device_info == NULL || vfio_dev_fd < 0)
+ if (device_info == NULL) {
+ rte_errno = EINVAL;
return -1;
+ }
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
if (ret) {
- EAL_LOG(ERR, "Cannot get device info, error %i (%s)",
- errno, strerror(errno));
+ EAL_LOG(ERR, "Cannot get device info, error %d (%s)", errno, strerror(errno));
+ rte_errno = errno;
return -1;
}
return 0;
}
-int
-vfio_has_supported_extensions(int vfio_container_fd)
-{
- int ret;
- unsigned idx, n_extensions = 0;
- for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
- const struct vfio_iommu_type *t = &iommu_types[idx];
-
- ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
- t->type_id);
- if (ret < 0) {
- EAL_LOG(ERR, "Could not get IOMMU type, error "
- "%i (%s)", errno, strerror(errno));
- close(vfio_container_fd);
- return -1;
- } else if (ret == 1) {
- /* we found a supported extension */
- n_extensions++;
- }
- EAL_LOG(DEBUG, "IOMMU type %d (%s) is %s",
- t->type_id, t->name,
- ret ? "supported" : "not supported");
- }
-
- /* if we didn't find any supported IOMMU types, fail */
- if (!n_extensions) {
- close(vfio_container_fd);
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Open a new VFIO container fd.
- *
- * If mp_request is true, requests a new container fd from the primary process
- * via mp channel (for secondary processes that need to open the default container).
- *
- * Otherwise, opens a new container fd locally by opening /dev/vfio/vfio.
- */
-int
-vfio_open_container_fd(bool mp_request)
-{
- int ret, vfio_container_fd;
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
- /* if not requesting via mp, open a new container locally */
- if (!mp_request) {
- vfio_container_fd = open(RTE_VFIO_CONTAINER_PATH, O_RDWR);
- if (vfio_container_fd < 0) {
- EAL_LOG(ERR, "Cannot open VFIO container %s, error %i (%s)",
- RTE_VFIO_CONTAINER_PATH, errno, strerror(errno));
- return -1;
- }
-
- /* check VFIO API version */
- ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
- if (ret != VFIO_API_VERSION) {
- if (ret < 0)
- EAL_LOG(ERR,
- "Could not get VFIO API version, error "
- "%i (%s)", errno, strerror(errno));
- else
- EAL_LOG(ERR, "Unsupported VFIO API version!");
- close(vfio_container_fd);
- return -1;
- }
-
- ret = vfio_has_supported_extensions(vfio_container_fd);
- if (ret) {
- EAL_LOG(ERR,
- "No supported IOMMU extensions found!");
- return -1;
- }
-
- return vfio_container_fd;
- }
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via mp channel
- */
- p->req = SOCKET_REQ_CONTAINER;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- vfio_container_fd = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
- vfio_container_fd = mp_rep->fds[0];
- free(mp_reply.msgs);
- return vfio_container_fd;
- }
- }
-
- free(mp_reply.msgs);
- EAL_LOG(ERR, "Cannot request VFIO container fd");
- return -1;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_container_fd)
int
rte_vfio_get_container_fd(void)
@@ -1340,511 +1071,54 @@ rte_vfio_get_container_fd(void)
* The default container is set up during rte_vfio_enable().
* This function does not create a new container.
*/
- if (!default_vfio_cfg->vfio_enabled)
- return -1;
+ if (vfio_cfg.mode != RTE_VFIO_MODE_NONE)
+ return vfio_cfg.default_cfg->container_fd;
- return default_vfio_cfg->vfio_container_fd;
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
int
-rte_vfio_get_group_num(const char *sysfs_base,
- const char *dev_addr, int *iommu_group_num)
+rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num)
{
- char linkname[PATH_MAX];
- char filename[PATH_MAX];
- char *tok[16], *group_tok, *end;
int ret;
- memset(linkname, 0, sizeof(linkname));
- memset(filename, 0, sizeof(filename));
-
- /* try to find out IOMMU group for this device */
- snprintf(linkname, sizeof(linkname),
- "%s/%s/iommu_group", sysfs_base, dev_addr);
-
- ret = readlink(linkname, filename, sizeof(filename));
-
- /* if the link doesn't exist, no VFIO for us */
- if (ret < 0)
- return 0;
-
- ret = rte_strsplit(filename, sizeof(filename),
- tok, RTE_DIM(tok), '/');
-
- if (ret <= 0) {
- EAL_LOG(ERR, "%s cannot get IOMMU group", dev_addr);
- return -1;
- }
-
- /* IOMMU group is always the last token */
- errno = 0;
- group_tok = tok[ret - 1];
- end = group_tok;
- *iommu_group_num = strtol(group_tok, &end, 10);
- if ((end != group_tok && *end != '\0') || errno != 0) {
- EAL_LOG(ERR, "%s error parsing IOMMU number!", dev_addr);
- return -1;
- }
-
- return 1;
-}
-
-static int
-type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
- void *arg)
-{
- int *vfio_container_fd = arg;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
- ms->len, 1);
-}
-
-static int
-vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- struct vfio_iommu_type1_dma_map dma_map;
- struct vfio_iommu_type1_dma_unmap dma_unmap;
- int ret;
-
- if (do_map != 0) {
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = vaddr;
- dma_map.size = len;
- dma_map.iova = iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
- if (ret) {
- /**
- * In case the mapping was already done EEXIST will be
- * returned from kernel.
- */
- if (errno == EEXIST) {
- EAL_LOG(DEBUG,
- "Memory segment is already mapped, skipping");
- } else {
- EAL_LOG(ERR,
- "Cannot set up DMA remapping, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
- }
- } else {
- memset(&dma_unmap, 0, sizeof(dma_unmap));
- dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
- dma_unmap.size = len;
- dma_unmap.iova = iova;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
- &dma_unmap);
- if (ret) {
- EAL_LOG(ERR, "Cannot clear DMA remapping, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- } else if (dma_unmap.size != len) {
- EAL_LOG(ERR, "Unexpected size %"PRIu64
- " of DMA remapping cleared instead of %"PRIu64,
- (uint64_t)dma_unmap.size, len);
- rte_errno = EIO;
- return -1;
- }
- }
-
- return 0;
-}
-
-static int
-vfio_type1_dma_map(int vfio_container_fd)
-{
- return rte_memseg_walk(type1_map, &vfio_container_fd);
-}
-
-/* Track the size of the statically allocated DMA window for SPAPR */
-uint64_t spapr_dma_win_len;
-uint64_t spapr_dma_win_page_sz;
-
-static int
-vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .vaddr = (uintptr_t) vaddr,
- .size = len,
- .flags = 0
- };
- int ret;
-
- if (do_map != 0) {
- struct vfio_iommu_type1_dma_map dma_map;
-
- if (iova + len > spapr_dma_win_len) {
- EAL_LOG(ERR, "DMA map attempt outside DMA window");
- return -1;
- }
-
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
- if (ret) {
- EAL_LOG(ERR,
- "Cannot register vaddr for IOMMU, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
-
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = vaddr;
- dma_map.size = len;
- dma_map.iova = iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
- if (ret) {
- EAL_LOG(ERR, "Cannot map vaddr for IOMMU, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
-
- } else {
- struct vfio_iommu_type1_dma_map dma_unmap;
-
- memset(&dma_unmap, 0, sizeof(dma_unmap));
- dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
- dma_unmap.size = len;
- dma_unmap.iova = iova;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
- &dma_unmap);
- if (ret) {
- EAL_LOG(ERR, "Cannot unmap vaddr for IOMMU, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
-
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
- if (ret) {
- EAL_LOG(ERR,
- "Cannot unregister vaddr for IOMMU, error "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
- }
-
- return ret;
-}
-
-static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl,
- const struct rte_memseg *ms, void *arg)
-{
- int *vfio_container_fd = arg;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- return vfio_spapr_dma_do_map(*vfio_container_fd,
- ms->addr_64, ms->iova, ms->len, 1);
-}
-
-struct spapr_size_walk_param {
- uint64_t max_va;
- uint64_t page_sz;
- bool is_user_managed;
-};
-
-/*
- * In order to set the DMA window size required for the SPAPR IOMMU
- * we need to walk the existing virtual memory allocations as well as
- * find the hugepage size used.
- */
-static int
-vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
-{
- struct spapr_size_walk_param *param = arg;
- uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
-
- if (msl->external && !msl->heap) {
- /* ignore user managed external memory */
- param->is_user_managed = true;
- return 0;
- }
-
- if (max > param->max_va) {
- param->page_sz = msl->page_sz;
- param->max_va = max;
- }
-
- return 0;
-}
-
-/*
- * Find the highest memory address used in physical or virtual address
- * space and use that as the top of the DMA window.
- */
-static int
-find_highest_mem_addr(struct spapr_size_walk_param *param)
-{
- /* find the maximum IOVA address for setting the DMA window size */
- if (rte_eal_iova_mode() == RTE_IOVA_PA) {
- static const char proc_iomem[] = "/proc/iomem";
- static const char str_sysram[] = "System RAM";
- uint64_t start, end, max = 0;
- char *line = NULL;
- char *dash, *space;
- size_t line_len;
-
- /*
- * Example "System RAM" in /proc/iomem:
- * 00000000-1fffffffff : System RAM
- * 200000000000-201fffffffff : System RAM
- */
- FILE *fd = fopen(proc_iomem, "r");
- if (fd == NULL) {
- EAL_LOG(ERR, "Cannot open %s", proc_iomem);
- return -1;
- }
- /* Scan /proc/iomem for the highest PA in the system */
- while (getline(&line, &line_len, fd) != -1) {
- if (strstr(line, str_sysram) == NULL)
- continue;
-
- space = strstr(line, " ");
- dash = strstr(line, "-");
-
- /* Validate the format of the memory string */
- if (space == NULL || dash == NULL || space < dash) {
- EAL_LOG(ERR, "Can't parse line \"%s\" in file %s",
- line, proc_iomem);
- continue;
- }
-
- start = strtoull(line, NULL, 16);
- end = strtoull(dash + 1, NULL, 16);
- EAL_LOG(DEBUG, "Found system RAM from 0x%" PRIx64
- " to 0x%" PRIx64, start, end);
- if (end > max)
- max = end;
- }
- free(line);
- fclose(fd);
-
- if (max == 0) {
- EAL_LOG(ERR, "Failed to find valid \"System RAM\" "
- "entry in file %s", proc_iomem);
- return -1;
- }
-
- spapr_dma_win_len = rte_align64pow2(max + 1);
- return 0;
- } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
- EAL_LOG(DEBUG, "Highest VA address in memseg list is 0x%"
- PRIx64, param->max_va);
- spapr_dma_win_len = rte_align64pow2(param->max_va);
- return 0;
- }
-
- spapr_dma_win_len = 0;
- EAL_LOG(ERR, "Unsupported IOVA mode");
- return -1;
-}
-
-
-/*
- * The SPAPRv2 IOMMU supports 2 DMA windows with starting
- * address at 0 or 1<<59. By default, a DMA window is set
- * at address 0, 2GB long, with a 4KB page. For DPDK we
- * must remove the default window and setup a new DMA window
- * based on the hugepage size and memory requirements of
- * the application before we can map memory for DMA.
- */
-static int
-spapr_dma_win_size(void)
-{
- struct spapr_size_walk_param param;
-
- /* only create DMA window once */
- if (spapr_dma_win_len > 0)
- return 0;
-
- /* walk the memseg list to find the page size/max VA address */
- memset(¶m, 0, sizeof(param));
- if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
- EAL_LOG(ERR, "Failed to walk memseg list for DMA window size");
+ if (sysfs_base == NULL || dev_addr == NULL || iommu_group_num == NULL) {
+ rte_errno = EINVAL;
return -1;
}
- /* we can't be sure if DMA window covers external memory */
- if (param.is_user_managed)
- EAL_LOG(WARNING, "Detected user managed external memory which may not be managed by the IOMMU");
-
- /* check physical/virtual memory size */
- if (find_highest_mem_addr(¶m) < 0)
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
return -1;
- EAL_LOG(DEBUG, "Setting DMA window size to 0x%" PRIx64,
- spapr_dma_win_len);
- spapr_dma_win_page_sz = param.page_sz;
- rte_mem_set_dma_mask(rte_ctz64(spapr_dma_win_len));
- return 0;
-}
-
-static int
-vfio_spapr_create_dma_window(int vfio_container_fd)
-{
- struct vfio_iommu_spapr_tce_create create = {
- .argsz = sizeof(create), };
- struct vfio_iommu_spapr_tce_remove remove = {
- .argsz = sizeof(remove), };
- struct vfio_iommu_spapr_tce_info info = {
- .argsz = sizeof(info), };
- int ret;
-
- ret = spapr_dma_win_size();
- if (ret < 0)
- return ret;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
- if (ret) {
- EAL_LOG(ERR, "Cannot get IOMMU info, error %i (%s)",
- errno, strerror(errno));
- return -1;
- }
-
- /*
- * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window
- * can't be changed for v1 but it can be changed for v2. Since DPDK only
- * supports v2, remove the default DMA window so it can be resized.
- */
- remove.start_addr = info.dma32_window_start;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
- if (ret)
- return -1;
-
- /* create a new DMA window (start address is not selectable) */
- create.window_size = spapr_dma_win_len;
- create.page_shift = rte_ctz64(spapr_dma_win_page_sz);
- create.levels = 1;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
-#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
- /*
- * The vfio_iommu_spapr_tce_info structure was modified in
- * Linux kernel 4.2.0 to add support for the
- * vfio_iommu_spapr_tce_ddw_info structure needed to try
- * multiple table levels. Skip the attempt if running with
- * an older kernel.
- */
- if (ret) {
- /* if at first we don't succeed, try more levels */
- uint32_t levels;
-
- for (levels = create.levels + 1;
- ret && levels <= info.ddw.levels; levels++) {
- create.levels = levels;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
- }
}
-#endif /* VFIO_IOMMU_SPAPR_INFO_DDW */
- if (ret) {
- EAL_LOG(ERR, "Cannot create new DMA window, error "
- "%i (%s)", errno, strerror(errno));
- EAL_LOG(ERR,
- "Consider using a larger hugepage size if supported by the system");
+ if (vfio_cfg.mode != RTE_VFIO_MODE_GROUP && vfio_cfg.mode != RTE_VFIO_MODE_NOIOMMU) {
+ EAL_LOG(ERR, "VFIO not initialized in group mode");
+ rte_errno = ENOTSUP;
return -1;
}
-
- /* verify the start address */
- if (create.start_addr != 0) {
- EAL_LOG(ERR, "Received unsupported start address 0x%"
- PRIx64, (uint64_t)create.start_addr);
+ ret = vfio_group_get_num(sysfs_base, dev_addr, iommu_group_num);
+ if (ret < 0) {
+ rte_errno = EINVAL;
return -1;
- }
- return ret;
-}
-
-static int
-vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
- uint64_t iova, uint64_t len, int do_map)
-{
- int ret = 0;
-
- if (do_map) {
- if (vfio_spapr_dma_do_map(vfio_container_fd,
- vaddr, iova, len, 1)) {
- EAL_LOG(ERR, "Failed to map DMA");
- ret = -1;
- }
- } else {
- if (vfio_spapr_dma_do_map(vfio_container_fd,
- vaddr, iova, len, 0)) {
- EAL_LOG(ERR, "Failed to unmap DMA");
- ret = -1;
- }
- }
-
- return ret;
-}
-
-static int
-vfio_spapr_dma_map(int vfio_container_fd)
-{
- if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) {
- EAL_LOG(ERR, "Could not create new DMA window!");
+ } else if (ret == 0) {
+ rte_errno = ENODEV;
return -1;
}
-
- /* map all existing DPDK segments for DMA */
- if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
- return -1;
-
- return 0;
-}
-
-static int
-vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
-{
- /* No-IOMMU mode does not need DMA mapping */
- return 0;
-}
-
-static int
-vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
- uint64_t __rte_unused vaddr,
- uint64_t __rte_unused iova, uint64_t __rte_unused len,
- int __rte_unused do_map)
-{
- /* No-IOMMU mode does not need DMA mapping */
return 0;
}
static int
-vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+vfio_dma_mem_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
uint64_t len, int do_map)
{
- const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
+ const struct vfio_iommu_ops *t = vfio_cfg.ops;
if (!t) {
EAL_LOG(ERR, "VFIO support not initialized");
- rte_errno = ENODEV;
return -1;
}
@@ -1852,16 +1126,14 @@ vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
EAL_LOG(ERR,
"VFIO custom DMA region mapping not supported by IOMMU %s",
t->name);
- rte_errno = ENOTSUP;
return -1;
}
- return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
- len, do_map);
+ return t->dma_user_map_func(cfg, vaddr, iova, len, do_map);
}
static int
-container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+container_dma_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
uint64_t len)
{
struct user_mem_map *new_map;
@@ -1869,16 +1141,15 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
bool has_partial_unmap;
int ret = 0;
- user_mem_maps = &vfio_cfg->mem_maps;
+ user_mem_maps = &cfg->mem_maps;
rte_spinlock_recursive_lock(&user_mem_maps->lock);
if (user_mem_maps->n_maps == RTE_DIM(user_mem_maps->maps)) {
EAL_LOG(ERR, "No more space for user mem maps");
- rte_errno = ENOMEM;
ret = -1;
goto out;
}
/* map the entry */
- if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+ if (vfio_dma_mem_map(cfg, vaddr, iova, len, 1)) {
/* technically, this will fail if there are currently no devices
* plugged in, even if a device were added later, this mapping
* might have succeeded. however, since we cannot verify if this
@@ -1891,7 +1162,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
goto out;
}
/* do we have partial unmap support? */
- has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+ has_partial_unmap = vfio_cfg.ops->partial_unmap;
/* create new user mem map entry */
new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
@@ -1908,17 +1179,17 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
}
static int
-container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+container_dma_unmap(struct container *cfg, uint64_t vaddr, uint64_t iova,
uint64_t len)
{
- struct user_mem_map orig_maps[RTE_DIM(vfio_cfg->mem_maps.maps)];
+ struct user_mem_map orig_maps[RTE_DIM(cfg->mem_maps.maps)];
struct user_mem_map new_maps[2]; /* can be at most 2 */
struct user_mem_maps *user_mem_maps;
int n_orig, n_new, ret = 0;
bool has_partial_unmap;
unsigned int newlen;
- user_mem_maps = &vfio_cfg->mem_maps;
+ user_mem_maps = &cfg->mem_maps;
rte_spinlock_recursive_lock(&user_mem_maps->lock);
/*
@@ -1944,13 +1215,12 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
/* did we find anything? */
if (n_orig < 0) {
EAL_LOG(ERR, "Couldn't find previously mapped region");
- rte_errno = EINVAL;
ret = -1;
goto out;
}
/* do we have partial unmap capability? */
- has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+ has_partial_unmap = vfio_cfg.ops->partial_unmap;
/*
* if we don't support partial unmap, we must check if start and end of
@@ -1966,7 +1236,6 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
if (!start_aligned || !end_aligned) {
EAL_LOG(DEBUG, "DMA partial unmap unsupported");
- rte_errno = ENOTSUP;
ret = -1;
goto out;
}
@@ -1984,28 +1253,20 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
newlen = (user_mem_maps->n_maps - n_orig) + n_new;
if (newlen >= RTE_DIM(user_mem_maps->maps)) {
EAL_LOG(ERR, "Not enough space to store partial mapping");
- rte_errno = ENOMEM;
ret = -1;
goto out;
}
/* unmap the entry */
- if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+ if (vfio_dma_mem_map(cfg, vaddr, iova, len, 0)) {
/* there may not be any devices plugged in, so unmapping will
- * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
- * stop us from removing the mapping, as the assumption is we
- * won't be needing this memory any more and thus will want to
- * prevent it from being remapped again on hotplug. so, only
- * fail if we indeed failed to unmap (e.g. if the mapping was
- * within our mapped range but had invalid alignment).
+ * fail, but that doesn't stop us from removing the mapping,
+ * as the assumption is we won't be needing this memory any
+ * more and thus will want to prevent it from being remapped
+ * again on hotplug. Ignore the error and proceed with
+ * removing the mapping from our records.
*/
- if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
- EAL_LOG(ERR, "Couldn't unmap region for DMA");
- ret = -1;
- goto out;
- } else {
- EAL_LOG(DEBUG, "DMA unmapping failed, but removing mappings anyway");
- }
+ EAL_LOG(DEBUG, "DMA unmapping failed, but removing mappings anyway");
}
/* we have unmapped the region, so now update the maps */
@@ -2021,212 +1282,178 @@ RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
int
rte_vfio_noiommu_is_enabled(void)
{
- int fd;
- ssize_t cnt;
- char c;
-
- fd = open(RTE_VFIO_NOIOMMU_MODE, O_RDONLY);
- if (fd < 0) {
- if (errno != ENOENT) {
- EAL_LOG(ERR, "Cannot open VFIO noiommu file "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
- /*
- * else the file does not exists
- * i.e. noiommu is not enabled
- */
- return 0;
- }
-
- cnt = read(fd, &c, 1);
- close(fd);
- if (cnt != 1) {
- EAL_LOG(ERR, "Unable to read from VFIO noiommu file "
- "%i (%s)", errno, strerror(errno));
- return -1;
- }
-
- return c == 'Y';
+ return vfio_cfg.mode == RTE_VFIO_MODE_NOIOMMU;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
int
rte_vfio_container_create(void)
{
- unsigned int i;
+ struct container *cfg;
+ int container_fd;
- /* Find an empty slot to store new vfio config */
- for (i = 1; i < RTE_DIM(vfio_cfgs); i++) {
- if (vfio_cfgs[i].vfio_container_fd == -1)
- break;
- }
-
- if (i == RTE_DIM(vfio_cfgs)) {
- EAL_LOG(ERR, "Exceed max VFIO container limit");
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO not initialized");
+ rte_errno = ENXIO;
return -1;
}
-
- /* Create a new container fd */
- vfio_cfgs[i].vfio_container_fd = vfio_open_container_fd(false);
- if (vfio_cfgs[i].vfio_container_fd < 0) {
- EAL_LOG(NOTICE, "Fail to create a new VFIO container");
+ cfg = vfio_container_create();
+ if (cfg == NULL) {
+ EAL_LOG(ERR, "Reached VFIO container limit");
+ rte_errno = ENOSPC;
return -1;
}
- return vfio_cfgs[i].vfio_container_fd;
+ switch (vfio_cfg.mode) {
+ case RTE_VFIO_MODE_GROUP:
+ case RTE_VFIO_MODE_NOIOMMU:
+ {
+ container_fd = vfio_group_open_container_fd();
+ if (container_fd < 0) {
+ EAL_LOG(ERR, "Fail to create a new VFIO container");
+ rte_errno = EIO;
+ goto err;
+ }
+ cfg->container_fd = container_fd;
+ break;
+ }
+ default:
+ EAL_LOG(NOTICE, "Unsupported VFIO mode");
+ rte_errno = ENOTSUP;
+ goto err;
+ }
+ return container_fd;
+err:
+ vfio_container_erase(cfg);
+ return -1;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_destroy)
int
rte_vfio_container_destroy(int container_fd)
{
- struct vfio_config *vfio_cfg;
- unsigned int i;
+ struct container *cfg;
+ struct vfio_device *dev;
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO container fd");
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO not initialized");
+ rte_errno = ENXIO;
return -1;
}
- for (i = 0; i < RTE_DIM(vfio_cfg->vfio_groups); i++)
- if (vfio_cfg->vfio_groups[i].group_num != -1)
- vfio_container_group_unbind(container_fd,
- vfio_cfg->vfio_groups[i].group_num);
-
- close(container_fd);
- vfio_cfg->vfio_container_fd = -1;
- vfio_cfg->vfio_active_groups = 0;
- vfio_cfg->vfio_iommu_type = NULL;
-
- return 0;
-}
-
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_assign_device)
-int
-rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
- const char *dev_addr)
-{
- int iommu_group_num;
- int ret;
-
- ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
- if (ret < 0) {
- EAL_LOG(ERR, "Cannot get IOMMU group number for device %s",
- dev_addr);
- return -1;
- } else if (ret == 0) {
- EAL_LOG(ERR,
- "Device %s is not assigned to any IOMMU group",
- dev_addr);
- return -1;
- }
-
- ret = vfio_container_group_bind(vfio_container_fd,
- iommu_group_num);
- if (ret < 0) {
- EAL_LOG(ERR,
- "Cannot bind IOMMU group %d for device %s",
- iommu_group_num, dev_addr);
- return -1;
- }
-
- return 0;
-}
-
-static int
-vfio_container_group_bind(int container_fd, int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO container fd");
+ cfg = vfio_container_get_by_fd(container_fd);
+ if (cfg == NULL) {
+ EAL_LOG(ERR, "VFIO container fd not managed by VFIO");
+ rte_errno = ENODEV;
return -1;
}
-
- return vfio_get_group_fd(vfio_cfg, iommu_group_num);
-}
-
-static int
-vfio_container_group_unbind(int container_fd, int iommu_group_num)
-{
- struct vfio_group *cur_grp = NULL;
- struct vfio_config *vfio_cfg;
- unsigned int i;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- EAL_LOG(ERR, "Invalid VFIO container fd");
+ /* forbid destroying default container */
+ if (vfio_container_is_default(cfg)) {
+ EAL_LOG(ERR, "Cannot destroy default VFIO container");
+ rte_errno = EINVAL;
return -1;
}
- for (i = 0; i < RTE_DIM(vfio_cfg->vfio_groups); i++) {
- if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
+ switch (vfio_cfg.mode) {
+ case RTE_VFIO_MODE_GROUP:
+ case RTE_VFIO_MODE_NOIOMMU:
+ /* erase all devices */
+ DEVICE_FOREACH_ACTIVE(cfg, dev) {
+ EAL_LOG(DEBUG, "Device in IOMMU group %d still open, closing", dev->group);
+ /*
+ * technically we could've done back-reference lookup and closed our groups
+ * following a device close, but since we're closing and erasing all groups
+ * anyway, we can afford to not bother.
+ */
+ vfio_device_erase(cfg, dev);
}
- }
- /* This should not happen */
- if (cur_grp == NULL) {
- EAL_LOG(ERR, "Specified VFIO group number not found");
+ /* erase all groups */
+ struct vfio_group *grp;
+ GROUP_FOREACH_ACTIVE(cfg, grp) {
+ EAL_LOG(DEBUG, "IOMMU group %d still open, closing", grp->group_num);
+ vfio_group_erase(cfg, grp);
+ }
+ break;
+ default:
+ EAL_LOG(ERR, "Unsupported VFIO mode");
+ rte_errno = ENOTSUP;
return -1;
}
- if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
- EAL_LOG(ERR,
- "Error when closing vfio_group_fd for iommu_group_num "
- "%d", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = -1;
- cur_grp->fd = -1;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups--;
+ /* erase entire config */
+ vfio_container_erase(cfg);
return 0;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
int
-rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len)
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len)
{
- struct vfio_config *vfio_cfg;
+ struct container *cfg;
if (len == 0) {
rte_errno = EINVAL;
return -1;
}
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
+ cfg = vfio_container_get_by_fd(container_fd);
+ if (cfg == NULL) {
EAL_LOG(ERR, "Invalid VFIO container fd");
+ rte_errno = EINVAL;
return -1;
}
- return container_dma_map(vfio_cfg, vaddr, iova, len);
+ if (container_dma_map(cfg, vaddr, iova, len) < 0) {
+ rte_errno = EIO;
+ return -1;
+ }
+
+ return 0;
}
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
int
-rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len)
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, uint64_t len)
{
- struct vfio_config *vfio_cfg;
+ struct container *cfg;
if (len == 0) {
rte_errno = EINVAL;
return -1;
}
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
+ if (vfio_cfg.mode == RTE_VFIO_MODE_NONE) {
+ EAL_LOG(ERR, "VFIO support not initialized");
+ rte_errno = ENXIO;
+ return -1;
+ }
+
+ cfg = vfio_container_get_by_fd(container_fd);
+ if (cfg == NULL) {
EAL_LOG(ERR, "Invalid VFIO container fd");
+ rte_errno = EINVAL;
return -1;
}
- return container_dma_unmap(vfio_cfg, vaddr, iova, len);
+ if (container_dma_unmap(cfg, vaddr, iova, len) < 0) {
+ rte_errno = EIO;
+ return -1;
+ }
+
+ return 0;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_mode)
+enum rte_vfio_mode
+rte_vfio_get_mode(void)
+{
+ return vfio_cfg.mode;
}
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 30389fb274..68d3a3ec6e 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -6,60 +6,161 @@
#define EAL_VFIO_H_
#include <rte_common.h>
+#include <rte_spinlock.h>
#include <stdint.h>
+#include <rte_vfio.h>
+
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define EAL_VFIO_MAX_USER_MEM_MAPS 256
+
+/* user memory map entry */
+struct user_mem_map {
+ uint64_t addr; /**< start VA */
+ uint64_t iova; /**< start IOVA */
+ uint64_t len; /**< total length of the mapping */
+ uint64_t chunk; /**< this mapping can be split in chunks of this size */
+};
+
+/* user memory maps container (common for all API modes) */
+struct user_mem_maps {
+ rte_spinlock_recursive_t lock;
+ int n_maps;
+ struct user_mem_map maps[EAL_VFIO_MAX_USER_MEM_MAPS];
+};
+
/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
struct vfio_group {
+ bool active;
int group_num;
int fd;
- int devices;
+ int n_devices;
+};
+
+/* device tracking (common for group and cdev modes) */
+struct vfio_device {
+ bool active;
+ int group; /**< back-reference to group list (group mode) */
+ int fd;
+};
+
+/* group mode specific configuration */
+struct vfio_group_config {
+ bool dma_setup_done;
+ bool iommu_type_set;
+ bool mem_event_clb_set;
+ size_t n_groups;
+ struct vfio_group groups[RTE_MAX_VFIO_GROUPS];
+};
+
+/* per-container configuration */
+struct container {
+ bool active;
+ int container_fd;
+ struct user_mem_maps mem_maps;
+ struct vfio_group_config group_cfg;
+ int n_devices;
+ struct vfio_device devices[RTE_MAX_VFIO_DEVICES];
};
/* DMA mapping function prototype.
- * Takes VFIO container fd as a parameter.
+ * Takes VFIO container config as a parameter.
* Returns 0 on success, -1 on error.
*/
-typedef int (*vfio_dma_func_t)(int);
+typedef int (*dma_func_t)(struct container *cfg);
/* Custom memory region DMA mapping function prototype.
- * Takes VFIO container fd, virtual address, physical address, length and
+ * Takes VFIO container config, virtual address, physical address, length and
* operation type (0 to unmap 1 for map) as a parameters.
* Returns 0 on success, -1 on error.
*/
-typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map);
+typedef int (*dma_user_func_t)(struct container *cfg, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map);
-struct vfio_iommu_type {
+/* mode-independent ops */
+struct vfio_iommu_ops {
int type_id;
const char *name;
bool partial_unmap;
- vfio_dma_user_func_t dma_user_map_func;
- vfio_dma_func_t dma_map_func;
+ dma_user_func_t dma_user_map_func;
+ dma_func_t dma_map_func;
};
-/* get the vfio container that devices are bound to by default */
-int vfio_open_container_fd(bool mp_request);
+/* global configuration */
+struct vfio_config {
+ struct container *default_cfg;
+ enum rte_vfio_mode mode;
+ const struct vfio_iommu_ops *ops;
+};
+
+/* per-process, per-container data */
+extern struct container containers[RTE_MAX_VFIO_CONTAINERS];
+
+/* current configuration */
+extern struct vfio_config vfio_cfg;
+
+#define CONTAINER_FOREACH(cfg) \
+ for ((cfg) = &containers[0]; \
+ (cfg) < &containers[RTE_DIM(containers)]; \
+ (cfg)++)
+
+#define CONTAINER_FOREACH_ACTIVE(cfg) \
+ CONTAINER_FOREACH((cfg)) \
+ if (((cfg)->active))
+
+#define GROUP_FOREACH(cfg, grp) \
+ for ((grp) = &((cfg)->group_cfg.groups[0]); \
+ (grp) < &((cfg)->group_cfg.groups[RTE_DIM((cfg)->group_cfg.groups)]); \
+ (grp)++)
-/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd);
+#define GROUP_FOREACH_ACTIVE(cfg, grp) \
+ GROUP_FOREACH((cfg), (grp)) \
+ if ((grp)->active)
-int
-vfio_get_iommu_type(void);
+#define DEVICE_FOREACH(cfg, dev) \
+ for ((dev) = &((cfg)->devices[0]); \
+ (dev) < &((cfg)->devices[RTE_DIM((cfg)->devices)]); \
+ (dev)++)
-int vfio_get_group_fd_by_num(int iommu_group_num);
+#define DEVICE_FOREACH_ACTIVE(cfg, dev) \
+ DEVICE_FOREACH((cfg), (dev)) \
+ if ((dev)->active)
-/* check if we have any supported extensions */
-int
-vfio_has_supported_extensions(int vfio_container_fd);
+/* for containers, we only need to initialize the lock in mem maps */
+#define CONTAINER_INITIALIZER \
+ ((struct container){ \
+ .mem_maps = {.lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER,}, \
+ })
+int vfio_get_iommu_type(void);
int vfio_mp_sync_setup(void);
void vfio_mp_sync_cleanup(void);
+bool vfio_container_is_default(struct container *cfg);
+/* group mode functions */
+int vfio_group_enable(struct container *cfg);
+int vfio_group_open_container_fd(void);
+int vfio_group_noiommu_is_enabled(void);
+int vfio_group_get_num(const char *sysfs_base, const char *dev_addr,
+ int *iommu_group_num);
+struct vfio_group *vfio_group_get_by_num(struct container *cfg, int iommu_group);
+struct vfio_group *vfio_group_create(struct container *cfg, int iommu_group);
+void vfio_group_erase(struct container *cfg, struct vfio_group *grp);
+int vfio_group_open_fd(struct container *cfg, struct vfio_group *grp);
+int vfio_group_prepare(struct container *cfg, struct vfio_group *grp);
+int vfio_group_setup_iommu(struct container *cfg);
+int vfio_group_setup_device_fd(const char *dev_addr,
+ struct vfio_group *grp, struct vfio_device *dev);
+
+#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
#define EAL_VFIO_MP "eal_vfio_mp_sync"
#define SOCKET_REQ_CONTAINER 0x100
@@ -75,6 +176,7 @@ struct vfio_mp_param {
union {
int group_num;
int iommu_type_id;
+ enum rte_vfio_mode mode;
};
};
diff --git a/lib/eal/linux/eal_vfio_group.c b/lib/eal/linux/eal_vfio_group.c
new file mode 100644
index 0000000000..520e61610c
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_group.c
@@ -0,0 +1,984 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2025 Intel Corporation
+ */
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <uapi/linux/vfio.h>
+
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_eal_memconfig.h>
+#include <rte_memory.h>
+#include <rte_string_fns.h>
+#include <rte_vfio.h>
+
+#include "eal_vfio.h"
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+static int vfio_type1_dma_map(struct container *);
+static int vfio_type1_dma_mem_map(struct container *, uint64_t, uint64_t, uint64_t, int);
+static int vfio_spapr_dma_map(struct container *);
+static int vfio_spapr_dma_mem_map(struct container *, uint64_t, uint64_t, uint64_t, int);
+static int vfio_noiommu_dma_map(struct container *);
+static int vfio_noiommu_dma_mem_map(struct container *, uint64_t, uint64_t, uint64_t, int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_ops iommu_types[] = {
+ /* x86 IOMMU, otherwise known as type 1 */
+ {
+ .type_id = VFIO_TYPE1_IOMMU,
+ .name = "Type 1",
+ .partial_unmap = false,
+ .dma_map_func = &vfio_type1_dma_map,
+ .dma_user_map_func = &vfio_type1_dma_mem_map
+ },
+ /* ppc64 IOMMU, otherwise known as spapr */
+ {
+ .type_id = VFIO_SPAPR_TCE_v2_IOMMU,
+ .name = "sPAPR",
+ .partial_unmap = true,
+ .dma_map_func = &vfio_spapr_dma_map,
+ .dma_user_map_func = &vfio_spapr_dma_mem_map
+ },
+ /* IOMMU-less mode */
+ {
+ .type_id = VFIO_NOIOMMU_IOMMU,
+ .name = "No-IOMMU",
+ .partial_unmap = true,
+ .dma_map_func = &vfio_noiommu_dma_map,
+ .dma_user_map_func = &vfio_noiommu_dma_mem_map
+ },
+};
+
+static const struct vfio_iommu_ops *
+vfio_group_set_iommu_type(int vfio_container_fd)
+{
+ unsigned int idx;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_ops *t = &iommu_types[idx];
+
+ int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, t->type_id);
+ if (ret == 0)
+ return t;
+ /* not an error, there may be more supported IOMMU types */
+ EAL_LOG(DEBUG, "Set IOMMU type %d (%s) failed, error "
+ "%i (%s)", t->type_id, t->name, errno,
+ strerror(errno));
+ }
+ /* if we didn't find a suitable IOMMU type, fail */
+ return NULL;
+}
+
+static int
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
+{
+ struct container *cfg = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ return vfio_type1_dma_mem_map(cfg, ms->addr_64, ms->iova, ms->len, 1);
+}
+
+static int
+vfio_type1_dma_mem_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ /**
+ * In case the mapping was already done EEXIST will be
+ * returned from kernel.
+ */
+ if (errno == EEXIST) {
+ EAL_LOG(DEBUG,
+ "Memory segment is already mapped, skipping");
+ } else {
+ EAL_LOG(ERR,
+ "Cannot set up DMA remapping, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+ }
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot clear DMA remapping, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ } else if (dma_unmap.size != len) {
+ EAL_LOG(ERR, "Unexpected size %"PRIu64
+ " of DMA remapping cleared instead of %"PRIu64,
+ (uint64_t)dma_unmap.size, len);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vfio_type1_dma_map(struct container *cfg)
+{
+ return rte_memseg_walk(type1_map, cfg);
+}
+
+/* Track the size of the statically allocated DMA window for SPAPR */
+uint64_t spapr_dma_win_len;
+uint64_t spapr_dma_win_page_sz;
+
+static int
+vfio_spapr_dma_do_map(struct container *cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .vaddr = (uintptr_t) vaddr,
+ .size = len,
+ .flags = 0
+ };
+ int ret;
+
+ if (do_map != 0) {
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ if (iova + len > spapr_dma_win_len) {
+ EAL_LOG(ERR, "DMA map attempt outside DMA window");
+ return -1;
+ }
+
+ ret = ioctl(cfg->container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
+ if (ret) {
+ EAL_LOG(ERR,
+ "Cannot register vaddr for IOMMU, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot map vaddr for IOMMU, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+
+ } else {
+ struct vfio_iommu_type1_dma_map dma_unmap;
+
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot unmap vaddr for IOMMU, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+
+ ret = ioctl(cfg->container_fd,
+ VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
+ if (ret) {
+ EAL_LOG(ERR,
+ "Cannot unregister vaddr for IOMMU, error "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg)
+{
+ struct container *cfg = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ return vfio_spapr_dma_do_map(cfg, ms->addr_64, ms->iova, ms->len, 1);
+}
+
+struct spapr_size_walk_param {
+ uint64_t max_va;
+ uint64_t page_sz;
+ bool is_user_managed;
+};
+
+/*
+ * In order to set the DMA window size required for the SPAPR IOMMU
+ * we need to walk the existing virtual memory allocations as well as
+ * find the hugepage size used.
+ */
+static int
+vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct spapr_size_walk_param *param = arg;
+ uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
+
+ if (msl->external && !msl->heap) {
+ /* ignore user managed external memory */
+ param->is_user_managed = true;
+ return 0;
+ }
+
+ if (max > param->max_va) {
+ param->page_sz = msl->page_sz;
+ param->max_va = max;
+ }
+
+ return 0;
+}
+
+/*
+ * Find the highest memory address used in physical or virtual address
+ * space and use that as the top of the DMA window.
+ */
+static int
+find_highest_mem_addr(struct spapr_size_walk_param *param)
+{
+ /* find the maximum IOVA address for setting the DMA window size */
+ if (rte_eal_iova_mode() == RTE_IOVA_PA) {
+ static const char proc_iomem[] = "/proc/iomem";
+ static const char str_sysram[] = "System RAM";
+ uint64_t start, end, max = 0;
+ char *line = NULL;
+ char *dash, *space;
+ size_t line_len;
+
+ /*
+ * Example "System RAM" in /proc/iomem:
+ * 00000000-1fffffffff : System RAM
+ * 200000000000-201fffffffff : System RAM
+ */
+ FILE *fd = fopen(proc_iomem, "r");
+ if (fd == NULL) {
+ EAL_LOG(ERR, "Cannot open %s", proc_iomem);
+ return -1;
+ }
+ /* Scan /proc/iomem for the highest PA in the system */
+ while (getline(&line, &line_len, fd) != -1) {
+ if (strstr(line, str_sysram) == NULL)
+ continue;
+
+ space = strstr(line, " ");
+ dash = strstr(line, "-");
+
+ /* Validate the format of the memory string */
+ if (space == NULL || dash == NULL || space < dash) {
+ EAL_LOG(ERR, "Can't parse line \"%s\" in file %s",
+ line, proc_iomem);
+ continue;
+ }
+
+ start = strtoull(line, NULL, 16);
+ end = strtoull(dash + 1, NULL, 16);
+ EAL_LOG(DEBUG, "Found system RAM from 0x%" PRIx64
+ " to 0x%" PRIx64, start, end);
+ if (end > max)
+ max = end;
+ }
+ free(line);
+ fclose(fd);
+
+ if (max == 0) {
+ EAL_LOG(ERR, "Failed to find valid \"System RAM\" "
+ "entry in file %s", proc_iomem);
+ return -1;
+ }
+
+ spapr_dma_win_len = rte_align64pow2(max + 1);
+ return 0;
+ } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ EAL_LOG(DEBUG, "Highest VA address in memseg list is 0x%"
+ PRIx64, param->max_va);
+ spapr_dma_win_len = rte_align64pow2(param->max_va);
+ return 0;
+ }
+
+ spapr_dma_win_len = 0;
+ EAL_LOG(ERR, "Unsupported IOVA mode");
+ return -1;
+}
+
+
+/*
+ * The SPAPRv2 IOMMU supports 2 DMA windows with starting
+ * address at 0 or 1<<59. By default, a DMA window is set
+ * at address 0, 2GB long, with a 4KB page. For DPDK we
+ * must remove the default window and setup a new DMA window
+ * based on the hugepage size and memory requirements of
+ * the application before we can map memory for DMA.
+ */
+static int
+spapr_dma_win_size(void)
+{
+ struct spapr_size_walk_param param;
+
+ /* only create DMA window once */
+ if (spapr_dma_win_len > 0)
+ return 0;
+
+ /* walk the memseg list to find the page size/max VA address */
+ memset(¶m, 0, sizeof(param));
+ if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
+ EAL_LOG(ERR, "Failed to walk memseg list for DMA window size");
+ return -1;
+ }
+
+ /* we can't be sure if DMA window covers external memory */
+ if (param.is_user_managed)
+ EAL_LOG(WARNING, "Detected user managed external memory which may not be managed by the IOMMU");
+
+ /* check physical/virtual memory size */
+ if (find_highest_mem_addr(¶m) < 0)
+ return -1;
+ EAL_LOG(DEBUG, "Setting DMA window size to 0x%" PRIx64,
+ spapr_dma_win_len);
+ spapr_dma_win_page_sz = param.page_sz;
+ rte_mem_set_dma_mask(rte_ctz64(spapr_dma_win_len));
+ return 0;
+}
+
+static int
+vfio_spapr_create_dma_window(struct container *cfg)
+{
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create), };
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove), };
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info), };
+ int ret;
+
+ ret = spapr_dma_win_size();
+ if (ret < 0)
+ return ret;
+
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot get IOMMU info, error %i (%s)",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ /*
+ * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window
+ * can't be changed for v1 but it can be changed for v2. Since DPDK only
+ * supports v2, remove the default DMA window so it can be resized.
+ */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret)
+ return -1;
+
+ /* create a new DMA window (start address is not selectable) */
+ create.window_size = spapr_dma_win_len;
+ create.page_shift = rte_ctz64(spapr_dma_win_page_sz);
+ create.levels = 1;
+ ret = ioctl(cfg->container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ /*
+ * The vfio_iommu_spapr_tce_info structure was modified in
+ * Linux kernel 4.2.0 to add support for the
+ * vfio_iommu_spapr_tce_ddw_info structure needed to try
+ * multiple table levels. Skip the attempt if running with
+ * an older kernel.
+ */
+ if (ret) {
+ /* if at first we don't succeed, try more levels */
+ uint32_t levels;
+
+ for (levels = create.levels + 1;
+ ret && levels <= info.ddw.levels; levels++) {
+ create.levels = levels;
+ ret = ioctl(cfg->container_fd,
+ VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ }
+ }
+ if (ret) {
+ EAL_LOG(ERR, "Cannot create new DMA window, error "
+ "%i (%s)", errno, strerror(errno));
+ EAL_LOG(ERR,
+ "Consider using a larger hugepage size if supported by the system");
+ return -1;
+ }
+
+ /* verify the start address */
+ if (create.start_addr != 0) {
+ EAL_LOG(ERR, "Received unsupported start address 0x%"
+ PRIx64, (uint64_t)create.start_addr);
+ return -1;
+ }
+ return ret;
+}
+
+static int
+vfio_spapr_dma_mem_map(struct container *cfg, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map)
+{
+ int ret = 0;
+
+ if (do_map) {
+ if (vfio_spapr_dma_do_map(cfg, vaddr, iova, len, 1)) {
+ EAL_LOG(ERR, "Failed to map DMA");
+ ret = -1;
+ }
+ } else {
+ if (vfio_spapr_dma_do_map(cfg, vaddr, iova, len, 0)) {
+ EAL_LOG(ERR, "Failed to unmap DMA");
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+vfio_spapr_dma_map(struct container *cfg)
+{
+ if (vfio_spapr_create_dma_window(cfg) < 0) {
+ EAL_LOG(ERR, "Could not create new DMA window!");
+ return -1;
+ }
+
+ /* map all existing DPDK segments for DMA */
+ if (rte_memseg_walk(vfio_spapr_map_walk, cfg) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+vfio_noiommu_dma_map(struct container *cfg __rte_unused)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_noiommu_dma_mem_map(struct container *cfg __rte_unused,
+ uint64_t vaddr __rte_unused,
+ uint64_t iova __rte_unused, uint64_t len __rte_unused,
+ int do_map __rte_unused)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+struct vfio_group *
+vfio_group_create(struct container *cfg, int iommu_group)
+{
+ struct vfio_group *grp;
+
+ if (cfg->group_cfg.n_groups >= RTE_DIM(cfg->group_cfg.groups)) {
+ EAL_LOG(ERR, "Cannot add more VFIO groups to container");
+ return NULL;
+ }
+ GROUP_FOREACH(cfg, grp) {
+ if (grp->active)
+ continue;
+ cfg->group_cfg.n_groups++;
+ grp->active = true;
+ grp->group_num = iommu_group;
+ grp->fd = -1;
+ return grp;
+ }
+ /* should not happen */
+ return NULL;
+}
+
+void
+vfio_group_erase(struct container *cfg, struct vfio_group *grp)
+{
+ struct vfio_group_config *group_cfg = &cfg->group_cfg;
+
+ if (grp->fd >= 0 && close(grp->fd) < 0)
+ EAL_LOG(ERR, "Error when closing group fd %d", grp->fd);
+
+ *grp = (struct vfio_group){0};
+ group_cfg->n_groups--;
+
+ /* if this was the last group in config, erase IOMMU setup and unregister callback */
+ if (group_cfg->n_groups == 0) {
+ group_cfg->dma_setup_done = false;
+ group_cfg->iommu_type_set = false;
+ }
+}
+
+struct vfio_group *
+vfio_group_get_by_num(struct container *cfg, int iommu_group)
+{
+ struct vfio_group *grp;
+
+ GROUP_FOREACH_ACTIVE(cfg, grp) {
+ if (grp->group_num == iommu_group)
+ return grp;
+ }
+ return NULL;
+}
+
+static int
+vfio_open_group_sysfs(int iommu_group_num)
+{
+ char filename[PATH_MAX];
+ int fd;
+
+ if (vfio_cfg.mode == RTE_VFIO_MODE_GROUP)
+ snprintf(filename, sizeof(filename), RTE_VFIO_GROUP_FMT, iommu_group_num);
+ else if (vfio_cfg.mode == RTE_VFIO_MODE_NOIOMMU)
+ snprintf(filename, sizeof(filename), RTE_VFIO_NOIOMMU_GROUP_FMT, iommu_group_num);
+
+ /* reset errno before open to differentiate errors */
+ errno = 0;
+ fd = open(filename, O_RDWR);
+
+ /* we have to differentiate between failed open and non-existence */
+ if (errno == ENOENT)
+ return -ENOENT;
+ return fd;
+}
+
+static int
+vfio_group_request_fd(int iommu_group_num)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+ int vfio_group_fd = -1;
+
+ p->req = SOCKET_REQ_GROUP;
+ p->group_num = iommu_group_num;
+ rte_strscpy(mp_req.name, EAL_VFIO_MP, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_group_fd = mp_rep->fds[0];
+ } else if (p->result == SOCKET_NO_FD) {
+ EAL_LOG(ERR, "Bad VFIO group fd");
+ vfio_group_fd = -ENOENT;
+ }
+ }
+
+ free(mp_reply.msgs);
+ return vfio_group_fd;
+}
+
+int
+vfio_group_open_fd(struct container *cfg, struct vfio_group *grp)
+{
+ int vfio_group_fd;
+
+ /* we make multiprocess request only in secondary processes for default config */
+ if ((rte_eal_process_type() != RTE_PROC_PRIMARY) && (vfio_container_is_default(cfg)))
+ vfio_group_fd = vfio_group_request_fd(grp->group_num);
+ else
+ vfio_group_fd = vfio_open_group_sysfs(grp->group_num);
+
+ /* pass the non-existence up the chain */
+ if (vfio_group_fd == -ENOENT)
+ return vfio_group_fd;
+ else if (vfio_group_fd < 0) {
+ EAL_LOG(ERR, "Failed to open VFIO group %d", grp->group_num);
+ return vfio_group_fd;
+ }
+ grp->fd = vfio_group_fd;
+ return 0;
+}
+
+static const struct vfio_iommu_ops *
+vfio_group_sync_iommu_ops(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+ int iommu_type_id;
+ unsigned int i;
+
+ /* find default container's IOMMU type */
+ p->req = SOCKET_REQ_IOMMU_TYPE;
+ rte_strscpy(mp_req.name, EAL_VFIO_MP, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ iommu_type_id = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK)
+ iommu_type_id = p->iommu_type_id;
+ }
+ free(mp_reply.msgs);
+ if (iommu_type_id < 0) {
+ EAL_LOG(ERR, "Could not get IOMMU type from primary process");
+ return NULL;
+ }
+
+ /* we now have an fd for default container, as well as its IOMMU type.
+ * now, set up default VFIO container config to match.
+ */
+ for (i = 0; i < RTE_DIM(iommu_types); i++) {
+ const struct vfio_iommu_ops *t = &iommu_types[i];
+ if (t->type_id != iommu_type_id)
+ continue;
+
+ return t;
+ }
+ EAL_LOG(ERR, "Could not find IOMMU type id (%i)", iommu_type_id);
+ return NULL;
+}
+
+int
+vfio_group_noiommu_is_enabled(void)
+{
+ int fd;
+ ssize_t cnt;
+ char c;
+
+ fd = open(RTE_VFIO_NOIOMMU_MODE, O_RDONLY);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ EAL_LOG(ERR, "Cannot open VFIO noiommu file "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+ /*
+ * else the file does not exists
+ * i.e. noiommu is not enabled
+ */
+ return 0;
+ }
+
+ cnt = read(fd, &c, 1);
+ close(fd);
+ if (cnt != 1) {
+ EAL_LOG(ERR, "Unable to read from VFIO noiommu file "
+ "%i (%s)", errno, strerror(errno));
+ return -1;
+ }
+
+ return c == 'Y';
+}
+
+static int
+vfio_has_supported_extensions(int vfio_container_fd)
+{
+ int ret;
+ unsigned int idx, n_extensions = 0;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_ops *t = &iommu_types[idx];
+
+ ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+ t->type_id);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Could not get IOMMU type, error "
+ "%i (%s)", errno, strerror(errno));
+ close(vfio_container_fd);
+ return -1;
+ } else if (ret == 1) {
+ /* we found a supported extension */
+ n_extensions++;
+ }
+ EAL_LOG(DEBUG, "IOMMU type %d (%s) is %s",
+ t->type_id, t->name,
+ ret ? "supported" : "not supported");
+ }
+
+ /* if we didn't find any supported IOMMU types, fail */
+ if (n_extensions == 0)
+ return -1;
+
+ return 0;
+}
+
+int
+vfio_group_open_container_fd(void)
+{
+ int ret, vfio_container_fd;
+
+ vfio_container_fd = open(RTE_VFIO_CONTAINER_PATH, O_RDWR);
+ if (vfio_container_fd < 0) {
+ EAL_LOG(DEBUG, "Cannot open VFIO container %s, error %i (%s)",
+ RTE_VFIO_CONTAINER_PATH, errno, strerror(errno));
+ return -1;
+ }
+
+ /* check VFIO API version */
+ ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+ if (ret != VFIO_API_VERSION) {
+ if (ret < 0)
+ EAL_LOG(DEBUG,
+ "Could not get VFIO API version, error "
+ "%i (%s)", errno, strerror(errno));
+ else
+ EAL_LOG(DEBUG, "Unsupported VFIO API version!");
+ close(vfio_container_fd);
+ return -1;
+ }
+
+ ret = vfio_has_supported_extensions(vfio_container_fd);
+ if (ret) {
+ EAL_LOG(DEBUG,
+ "No supported IOMMU extensions found!");
+ close(vfio_container_fd);
+ return -1;
+ }
+
+ return vfio_container_fd;
+}
+
+int
+vfio_group_enable(struct container *cfg)
+{
+ int container_fd;
+ DIR *dir;
+
+ /* VFIO directory might not exist (e.g., unprivileged containers) */
+ dir = opendir(RTE_VFIO_DIR);
+ if (dir == NULL) {
+ EAL_LOG(DEBUG,
+ "VFIO directory does not exist, skipping VFIO group support...");
+ return 1;
+ }
+ closedir(dir);
+
+ /* open a default container */
+ container_fd = vfio_group_open_container_fd();
+ if (container_fd < 0)
+ return -1;
+
+ cfg->container_fd = container_fd;
+ return 0;
+}
+
+int
+vfio_group_prepare(struct container *cfg, struct vfio_group *grp)
+{
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)};
+ int ret;
+
+ /*
+ * We need to assign group to a container and check if it is viable, but there are cases
+ * where we don't need to do that.
+ *
+ * For default container, we need to set up the group only in primary process, as secondary
+ * process would have requested group fd over IPC, which implies it would have already been
+ * set up by the primary.
+ *
+ * For custom containers, every process sets up its own groups.
+ */
+ if (vfio_container_is_default(cfg) && rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ EAL_LOG(DEBUG, "Skipping setup for VFIO group %d", grp->group_num);
+ return 0;
+ }
+
+ /* check if the group is viable */
+ ret = ioctl(grp->fd, VFIO_GROUP_GET_STATUS, &group_status);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot get VFIO group status for group %d, error %i (%s)",
+ grp->group_num, errno, strerror(errno));
+ return -1;
+ }
+
+ if ((group_status.flags & VFIO_GROUP_FLAGS_VIABLE) == 0) {
+ EAL_LOG(ERR, "VFIO group %d is not viable! "
+ "Not all devices in IOMMU group bound to VFIO or unbound",
+ grp->group_num);
+ return -1;
+ }
+
+ /* set container for group if necessary */
+ if ((group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET) == 0) {
+ /* add group to a container */
+ ret = ioctl(grp->fd, VFIO_GROUP_SET_CONTAINER, &cfg->container_fd);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot add VFIO group %d to container, error %i (%s)",
+ grp->group_num, errno, strerror(errno));
+ return -1;
+ }
+ } else {
+ /* group is already added to a container - this should not happen */
+ EAL_LOG(ERR, "VFIO group %d is already assigned to a container", grp->group_num);
+ return -1;
+ }
+ return 0;
+}
+
+int
+vfio_group_setup_iommu(struct container *cfg)
+{
+ const struct vfio_iommu_ops *ops;
+
+ /*
+ * Setting IOMMU type is a per-container operation (via ioctl on container fd), but the ops
+ * structure is global and shared across all containers.
+ *
+ * For secondary processes with default container, we sync ops from primary. For all other
+ * cases (primary, or secondary with custom containers), we set IOMMU type on the container
+ * which also discovers the ops.
+ */
+ if (vfio_container_is_default(cfg) && rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ /* Secondary process: sync ops from primary for default container */
+ ops = vfio_group_sync_iommu_ops();
+ if (ops == NULL)
+ return -1;
+ } else {
+ /* Primary process OR custom container: set IOMMU type on container */
+ ops = vfio_group_set_iommu_type(cfg->container_fd);
+ if (ops == NULL)
+ return -1;
+ }
+
+ /* Set or verify global ops */
+ if (vfio_cfg.ops == NULL) {
+ vfio_cfg.ops = ops;
+ EAL_LOG(INFO, "IOMMU type set to %d (%s)", ops->type_id, ops->name);
+ } else if (vfio_cfg.ops != ops) {
+ /* This shouldn't happen on the same machine, but log it */
+ EAL_LOG(WARNING,
+ "Container has different IOMMU type (%d - %s) than previously set (%d - %s)",
+ ops->type_id, ops->name, vfio_cfg.ops->type_id, vfio_cfg.ops->name);
+ }
+
+ return 0;
+}
+
+int
+vfio_group_setup_device_fd(const char *dev_addr, struct vfio_group *grp, struct vfio_device *dev)
+{
+ rte_uuid_t vf_token;
+ int fd;
+
+ rte_eal_vfio_get_vf_token(vf_token);
+
+ if (!rte_uuid_is_null(vf_token)) {
+ char vf_token_str[RTE_UUID_STRLEN];
+ char devaddr[PATH_MAX];
+
+ rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str));
+ snprintf(devaddr, sizeof(devaddr),
+ "%s vf_token=%s", dev_addr, vf_token_str);
+
+ fd = ioctl(grp->fd, VFIO_GROUP_GET_DEVICE_FD, devaddr);
+ if (fd >= 0)
+ goto out;
+ }
+ /* get a file descriptor for the device */
+ fd = ioctl(grp->fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+ if (fd < 0) {
+ /*
+ * if we cannot get a device fd, this implies a problem with the VFIO group or the
+ * container not having IOMMU configured.
+ */
+ EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed", dev_addr);
+ return -1;
+ }
+out:
+ dev->fd = fd;
+ /* store backreference to group */
+ dev->group = grp->group_num;
+ /* increment number of devices in group */
+ grp->n_devices++;
+ return 0;
+}
+
+int
+vfio_group_get_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num)
+{
+ char linkname[PATH_MAX];
+ char filename[PATH_MAX];
+ char *tok[16], *group_tok, *end;
+ int ret, group_num;
+
+ memset(linkname, 0, sizeof(linkname));
+ memset(filename, 0, sizeof(filename));
+
+ /* try to find out IOMMU group for this device */
+ snprintf(linkname, sizeof(linkname),
+ "%s/%s/iommu_group", sysfs_base, dev_addr);
+
+ ret = readlink(linkname, filename, sizeof(filename));
+
+ /* if the link doesn't exist, no VFIO for us */
+ if (ret < 0)
+ return 0;
+
+ ret = rte_strsplit(filename, sizeof(filename),
+ tok, RTE_DIM(tok), '/');
+
+ if (ret <= 0) {
+ EAL_LOG(ERR, "%s cannot get IOMMU group", dev_addr);
+ return -1;
+ }
+
+ /* IOMMU group is always the last token */
+ errno = 0;
+ group_tok = tok[ret - 1];
+ end = group_tok;
+ group_num = strtol(group_tok, &end, 10);
+ if (end == group_tok || *end != '\0' || errno != 0) {
+ EAL_LOG(ERR, "%s error parsing IOMMU number!", dev_addr);
+ return -1;
+ }
+ *iommu_group_num = group_num;
+
+ return 1;
+}
diff --git a/lib/eal/linux/eal_vfio_mp_sync.c b/lib/eal/linux/eal_vfio_mp_sync.c
index 3eaeef2fc8..9a07d35023 100644
--- a/lib/eal/linux/eal_vfio_mp_sync.c
+++ b/lib/eal/linux/eal_vfio_mp_sync.c
@@ -32,21 +32,32 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
switch (m->req) {
case SOCKET_REQ_GROUP:
+ {
+ struct container *cfg = vfio_cfg.default_cfg;
+ struct vfio_group *grp;
+
+ if (vfio_cfg.mode != RTE_VFIO_MODE_GROUP &&
+ vfio_cfg.mode != RTE_VFIO_MODE_NOIOMMU) {
+ EAL_LOG(ERR, "VFIO not initialized in group mode");
+ r->result = SOCKET_ERR;
+ break;
+ }
+
r->req = SOCKET_REQ_GROUP;
r->group_num = m->group_num;
- fd = vfio_get_group_fd_by_num(m->group_num);
- if (fd < 0 && fd != -ENOENT)
- r->result = SOCKET_ERR;
- else if (fd == -ENOENT)
- /* if VFIO group exists but isn't bound to VFIO driver */
+ grp = vfio_group_get_by_num(cfg, m->group_num);
+ if (grp == NULL) {
+ /* group doesn't exist in primary */
r->result = SOCKET_NO_FD;
- else {
- /* if group exists and is bound to VFIO driver */
+ } else {
+ /* group exists and is bound to VFIO driver */
+ fd = grp->fd;
r->result = SOCKET_OK;
reply.num_fds = 1;
reply.fds[0] = fd;
}
break;
+ }
case SOCKET_REQ_CONTAINER:
r->req = SOCKET_REQ_CONTAINER;
fd = rte_vfio_get_container_fd();
@@ -54,6 +65,7 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
r->result = SOCKET_ERR;
else {
r->result = SOCKET_OK;
+ r->mode = vfio_cfg.mode;
reply.num_fds = 1;
reply.fds[0] = fd;
}
@@ -62,6 +74,13 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
int iommu_type_id;
+ if (vfio_cfg.mode != RTE_VFIO_MODE_GROUP &&
+ vfio_cfg.mode != RTE_VFIO_MODE_NOIOMMU) {
+ EAL_LOG(ERR, "VFIO not initialized in group mode");
+ r->result = SOCKET_ERR;
+ break;
+ }
+
r->req = SOCKET_REQ_IOMMU_TYPE;
iommu_type_id = vfio_get_iommu_type();
@@ -90,8 +109,11 @@ vfio_mp_sync_setup(void)
{
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
- if (ret && rte_errno != ENOTSUP)
+ if (ret && rte_errno != ENOTSUP) {
+ EAL_LOG(ERR, "Multiprocess sync setup failed: %d (%s)",
+ rte_errno, rte_strerror(rte_errno));
return -1;
+ }
}
return 0;
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 29ba313218..5ec8eddaa2 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
'eal_thread.c',
'eal_timer.c',
'eal_vfio.c',
+ 'eal_vfio_group.c',
'eal_vfio_mp_sync.c',
)
--
2.47.3
^ permalink raw reply related
* [PATCH v8 13/18] bus/pci: use the new VFIO mode API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Chenbo Xia, Nipun Gupta
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Use the new VFIO mode API to query no-IOMMU status.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/bus/pci/linux/pci.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 9aae0a5d14..a1575b84e2 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -596,7 +596,7 @@ pci_device_iova_mode(const struct rte_pci_driver *pdrv,
static int is_vfio_noiommu_enabled = -1;
if (is_vfio_noiommu_enabled == -1) {
- if (rte_vfio_noiommu_is_enabled() == 1)
+ if (rte_vfio_get_mode() == RTE_VFIO_MODE_NOIOMMU)
is_vfio_noiommu_enabled = 1;
else
is_vfio_noiommu_enabled = 0;
--
2.47.3
^ permalink raw reply related
* [PATCH v8 11/18] vfio: remove group-based API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Bruce Richardson
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
All drivers have been adjusted to not use the VFIO group API directly and
instead rely on container device assignment model, so the group API is no
longer useful and can be removed.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
lib/eal/freebsd/eal.c | 33 ---------------
lib/eal/include/rte_vfio.h | 72 --------------------------------
lib/eal/linux/eal_vfio.c | 47 ++++++++++-----------
lib/eal/linux/eal_vfio.h | 2 +
lib/eal/linux/eal_vfio_mp_sync.c | 2 +-
5 files changed, 26 insertions(+), 130 deletions(-)
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index 0c64a62c5a..cda72dfd1d 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -856,13 +856,6 @@ int rte_vfio_noiommu_is_enabled(void)
return 0;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
-int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
-{
- rte_errno = ENOTSUP;
- return -1;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
int
rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
@@ -881,14 +874,6 @@ rte_vfio_get_container_fd(void)
return -1;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
-int
-rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
-{
- rte_errno = ENOTSUP;
- return -1;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
int
rte_vfio_container_create(void)
@@ -905,24 +890,6 @@ rte_vfio_container_destroy(__rte_unused int container_fd)
return -1;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
-int
-rte_vfio_container_group_bind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- rte_errno = ENOTSUP;
- return -1;
-}
-
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
-int
-rte_vfio_container_group_unbind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- rte_errno = ENOTSUP;
- return -1;
-}
-
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
int
rte_vfio_container_dma_map(__rte_unused int container_fd,
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index e7e2ee950b..941b7d0541 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -136,24 +136,6 @@ int rte_vfio_is_enabled(const char *modname);
__rte_internal
int rte_vfio_noiommu_is_enabled(void);
-/**
- * @internal
- * Remove group fd from internal VFIO group fd array.
- *
- * This function is only relevant to linux and will return
- * an error on BSD.
- *
- * @param vfio_group_fd
- * VFIO Group FD.
- *
- * @return
- * 0 on success.
- * <0 on failure.
- */
-__rte_internal
-int
-rte_vfio_clear_group(int vfio_group_fd);
-
/**
* @internal
* Parse IOMMU group number for a device.
@@ -218,24 +200,6 @@ __rte_internal
int
rte_vfio_get_container_fd(void);
-/**
- * @internal
- * Open VFIO group fd or get an existing one.
- *
- * This function is only relevant to linux and will return
- * an error on BSD.
- *
- * @param iommu_group_num
- * iommu group number
- *
- * @return
- * > 0 group fd
- * < 0 for errors
- */
-__rte_internal
-int
-rte_vfio_get_group_fd(int iommu_group_num);
-
/**
* @internal
* Create a new container for device binding.
@@ -297,42 +261,6 @@ int
rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
const char *dev_addr);
-/**
- * @internal
- * Bind a IOMMU group to a container.
- *
- * @param container_fd
- * the container's fd
- *
- * @param iommu_group_num
- * the iommu group number to bind to container
- *
- * @return
- * group fd if successful
- * <0 if failed
- */
-__rte_internal
-int
-rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
-
-/**
- * @internal
- * Unbind a IOMMU group from a container.
- *
- * @param container_fd
- * the container fd of container
- *
- * @param iommu_group_num
- * the iommu group number to delete from container
- *
- * @return
- * 0 if successful
- * <0 if failed
- */
-__rte_internal
-int
-rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
-
/**
* @internal
* Perform DMA mapping for devices in a container.
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 02fec64658..7893d334eb 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -67,6 +67,9 @@ static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
uint64_t iova, uint64_t len, int do_map);
+static int vfio_container_group_bind(int container_fd, int iommu_group_num);
+static int vfio_container_group_unbind(int container_fd, int iommu_group_num);
+
/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
@@ -532,9 +535,8 @@ get_vfio_cfg_by_container_fd(int container_fd)
return NULL;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
int
-rte_vfio_get_group_fd(int iommu_group_num)
+vfio_get_group_fd_by_num(int iommu_group_num)
{
struct vfio_config *vfio_cfg;
@@ -731,9 +733,8 @@ vfio_sync_default_container(void)
return -1;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
-int
-rte_vfio_clear_group(int vfio_group_fd)
+static int
+vfio_clear_group(int vfio_group_fd)
{
int i;
struct vfio_config *vfio_cfg;
@@ -787,7 +788,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
return -1;
/* get the actual group fd */
- vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+ vfio_group_fd = vfio_get_group_fd_by_num(iommu_group_num);
if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
return -1;
@@ -813,14 +814,14 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
EAL_LOG(ERR, "%s cannot get VFIO group status, "
"error %i (%s)", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
EAL_LOG(ERR, "%s VFIO group is not viable! "
"Not all devices in IOMMU group bound to VFIO or unbound",
dev_addr);
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
}
@@ -841,7 +842,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
"%s cannot add VFIO group to container, error "
"%i (%s)", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
}
@@ -865,7 +866,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
"%s failed to select IOMMU type",
dev_addr);
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
}
/* lock memory hotplug before mapping and release it
@@ -882,7 +883,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
"%i (%s)",
dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
rte_mcfg_mem_read_unlock();
return -1;
}
@@ -951,7 +952,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
if (ret < 0) {
EAL_LOG(ERR, "Could not sync default VFIO container");
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
}
/* we have successfully initialized VFIO, notify user */
@@ -988,7 +989,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
EAL_LOG(WARNING, "Getting a vfio_dev_fd for %s failed",
dev_addr);
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
+ vfio_clear_group(vfio_group_fd);
return -1;
}
@@ -1026,9 +1027,9 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
}
/* get the actual group fd */
- vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+ vfio_group_fd = vfio_get_group_fd_by_num(iommu_group_num);
if (vfio_group_fd < 0) {
- EAL_LOG(INFO, "rte_vfio_get_group_fd failed for %s",
+ EAL_LOG(INFO, "vfio_get_group_fd_by_num failed for %s",
dev_addr);
ret = vfio_group_fd;
goto out;
@@ -1064,7 +1065,7 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
goto out;
}
- if (rte_vfio_clear_group(vfio_group_fd) < 0) {
+ if (vfio_clear_group(vfio_group_fd) < 0) {
EAL_LOG(INFO, "Error when clearing group for %s",
dev_addr);
ret = -1;
@@ -2091,7 +2092,7 @@ rte_vfio_container_destroy(int container_fd)
for (i = 0; i < RTE_DIM(vfio_cfg->vfio_groups); i++)
if (vfio_cfg->vfio_groups[i].group_num != -1)
- rte_vfio_container_group_unbind(container_fd,
+ vfio_container_group_unbind(container_fd,
vfio_cfg->vfio_groups[i].group_num);
close(container_fd);
@@ -2122,7 +2123,7 @@ rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
return -1;
}
- ret = rte_vfio_container_group_bind(vfio_container_fd,
+ ret = vfio_container_group_bind(vfio_container_fd,
iommu_group_num);
if (ret < 0) {
EAL_LOG(ERR,
@@ -2134,9 +2135,8 @@ rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
return 0;
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
-int
-rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
+static int
+vfio_container_group_bind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
@@ -2149,9 +2149,8 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
-RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
-int
-rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
+static int
+vfio_container_group_unbind(int container_fd, int iommu_group_num)
{
struct vfio_group *cur_grp = NULL;
struct vfio_config *vfio_cfg;
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 89c4b5ba45..30389fb274 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -51,6 +51,8 @@ vfio_set_iommu_type(int vfio_container_fd);
int
vfio_get_iommu_type(void);
+int vfio_get_group_fd_by_num(int iommu_group_num);
+
/* check if we have any supported extensions */
int
vfio_has_supported_extensions(int vfio_container_fd);
diff --git a/lib/eal/linux/eal_vfio_mp_sync.c b/lib/eal/linux/eal_vfio_mp_sync.c
index 22136f2e8b..3eaeef2fc8 100644
--- a/lib/eal/linux/eal_vfio_mp_sync.c
+++ b/lib/eal/linux/eal_vfio_mp_sync.c
@@ -34,7 +34,7 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
case SOCKET_REQ_GROUP:
r->req = SOCKET_REQ_GROUP;
r->group_num = m->group_num;
- fd = rte_vfio_get_group_fd(m->group_num);
+ fd = vfio_get_group_fd_by_num(m->group_num);
if (fd < 0 && fd != -ENOENT)
r->result = SOCKET_ERR;
else if (fd == -ENOENT)
--
2.47.3
^ permalink raw reply related
* [PATCH v8 10/18] vhost: remove group-related API from drivers
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Maxime Coquelin, Chenbo Xia, Matan Azrad,
Viacheslav Ovsiienko, Chaoyong He
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Some vDPA drivers have "get_vfio_group_fd" call in their internal driver
API structure, but it is not used for anything beyond device assignment
to containers which can now be achieved via other means, so remove this
API and all its usages.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
doc/guides/prog_guide/vhost_lib.rst | 4 ----
doc/guides/rel_notes/deprecation.rst | 4 ----
drivers/vdpa/ifc/ifcvf_vdpa.c | 19 -------------------
drivers/vdpa/mlx5/mlx5_vdpa.c | 1 -
drivers/vdpa/nfp/nfp_vdpa.c | 20 --------------------
lib/vhost/vdpa_driver.h | 3 ---
6 files changed, 51 deletions(-)
diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index 0c2b4d020a..2f80cf4072 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -471,10 +471,6 @@ Finally, a set of device ops is defined for device specific operations:
Called to allow the device to response to RARP sending.
-* ``get_vfio_group_fd``
-
- Called to get the VFIO group fd of the device.
-
* ``get_vfio_device_fd``
Called to get the VFIO device fd of the device.
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index f2901064f5..c520129ac3 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -30,10 +30,6 @@ Deprecation Notices
Use the ``-S <service-corelist>`` parameter instead
to specify the cores to be used for background services in DPDK.
-* vdpa: The vDPA driver API will no longer offer ``get_vfio_group_fd``
- as part of its internal API. All drivers will be adjusted
- to use the new unified VFIO container device assignment API.
-
* rte_atomicNN_xxx: These APIs do not take memory order parameter. This does
not allow for writing optimized code for all the CPU architectures supported
in DPDK. DPDK has adopted the atomic operations from
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 6f1c050787..63f4172da5 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -58,7 +58,6 @@ struct ifcvf_internal {
struct ifcvf_hw hw;
int configured;
int vfio_container_fd;
- int vfio_group_fd;
int vfio_dev_fd;
rte_thread_t tid; /* thread for notify relay */
rte_thread_t intr_tid; /* thread for config space change interrupt relay */
@@ -1204,22 +1203,6 @@ ifcvf_set_features(int vid)
return 0;
}
-static int
-ifcvf_get_vfio_group_fd(int vid)
-{
- struct rte_vdpa_device *vdev;
- struct internal_list *list;
-
- vdev = rte_vhost_get_vdpa_device(vid);
- list = find_internal_resource_by_vdev(vdev);
- if (list == NULL) {
- DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
- return -1;
- }
-
- return list->internal->vfio_group_fd;
-}
-
static int
ifcvf_get_vfio_device_fd(int vid)
{
@@ -1465,7 +1448,6 @@ static struct rte_vdpa_dev_ops ifcvf_net_ops = {
.set_vring_state = ifcvf_set_vring_state,
.set_features = ifcvf_set_features,
.migration_done = NULL,
- .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
.get_vfio_device_fd = ifcvf_get_vfio_device_fd,
.get_notify_area = ifcvf_get_notify_area,
.get_dev_type = ifcvf_get_device_type,
@@ -1596,7 +1578,6 @@ static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
.dev_close = ifcvf_dev_close,
.set_vring_state = ifcvf_set_vring_state,
.migration_done = NULL,
- .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
.get_vfio_device_fd = ifcvf_get_vfio_device_fd,
.get_notify_area = ifcvf_get_notify_area,
.get_config = ifcvf_blk_get_config,
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index 11708e2005..1bb5de51b6 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -523,7 +523,6 @@ static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
.set_vring_state = mlx5_vdpa_set_vring_state,
.set_features = mlx5_vdpa_features_set,
.migration_done = NULL,
- .get_vfio_group_fd = NULL,
.get_vfio_device_fd = mlx5_vdpa_get_device_fd,
.get_notify_area = mlx5_vdpa_get_notify_area,
.get_stats_names = mlx5_vdpa_get_stats_names,
diff --git a/drivers/vdpa/nfp/nfp_vdpa.c b/drivers/vdpa/nfp/nfp_vdpa.c
index 4885fa5cbc..efd7ee7d95 100644
--- a/drivers/vdpa/nfp/nfp_vdpa.c
+++ b/drivers/vdpa/nfp/nfp_vdpa.c
@@ -36,9 +36,7 @@ struct nfp_vdpa_dev {
struct nfp_vdpa_hw hw;
int vfio_container_fd;
- int vfio_group_fd;
int vfio_dev_fd;
- int iommu_group;
rte_thread_t tid; /**< Thread for notify relay */
int epoll_fd;
@@ -152,7 +150,6 @@ static void
nfp_vdpa_vfio_teardown(struct nfp_vdpa_dev *device)
{
rte_pci_unmap_device(device->pci_dev);
- rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group);
rte_vfio_container_destroy(device->vfio_container_fd);
}
@@ -1018,22 +1015,6 @@ nfp_vdpa_dev_close(int vid)
return 0;
}
-static int
-nfp_vdpa_get_vfio_group_fd(int vid)
-{
- struct rte_vdpa_device *vdev;
- struct nfp_vdpa_dev_node *node;
-
- vdev = rte_vhost_get_vdpa_device(vid);
- node = nfp_vdpa_find_node_by_vdev(vdev);
- if (node == NULL) {
- DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
- return -ENODEV;
- }
-
- return node->device->vfio_group_fd;
-}
-
static int
nfp_vdpa_get_vfio_device_fd(int vid)
{
@@ -1185,7 +1166,6 @@ struct rte_vdpa_dev_ops nfp_vdpa_ops = {
.dev_close = nfp_vdpa_dev_close,
.set_vring_state = nfp_vdpa_set_vring_state,
.set_features = nfp_vdpa_set_features,
- .get_vfio_group_fd = nfp_vdpa_get_vfio_group_fd,
.get_vfio_device_fd = nfp_vdpa_get_vfio_device_fd,
.get_notify_area = nfp_vdpa_get_notify_area,
};
diff --git a/lib/vhost/vdpa_driver.h b/lib/vhost/vdpa_driver.h
index 42392a0d14..c7b9be09fb 100644
--- a/lib/vhost/vdpa_driver.h
+++ b/lib/vhost/vdpa_driver.h
@@ -50,9 +50,6 @@ struct rte_vdpa_dev_ops {
/** Destination operations when migration done */
int (*migration_done)(int vid);
- /** Get the vfio group fd */
- int (*get_vfio_group_fd)(int vid);
-
/** Get the vfio device fd */
int (*get_vfio_device_fd)(int vid);
--
2.47.3
^ permalink raw reply related
* [PATCH v8 09/18] vdpa/sfc: use container device assignment API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Vijay Kumar Srivastava
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The SFC vDPA driver uses VFIO group bind/unbind functionality for container
device assignment purposes. Use the new container device assignment API
instead to provide clearer semantics.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/vdpa/sfc/sfc_vdpa.c | 39 +++++++------------------------------
drivers/vdpa/sfc/sfc_vdpa.h | 2 --
2 files changed, 7 insertions(+), 34 deletions(-)
diff --git a/drivers/vdpa/sfc/sfc_vdpa.c b/drivers/vdpa/sfc/sfc_vdpa.c
index eda111954f..99b4ced3f4 100644
--- a/drivers/vdpa/sfc/sfc_vdpa.c
+++ b/drivers/vdpa/sfc/sfc_vdpa.c
@@ -80,22 +80,12 @@ sfc_vdpa_vfio_setup(struct sfc_vdpa_adapter *sva)
goto fail_container_create;
}
- rc = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), dev_name,
- &sva->iommu_group_num);
- if (rc <= 0) {
- sfc_vdpa_err(sva, "failed to get IOMMU group for %s : %s",
- dev_name, rte_strerror(-rc));
- goto fail_get_group_num;
- }
-
- sva->vfio_group_fd =
- rte_vfio_container_group_bind(sva->vfio_container_fd,
- sva->iommu_group_num);
- if (sva->vfio_group_fd < 0) {
- sfc_vdpa_err(sva,
- "failed to bind IOMMU group %d to container %d",
- sva->iommu_group_num, sva->vfio_container_fd);
- goto fail_group_bind;
+ rc = rte_vfio_container_assign_device(sva->vfio_container_fd,
+ rte_pci_get_sysfs_path(), dev_name);
+ if (rc < 0) {
+ sfc_vdpa_err(sva, "failed to assign device %s to container %d",
+ dev_name, sva->vfio_container_fd);
+ goto fail_device_assign;
}
if (rte_pci_map_device(dev) != 0) {
@@ -109,15 +99,7 @@ sfc_vdpa_vfio_setup(struct sfc_vdpa_adapter *sva)
return 0;
fail_pci_map_device:
- if (rte_vfio_container_group_unbind(sva->vfio_container_fd,
- sva->iommu_group_num) != 0) {
- sfc_vdpa_err(sva,
- "failed to unbind IOMMU group %d from container %d",
- sva->iommu_group_num, sva->vfio_container_fd);
- }
-
-fail_group_bind:
-fail_get_group_num:
+fail_device_assign:
if (rte_vfio_container_destroy(sva->vfio_container_fd) != 0) {
sfc_vdpa_err(sva, "failed to destroy container %d",
sva->vfio_container_fd);
@@ -132,13 +114,6 @@ sfc_vdpa_vfio_teardown(struct sfc_vdpa_adapter *sva)
{
rte_pci_unmap_device(sva->pdev);
- if (rte_vfio_container_group_unbind(sva->vfio_container_fd,
- sva->iommu_group_num) != 0) {
- sfc_vdpa_err(sva,
- "failed to unbind IOMMU group %d from container %d",
- sva->iommu_group_num, sva->vfio_container_fd);
- }
-
if (rte_vfio_container_destroy(sva->vfio_container_fd) != 0) {
sfc_vdpa_err(sva,
"failed to destroy container %d",
diff --git a/drivers/vdpa/sfc/sfc_vdpa.h b/drivers/vdpa/sfc/sfc_vdpa.h
index 2b843e563d..99a81fd1b0 100644
--- a/drivers/vdpa/sfc/sfc_vdpa.h
+++ b/drivers/vdpa/sfc/sfc_vdpa.h
@@ -70,10 +70,8 @@ struct sfc_vdpa_adapter {
sfc_vdpa_filter_t filters;
- int vfio_group_fd;
int vfio_dev_fd;
int vfio_container_fd;
- int iommu_group_num;
struct sfc_vdpa_ops_data *ops_data;
};
--
2.47.3
^ permalink raw reply related
* [PATCH v8 08/18] vdpa/nfp: use container device assignment API
From: Anatoly Burakov @ 2026-06-11 15:09 UTC (permalink / raw)
To: dev, Chaoyong He
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The NFP vDPA driver uses VFIO group bind/unbind functionality for container
device assignment purposes. Use the new container device assignment API
instead to provide clearer semantics.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/vdpa/nfp/nfp_vdpa.c | 17 +++++------------
1 file changed, 5 insertions(+), 12 deletions(-)
diff --git a/drivers/vdpa/nfp/nfp_vdpa.c b/drivers/vdpa/nfp/nfp_vdpa.c
index f4fd5c92ec..4885fa5cbc 100644
--- a/drivers/vdpa/nfp/nfp_vdpa.c
+++ b/drivers/vdpa/nfp/nfp_vdpa.c
@@ -122,33 +122,26 @@ nfp_vdpa_vfio_setup(struct nfp_vdpa_dev *device)
rte_pci_unmap_device(pci_dev);
rte_pci_device_name(&pci_dev->addr, dev_name, RTE_DEV_NAME_MAX_LEN);
- ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), dev_name,
- &device->iommu_group);
- if (ret <= 0)
- return -1;
device->vfio_container_fd = rte_vfio_container_create();
if (device->vfio_container_fd < 0)
return -1;
- device->vfio_group_fd = rte_vfio_container_group_bind(
- device->vfio_container_fd, device->iommu_group);
- if (device->vfio_group_fd < 0)
+ ret = rte_vfio_container_assign_device(device->vfio_container_fd,
+ rte_pci_get_sysfs_path(), dev_name);
+ if (ret < 0)
goto container_destroy;
- DRV_VDPA_LOG(DEBUG, "The container_fd=%d, group_fd=%d.",
- device->vfio_container_fd, device->vfio_group_fd);
+ DRV_VDPA_LOG(DEBUG, "container_fd=%d", device->vfio_container_fd);
ret = rte_pci_map_device(pci_dev);
if (ret != 0)
- goto group_unbind;
+ goto container_destroy;
device->vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle);
return 0;
-group_unbind:
- rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group);
container_destroy:
rte_vfio_container_destroy(device->vfio_container_fd);
--
2.47.3
^ permalink raw reply related
* [PATCH v8 07/18] vdpa/ifc: use container device assignment API
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The IFC vDPA driver uses VFIO group bind/unbind functionality for container
device assignment purposes. Use the new container device assignment API
instead to provide clearer semantics.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/vdpa/ifc/ifcvf_vdpa.c | 15 +++------------
1 file changed, 3 insertions(+), 12 deletions(-)
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index f319d455ba..6f1c050787 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -174,28 +174,19 @@ ifcvf_vfio_setup(struct ifcvf_internal *internal)
{
struct rte_pci_device *dev = internal->pdev;
char devname[RTE_DEV_NAME_MAX_LEN] = {0};
- int iommu_group_num;
- int i, ret;
+ int i;
internal->vfio_dev_fd = -1;
- internal->vfio_group_fd = -1;
internal->vfio_container_fd = -1;
rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
- ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
- &iommu_group_num);
- if (ret <= 0) {
- DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
- return -1;
- }
internal->vfio_container_fd = rte_vfio_container_create();
if (internal->vfio_container_fd < 0)
return -1;
- internal->vfio_group_fd = rte_vfio_container_group_bind(
- internal->vfio_container_fd, iommu_group_num);
- if (internal->vfio_group_fd < 0)
+ if (rte_vfio_container_assign_device(internal->vfio_container_fd,
+ rte_pci_get_sysfs_path(), devname) < 0)
goto err;
if (rte_pci_map_device(dev))
--
2.47.3
^ permalink raw reply related
* [PATCH v8 04/18] vfio: add container device assignment API
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev, Bruce Richardson
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Currently, VFIO has explicit group bind API's, but the way they're used is
such that no one actually cares about VFIO groups, and the real goal of
everyone using VFIO group bind API is to bind devices to particular VFIO
container, such that when `rte_vfio_setup_device` call eventually comes,
VFIO will pick up the correct container.
To remove dependency on group API's, add a new "container assign device"
API call that will do the same thing, but will not depend on managing VFIO
group fd's.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
lib/eal/freebsd/eal.c | 10 ++++++++++
lib/eal/include/rte_vfio.h | 26 ++++++++++++++++++++++++++
lib/eal/linux/eal_vfio.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+)
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index f8ab932962..0c64a62c5a 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -944,3 +944,13 @@ rte_vfio_container_dma_unmap(__rte_unused int container_fd,
rte_errno = ENOTSUP;
return -1;
}
+
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_assign_device)
+int
+rte_vfio_container_assign_device(__rte_unused int vfio_container_fd,
+ __rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr)
+{
+ rte_errno = ENOTSUP;
+ return -1;
+}
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index fb666141f6..e7e2ee950b 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -271,6 +271,32 @@ __rte_internal
int
rte_vfio_container_destroy(int container_fd);
+/**
+ * @internal
+ *
+ * Assign a device to a VFIO container.
+ *
+ * Doing so will cause `rte_vfio_setup_device()` to set up the device with the VFIO container
+ * specified in this assign operation.
+ *
+ * This function is only relevant on Linux.
+ *
+ * @param vfio_container_fd
+ * VFIO container file descriptor.
+ * @param sysfs_base
+ * Sysfs path prefix.
+ * @param dev_addr
+ * Device identifier.
+ *
+ * @return
+ * 0 on success.
+ * <0 on failure, rte_errno is set.
+ */
+__rte_internal
+int
+rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
+ const char *dev_addr);
+
/**
* @internal
* Bind a IOMMU group to a container.
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 47c973e49a..02fec64658 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -2102,6 +2102,38 @@ rte_vfio_container_destroy(int container_fd)
return 0;
}
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_assign_device)
+int
+rte_vfio_container_assign_device(int vfio_container_fd, const char *sysfs_base,
+ const char *dev_addr)
+{
+ int iommu_group_num;
+ int ret;
+
+ ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
+ if (ret < 0) {
+ EAL_LOG(ERR, "Cannot get IOMMU group number for device %s",
+ dev_addr);
+ return -1;
+ } else if (ret == 0) {
+ EAL_LOG(ERR,
+ "Device %s is not assigned to any IOMMU group",
+ dev_addr);
+ return -1;
+ }
+
+ ret = rte_vfio_container_group_bind(vfio_container_fd,
+ iommu_group_num);
+ if (ret < 0) {
+ EAL_LOG(ERR,
+ "Cannot bind IOMMU group %d for device %s",
+ iommu_group_num, dev_addr);
+ return -1;
+ }
+
+ return 0;
+}
+
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
--
2.47.3
^ permalink raw reply related
* [PATCH v8 06/18] net/ntnic: use container device assignment API
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev, Christian Koue Muf, Serhii Iliushyk
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The NTNIC driver uses VFIO group bind/unbind functionality for container
device assignment purposes. Use the new container device assignment API
instead to provide clearer semantics.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/ntnic/ntnic_vfio.c | 30 +++++++++---------------------
1 file changed, 9 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ntnic/ntnic_vfio.c b/drivers/net/ntnic/ntnic_vfio.c
index 439468b3a2..c746b300b2 100644
--- a/drivers/net/ntnic/ntnic_vfio.c
+++ b/drivers/net/ntnic/ntnic_vfio.c
@@ -28,7 +28,6 @@ nt_vfio_vf_num(const struct rte_pci_device *pdev)
/* Internal API */
struct vfio_dev {
int container_fd;
- int group_fd;
int dev_fd;
uint64_t iova_addr;
};
@@ -50,7 +49,6 @@ nthw_vfio_setup(struct rte_pci_device *dev)
{
int ret;
char devname[RTE_DEV_NAME_MAX_LEN] = { 0 };
- int iommu_group_num;
int vf_num;
struct vfio_dev *vfio;
@@ -66,14 +64,9 @@ nthw_vfio_setup(struct rte_pci_device *dev)
}
vfio->dev_fd = -1;
- vfio->group_fd = -1;
vfio->iova_addr = START_VF_IOVA;
rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
- ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname, &iommu_group_num);
- if (ret <= 0)
- return -1;
-
if (vf_num == 0) {
/* use default container for pf0 */
vfio->container_fd = RTE_VFIO_DEFAULT_CONTAINER_FD;
@@ -86,17 +79,14 @@ nthw_vfio_setup(struct rte_pci_device *dev)
"VFIO device setup failed. VFIO container creation failed.");
return -1;
}
- }
+ ret = rte_vfio_container_assign_device(vfio->container_fd,
+ rte_pci_get_sysfs_path(), devname);
+ if (ret < 0) {
+ NT_LOG(ERR, NTNIC,
+ "VFIO device setup failed. Assign device to container failed.");
+ goto err;
+ }
- vfio->group_fd = rte_vfio_container_group_bind(vfio->container_fd, iommu_group_num);
-
- if (vfio->group_fd < 0) {
- NT_LOG(ERR, NTNIC,
- "VFIO device setup failed. VFIO container group bind failed.");
- goto err;
- }
-
- if (vf_num > 0) {
if (rte_pci_map_device(dev)) {
NT_LOG(ERR, NTNIC,
"Map VFIO device failed. is the vfio-pci driver loaded?");
@@ -106,10 +96,8 @@ nthw_vfio_setup(struct rte_pci_device *dev)
vfio->dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
- NT_LOG(DBG, NTNIC,
- "%s: VFIO id=%d, dev_fd=%d, container_fd=%d, group_fd=%d, iommu_group_num=%d",
- dev->name, vf_num, vfio->dev_fd, vfio->container_fd, vfio->group_fd,
- iommu_group_num);
+ NT_LOG(DBG, NTNIC, "%s: VFIO id=%d, dev_fd=%d, container_fd=%d",
+ dev->name, vf_num, vfio->dev_fd, vfio->container_fd);
return vf_num;
--
2.47.3
^ permalink raw reply related
* [PATCH v8 05/18] net/nbl: do not use VFIO group bind API
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev, Dimon Zhao, Leon Yu, Sam Chen
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The NBL driver currently uses group bind API, but it is using it only to
get group fd and nothing else. In context of NBL driver, this is the only
usage of VFIO API's in the driver, and it is not necessary to use it for
what NBL driver is trying to accomplish.
Use a direct `open()` call instead, and store the group fd in common
structure.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/nbl/nbl_common/nbl_userdev.c | 18 +++++++++++-------
drivers/net/nbl/nbl_include/nbl_include.h | 1 +
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/net/nbl/nbl_common/nbl_userdev.c b/drivers/net/nbl/nbl_common/nbl_userdev.c
index 96f0d2e264..fb256e543f 100644
--- a/drivers/net/nbl/nbl_common/nbl_userdev.c
+++ b/drivers/net/nbl/nbl_common/nbl_userdev.c
@@ -387,6 +387,13 @@ nbl_userdev_mem_event_callback(enum rte_mem_event type, const void *addr, size_t
}
}
+static int nbl_open_group_fd(int iommu_group_num)
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), RTE_VFIO_GROUP_FMT, iommu_group_num);
+ return open(path, O_RDWR);
+}
+
static int nbl_mdev_map_device(struct nbl_adapter *adapter)
{
const struct rte_pci_device *pci_dev = adapter->pci_dev;
@@ -424,11 +431,12 @@ static int nbl_mdev_map_device(struct nbl_adapter *adapter)
}
NBL_LOG(DEBUG, "nbl vfio container %d", container);
- vfio_group_fd = rte_vfio_container_group_bind(container, common->iommu_group_num);
+ vfio_group_fd = nbl_open_group_fd(common->iommu_group_num);
if (vfio_group_fd < 0) {
NBL_LOG(ERR, "nbl vfio group bind failed, %d", vfio_group_fd);
goto free_container;
}
+ common->groupfd = vfio_group_fd;
/* check if the group is viable */
ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
@@ -535,7 +543,6 @@ static int nbl_mdev_map_device(struct nbl_adapter *adapter)
}
free_group:
close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
free_container:
if (container_create)
rte_vfio_container_destroy(container);
@@ -549,17 +556,14 @@ static int nbl_mdev_unmap_device(struct nbl_adapter *adapter)
close(common->devfd);
rte_mcfg_mem_read_lock();
- vfio_group_fd = rte_vfio_container_group_bind(nbl_default_container,
- common->iommu_group_num);
+ vfio_group_fd = common->groupfd;
NBL_LOG(DEBUG, "close vfio_group_fd %d", vfio_group_fd);
ret = ioctl(vfio_group_fd, VFIO_GROUP_UNSET_CONTAINER, &nbl_default_container);
if (ret)
NBL_LOG(ERR, "unset container, error %i (%s) %d",
errno, strerror(errno), ret);
nbl_group_count--;
- ret = rte_vfio_container_group_unbind(nbl_default_container, common->iommu_group_num);
- if (ret)
- NBL_LOG(ERR, "vfio container group unbind failed %d", ret);
+ close(vfio_group_fd);
if (!nbl_group_count) {
rte_mem_event_callback_unregister(NBL_USERDEV_EVENT_CLB_NAME, NULL);
nbl_userdev_dma_free();
diff --git a/drivers/net/nbl/nbl_include/nbl_include.h b/drivers/net/nbl/nbl_include/nbl_include.h
index eeae6a3301..ba99b9f8e7 100644
--- a/drivers/net/nbl/nbl_include/nbl_include.h
+++ b/drivers/net/nbl/nbl_include/nbl_include.h
@@ -132,6 +132,7 @@ struct nbl_common_info {
u16 vsi_id;
u16 instance_id;
int devfd;
+ int groupfd;
int eventfd;
int ifindex;
int iommu_group_num;
--
2.47.3
^ permalink raw reply related
* [PATCH v8 03/18] vfio: split get device info from setup
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev, Nipun Gupta, Nikhil Agarwal, Chenbo Xia, Tomasz Duszynski,
Ajit Khaparde, Vikas Gupta, Bruce Richardson
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
Currently, setup gets device info as part of setup, while the separate get
device info API also calls setup if the fd is zero. Untangle these two APIs
and make each do one thing, and adjust all existing callers.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/bus/cdx/cdx_vfio.c | 12 ++++++++--
drivers/bus/pci/linux/pci_vfio.c | 18 ++++++++++----
drivers/bus/platform/platform.c | 9 ++++++-
drivers/crypto/bcmfs/bcmfs_vfio.c | 8 ++++++-
lib/eal/freebsd/eal.c | 3 +--
lib/eal/include/rte_vfio.h | 23 +++++++-----------
lib/eal/linux/eal_vfio.c | 40 +++++++++----------------------
7 files changed, 59 insertions(+), 54 deletions(-)
diff --git a/drivers/bus/cdx/cdx_vfio.c b/drivers/bus/cdx/cdx_vfio.c
index 11fe3265d2..9bae264409 100644
--- a/drivers/bus/cdx/cdx_vfio.c
+++ b/drivers/bus/cdx/cdx_vfio.c
@@ -401,10 +401,14 @@ cdx_vfio_map_resource_primary(struct rte_cdx_device *dev)
return -1;
ret = rte_vfio_setup_device(RTE_CDX_BUS_DEVICES_PATH, dev_name,
- &vfio_dev_fd, &device_info);
+ &vfio_dev_fd);
if (ret)
return ret;
+ ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
+ if (ret)
+ goto err_vfio_dev_fd;
+
/* allocate vfio_res and get region info */
vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
if (vfio_res == NULL) {
@@ -510,10 +514,14 @@ cdx_vfio_map_resource_secondary(struct rte_cdx_device *dev)
}
ret = rte_vfio_setup_device(RTE_CDX_BUS_DEVICES_PATH, dev_name,
- &vfio_dev_fd, &device_info);
+ &vfio_dev_fd);
if (ret)
return ret;
+ ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
+ if (ret)
+ goto err_vfio_dev_fd;
+
/* map MMIO regions */
maps = vfio_res->maps;
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index bc5c5c2499..54e9506058 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -753,10 +753,14 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
loc->domain, loc->bus, loc->devid, loc->function);
ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd, &device_info);
+ &vfio_dev_fd);
if (ret)
return ret;
+ ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
+ if (ret)
+ goto err_vfio_dev_fd;
+
if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
goto err_vfio_dev_fd;
@@ -962,10 +966,14 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
}
ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd, &device_info);
+ &vfio_dev_fd);
if (ret)
return ret;
+ ret = rte_vfio_get_device_info(vfio_dev_fd, &device_info);
+ if (ret)
+ goto err_vfio_dev_fd;
+
ret = pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info);
if (ret)
goto err_vfio_dev;
@@ -1195,12 +1203,14 @@ pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
if (vfio_dev_fd < 0) {
return -1;
} else if (vfio_dev_fd == 0) {
- if (rte_vfio_get_device_info(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd, &device_info) != 0)
+ if (rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd) != 0)
return -1;
/* save vfio_dev_fd so it can be used during release */
if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd) != 0)
return -1;
+ if (rte_vfio_get_device_info(vfio_dev_fd, &device_info) != 0)
+ return -1;
if (pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info) != 0)
return -1;
diff --git a/drivers/bus/platform/platform.c b/drivers/bus/platform/platform.c
index 170a2e03d0..3ee4b76781 100644
--- a/drivers/bus/platform/platform.c
+++ b/drivers/bus/platform/platform.c
@@ -292,12 +292,19 @@ device_setup(struct rte_platform_device *pdev)
const char *name = pdev->name;
int ret;
- ret = rte_vfio_setup_device(PLATFORM_BUS_DEVICES_PATH, name, &pdev->dev_fd, &dev_info);
+ ret = rte_vfio_setup_device(PLATFORM_BUS_DEVICES_PATH, name, &pdev->dev_fd);
if (ret) {
PLATFORM_LOG_LINE(ERR, "failed to setup %s", name);
return -ENODEV;
}
+ ret = rte_vfio_get_device_info(pdev->dev_fd, &dev_info);
+ if (ret) {
+ PLATFORM_LOG_LINE(ERR, "failed to get device info for %s", name);
+ ret = -ENODEV;
+ goto out;
+ }
+
/* This is an extra check to confirm that platform device was initialized
* by a kernel vfio-platform driver. On kernels that predate vfio-platform
* driver this flag obviously does not exist. In such scenarios this
diff --git a/drivers/crypto/bcmfs/bcmfs_vfio.c b/drivers/crypto/bcmfs/bcmfs_vfio.c
index e7f7ed994c..d00aaf1bb7 100644
--- a/drivers/crypto/bcmfs/bcmfs_vfio.c
+++ b/drivers/crypto/bcmfs/bcmfs_vfio.c
@@ -25,12 +25,18 @@ vfio_map_dev_obj(const char *path, const char *dev_obj,
struct vfio_device_info d_info = { .argsz = sizeof(d_info) };
struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
- ret = rte_vfio_setup_device(path, dev_obj, dev_fd, &d_info);
+ ret = rte_vfio_setup_device(path, dev_obj, dev_fd);
if (ret) {
BCMFS_LOG(ERR, "VFIO Setting for device failed");
return ret;
}
+ ret = rte_vfio_get_device_info(*dev_fd, &d_info);
+ if (ret) {
+ BCMFS_LOG(ERR, "VFIO Getting device info failed");
+ goto map_failed;
+ }
+
/* getting device region info*/
ret = ioctl(*dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
if (ret < 0) {
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index 0fe54a9dd7..f8ab932962 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -822,8 +822,7 @@ rte_eal_vfio_get_vf_token(__rte_unused rte_uuid_t vf_token)
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
- __rte_unused int *vfio_dev_fd,
- __rte_unused struct vfio_device_info *device_info)
+ __rte_unused int *vfio_dev_fd)
{
rte_errno = ENOTSUP;
return -1;
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 0ddeb08f94..fb666141f6 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -55,10 +55,7 @@ struct vfio_device_info;
* device location.
*
* @param vfio_dev_fd
- * VFIO fd.
- *
- * @param device_info
- * Device information.
+ * Pointer to VFIO fd, will be set to the opened device fd on success.
*
* @return
* 0 on success.
@@ -67,7 +64,7 @@ struct vfio_device_info;
*/
__rte_internal
int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
- int *vfio_dev_fd, struct vfio_device_info *device_info);
+ int *vfio_dev_fd);
/**
* @internal
@@ -187,19 +184,16 @@ rte_vfio_get_group_num(const char *sysfs_base,
* @internal
* Get device information.
*
+ * This function retrieves VFIO device information from an already opened
+ * device. The device must be opened with rte_vfio_setup_device() first.
+ *
* This function is only relevant to Linux and will return an error on BSD.
*
- * @param sysfs_base
- * sysfs path prefix.
- *
- * @param dev_addr
- * device location.
- *
* @param vfio_dev_fd
- * VFIO fd.
+ * VFIO device fd (must be a valid, already opened fd).
*
* @param device_info
- * Device information.
+ * Pointer to device information structure to be filled.
*
* @return
* 0 on success.
@@ -207,8 +201,7 @@ rte_vfio_get_group_num(const char *sysfs_base,
*/
__rte_internal
int
-rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
- int *vfio_dev_fd, struct vfio_device_info *device_info);
+rte_vfio_get_device_info(int vfio_dev_fd, struct vfio_device_info *device_info);
/**
* @internal
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 33fa04feaf..47c973e49a 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -758,7 +758,7 @@ rte_vfio_clear_group(int vfio_group_fd)
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
int
rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
- int *vfio_dev_fd, struct vfio_device_info *device_info)
+ int *vfio_dev_fd)
{
struct vfio_group_status group_status = {
.argsz = sizeof(group_status)
@@ -975,7 +975,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD,
dev);
if (*vfio_dev_fd >= 0)
- goto dev_get_info;
+ goto out;
}
/* get a file descriptor for the device */
@@ -992,18 +992,8 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
return -1;
}
- /* test and setup the device */
-dev_get_info:
- ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
- if (ret) {
- EAL_LOG(ERR, "%s cannot get device info, "
- "error %i (%s)", dev_addr, errno,
- strerror(errno));
- close(*vfio_dev_fd);
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
+ /* device is now set up */
+out:
vfio_group_device_get(vfio_group_fd);
return 0;
@@ -1217,26 +1207,18 @@ vfio_set_iommu_type(int vfio_container_fd)
RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_info)
int
-rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
- int *vfio_dev_fd, struct vfio_device_info *device_info)
+rte_vfio_get_device_info(int vfio_dev_fd, struct vfio_device_info *device_info)
{
int ret;
- if (device_info == NULL || *vfio_dev_fd < 0)
+ if (device_info == NULL || vfio_dev_fd < 0)
return -1;
- if (*vfio_dev_fd == 0) {
- ret = rte_vfio_setup_device(sysfs_base, dev_addr,
- vfio_dev_fd, device_info);
- if (ret)
- return -1;
- } else {
- ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
- if (ret) {
- EAL_LOG(ERR, "%s cannot get device info, error %i (%s)",
- dev_addr, errno, strerror(errno));
- return -1;
- }
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+ if (ret) {
+ EAL_LOG(ERR, "Cannot get device info, error %i (%s)",
+ errno, strerror(errno));
+ return -1;
}
return 0;
--
2.47.3
^ permalink raw reply related
* [PATCH v8 02/18] vfio: make all functions internal
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
To: dev, Bruce Richardson, Dmitry Kozlyuk
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>
The VFIO API is an externally exported API because the original intent was
to offer DMA mapping facilities to applications. However, practical usage
of this API seems to be centered around drivers, so keeping this API
exported to applications only creates needless API/ABI stability surface
that has no added value.
Make the entire VFIO API internal-only and visible only to drivers.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
doc/guides/rel_notes/deprecation.rst | 6 ----
lib/eal/freebsd/eal.c | 30 +++++++++---------
lib/eal/include/rte_vfio.h | 47 ++++++++++++++++++++++++----
lib/eal/linux/eal_vfio.c | 32 +++++++++----------
lib/eal/windows/eal.c | 4 +--
5 files changed, 74 insertions(+), 45 deletions(-)
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index c7f8230278..f2901064f5 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -30,12 +30,6 @@ Deprecation Notices
Use the ``-S <service-corelist>`` parameter instead
to specify the cores to be used for background services in DPDK.
-* eal: The entire VFIO API (``rte_vfio_*``) will be made internal only,
- and will only be available to EAL and drivers.
- Group-based API (``rte_vfio_*_group_*``) will be removed
- and replaced with unified container device assignment API.
- This change will be made in 26.11 release.
-
* vdpa: The vDPA driver API will no longer offer ``get_vfio_group_fd``
as part of its internal API. All drivers will be adjusted
to use the new unified VFIO container device assignment API.
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index 8b1ba5b99b..0fe54a9dd7 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -819,7 +819,7 @@ rte_eal_vfio_get_vf_token(__rte_unused rte_uuid_t vf_token)
{
}
-RTE_EXPORT_SYMBOL(rte_vfio_setup_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int *vfio_dev_fd,
@@ -829,7 +829,7 @@ int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_release_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
int rte_vfio_release_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int fd)
@@ -838,33 +838,33 @@ int rte_vfio_release_device(__rte_unused const char *sysfs_base,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_enable)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_enable)
int rte_vfio_enable(__rte_unused const char *modname)
{
rte_errno = ENOTSUP;
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_is_enabled)
int rte_vfio_is_enabled(__rte_unused const char *modname)
{
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_noiommu_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
int rte_vfio_noiommu_is_enabled(void)
{
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_clear_group)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
{
rte_errno = ENOTSUP;
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_num)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
int
rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
@@ -874,7 +874,7 @@ rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_container_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_container_fd)
int
rte_vfio_get_container_fd(void)
{
@@ -882,7 +882,7 @@ rte_vfio_get_container_fd(void)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
int
rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
{
@@ -890,7 +890,7 @@ rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_create)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
int
rte_vfio_container_create(void)
{
@@ -898,7 +898,7 @@ rte_vfio_container_create(void)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_destroy)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_destroy)
int
rte_vfio_container_destroy(__rte_unused int container_fd)
{
@@ -906,7 +906,7 @@ rte_vfio_container_destroy(__rte_unused int container_fd)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_bind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
int
rte_vfio_container_group_bind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
@@ -915,7 +915,7 @@ rte_vfio_container_group_bind(__rte_unused int container_fd,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_unbind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
int
rte_vfio_container_group_unbind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
@@ -924,7 +924,7 @@ rte_vfio_container_group_unbind(__rte_unused int container_fd,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
int
rte_vfio_container_dma_map(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
@@ -935,7 +935,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
int
rte_vfio_container_dma_unmap(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index d1e8bce56b..0ddeb08f94 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -7,7 +7,11 @@
/**
* @file
- * RTE VFIO. This library provides various VFIO related utility functions.
+ * @internal
+ *
+ * RTE VFIO internal API.
+ *
+ * This library provides VFIO related utility functions for use by drivers.
*/
#include <stdbool.h>
@@ -36,6 +40,7 @@ struct vfio_device_info;
#define RTE_VFIO_DEFAULT_CONTAINER_FD (-1)
/**
+ * @internal
* Setup vfio_cfg for the device identified by its address.
* It discovers the configured I/O MMU groups or sets a new one for the device.
* If a new groups is assigned, the DMA mapping is performed.
@@ -60,10 +65,12 @@ struct vfio_device_info;
* <0 on failure.
* >1 if the device cannot be managed this way.
*/
+__rte_internal
int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info);
/**
+ * @internal
* Release a device mapped to a VFIO-managed I/O MMU group.
*
* This function is only relevant to linux and will return
@@ -82,9 +89,11 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
* 0 on success.
* <0 on failure.
*/
+__rte_internal
int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
/**
+ * @internal
* Enable a VFIO-related kmod.
*
* This function is only relevant to linux and will return
@@ -97,9 +106,11 @@ int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd
* 0 on success.
* <0 on failure.
*/
+__rte_internal
int rte_vfio_enable(const char *modname);
/**
+ * @internal
* Check whether a VFIO-related kmod is enabled.
*
* This function is only relevant to Linux.
@@ -111,9 +122,11 @@ int rte_vfio_enable(const char *modname);
* 1 if true.
* 0 otherwise.
*/
+__rte_internal
int rte_vfio_is_enabled(const char *modname);
/**
+ * @internal
* Whether VFIO NOIOMMU mode is enabled.
*
* This function is only relevant to Linux.
@@ -123,10 +136,12 @@ int rte_vfio_is_enabled(const char *modname);
* 0 if false.
* <0 for errors.
*/
+__rte_internal
int rte_vfio_noiommu_is_enabled(void);
/**
- * Remove group fd from internal VFIO group fd array/
+ * @internal
+ * Remove group fd from internal VFIO group fd array.
*
* This function is only relevant to linux and will return
* an error on BSD.
@@ -138,11 +153,13 @@ int rte_vfio_noiommu_is_enabled(void);
* 0 on success.
* <0 on failure.
*/
+__rte_internal
int
rte_vfio_clear_group(int vfio_group_fd);
/**
- * Parse IOMMU group number for a device
+ * @internal
+ * Parse IOMMU group number for a device.
*
* This function is only relevant to linux and will return
* an error on BSD.
@@ -161,12 +178,14 @@ rte_vfio_clear_group(int vfio_group_fd);
* 0 for non-existent group or VFIO
* <0 for errors
*/
+__rte_internal
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num);
/**
- * Get device information
+ * @internal
+ * Get device information.
*
* This function is only relevant to Linux and will return an error on BSD.
*
@@ -186,12 +205,13 @@ rte_vfio_get_group_num(const char *sysfs_base,
* 0 on success.
* <0 on failure.
*/
-__rte_experimental
+__rte_internal
int
rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info);
/**
+ * @internal
* Get the default VFIO container fd
*
* This function is only relevant to linux and will return
@@ -201,11 +221,13 @@ rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
* > 0 default container fd
* < 0 if VFIO is not enabled or not supported
*/
+__rte_internal
int
rte_vfio_get_container_fd(void);
/**
- * Open VFIO group fd or get an existing one
+ * @internal
+ * Open VFIO group fd or get an existing one.
*
* This function is only relevant to linux and will return
* an error on BSD.
@@ -217,10 +239,12 @@ rte_vfio_get_container_fd(void);
* > 0 group fd
* < 0 for errors
*/
+__rte_internal
int
rte_vfio_get_group_fd(int iommu_group_num);
/**
+ * @internal
* Create a new container for device binding.
*
* @note Any newly allocated DPDK memory will not be mapped into these
@@ -235,10 +259,12 @@ rte_vfio_get_group_fd(int iommu_group_num);
* the container fd if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_create(void);
/**
+ * @internal
* Destroy the container, unbind all vfio groups within it.
*
* @param container_fd
@@ -248,10 +274,12 @@ rte_vfio_container_create(void);
* 0 if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_destroy(int container_fd);
/**
+ * @internal
* Bind a IOMMU group to a container.
*
* @param container_fd
@@ -264,10 +292,12 @@ rte_vfio_container_destroy(int container_fd);
* group fd if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
/**
+ * @internal
* Unbind a IOMMU group from a container.
*
* @param container_fd
@@ -280,10 +310,12 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
* 0 if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
/**
+ * @internal
* Perform DMA mapping for devices in a container.
*
* @param container_fd
@@ -303,11 +335,13 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
* 0 if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
uint64_t iova, uint64_t len);
/**
+ * @internal
* Perform DMA unmapping for devices in a container.
*
* @param container_fd
@@ -327,6 +361,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
* 0 if successful
* <0 if failed
*/
+__rte_internal
int
rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
uint64_t iova, uint64_t len);
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index f1050ffa60..33fa04feaf 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -532,7 +532,7 @@ get_vfio_cfg_by_container_fd(int container_fd)
return NULL;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
int
rte_vfio_get_group_fd(int iommu_group_num)
{
@@ -731,7 +731,7 @@ vfio_sync_default_container(void)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_clear_group)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
int
rte_vfio_clear_group(int vfio_group_fd)
{
@@ -755,7 +755,7 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_setup_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
int
rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
@@ -1009,7 +1009,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_release_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
int
rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
int vfio_dev_fd)
@@ -1098,7 +1098,7 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
return ret;
}
-RTE_EXPORT_SYMBOL(rte_vfio_enable)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_enable)
int
rte_vfio_enable(const char *modname)
{
@@ -1175,7 +1175,7 @@ rte_vfio_enable(const char *modname)
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_is_enabled)
int
rte_vfio_is_enabled(const char *modname)
{
@@ -1215,7 +1215,7 @@ vfio_set_iommu_type(int vfio_container_fd)
return NULL;
}
-RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_vfio_get_device_info, 24.03)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_info)
int
rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
@@ -1349,7 +1349,7 @@ vfio_open_container_fd(bool mp_request)
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_container_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_container_fd)
int
rte_vfio_get_container_fd(void)
{
@@ -1363,7 +1363,7 @@ rte_vfio_get_container_fd(void)
return default_vfio_cfg->vfio_container_fd;
}
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_num)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num)
@@ -2034,7 +2034,7 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
return ret;
}
-RTE_EXPORT_SYMBOL(rte_vfio_noiommu_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
int
rte_vfio_noiommu_is_enabled(void)
{
@@ -2067,7 +2067,7 @@ rte_vfio_noiommu_is_enabled(void)
return c == 'Y';
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_create)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
int
rte_vfio_container_create(void)
{
@@ -2094,7 +2094,7 @@ rte_vfio_container_create(void)
return vfio_cfgs[i].vfio_container_fd;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_destroy)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_destroy)
int
rte_vfio_container_destroy(int container_fd)
{
@@ -2120,7 +2120,7 @@ rte_vfio_container_destroy(int container_fd)
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_bind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
{
@@ -2135,7 +2135,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_unbind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
int
rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
{
@@ -2176,7 +2176,7 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
return 0;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
int
rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
uint64_t len)
@@ -2197,7 +2197,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
return container_dma_map(vfio_cfg, vaddr, iova, len);
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
int
rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
uint64_t len)
diff --git a/lib/eal/windows/eal.c b/lib/eal/windows/eal.c
index 6dacae7235..de7a89a829 100644
--- a/lib/eal/windows/eal.c
+++ b/lib/eal/windows/eal.c
@@ -453,7 +453,7 @@ eal_asprintf(char **buffer, const char *format, ...)
return ret;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
int
rte_vfio_container_dma_map(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
@@ -464,7 +464,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd,
return -1;
}
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
int
rte_vfio_container_dma_unmap(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
--
2.47.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox