From: Samar Yadav <samaryadav5@gmail.com>
To: dev@dpdk.org
Cc: maxime.coquelin@redhat.com, chenbox@nvidia.com,
Samar Yadav <samaryadav5@gmail.com>,
tiwei.bie@intel.com, stable@dpdk.org
Subject: [PATCH] net/virtio-user: fix eventfd sharing in secondary process
Date: Wed, 24 Jun 2026 08:57:41 +0000 [thread overview]
Message-ID: <20260624085741.2195573-1-samaryadav5@gmail.com> (raw)
virtio_user secondary processes cannot communicate with the vhost
backend: the kick/call eventfds are opened by the primary and never
shared, so a secondary's queue notification writes to an invalid fd
and traffic stalls.
Share the fds over a dedicated virtio-user multiprocess channel. The
primary registers a process-wide MP action that returns a port's
kick/call fds (looked up by port name); a secondary requests them at
probe time, before the port is announced.
The received fds are stored in eth_dev->process_private, which is
per-process, instead of the primary-owned shared dev->kickfds and
dev->callfds arrays; the secondary data path notifies the backend using
its own kickfd. In the primary, the MP handler reads the fd arrays under
dev->mutex, and the teardown path takes the same lock while closing and
freeing them, so the two cannot race.
Fixes: 1c8489da561b ("net/virtio-user: fix multi-process support")
Cc: tiwei.bie@intel.com
Cc: stable@dpdk.org
Signed-off-by: Samar Yadav <samaryadav5@gmail.com>
---
.mailmap | 1 +
.../net/virtio/virtio_user/virtio_user_dev.c | 56 +++-
.../net/virtio/virtio_user/virtio_user_dev.h | 20 ++
drivers/net/virtio/virtio_user_ethdev.c | 260 +++++++++++++++++-
4 files changed, 333 insertions(+), 4 deletions(-)
diff --git a/.mailmap b/.mailmap
index 4001e5fb0e70..8f921d4b9f46 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1448,6 +1448,7 @@ Salem Sol <salems@nvidia.com>
Sam Andrew <samandrew@microsoft.com>
Sam Chen <sam.chen@nebula-matrix.com>
Sam Grove <sam.grove@sifive.com>
+Samar Yadav <samaryadav5@gmail.com> <samar.yadav@broadcom.com>
Sameer Vaze <svaze@qti.qualcomm.com>
Sameh Gobriel <sameh.gobriel@intel.com>
Samik Gupta <samik.gupta@broadcom.com>
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
index f3df73c1f0ca..5e431ebc6511 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
@@ -34,6 +34,54 @@ const char * const virtio_user_backend_strings[] = {
[VIRTIO_USER_BACKEND_VHOST_VDPA] = "VHOST_VDPA",
};
+/*
+ * Collect the primary device's kick/call fds (interleaved kick,call per queue)
+ * for sharing with a secondary process. Caller must serialize against the
+ * control path (dev->mutex) so the fd arrays are not freed concurrently.
+ */
+int
+virtio_user_get_eventfds_from_dev(struct virtio_user_dev *dev,
+ int fds[VIRTIO_USER_MAX_EVENTFDS])
+{
+ uint32_t max_queues;
+ int i, total_fds = 0;
+ int kickfd, callfd;
+
+ if (dev == NULL || fds == NULL)
+ return -EINVAL;
+
+ if (dev->kickfds == NULL || dev->callfds == NULL) {
+ PMD_INIT_LOG(ERR, "Device eventfd arrays not initialized");
+ return -EINVAL;
+ }
+
+ max_queues = dev->max_queue_pairs * 2;
+ if (dev->hw_cvq)
+ max_queues += 1;
+
+ /* each queue contributes a kick and a call fd */
+ if (max_queues * 2 > VIRTIO_USER_MAX_EVENTFDS) {
+ PMD_INIT_LOG(ERR,
+ "Device needs %u eventfds, exceeds MP limit %d",
+ max_queues * 2, VIRTIO_USER_MAX_EVENTFDS);
+ return -E2BIG;
+ }
+
+ for (i = 0; i < (int)max_queues; i++) {
+ kickfd = dev->kickfds[i];
+ callfd = dev->callfds[i];
+ if (kickfd < 0 || callfd < 0) {
+ PMD_INIT_LOG(ERR, "Queue %d has invalid fds (kick=%d call=%d)",
+ i, kickfd, callfd);
+ return -EINVAL;
+ }
+ fds[total_fds++] = kickfd;
+ fds[total_fds++] = callfd;
+ }
+
+ return total_fds;
+}
+
static int
virtio_user_uninit_notify_queue(struct virtio_user_dev *dev, uint32_t queue_sel)
{
@@ -865,9 +913,15 @@ virtio_user_dev_uninit(struct virtio_user_dev *dev)
rte_mem_event_callback_unregister(VIRTIO_USER_MEM_EVENT_CLB_NAME, dev);
+ /*
+ * Serialize closing/freeing the kick/call fd arrays against the MP
+ * handler, which reads them under the same lock to share them with
+ * secondary processes.
+ */
+ pthread_mutex_lock(&dev->mutex);
virtio_user_dev_uninit_notify(dev);
-
virtio_user_free_vrings(dev);
+ pthread_mutex_unlock(&dev->mutex);
free(dev->ifname);
diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h
index 66400b3b6295..c00297c79ed8 100644
--- a/drivers/net/virtio/virtio_user/virtio_user_dev.h
+++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h
@@ -11,6 +11,11 @@
#include "../virtio.h"
#include "../virtio_ring.h"
+#include <rte_eal.h>
+
+/* Max eventfds shareable over the MP channel (bounded by SCM_RIGHTS). */
+#define VIRTIO_USER_MAX_EVENTFDS RTE_MP_MAX_FD_NUM
+
enum virtio_user_backend_type {
VIRTIO_USER_BACKEND_UNKNOWN,
VIRTIO_USER_BACKEND_VHOST_USER,
@@ -89,5 +94,20 @@ int virtio_user_dev_get_rss_config(struct virtio_user_dev *dev, void *dst, size_
int length);
void virtio_user_dev_delayed_disconnect_handler(void *param);
int virtio_user_dev_server_reconnect(struct virtio_user_dev *dev);
+
+/**
+ * Collect a primary device's kick/call eventfds for sharing with a
+ * secondary process over the multiprocess channel.
+ *
+ * @param dev
+ * Pointer to the virtio_user device (primary).
+ * @param fds
+ * Output array, must hold at least VIRTIO_USER_MAX_EVENTFDS elements.
+ * @return
+ * Number of fds written on success, negative errno on error.
+ */
+int virtio_user_get_eventfds_from_dev(struct virtio_user_dev *dev,
+ int fds[VIRTIO_USER_MAX_EVENTFDS]);
+
extern const char * const virtio_user_backend_strings[];
#endif
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 747dddeb2eba..1c724ad59ea6 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -27,6 +27,35 @@
#include "virtio_rxtx.h"
#include "virtio_user/virtio_user_dev.h"
#include "virtio_user/vhost.h"
+#include <errno.h>
+#include <rte_errno.h>
+#include <rte_string_fns.h>
+#include <rte_spinlock.h>
+
+/* Virtio-user multiprocess communication channel */
+#define VIRTIO_USER_MP_NAME "virtio_user_mp"
+
+struct virtio_user_mp_param {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+};
+
+/*
+ * Per-process private data, referenced by eth_dev->process_private which (unlike
+ * dev_private) is NOT shared between primary and secondary processes. A secondary
+ * stores the kick/call fds it receives from the primary here, so it never mutates
+ * the primary-owned shared dev->kickfds/dev->callfds arrays. callfds are kept for
+ * a complete per-process view of the backend fds; only kickfds are used by the
+ * secondary data path today.
+ */
+struct virtio_user_proc_priv {
+ uint32_t nr_queues;
+ int *kickfds;
+ int *callfds;
+};
+
+/* Guards one-time registration of the process-wide MP action. */
+static rte_spinlock_t virtio_user_mp_lock = RTE_SPINLOCK_INITIALIZER;
+static bool virtio_user_mp_registered;
#define virtio_user_get_dev(hwp) container_of(hwp, struct virtio_user_dev, hw)
@@ -269,6 +298,26 @@ virtio_user_del_queue(struct virtio_hw *hw, struct virtqueue *vq)
virtio_user_dev_destroy_shadow_cvq(dev);
}
+/*
+ * Return the kick fd to notify the backend for a queue in the running process.
+ * The secondary uses its own fds (process_private); the primary owns dev->kickfds.
+ */
+static int
+virtio_user_get_kickfd(struct virtio_hw *hw, struct virtio_user_dev *dev,
+ uint16_t queue_idx)
+{
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ struct rte_eth_dev *eth_dev = &rte_eth_devices[hw->port_id];
+ struct virtio_user_proc_priv *pp = eth_dev->process_private;
+
+ if (pp == NULL || queue_idx >= pp->nr_queues)
+ return -1;
+ return pp->kickfds[queue_idx];
+ }
+
+ return dev->kickfds[queue_idx];
+}
+
static void
virtio_user_notify_queue(struct virtio_hw *hw, struct virtqueue *vq)
{
@@ -282,8 +331,10 @@ virtio_user_notify_queue(struct virtio_hw *hw, struct virtqueue *vq)
}
if (!dev->notify_area) {
- if (write(dev->kickfds[vq->vq_queue_index], ¬ify_data,
- sizeof(notify_data)) < 0)
+ int kickfd = virtio_user_get_kickfd(hw, dev, vq->vq_queue_index);
+
+ if (kickfd < 0 || write(kickfd, ¬ify_data,
+ sizeof(notify_data)) < 0)
PMD_DRV_LOG(ERR, "failed to kick backend: %s",
strerror(errno));
return;
@@ -495,6 +546,166 @@ virtio_user_eth_dev_free(struct rte_eth_dev *eth_dev)
rte_eth_dev_release_port(eth_dev);
}
+/* Close and free a secondary's per-process eventfd storage. */
+static void
+virtio_user_free_proc_priv(struct rte_eth_dev *eth_dev)
+{
+ struct virtio_user_proc_priv *pp = eth_dev->process_private;
+ uint32_t i;
+
+ if (pp == NULL)
+ return;
+
+ for (i = 0; i < pp->nr_queues; i++) {
+ if (pp->kickfds != NULL && pp->kickfds[i] >= 0)
+ close(pp->kickfds[i]);
+ if (pp->callfds != NULL && pp->callfds[i] >= 0)
+ close(pp->callfds[i]);
+ }
+
+ rte_free(pp->kickfds);
+ rte_free(pp->callfds);
+ rte_free(pp);
+ eth_dev->process_private = NULL;
+}
+
+/*
+ * Primary-side MP handler: reply with this port's kick/call eventfds so the
+ * requesting secondary can talk to the vhost backend. Always sends a reply
+ * (num_fds == 0 on error) so the secondary fails fast instead of timing out.
+ */
+static int
+virtio_user_mp_primary_handler(const struct rte_mp_msg *msg, const void *peer)
+{
+ const struct virtio_user_mp_param *param =
+ (const struct virtio_user_mp_param *)msg->param;
+ int eventfds[VIRTIO_USER_MAX_EVENTFDS];
+ struct rte_eth_dev *eth_dev;
+ struct virtio_user_dev *dev;
+ struct rte_mp_msg reply;
+ int num_fds;
+ int i;
+
+ memset(&reply, 0, sizeof(reply));
+ strlcpy(reply.name, msg->name, sizeof(reply.name));
+ reply.len_param = 0;
+ reply.num_fds = 0;
+
+ eth_dev = rte_eth_dev_get_by_name(param->port_name);
+ if (eth_dev == NULL || eth_dev->data->dev_private == NULL) {
+ PMD_INIT_LOG(ERR, "Failed to find virtio_user port: %s",
+ param->port_name);
+ return rte_mp_reply(&reply, peer);
+ }
+
+ dev = eth_dev->data->dev_private;
+
+ /* serialize against control-path changes to the fd arrays */
+ pthread_mutex_lock(&dev->mutex);
+ num_fds = virtio_user_get_eventfds_from_dev(dev, eventfds);
+ if (num_fds >= 0 && num_fds <= RTE_MP_MAX_FD_NUM) {
+ reply.num_fds = num_fds;
+ for (i = 0; i < num_fds; i++)
+ reply.fds[i] = eventfds[i];
+ } else {
+ PMD_INIT_LOG(ERR, "Cannot share eventfds for %s (ret=%d)",
+ param->port_name, num_fds);
+ }
+ pthread_mutex_unlock(&dev->mutex);
+
+ return rte_mp_reply(&reply, peer);
+}
+
+/*
+ * Secondary-side: request the primary's kick/call eventfds and store them in
+ * this process's eth_dev->process_private. The shared dev->kickfds/dev->callfds
+ * arrays (owned by the primary) are never touched.
+ */
+static int
+virtio_user_sync_eventfds(struct rte_eth_dev *eth_dev, struct virtio_user_dev *dev)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct virtio_user_mp_param *req_param;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct virtio_user_proc_priv *pp;
+ uint32_t total_queues, i;
+ int nr_fds, ret = 0;
+
+ if (dev == NULL)
+ return -EINVAL;
+
+ if (rte_eal_process_type() != RTE_PROC_SECONDARY)
+ return -EINVAL;
+
+ total_queues = dev->max_queue_pairs * 2 + (dev->hw_cvq ? 1 : 0);
+
+ pp = rte_zmalloc("virtio_user_proc_priv", sizeof(*pp), 0);
+ if (pp == NULL)
+ return -ENOMEM;
+
+ pp->kickfds = rte_malloc("virtio_user_proc_priv",
+ total_queues * sizeof(int), 0);
+ pp->callfds = rte_malloc("virtio_user_proc_priv",
+ total_queues * sizeof(int), 0);
+ if (pp->kickfds == NULL || pp->callfds == NULL) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+ for (i = 0; i < total_queues; i++) {
+ pp->kickfds[i] = -1;
+ pp->callfds[i] = -1;
+ }
+
+ memset(&mp_req, 0, sizeof(mp_req));
+ req_param = (struct virtio_user_mp_param *)mp_req.param;
+ strlcpy(req_param->port_name, eth_dev->data->name,
+ sizeof(req_param->port_name));
+ strlcpy(mp_req.name, VIRTIO_USER_MP_NAME, RTE_MP_MAX_NAME_LEN);
+ mp_req.len_param = sizeof(*req_param);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) < 0 ||
+ mp_reply.nb_received != 1) {
+ PMD_INIT_LOG(ERR, "Failed to request eventfds from primary");
+ free(mp_reply.msgs);
+ ret = -EIO;
+ goto err_free;
+ }
+
+ mp_rep = &mp_reply.msgs[0];
+ nr_fds = mp_rep->num_fds;
+
+ /* a partially-synced device cannot work: treat any mismatch as fatal */
+ if (nr_fds != (int)total_queues * 2) {
+ PMD_INIT_LOG(ERR, "Expected %u eventfds, received %d",
+ total_queues * 2, nr_fds);
+ for (i = 0; i < (uint32_t)nr_fds; i++)
+ close(mp_rep->fds[i]);
+ free(mp_reply.msgs);
+ ret = -EPROTO;
+ goto err_free;
+ }
+
+ for (i = 0; i < total_queues; i++) {
+ pp->kickfds[i] = mp_rep->fds[i * 2];
+ pp->callfds[i] = mp_rep->fds[i * 2 + 1];
+ }
+ pp->nr_queues = total_queues;
+ free(mp_reply.msgs);
+
+ eth_dev->process_private = pp;
+ PMD_INIT_LOG(DEBUG, "Synced %u queue eventfds for secondary port %s",
+ total_queues, eth_dev->data->name);
+ return 0;
+
+err_free:
+ rte_free(pp->kickfds);
+ rte_free(pp->callfds);
+ rte_free(pp);
+ return ret;
+}
+
/* Dev initialization routine. Invoked once for each virtio vdev at
* EAL init time, see rte_bus_probe().
* Returns 0 on success.
@@ -542,6 +753,17 @@ virtio_user_pmd_probe(struct rte_vdev_device *vdev)
eth_dev->dev_ops = &virtio_user_secondary_eth_dev_ops;
eth_dev->device = &vdev->device;
+
+ /* populate this process's eventfds before announcing the port */
+ ret = virtio_user_sync_eventfds(eth_dev, dev);
+ if (ret < 0) {
+ PMD_INIT_LOG(ERR,
+ "Failed to sync eventfds in secondary: %d",
+ ret);
+ rte_eth_dev_release_port(eth_dev);
+ return ret;
+ }
+
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -722,6 +944,36 @@ virtio_user_pmd_probe(struct rte_vdev_device *vdev)
}
}
+ /*
+ * Register the process-wide MP action once so secondaries can fetch a
+ * port's eventfds by name. It is intentionally left registered for the
+ * lifetime of the process (cleaned up at exit): unregistering per device
+ * cannot drain handler calls already dispatched on the EAL MP thread.
+ */
+ rte_spinlock_lock(&virtio_user_mp_lock);
+ if (!virtio_user_mp_registered) {
+ ret = rte_mp_action_register(VIRTIO_USER_MP_NAME,
+ virtio_user_mp_primary_handler);
+ if (ret < 0 && rte_errno != EEXIST) {
+ rte_spinlock_unlock(&virtio_user_mp_lock);
+ if (rte_errno == ENOTSUP) {
+ PMD_INIT_LOG(WARNING,
+ "MP unsupported, secondary eventfd sharing disabled");
+ rte_eth_dev_probing_finish(eth_dev);
+ ret = 0;
+ goto end;
+ }
+ PMD_INIT_LOG(ERR, "Failed to register MP handler: %s",
+ strerror(rte_errno));
+ virtio_user_dev_uninit(dev);
+ virtio_user_eth_dev_free(eth_dev);
+ ret = -1;
+ goto end;
+ }
+ virtio_user_mp_registered = true;
+ }
+ rte_spinlock_unlock(&virtio_user_mp_lock);
+
rte_eth_dev_probing_finish(eth_dev);
ret = 0;
@@ -749,8 +1001,10 @@ virtio_user_pmd_remove(struct rte_vdev_device *vdev)
if (!eth_dev)
return 0;
- if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ virtio_user_free_proc_priv(eth_dev);
return rte_eth_dev_release_port(eth_dev);
+ }
/* make sure the device is stopped, queues freed */
return rte_eth_dev_close(eth_dev->data->port_id);
--
2.52.0
next reply other threads:[~2026-06-24 12:54 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-24 8:57 Samar Yadav [this message]
2026-06-24 15:10 ` [PATCH] net/virtio-user: fix eventfd sharing in secondary process Stephen Hemminger
2026-06-24 15:16 ` Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260624085741.2195573-1-samaryadav5@gmail.com \
--to=samaryadav5@gmail.com \
--cc=chenbox@nvidia.com \
--cc=dev@dpdk.org \
--cc=maxime.coquelin@redhat.com \
--cc=stable@dpdk.org \
--cc=tiwei.bie@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.