* [PATCH 01/14] examples/vhost_user_rdma: implement core application initialization for supporting vhost_user_rdma device
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH] hw/rdma: Implement vhost-user RDMA device with PCI support Xiong Weimin
` (9 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 108628 bytes --]
From: xiongweimin <xiongweimin@kylinos.cn>
This commit introduces the main initialization routine for vHost RDMA
application built on DPDK. The implementation includes:
1. DPDK EAL environment initialization with proper signal handling
2. Argument parsing for application-specific configuration
3. Creation of shared memory resources:
- Packet buffer pools with per-core caching
- Optimized ring buffers for RX/TX with SP/MC synchronization flags
4. Backend network device detection and initialization
5. Worker thread launch across available cores
6. Multi-device support with shared/dedicated resource allocation
7. vHost device construction and driver registration
Key features:
- NUMA-aware resource allocation using rte_socket_id()
- Optimized ring flags (SP_ENQ, MC_HTS_DEQ) for lockless operation
- Graceful shutdown handling through signal interception
- Resource isolation for multi-device configurations
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: I1a42aeaa04595d13fc392452c1c9ca3f97442acc
---
examples/meson.build | 1 +
examples/vhost_user_rdma/main.c | 607 ++++++++++++++++++
examples/vhost_user_rdma/meson.build | 45 ++
examples/vhost_user_rdma/vhost_rdma.c | 697 +++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma.h | 444 ++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.c | 647 ++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.h | 710 ++++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_log.h | 52 ++
examples/vhost_user_rdma/vhost_rdma_pkt.h | 296 +++++++++
9 files changed, 3499 insertions(+)
create mode 100644 examples/vhost_user_rdma/main.c
create mode 100644 examples/vhost_user_rdma/meson.build
create mode 100644 examples/vhost_user_rdma/vhost_rdma.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_ib.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_ib.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_log.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_pkt.h
diff --git a/examples/meson.build b/examples/meson.build
index 8e8968a1fa..780d49d4b4 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -54,6 +54,7 @@ all_examples = [
'vdpa',
'vhost',
'vhost_blk',
+ 'vhost_user_rdma',
'vhost_crypto',
'vm_power_manager',
'vm_power_manager/guest_cli',
diff --git a/examples/vhost_user_rdma/main.c b/examples/vhost_user_rdma/main.c
new file mode 100644
index 0000000000..d5dda47e4e
--- /dev/null
+++ b/examples/vhost_user_rdma/main.c
@@ -0,0 +1,607 @@
+/*
+ * Vhost-user RDMA Device - Initialization and Packet Forwarding
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2025 KylinSoft Inc. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ */
+
+#include <signal.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <errno.h>
+
+/* DPDK headers */
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_ethdev.h>
+#include <rte_mbuf.h>
+#include <rte_ring.h>
+#include <rte_malloc.h>
+#include <dev_driver.h>
+
+/* Local headers */
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_log.h"
+
+/**
+ * Maximum length for Unix socket path
+ */
+#define SOCKET_PATH_MAX 64
+
+/**
+ * Default number of RX/TX descriptors
+ */
+#define MAX_NB_RXD 1024
+#define MAX_NB_TXD 1024
+
+/**
+ * Size of shared rings between vhost devices and datapath
+ */
+#define MAX_RING_COUNT 1024
+
+/**
+ * Default number of mbufs in memory pool
+ */
+#define NUM_MBUFS_DEFAULT (1UL << 16) // 65536
+
+/**
+ * Cache size for per-lcore mbuf cache
+ */
+#define MBUF_CACHE_SIZE 256
+
+/**
+ * Data buffer size in each mbuf
+ */
+#define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
+
+/* Forward declarations */
+extern struct vhost_rdma_device g_vhost_rdma_dev[];
+
+/* Global configuration */
+static char *socket_path; /* Array of socket paths */
+static int nb_sockets = 0; /* Number of vhost sockets */
+static uint16_t pair_port_id = UINT16_MAX; /* Physical port ID to forward packets */
+static volatile bool force_quit; /* Signal to exit cleanly */
+
+/* Stats and feature flags */
+static uint32_t enable_stats; /* Enable periodic stats printing (seconds) */
+static uint32_t enable_tx_csum; /* Enable TX checksum offload */
+static int total_num_mbufs = NUM_MBUFS_DEFAULT;/* Total mbufs across pools */
+
+/* Shared resources */
+static struct rte_ring *vhost_rdma_rx_ring;
+static struct rte_ring *vhost_rdma_tx_ring;
+static struct rte_mempool *vhost_rdma_mbuf_pool;
+
+/* Per-lcore info for device management */
+struct lcore_info {
+ uint32_t device_num;
+ TAILQ_HEAD(vhost_dev_tailq_list, vhost_rdma_device) vdev_list;
+};
+
+static struct lcore_info lcore_info[RTE_MAX_LCORE];
+static unsigned int lcore_ids[RTE_MAX_LCORE];
+
+/* Port configuration templates */
+static struct rte_eth_conf default_port_config;
+
+static struct rte_eth_conf offload_port_config = {
+ .txmode = {
+ .offloads = RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
+ RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
+ RTE_ETH_TX_OFFLOAD_TCP_CKSUM,
+ },
+};
+
+enum {
+#define OPT_STATS "stats"
+ OPT_STATS_NUM,
+#define OPT_SOCKET_FILE "socket-file"
+ OPT_SOCKET_FILE_NUM,
+#define OPT_TX_CSUM "tx-csum"
+ OPT_TX_CSUM_NUM,
+#define OPT_NUM_MBUFS "total-num-mbufs"
+ OPT_NUM_MBUFS_NUM,
+};
+
+/**
+ * @brief Unregister all registered vhost drivers.
+ *
+ * Called during signal cleanup to ensure no stale sockets remain.
+ *
+ * @param socket_num Number of socket paths to unregister
+ */
+static void
+unregister_drivers(int socket_num)
+{
+ int i, ret;
+
+ for (i = 0; i < socket_num; i++) {
+ const char *path = socket_path + i * SOCKET_PATH_MAX;
+ ret = rte_vhost_driver_unregister(path);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Failed to unregister vhost driver for socket %s\n", path);
+ } else {
+ RDMA_LOG_INFO("Unregistered socket: %s\n", path);
+ }
+ }
+}
+
+/**
+ * @brief Signal handler for graceful shutdown (SIGINT).
+ *
+ * Cleans up vhost driver registrations and exits.
+ */
+static void
+vhost_rdma_signal_handler(__rte_unused int signum)
+{
+ RDMA_LOG_INFO("Received SIGINT, shutting down...\n");
+
+ if((signum == SIGINT) || (signum == SIGTERM))
+ force_quit = true;
+
+ unregister_drivers(nb_sockets);
+ exit(0);
+}
+
+/**
+ * @brief Initialize an Ethernet port with given offload settings.
+ *
+ * Configures one RX/TX queue, sets up descriptor rings, starts the port.
+ *
+ * @param port_id The port identifier
+ * @param offload Whether to enable hardware offloads
+ * @return 0 on success, negative on failure
+ */
+static int
+vhost_rdma_init_port(uint16_t port_id, bool offload)
+{
+ int ret;
+ uint16_t nb_rxd = MAX_NB_RXD;
+ uint16_t nb_txd = MAX_NB_TXD;
+ struct rte_eth_dev_info dev_info;
+ struct rte_eth_conf port_conf = offload ? offload_port_config : default_port_config;
+ struct rte_eth_txconf txconf;
+ struct rte_ether_addr addr;
+ char mac_str[RTE_ETHER_ADDR_FMT_SIZE];
+
+ RDMA_LOG_INFO("Initializing port %u with %s offloads\n", port_id,
+ offload ? "enabled" : "disabled");
+
+ ret = rte_eth_dev_info_get(port_id, &dev_info);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to get device info for port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_configure(port_id, 1, 1, &port_conf);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to configure port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
+ if (ret < 0) {
+ LOG_WARN("Failed to adjust number of descriptors for port %u\n", port_id);
+ }
+
+ ret = rte_eth_rx_queue_setup(port_id, 0, nb_rxd,
+ rte_eth_dev_socket_id(port_id),
+ NULL,
+ vhost_rdma_mbuf_pool);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to setup RX queue for port %u\n", port_id);
+ goto out;
+ }
+
+ txconf = dev_info.default_txconf;
+ txconf.offloads = port_conf.txmode.offloads;
+ ret = rte_eth_tx_queue_setup(port_id, 0, nb_txd,
+ rte_eth_dev_socket_id(port_id),
+ &txconf);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to setup TX queue for port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_start(port_id);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to start port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_promiscuous_enable(port_id);
+ if (ret < 0) {
+ LOG_WARN("Failed to enable promiscuous mode on port %u\n", port_id);
+ }
+
+ ret = rte_eth_macaddr_get(port_id, &addr);
+ if (ret == 0) {
+ rte_ether_format_addr(mac_str, sizeof(mac_str), &addr);
+ RDMA_LOG_INFO("Port %u MAC address: %s\n", port_id, mac_str);
+ } else {
+ LOG_WARN("Could not read MAC address for port %u\n", port_id);
+ }
+
+out:
+ return ret;
+}
+
+/**
+ * @brief Print usage information.
+ */
+static void
+vhost_rdma_usage(const char *prgname)
+{
+ printf("%s [EAL options] --\n"
+ " -p PORTMASK\n"
+ " --socket-file <path> : Path to vhost-user socket (can be repeated)\n"
+ " --stats <N> : Print stats every N seconds (0=disable)\n"
+ " --tx-csum <0|1> : Disable/enable TX checksum offload\n"
+ " --total-num-mbufs <N> : Total number of mbufs in pool (default: %ld)\n",
+ prgname, NUM_MBUFS_DEFAULT);
+}
+
+/**
+ * @brief Parse a numeric option safely.
+ *
+ * @param q_arg Input string
+ * @param max_valid_value Maximum allowed value
+ * @return Parsed integer or -1 on error
+ */
+static int
+vhost_rdma_parse_num_opt(const char *q_arg, uint32_t max_valid_value)
+{
+ char *end = NULL;
+ unsigned long num;
+
+ errno = 0;
+ num = strtoul(q_arg, &end, 10);
+
+ if (!q_arg || q_arg[0] == '\0' || end == NULL || *end != '\0')
+ return -1;
+ if (errno != 0 || num > max_valid_value)
+ return -1;
+
+ return (int)num;
+}
+
+/**
+ * @brief Parse and store vhost socket path.
+ *
+ * Supports multiple sockets via repeated --socket-file.
+ *
+ * @param q_arg Socket file path
+ * @return 0 on success, -1 on failure
+ */
+static int
+vhost_rdma_parse_socket_path(const char *q_arg)
+{
+ char *old_ptr;
+
+ if (strnlen(q_arg, SOCKET_PATH_MAX) >= SOCKET_PATH_MAX) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Socket path too long: %s\n", q_arg);
+ return -1;
+ }
+
+ old_ptr = socket_path;
+ socket_path = realloc(socket_path, SOCKET_PATH_MAX * (nb_sockets + 1));
+ if (socket_path == NULL) {
+ free(old_ptr);
+ return -1;
+ }
+
+ strncpy(socket_path + nb_sockets * SOCKET_PATH_MAX, q_arg, SOCKET_PATH_MAX - 1);
+ socket_path[(nb_sockets + 1) * SOCKET_PATH_MAX - 1] = '\0';
+
+ RDMA_LOG_INFO("Registered socket[%d]: %s\n",
+ nb_sockets, socket_path + nb_sockets * SOCKET_PATH_MAX);
+
+ nb_sockets++;
+ return 0;
+}
+
+/**
+ * @brief Parse command-line arguments.
+ *
+ * Supported options:
+ * --socket-file, --stats, --tx-csum, --total-num-mbufs
+ *
+ * @param argc Argument count
+ * @param argv Argument vector
+ * @return 0 on success, -1 on failure
+ */
+static int
+vhost_rdma_parse_args(int argc, char **argv)
+{
+ int opt, ret;
+ int option_idx;
+ const char *prgname = argv[0];
+
+ static struct option lgopts[] = {
+ { "stats", required_argument, NULL, OPT_STATS_NUM },
+ { "socket-file", required_argument, NULL, OPT_SOCKET_FILE_NUM },
+ { "tx-csum", required_argument, NULL, OPT_TX_CSUM_NUM },
+ { "total-num-mbufs",required_argument, NULL, OPT_NUM_MBUFS_NUM },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((opt = getopt_long(argc, argv, "",
+ lgopts, &option_idx)) != EOF) {
+ switch (opt) {
+ case OPT_STATS_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, INT32_MAX);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --stats\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ enable_stats = ret;
+ break;
+
+ case OPT_NUM_MBUFS_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, INT32_MAX);
+ if (ret < 0 || ret == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --total-num-mbufs\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ total_num_mbufs = ret;
+ break;
+
+ case OPT_SOCKET_FILE_NUM:
+ if (vhost_rdma_parse_socket_path(optarg) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid socket path: %s\n", optarg);
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ break;
+
+ case OPT_TX_CSUM_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, 1);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --tx-csum (must be 0 or 1)\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ enable_tx_csum = ret;
+ break;
+
+ default:
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ }
+
+ if (nb_sockets == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "At least one --socket-file must be specified.\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vhost_rdma_main_loop(__rte_unused void* arg)
+{
+ while (!force_quit) {
+
+ }
+ return 0;
+}
+
+/**
+ * @brief Application entry point.
+ *
+ * Initializes EAL, parses args, sets up ports, mempools, rings,
+ * registers vhost drivers, launches threads.
+ */
+int main(int argc, char **argv)
+{
+ unsigned lcore_id, core_id = 0;
+ int ret;
+ uint16_t port_id;
+ bool pair_found = false;
+ struct rte_eth_dev_info dev_info;
+
+ force_quit = false;
+ enable_stats = 0;
+ enable_tx_csum = 0;
+
+ /* Register signal handler for clean shutdown */
+ signal(SIGINT, vhost_rdma_signal_handler);
+ signal(SIGTERM, vhost_rdma_signal_handler);
+
+ /* Initialize DPDK Environment Abstraction Layer */
+ ret = rte_eal_init(argc, argv);
+ if (ret < 0)
+ rte_panic("Unable to initialize DPDK EAL\n");
+
+ argc -= ret;
+ argv += ret;
+
+ rte_log_set_global_level(RTE_LOG_NOTICE);
+
+ /* Parse application-specific arguments */
+ if (vhost_rdma_parse_args(argc, argv) != 0) {
+ rte_exit(EXIT_FAILURE, "Argument parsing failed\n");
+ }
+
+ /* Initialize per-lcore data structures */
+ for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+ TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
+ if (rte_lcore_is_enabled(lcore_id)) {
+ lcore_ids[core_id++] = lcore_id;
+ }
+ }
+
+ if (rte_lcore_count() < 2) {
+ rte_exit(EXIT_FAILURE, "At least two cores required (one main + one worker)\n");
+ }
+
+ /*
+ * Create shared memory pool for mbufs
+ * Used by both RX and TX paths
+ */
+ vhost_rdma_mbuf_pool = rte_pktmbuf_pool_create(
+ "mbuf_pool_shared",
+ total_num_mbufs,
+ MBUF_CACHE_SIZE,
+ sizeof(struct vhost_rdma_pkt_info),
+ MBUF_DATA_SIZE,
+ rte_socket_id());
+
+ if (vhost_rdma_mbuf_pool == NULL) {
+ rte_exit(EXIT_FAILURE, "Cannot create mbuf pool: %s\n", rte_strerror(rte_errno));
+ }
+
+ /*
+ * Create shared rings for packet exchange
+ * SP_ENQ: Single-producer enqueue (from NIC)
+ * MC_HTS_DEQ: Multi-consumer with HTS dequeue (to workers)
+ */
+ vhost_rdma_rx_ring = rte_ring_create(
+ "ring_rx_shared",
+ MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_SP_ENQ | RING_F_MC_HTS_DEQ
+ );
+ if (vhost_rdma_rx_ring == NULL)
+ rte_exit(EXIT_FAILURE, "Failed to create RX ring: %s\n", rte_strerror(rte_errno));
+
+ vhost_rdma_tx_ring = rte_ring_create(
+ "ring_tx_shared",
+ MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_MP_HTS_ENQ | RING_F_SC_DEQ
+ );
+ if (vhost_rdma_tx_ring == NULL)
+ rte_exit(EXIT_FAILURE, "Failed to create TX ring: %s\n", rte_strerror(rte_errno));
+
+ /*
+ * Find and initialize backend Ethernet device (e.g., net_tap or net_vhost)
+ */
+ RTE_ETH_FOREACH_DEV(port_id) {
+ ret = rte_eth_dev_info_get(port_id, &dev_info);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Failed to get info for port %u\n", port_id);
+ continue;
+ }
+
+ if (!pair_found &&
+ (strcmp(dev_info.driver_name, "net_tap") == 0 ||
+ strcmp(dev_info.driver_name, "net_vhost") == 0)) {
+
+ pair_port_id = port_id;
+ pair_found = true;
+
+ ret = vhost_rdma_init_port(port_id, !!enable_tx_csum);
+ if (ret != 0) {
+ rte_exit(EXIT_FAILURE, "Failed to initialize port %u: %s\n",
+ port_id, rte_strerror(-ret));
+ }
+
+ RDMA_LOG_INFO("Using device %s (port %u) as backend interface\n",
+ dev_info.device->name, port_id);
+ }
+ }
+
+ if (!pair_found) {
+ rte_exit(EXIT_FAILURE, "No suitable backend Ethernet device found\n");
+ }
+
+ /*
+ * Setup per-vhost-device resources and register vhost drivers
+ */
+ char name_buf[SOCKET_PATH_MAX];
+ for (int i = 0; i < nb_sockets; i++) {
+ const char *sock_path = socket_path + i * SOCKET_PATH_MAX;
+ struct vhost_rdma_device *dev = &g_vhost_rdma_dev[i];
+
+ dev->vid = i;
+
+ if (i == 0) {
+ /* Use shared resources for first device */
+ dev->rx_ring = vhost_rdma_rx_ring;
+ dev->tx_ring = vhost_rdma_tx_ring;
+ dev->mbuf_pool = vhost_rdma_mbuf_pool;
+ } else {
+ /* Create dedicated resources for additional devices */
+ snprintf(name_buf, sizeof(name_buf), "dev%u_rx_ring", i);
+ dev->rx_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(), RING_F_SP_ENQ | RING_F_MC_HTS_DEQ);
+ if (!dev->rx_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create RX ring %d\n", i);
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_tx_ring", i);
+ dev->tx_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(), RING_F_MP_HTS_ENQ | RING_F_SC_DEQ);
+ if (!dev->tx_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create TX ring %d\n", i);
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_mbuf_pool", i);
+ dev->mbuf_pool = rte_pktmbuf_pool_create(name_buf,
+ total_num_mbufs,
+ MBUF_CACHE_SIZE,
+ sizeof(struct vhost_rdma_pkt_info),
+ MBUF_DATA_SIZE,
+ rte_socket_id());
+ if (!dev->mbuf_pool)
+ rte_exit(EXIT_FAILURE, "Failed to create mbuf pool %d\n", i);
+ }
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_task_ring", i);
+ dev->task_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_MP_HTS_ENQ | RING_F_MC_HTS_DEQ);
+ if (!dev->task_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create task ring %d\n", i);
+
+ /* Construct and register vhost device */
+ ret = vhost_rdma_construct(dev, sock_path, i);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to construct vhost device %d\n", i);
+ continue;
+ }
+
+ ret = rte_vhost_driver_start(sock_path);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to start vhost driver for %s\n", sock_path);
+ } else {
+ RDMA_LOG_INFO("Successfully started vhost driver: %s\n", sock_path);
+ }
+ }
+
+ /* Wait for all worker threads to complete (they won't unless forced) */
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ rte_eal_wait_lcore(lcore_id);
+ }
+
+ vhost_rdma_main_loop(NULL);
+
+ /* Cleanup */
+ rte_eal_cleanup();
+ free(socket_path);
+
+ RDMA_LOG_INFO("Application terminated gracefully.\n");
+ return 0;
+}
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
new file mode 100644
index 0000000000..d6ccaf32a4
--- /dev/null
+++ b/examples/vhost_user_rdma/meson.build
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+if not is_linux
+ build = false
+ subdir_done()
+endif
+
+deps += ['vhost', 'timer']
+
+allow_experimental_apis = true
+
+cflags_options = [
+ '-std=c11',
+ '-Wno-strict-prototypes',
+ '-Wno-pointer-arith',
+ '-Wno-maybe-uninitialized',
+ '-Wno-discarded-qualifiers',
+ '-Wno-old-style-definition',
+ '-Wno-sign-compare',
+ '-Wno-stringop-overflow',
+ '-O3',
+ '-g',
+ '-DALLOW_EXPERIMENTAL_API',
+ '-DDEBUG_RDMA',
+ '-DDEBUG_RDMA_DP',
+]
+
+foreach option:cflags_options
+ if cc.has_argument(option)
+ cflags += option
+ endif
+endforeach
+
+sources = files(
+ 'main.c',
+ 'vhost_rdma.c',
+ 'vhost_rdma_ib.c',
+)
+
diff --git a/examples/vhost_user_rdma/vhost_rdma.c b/examples/vhost_user_rdma/vhost_rdma.c
new file mode 100644
index 0000000000..2cf47a6baa
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma.c
@@ -0,0 +1,697 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <unistd.h>
+#include <stdlib.h>
+
+#include <rte_malloc.h>
+#include <rte_bitmap.h>
+#include <rte_common.h>
+#include <rte_ring.h>
+#include <rte_vhost.h>
+#include <rte_malloc.h>
+
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_log.h"
+
+#define VHOST_MAX_DEVICES 32
+
+struct vhost_rdma_device g_vhost_rdma_dev[MAX_VHOST_RDMA_DEV_NUM];
+struct vhost_rdma_net_dev g_vhost_rdma_net_dev[MAX_VHOST_RDMA_DEV_NUM];
+
+/**
+ * @brief Install required vhost-user protocol features for RDMA device.
+ *
+ * Enables CONFIG and MQ features which are essential for multi-queue
+ * and configuration space access in vhost-user frontend.
+ *
+ * @param path Socket or VFIO device path used by vhost driver
+ */
+static void
+vhost_rdma_install_rte_compat_hooks(const char *path)
+{
+ uint64_t protocol_features = 0;
+
+ if (!path) {
+ RDMA_LOG_ERR("Invalid path parameter");
+ return;
+ }
+
+ /* Retrieve current protocol features */
+ if (rte_vhost_driver_get_protocol_features(path, &protocol_features) < 0) {
+ RDMA_LOG_DEBUG("Failed to get protocol features for %s, assuming 0", path);
+ protocol_features = 0;
+ }
+
+ /* Enable mandatory features */
+ protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG); // For GET/SET_CONFIG
+ protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_MQ); // Multi-queue support
+
+ if (rte_vhost_driver_set_protocol_features(path, protocol_features) < 0) {
+ RDMA_LOG_ERR("Failed to set protocol features on %s", path);
+ } else {
+ RDMA_LOG_DEBUG("Enabled CONFIG and MQ features for %s", path);
+ }
+}
+
+/**
+ * @brief Construct a net device with given queues.
+ *
+ * Initializes the per-device queue mapping and state.
+ *
+ * @param queues Array of vhost-user queues
+ * @param idx Device index
+ */
+void
+vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx)
+{
+ if (idx < 0 || idx >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid device index: %d", idx);
+ return;
+ }
+
+ if (!queues) {
+ RDMA_LOG_ERR("NULL queues pointer for device %d", idx);
+ return;
+ }
+
+ g_vhost_rdma_net_dev[idx].queues = queues;
+ g_vhost_rdma_net_dev[idx].started = false;
+
+ RDMA_LOG_DEBUG("Net device %d constructed with queues=%p", idx, queues);
+}
+
+/**
+ * @brief Initialize an object pool with bitmap-based allocation tracking.
+ *
+ * Allocates contiguous memory for `num` objects and a bitmap to track usage.
+ * Optionally reserves index 0 (when !start_zero), useful for representing invalid handles.
+ *
+ * @param pool [out] Pool structure to initialize
+ * @param name Name used in memory allocation (can be NULL)
+ * @param num Number of objects to allocate
+ * @param size Size of each object
+ * @param start_zero If true, index 0 is usable; else reserved
+ * @param cleanup Optional callback called on free (can be NULL)
+ *
+ * @return 0 on success, -1 on failure
+ */
+int
+vhost_rdma_pool_init(struct vhost_rdma_pool *pool,
+ const char *name,
+ uint32_t num,
+ uint32_t size,
+ bool start_zero,
+ void (*cleanup)(void *))
+{
+ void *mem = NULL;
+ uint32_t actual_num;
+ struct rte_bitmap *bmp = NULL;
+ const char *pool_name = name ? name : "vhost_rdma_pool";
+
+ if (!pool || num == 0 || size == 0) {
+ RDMA_LOG_ERR("Invalid parameters: pool=%p, num=%u, size=%u", pool, num, size);
+ return -1;
+ }
+
+ /* Adjust total number: reserve index 0 if needed */
+ actual_num = start_zero ? num : num + 1;
+
+ /* Allocate object storage */
+ pool->objs = rte_zmalloc(pool_name, actual_num * size, RTE_CACHE_LINE_SIZE);
+ if (!pool->objs) {
+ RDMA_LOG_ERR("Failed to allocate %u * %u bytes for objects", actual_num, size);
+ goto err_objs;
+ }
+
+ /* Allocate bitmap metadata */
+ uint32_t bmp_size = rte_bitmap_get_memory_footprint(actual_num);
+ mem = rte_zmalloc(pool_name, bmp_size, RTE_CACHE_LINE_SIZE);
+ if (!mem) {
+ RDMA_LOG_ERR("Failed to allocate %u bytes for bitmap", bmp_size);
+ goto err_bmp_mem;
+ }
+
+ /* Initialize bitmap */
+ bmp = rte_bitmap_init(actual_num, mem, bmp_size);
+ if (!bmp) {
+ RDMA_LOG_ERR("Failed to init bitmap with %u bits", actual_num);
+ goto err_bmp_init;
+ }
+
+ /* Mark all slots as FREE (bitmap: SET = free) */
+ for (uint32_t i = 0; i < actual_num; i++) {
+ rte_bitmap_set(bmp, i);
+ }
+
+ /* Reserve index 0 if not starting from zero */
+ if (!start_zero) {
+ rte_bitmap_clear(bmp, 0); /* Now allocated/reserved */
+ }
+
+ /* Finalize pool setup */
+ pool->bitmap = bmp;
+ pool->bitmap_mem = mem;
+ pool->num = actual_num;
+ pool->size = size;
+ pool->cleanup = cleanup;
+
+ RDMA_LOG_DEBUG("Pool '%s' initialized: %u entries, obj_size=%u, start_zero=%d",
+ pool_name, actual_num, size, start_zero);
+
+ return 0;
+
+err_bmp_init:
+ rte_free(mem);
+err_bmp_mem:
+ rte_free(pool->objs);
+err_objs:
+ return -1;
+}
+
+/**
+ * @brief Get pointer to object at given index if it is currently allocated.
+ *
+ * Does NOT check thread safety.
+ *
+ * @param pool Pool instance
+ * @param idx Object index
+ * @return Pointer to object if allocated, NULL otherwise or if out-of-bounds
+ */
+void *
+vhost_rdma_pool_get(struct vhost_rdma_pool *pool, uint32_t idx)
+{
+ if (!pool || idx >= pool->num) {
+ RDMA_LOG_DEBUG("Invalid pool or index: pool=%p, idx=%u, num=%u",
+ pool, idx, pool ? pool->num : 0);
+ return NULL;
+ }
+
+ /* Bitmap: SET = free, CLEAR = allocated */
+ if (rte_bitmap_get(pool->bitmap, idx)) {
+ RDMA_LOG_DEBUG("Object at index %u is free, cannot get", idx);
+ return NULL;
+ }
+
+ return RTE_PTR_ADD(pool->objs, idx * pool->size);
+}
+
+/**
+ * @brief Allocate a new object from the pool.
+ *
+ * Finds the first available slot, clears its bit (marks as used), optionally zeroes memory,
+ * and returns a pointer. Also outputs the assigned index via `idx` parameter.
+ *
+ * @param pool Pool to allocate from
+ * @param idx [out] Assigned index (optional, pass NULL if not needed)
+ * @return Pointer to allocated object, or NULL if no space
+ */
+void *
+vhost_rdma_pool_alloc(struct vhost_rdma_pool *pool, uint32_t *idx)
+{
+ uint32_t pos = 0;
+ uint64_t slab = 0;
+ void *obj;
+
+ if (!pool) {
+ RDMA_LOG_ERR("NULL pool");
+ return NULL;
+ }
+
+ __rte_bitmap_scan_init(pool->bitmap);
+ int found = rte_bitmap_scan(pool->bitmap, &pos, &slab);
+ if (!found) {
+ RDMA_LOG_DEBUG("No free objects in pool");
+ return NULL;
+ }
+
+ uint32_t allocated_idx = pos + __builtin_ctzll(slab);
+ obj = RTE_PTR_ADD(pool->objs, allocated_idx * pool->size);
+
+ /* Zero-initialize new object */
+ memset(obj, 0, pool->size);
+
+ /* Mark as allocated */
+ rte_bitmap_clear(pool->bitmap, allocated_idx);
+
+ if (idx) {
+ *idx = allocated_idx;
+ }
+
+ RDMA_LOG_DEBUG("Allocated object at index %u", allocated_idx);
+ return obj;
+}
+
+/**
+ * @brief Free an object back into the pool.
+ *
+ * Calls optional cleanup callback before releasing.
+ * Not thread-safe — must be externally synchronized.
+ *
+ * @param pool Pool containing the object
+ * @param idx Index of object to free
+ */
+void
+vhost_rdma_pool_free(struct vhost_rdma_pool *pool, uint32_t idx)
+{
+ if (!pool || idx >= pool->num) {
+ RDMA_LOG_ERR("Invalid pool or index: pool=%p, idx=%u", pool, idx);
+ return;
+ }
+
+ void *obj = vhost_rdma_pool_get(pool, idx);
+ if (!obj) {
+ RDMA_LOG_DEBUG("Index %u already free, skipping", idx);
+ return; /* Idempotent: already free */
+ }
+
+ /* Call user-defined cleanup hook */
+ if (pool->cleanup) {
+ pool->cleanup(obj);
+ }
+
+ /* Return to free list */
+ rte_bitmap_set(pool->bitmap, idx);
+
+ RDMA_LOG_DEBUG("Freed object at index %u", idx);
+}
+
+/**
+ * @brief Destroy the entire pool and release all memory.
+ *
+ * WARNING: Caller must ensure no live references exist.
+ * Does NOT call cleanup() on remaining live objects.
+ *
+ * @param pool Pool to destroy
+ */
+void
+vhost_rdma_pool_destroy(struct vhost_rdma_pool *pool)
+{
+ if (!pool) {
+ return;
+ }
+
+ if (pool->bitmap_mem) {
+ rte_bitmap_free(pool->bitmap); /* Frees internal state too */
+ rte_free(pool->bitmap_mem);
+ pool->bitmap = NULL;
+ pool->bitmap_mem = NULL;
+ }
+
+ if (pool->objs) {
+ rte_free(pool->objs);
+ pool->objs = NULL;
+ }
+
+ pool->num = 0;
+ pool->size = 0;
+ pool->cleanup = NULL;
+
+ RDMA_LOG_DEBUG("Pool destroyed");
+}
+
+/**
+ * @brief Set up the vhost-user network backend for a given device.
+ *
+ * Initializes guest memory mapping, negotiates features (e.g., merged RX buffers),
+ * sets header length accordingly, disables unnecessary notifications during setup,
+ * and marks the device as started.
+ *
+ * @param vid Vhost device ID (from rte_vhost driver)
+ */
+void
+vs_vhost_rdma_net_setup(int vid)
+{
+ struct vhost_rdma_net_dev *dev;
+ uint64_t negotiated_features = 0;
+ int ret;
+
+ /* Validate input */
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid vhost device ID: %d", vid);
+ return;
+ }
+
+ dev = &g_vhost_rdma_net_dev[vid];
+ if (!dev) {
+ RDMA_LOG_ERR("Device structure not initialized for vid=%d", vid);
+ return; /* Should never happen */
+ }
+
+ /* Initialize device context */
+ dev->vid = vid;
+ dev->started = false;
+
+ /* Step 1: Get negotiated VirtIO features */
+ if (rte_vhost_get_negotiated_features(vid, &negotiated_features) < 0) {
+ RDMA_LOG_ERR("Failed to get negotiated features for vid=%d", vid);
+ return;
+ }
+ dev->features = negotiated_features;
+
+ /* Step 2: Determine virtio-net header size based on features */
+ if (negotiated_features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ RDMA_LOG_DEBUG("Using merged RX buffer header (size=%zu) for vid=%d",
+ dev->hdr_len, vid);
+ } else {
+ dev->hdr_len = sizeof(struct virtio_net_hdr);
+ RDMA_LOG_DEBUG("Using standard net header (size=%zu) for vid=%d",
+ dev->hdr_len, vid);
+ }
+
+ /* Step 3: Get guest memory table (VA->GPA/HPA translation) */
+ if (dev->mem) {
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ ret = rte_vhost_get_mem_table(vid, &dev->mem);
+ if (ret < 0 || dev->mem == NULL) {
+ RDMA_LOG_ERR("Failed to retrieve guest memory layout for vid=%d", vid);
+ return;
+ }
+
+ RDMA_LOG_INFO("Guest memory table acquired: %u regions mapped", dev->mem->nregions);
+
+ /* Step 4: Disable guest notification during initial setup */
+ ret = rte_vhost_enable_guest_notification(vid, VHOST_NET_RXQ, 0);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to disable RX queue kick suppression for vid=%d", vid);
+ }
+
+ ret = rte_vhost_enable_guest_notification(vid, VHOST_NET_TXQ, 0);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to disable TX queue kick suppression for vid=%d", vid);
+ }
+
+ /* Final step: Mark device as ready */
+ dev->started = true;
+
+ RDMA_LOG_INFO("vhost-user net device vid=%d setup completed successfully", vid);
+}
+
+/**
+ * @brief Callback: A new vhost-user device has been negotiated and is ready for setup.
+ *
+ * This function initializes the backend RDMA device context, sets up networking parameters,
+ * allocates required resources, and marks the device as started.
+ *
+ * @param vid Vhost device identifier assigned by rte_vhost
+ * @return 0 on success, negative on failure (though return value is often ignored)
+ */
+static int
+vhost_rdma_new_device(int vid)
+{
+ struct vhost_rdma_device *dev;
+
+ /* Validate device ID */
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid vhost device ID: %d", vid);
+ return -1;
+ }
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ /* Avoid re-initializing an already started device */
+ if (dev->started) {
+ RDMA_LOG_DEBUG("Device vid=%d already started, skipping initialization", vid);
+ return 0;
+ }
+
+ /* Setup network layer: features, header size, memory table */
+ vs_vhost_rdma_net_setup(vid);
+
+ /* Finalize device state */
+ dev->vid = vid;
+ dev->started = true;
+ dev->stopped = false;
+
+ RDMA_LOG_INFO("New vhost-RDMA device created: vid=%d", vid);
+ return 0;
+}
+
+/**
+ * @brief Clean up guest memory mapping for a vhost device.
+ *
+ * Frees memory allocated by rte_vhost_get_mem_table().
+ * Safe to call multiple times (idempotent).
+ *
+ * @param vid Device ID
+ */
+static void
+vs_vhost_rdma_net_remove(int vid)
+{
+ struct vhost_rdma_net_dev *net_dev;
+
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid device ID in net_remove: %d", vid);
+ return;
+ }
+
+ net_dev = &g_vhost_rdma_net_dev[vid];
+
+ if (net_dev->mem) {
+ RDMA_LOG_DEBUG("Freeing guest memory table for vid=%d", vid);
+ rte_free(net_dev->mem); /* Use rte_free() because allocated via DPDK */
+ net_dev->mem = NULL;
+ } else {
+ RDMA_LOG_DEBUG("No memory table to free for vid=%d", vid);
+ }
+}
+
+/**
+ * @brief Destroy and release all resources associated with a vhost-RDMA device.
+ *
+ * Called when frontend disconnects or device is removed.
+ * Ensures safe teardown of IB context, queues, memory mappings, and notification states.
+ *
+ * @param vid Vhost device ID
+ */
+static void
+vhost_rdma_destroy_device(__rte_unused int vid)
+{
+ struct vhost_rdma_device *dev;
+ struct vhost_user_queue *vq;
+ unsigned int lcore_id;
+
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Attempted to destroy invalid device ID: %d", vid);
+ return;
+ }
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ if (!dev->started) {
+ RDMA_LOG_DEBUG("Device vid=%d not started, nothing to destroy", vid);
+ return;
+ }
+
+ /* Mark device as stopping */
+ dev->started = false;
+ dev->stopped = true;
+
+ RDMA_LOG_INFO("Destroying vhost-RDMA device: vid=%d", vid);
+
+ /*
+ * Wait gracefully until device is no longer in use.
+ * Use atomic counter if available, or yield CPU.
+ *
+ * Note: Original code had `while (dev->inuse == 0)` which waits forever if never used!
+ * Should be: while (dev->inuse > 0)
+ */
+ while (dev->inuse > 0) {
+ lcore_id = rte_lcore_id();
+ if (lcore_id != RTE_MAX_LCORE) {
+ rte_pause(); /* Yield CPU time on polling lcore */
+ } else {
+ rte_delay_us_block(100); /* Background thread sleep */
+ }
+ }
+
+ /* Step 1: Remove from network subsystem */
+ vs_vhost_rdma_net_remove(vid);
+
+ /* Step 2: Destroy InfiniBand/RDMA components (QP, CQ, MR cleanup) */
+ vhost_rdma_destroy_ib(dev);
+
+ /* Step 3: Persist vring indices before shutdown */
+ for (int i = 0; i < NUM_VHOST_QUEUES; i++) {
+ vq = &dev->vqs[i];
+
+ if (vq->enabled) {
+ int ret = rte_vhost_set_vring_base(dev->vid, i,
+ vq->last_avail_idx,
+ vq->last_used_idx);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to save vring base for queue %d", i);
+ }
+
+ vq->enabled = false;
+ RDMA_LOG_DEBUG("Disabled vring %d", i);
+ }
+ }
+
+ /* Step 4: Free per-device memory table (if any) */
+ if (dev->mem) {
+ RDMA_LOG_DEBUG("Freeing device memory table for vid=%d", vid);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ RDMA_LOG_INFO("vhost-RDMA device destroyed successfully: vid=%d", vid);
+}
+
+static enum rte_vhost_msg_result extern_vhost_pre_msg_handler(__rte_unused int vid, void *_msg)
+{
+ struct vhost_rdma_device *dev;
+ struct vhost_user_rdma_msg *msg = _msg;
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ switch ((int)msg->request) {
+ case VHOST_USER_GET_VRING_BASE:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_MEM_TABLE:
+ break;
+ case VHOST_USER_GET_CONFIG: {
+ rte_memcpy(msg->payload.cfg.region, &dev->rdma_config, sizeof(dev->rdma_config));
+ return RTE_VHOST_MSG_RESULT_REPLY;
+ }
+ case VHOST_USER_SET_CONFIG:
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+struct rte_vhost_user_extern_ops g_extern_vhost_ops = {
+ .pre_msg_handle = extern_vhost_pre_msg_handler,
+};
+
+static int vhost_rdma_new_connection(int vid)
+{
+ int ret = 0;
+
+ ret = rte_vhost_extern_callback_register(vid, &g_extern_vhost_ops, NULL);
+ if (ret != 0)
+ RDMA_LOG_ERR(
+ "rte_vhost_extern_callback_register failed for vid = %d\n",
+ vid);
+
+ g_vhost_rdma_dev[vid].vid = vid;
+ return ret;
+}
+
+static int vhost_rdma_vring_state_changed(int vid, uint16_t queue_id, int enable)
+{
+ struct vhost_rdma_device *dev = &g_vhost_rdma_dev[vid];
+ struct vhost_user_queue *vq;
+
+ assert(dev->vid == vid);
+
+ if (enable) {
+ vq = &dev->vqs[queue_id];
+
+ if (vq->enabled)
+ return 0;
+
+ vq->id = queue_id;
+
+ assert(rte_vhost_get_vhost_vring(dev->vid, queue_id,
+ &vq->vring) == 0);
+
+ assert(rte_vhost_get_vring_base(dev->vid, queue_id,
+ &vq->last_avail_idx,
+ &vq->last_used_idx) == 0);
+
+ vq->enabled = true;
+ /*
+ * ctrl_handler MUST start when the virtqueue is enabled,
+ * NOT start in new_device(). because driver will query some
+ * informations through ctrl vq in ib_register_device() when
+ * the device is not enabled.
+ */
+ if (queue_id == VHOST_NET_ROCE_CTRL_QUEUE && !dev->ctrl_intr_registered) {
+ assert(rte_vhost_get_mem_table(vid, &dev->mem) == 0);
+ assert(dev->mem != NULL);
+
+ dev->ctrl_intr_handle.fd = dev->vqs[VHOST_NET_ROCE_CTRL_QUEUE].vring.kickfd;
+ dev->ctrl_intr_handle.type = RTE_INTR_HANDLE_EXT;
+ rte_intr_callback_register(&dev->ctrl_intr_handle,
+ vhost_rdma_handle_ctrl_vq, dev);
+ dev->ctrl_intr_registered = 1;
+ }
+ }
+ return 0;
+}
+
+static const struct rte_vhost_device_ops vhost_rdma_device_ops = {
+ .new_device = vhost_rdma_new_device,
+ .destroy_device = vhost_rdma_destroy_device,
+ .new_connection = vhost_rdma_new_connection,
+ .vring_state_changed = vhost_rdma_vring_state_changed,
+};
+
+int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx)
+{
+ int ret;
+
+ unlink(path);
+
+ ret = rte_vhost_driver_register(path, 0);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Socket %s already exists\n", path);
+ return ret;
+ }
+
+ ret = rte_vhost_driver_set_features(path, VHOST_RDMA_FEATURE);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Set vhost driver features failed\n");
+ rte_vhost_driver_unregister(path);
+ return ret;
+ }
+
+ dev->stopped = false;
+ dev->inuse = 0;
+
+ /* set vhost user protocol features */
+ vhost_rdma_install_rte_compat_hooks(path);
+
+ dev->rdma_vqs = &dev->vqs[VHOST_NET_ROCE_CTRL_QUEUE];
+
+ vhost_rdma_net_construct(dev->vqs, idx);
+
+ vhost_rdma_init_ib(dev);
+ rte_spinlock_init(&dev->port_lock);
+
+ rte_vhost_driver_callback_register(path,
+ &vhost_rdma_device_ops);
+
+ for (int i = 0; i < VHOST_RDMA_NUM_OF_COUNTERS; i++) {
+ rte_atomic64_init(&dev->stats_counters[i]);
+ }
+
+ if(dev->tx_ring){
+ rte_eal_mp_remote_launch(vhost_rdma_task_scheduler, dev, SKIP_MAIN);
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma.h b/examples/vhost_user_rdma/vhost_rdma.h
new file mode 100644
index 0000000000..c1531d1a7a
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma.h
@@ -0,0 +1,444 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef VHOST_RDMA_H_
+#define VHOST_RDMA_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vhost.h>
+#include <rte_interrupts.h>
+#include <rte_atomic.h>
+#include <rte_spinlock.h>
+#include <rte_mempool.h>
+#include <rte_ring.h>
+#include <rte_bitmap.h>
+
+#include "vhost_rdma_ib.h"
+#include "eal_interrupts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Number of vhost queues.
+ *
+ * One CTRL VQ + 64 CQ + 64 TX + 64 RX event queues
+ */
+#define NUM_VHOST_QUEUES 193
+
+/**
+ * @brief Maximum GID table length
+ */
+#define VHOST_MAX_GID_TBL_LEN 512
+
+/**
+ * @brief Port PKey table length (single entry for default)
+ */
+#define VHOST_PORT_PKEY_TBL_LEN 1
+
+/**
+ * @brief Number of RDMA ports supported (currently only one)
+ */
+#define NUM_OF_VHOST_RDMA_PORT 1
+
+
+#define MAX_VHOST_RDMA_DEV_NUM 16
+
+#define VIRTIO_NET_F_ROCE 48
+
+#define VHOST_NET_ROCE_CTRL_QUEUE 0
+
+#define VHOST_RDMA_GID_TYPE_ILLIGAL (-1u)
+
+#define DEFAULT_IB_MTU VHOST_RDMA_IB_MTU_1024
+
+#define VHOST_NET_RXQ 0
+#define VHOST_NET_TXQ 1
+
+/* VIRTIO_F_EVENT_IDX is NOT supported now */
+#define VHOST_RDMA_FEATURE ((1ULL << VIRTIO_F_VERSION_1) |\
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_STATUS) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_ROCE))
+
+__rte_always_inline uint32_t
+roundup_pow_of_two(uint32_t n)
+{
+ return n < 2 ? n : (1u << (32 - __builtin_clz (n - 1)));
+}
+
+/**
+ * @brief Counter types for statistics in vhost RDMA device
+ */
+enum vhost_rdma_counters {
+ VHOST_RDMA_CNT_SENT_PKTS,
+ VHOST_RDMA_CNT_RCVD_PKTS,
+ VHOST_RDMA_CNT_DUP_REQ,
+ VHOST_RDMA_CNT_OUT_OF_SEQ_REQ,
+ VHOST_RDMA_CNT_RCV_RNR,
+ VHOST_RDMA_CNT_SND_RNR,
+ VHOST_RDMA_CNT_RCV_SEQ_ERR,
+ VHOST_RDMA_CNT_COMPLETER_SCHED,
+ VHOST_RDMA_CNT_RETRY_EXCEEDED,
+ VHOST_RDMA_CNT_RNR_RETRY_EXCEEDED,
+ VHOST_RDMA_CNT_COMP_RETRY,
+ VHOST_RDMA_CNT_SEND_ERR,
+ VHOST_RDMA_CNT_LINK_DOWNED,
+ VHOST_RDMA_CNT_RDMA_SEND,
+ VHOST_RDMA_CNT_RDMA_RECV,
+ VHOST_RDMA_NUM_OF_COUNTERS
+};
+
+struct vhost_rdma_net_dev {
+ int vid;
+ uint64_t features;
+ size_t hdr_len;
+ bool started;
+ struct rte_vhost_memory *mem;
+ struct vhost_user_queue *queues;
+}__rte_cache_aligned;
+
+struct vhost_user_queue {
+ struct rte_vhost_vring vring;
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+ uint16_t id;
+ bool enabled;
+};
+
+/**
+ * @brief Configuration structure exposed to guest via virtio config space
+ *
+ * All fields are in little-endian byte order.
+ */
+struct vhost_rdma_config {
+ uint32_t phys_port_cnt; /**< Physical port count */
+ uint64_t sys_image_guid; /**< System image GUID */
+ uint32_t vendor_id; /**< Vendor ID */
+ uint32_t vendor_part_id; /**< Vendor part number */
+ uint32_t hw_ver; /**< Hardware version */
+ uint64_t max_mr_size; /**< Max memory region size */
+ uint64_t page_size_cap; /**< Page size capabilities */
+ uint32_t max_qp; /**< Max number of QPs */
+ uint32_t max_qp_wr; /**< Max work requests per QP */
+ uint64_t device_cap_flag; /**< Device capability flags */
+ uint32_t max_send_sge; /**< Max SGEs in send WR */
+ uint32_t max_recv_sge; /**< Max SGEs in recv WR */
+ uint32_t max_sge_rd; /**< Max SGEs for RD operations */
+ uint32_t max_cq; /**< Max completion queues */
+ uint32_t max_cqe; /**< Max entries per CQ */
+ uint32_t max_mr; /**< Max memory regions */
+ uint32_t max_pd; /**< Max protection domains */
+ uint32_t max_qp_rd_atom; /**< Max RDMA read-atoms per QP */
+ uint32_t max_res_rd_atom; /**< Max responder resources */
+ uint32_t max_qp_init_rd_atom; /**< Max initiator RD atoms */
+ uint32_t atomic_cap; /**< Atomic operation support */
+ uint32_t max_mw; /**< Max memory windows */
+ uint32_t max_mcast_grp; /**< Max multicast groups */
+ uint32_t max_mcast_qp_attach; /**< Max QPs per multicast group */
+ uint32_t max_total_mcast_qp_attach;/**< Total multicast attachments */
+ uint32_t max_ah; /**< Max address handles */
+ uint32_t max_fast_reg_page_list_len; /**< Fast registration page list len */
+ uint32_t max_pi_fast_reg_page_list_len; /**< PI fast reg list len */
+ uint16_t max_pkeys; /**< Max partition keys */
+ uint8_t local_ca_ack_delay; /**< Local CA ACK delay */
+ uint8_t reserved[5]; /* Pad to 8-byte alignment before variable area */
+ uint8_t reserved1[64]; /**< Reserved for future use */
+};
+
+/**
+ * @brief Device attributes (host-native format, not exposed directly)
+ */
+struct vhost_rdma_dev_attr {
+ uint64_t max_mr_size;
+ uint64_t page_size_cap;
+ uint32_t hw_ver;
+ uint32_t max_qp_wr;
+ uint64_t device_cap_flags;
+ uint32_t max_qps;
+ uint32_t max_cqs;
+ uint32_t max_send_sge;
+ uint32_t max_recv_sge;
+ uint32_t max_sge_rd;
+ uint32_t max_cqe;
+ uint32_t max_mr;
+ uint32_t max_mw;
+ uint32_t max_pd;
+ uint32_t max_qp_rd_atom;
+ uint32_t max_qp_init_rd_atom;
+ uint32_t max_ah;
+ uint32_t max_fast_reg_page_list_len;
+ uint8_t local_ca_ack_delay;
+};
+
+/**
+ * @brief Port-level attributes
+ */
+struct vhost_rdma_port_attr {
+ uint32_t bad_pkey_cntr; /**< Bad PKey counter */
+ uint32_t qkey_viol_cntr; /**< QKey violation counter */
+};
+
+/**
+ * @brief GID entry with type indicator
+ */
+struct vhost_rdma_gid {
+#define VHOST_RDMA_GID_TYPE_INVALID ((uint32_t)(-1))
+ uint32_t type; /**< GID type: RoCEv1, RoCEv2, etc. */
+ uint8_t gid[16]; /**< 128-bit GID value */
+};
+
+/**
+ * @brief Generic object pool for managing RDMA resources
+ */
+struct vhost_rdma_pool {
+ void *objs; /**< Array of allocated objects */
+ uint32_t num; /**< Number of objects in pool */
+ uint32_t size; /**< Size of each object */
+
+ struct rte_bitmap *bitmap; /**< Bitmap tracking free slots */
+ void *bitmap_mem; /**< Memory backing the bitmap */
+
+ void (*cleanup)(void *arg); /**< Optional cleanup function */
+};
+
+/**
+ * @brief Main RDMA vhost device structure
+ */
+struct vhost_rdma_device {
+ int vid; /**< Vhost-Rdma device ID */
+ int started; /**< Device start state */
+ volatile bool stopped; /**< Stop flag for threads */
+ volatile int inuse; /**< Reference count */
+
+ /* Memory and resource management */
+ struct rte_vhost_memory *mem; /**< Guest physical memory map */
+ struct rte_mempool *mbuf_pool; /**< mbuf pool for packet I/O */
+ struct rte_ring *tx_ring; /**< TX ring for outbound packets */
+ struct rte_ring *rx_ring; /**< RX ring for inbound packets */
+
+ /* Queues */
+ struct vhost_user_queue vqs[NUM_VHOST_QUEUES]; /**< All vhost queues */
+ struct vhost_user_queue *rdma_vqs; /**< Shortcut to RDMA queues */
+ struct vhost_user_queue *cq_vqs; /**< Shortcut to CQ notification queues */
+ struct vhost_user_queue *qp_vqs; /**< Shortcut to QP data queues */
+ struct rte_ring *task_ring; /**< Task scheduling ring */
+
+ /* Interrupt handling for control plane */
+ struct rte_intr_handle ctrl_intr_handle; /**< Control interrupt handle */
+ int ctrl_intr_registered; /**< Whether interrupt is registered */
+
+ /* Virtio-net configuration (exposed to guest) */
+ struct virtio_net_config config; /**< Generic virtio-net config */
+ struct vhost_rdma_config rdma_config; /**< RDMA-specific config */
+ uint32_t max_inline_data; /**< Max inline data size */
+
+ /* Device attributes (cached from config) */
+ struct vhost_rdma_dev_attr attr; /**< Cached device attributes */
+
+ /* Single port support */
+ struct vhost_rdma_port_attr port_attr; /**< Port-level counters */
+ rte_spinlock_t port_lock; /**< Lock for port access */
+ unsigned int mtu_cap; /**< MTU capability */
+ struct vhost_rdma_gid gid_tbl[VHOST_MAX_GID_TBL_LEN]; /**< GID table */
+ struct vhost_rdma_qp *qp_gsi; /**< Global shared inbox QP? */
+
+ /* Resource pools */
+ struct vhost_rdma_pool pd_pool; /**< Protection domain pool */
+ struct vhost_rdma_pool mr_pool; /**< Memory region pool */
+ struct vhost_rdma_pool cq_pool; /**< Completion queue pool */
+ struct vhost_rdma_pool qp_pool; /**< Queue pair pool */
+ struct vhost_rdma_pool ah_pool; /**< Address handle pool */
+
+ /* Statistics counters */
+ rte_atomic64_t stats_counters[VHOST_RDMA_NUM_OF_COUNTERS];
+};
+
+#define vhost_rdma_drop_ref(obj, dev, type) \
+ do { \
+ if (rte_atomic32_dec_and_test(&(obj)->refcnt)) { \
+ struct vhost_rdma_pool* pool = &(dev)->type##_pool; \
+ if (pool->cleanup) { \
+ pool->cleanup(obj); \
+ } \
+ vhost_rdma_pool_free(pool, (obj)->type##n); \
+ } \
+ }while(0)
+
+#define vhost_rdma_add_ref(obj) rte_atomic32_inc(&(obj)->refcnt)
+
+/**
+ * @brief Check if there is a new available descriptor in the virtqueue.
+ *
+ * This function compares the current avail->idx from the guest with the last
+ * processed index. If they differ, at least one new descriptor is ready.
+ *
+ * @param vq Pointer to the virtual queue.
+ * @return true if a new descriptor is available, false otherwise.
+ */
+static __rte_always_inline bool
+vhost_rdma_vq_is_avail(struct vhost_user_queue *vq)
+{
+ return vq->vring.avail->idx != vq->last_avail_idx;
+}
+
+/**
+ * @brief Get pointer to element at given index in a generic data ring.
+ *
+ * Used for accessing pre-allocated memory pools where each element has fixed size.
+ *
+ * @param queue Pointer to the queue containing data buffer.
+ * @param idx Index of the desired element.
+ * @return Pointer to the data at position idx.
+ */
+static __rte_always_inline void *
+vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
+{
+ return queue->data + queue->elem_size * idx;
+}
+
+/**
+ * @brief Retrieve the next available descriptor index from the avail ring.
+ *
+ * Reads the descriptor index at the current position in the avail ring,
+ * increments last_avail_idx, and returns the descriptor index.
+ *
+ * @param vq Pointer to the virtual queue.
+ * @return Index of the first descriptor in the incoming request chain.
+ */
+static __rte_always_inline uint16_t
+vhost_rdma_vq_get_desc_idx(struct vhost_user_queue *vq)
+{
+ uint16_t desc_idx;
+ uint16_t last_avail_idx;
+
+ /* Mask with ring size to handle wraparound */
+ last_avail_idx = vq->last_avail_idx & (vq->vring.size - 1);
+ desc_idx = vq->vring.avail->ring[last_avail_idx];
+
+ /* Advance the local index tracker */
+ vq->last_avail_idx++;
+
+ return desc_idx;
+}
+
+/**
+ * @brief Get the next descriptor in the chain, if any.
+ *
+ * Checks the VRING_DESC_F_NEXT flag. If set, returns pointer to the next
+ * descriptor using the 'next' field as an index into the descriptor table.
+ *
+ * @param table Base address of the descriptor table.
+ * @param desc Current descriptor.
+ * @return Pointer to next descriptor, or NULL if end of chain.
+ */
+static __rte_always_inline struct vring_desc *
+vhost_rdma_vring_get_next_desc(struct vring_desc *table, struct vring_desc *desc)
+{
+ if (desc->flags & VRING_DESC_F_NEXT)
+ return &table[desc->next];
+
+ return NULL;
+}
+
+/**
+ * @brief Add a used descriptor entry to the used ring.
+ *
+ * Records that a buffer has been consumed by the host/device, including its
+ * original descriptor index and the number of bytes written.
+ *
+ * Uses memory barriers to ensure ordering before updating used->idx.
+ *
+ * @param vq Virtual queue.
+ * @param idx Descriptor index being returned.
+ * @param len Number of bytes written (for writeable descriptors).
+ */
+static __rte_always_inline void
+vhost_rdma_queue_push(struct vhost_user_queue *vq, uint16_t idx, uint32_t len)
+{
+ struct vring_used *used = vq->vring.used;
+ uint16_t slot = used->idx & (vq->vring.size - 1);
+
+ used->ring[slot].id = idx;
+ used->ring[slot].len = len;
+
+ /* Full memory barrier before incrementing idx to ensure visibility */
+ rte_smp_mb();
+ used->idx++;
+ rte_smp_mb();
+}
+
+/**
+ * @brief Notify the frontend (guest) about used descriptor updates.
+ *
+ * Calls into the DPDK vhost library to signal the guest via eventfd or doorbell.
+ *
+ * @param vid Virtual host device ID.
+ * @param vq Pointer to the virtual queue that needs notification.
+ */
+static __rte_always_inline void
+vhost_rdma_queue_notify(int vid, struct vhost_user_queue *vq)
+{
+ rte_vhost_vring_call(vid, vq->id);
+}
+
+/**
+ * @brief Translate Guest Physical Address (GPA) to Virtual VA in host.
+ *
+ * Wrapper around DPDK's rte_vhost_va_from_guest_pa(). This function performs
+ * address translation using the guest memory map provided through vhost-user.
+ *
+ * @param mem Pointer to vhost memory region mapping.
+ * @param gpa Guest physical address to translate.
+ * @param len [in/out] On input: requested length; on output: actual mapped length.
+ * @return Host virtual address corresponding to GPA, or 0 on failure.
+ */
+static __rte_always_inline uint64_t
+gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa, uint64_t *len)
+{
+ assert(mem != NULL);
+ return rte_vhost_va_from_guest_pa(mem, gpa, len);
+}
+
+int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx);
+void vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx);
+void vs_vhost_rdma_net_setup(int vid);
+
+
+void vhost_rdma_destroy(const char* path);
+int vhost_rdma_pool_init(struct vhost_rdma_pool* pool,
+ const char* name,
+ uint32_t num,
+ uint32_t size,
+ bool start_zero,
+ void (*cleanup)(void*));
+void* vhost_rdma_pool_get(struct vhost_rdma_pool* pool, uint32_t idx);
+void vhost_rdma_pool_free(struct vhost_rdma_pool* pool, uint32_t idx);
+void* vhost_rdma_pool_alloc(struct vhost_rdma_pool* pool, uint32_t *idx);
+void vhost_rdma_pool_destroy(struct vhost_rdma_pool* pool);
+
+extern struct vhost_rdma_device g_vhost_rdma_dev[MAX_VHOST_RDMA_DEV_NUM];
+extern struct vhost_rdma_net_dev g_vhost_rdma_net_dev[MAX_VHOST_RDMA_DEV_NUM];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VHOST_RDMA_H_ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
new file mode 100644
index 0000000000..5535a8696b
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -0,0 +1,647 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <unistd.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include <rte_ethdev.h>
+#include <rte_spinlock.h>
+#include <rte_malloc.h>
+
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_log.h"
+#include "vhost_rdma_pkt.h"
+
+#define CHK_IOVEC(tp, iov) \
+ do { \
+ if(iov->iov_len < sizeof(*tp)) { \
+ RDMA_LOG_ERR("%s: " #iov " iovec is too small : %ld, %ld", __func__, sizeof(*tp), iov->iov_len); \
+ return -1; \
+ } \
+ tp = iov->iov_base; \
+ } while(0); \
+
+#define DEFINE_VIRTIO_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd}
+
+#define CTRL_NO_CMD __rte_unused struct iovec *__in
+#define CTRL_NO_RSP __rte_unused struct iovec *__out
+
+/**
+ * @brief Free resources held by a response entry in the RDMA responder path.
+ *
+ * Cleans up mbuf (for ATOMIC) or MR reference (for RDMA READ), then resets type.
+ * Uses RDMA_LOG_* macros for consistent logging.
+ *
+ * @param qp Queue Pair (currently unused)
+ * @param res Response resource to free (in/out)
+ */
+void
+free_rd_atomic_resource(__rte_unused struct vhost_rdma_qp *qp,
+ struct vhost_rdma_resp_res *res)
+{
+ if (!res) {
+ RDMA_LOG_ERR("Cannot free NULL response resource");
+ return;
+ }
+
+ switch (res->type) {
+ case VHOST_ATOMIC_MASK: {
+ struct rte_mbuf *mbuf = res->atomic.mbuf;
+ if (mbuf) {
+ RDMA_LOG_DEBUG("Freeing mbuf=%p from ATOMIC response", mbuf);
+ rte_pktmbuf_free(mbuf);
+ res->atomic.mbuf = NULL;
+ }
+ break;
+ }
+
+ case VHOST_READ_MASK: {
+ struct vhost_rdma_mr *mr = res->read.mr;
+ if (mr) {
+ RDMA_LOG_DEBUG("Dropping MR reference %p from RDMA READ response", mr);
+ vhost_rdma_drop_ref(mr, qp->dev, mr);
+ res->read.mr = NULL;
+ }
+ break;
+ }
+
+ case 0:
+ /* Already freed — silent no-op */
+ break;
+
+ default:
+ RDMA_LOG_ERR("Unknown response resource type %u (possible memory corruption)", res->type);
+ break;
+ }
+
+ /* Reset type to mark as free */
+ res->type = 0;
+}
+
+/**
+ * @brief Free all RD/Atomic response resources allocated for a Queue Pair.
+ *
+ * Iterates through the pre-allocated array of response tracking entries
+ * (used for RDMA READ and ATOMIC operations), frees associated mbufs or MRs,
+ * then releases the entire array memory.
+ *
+ * Safe to call multiple times (idempotent).
+ *
+ * @param qp Pointer to the Queue Pair whose response resources should be freed
+ */
+void
+free_rd_atomic_resources(struct vhost_rdma_qp *qp)
+{
+ if (!qp) {
+ RDMA_LOG_ERR("Cannot free response resources: qp is NULL");
+ return;
+ }
+
+ if (!qp->resp.resources) {
+ RDMA_LOG_DEBUG("No response resources to free for QP %u", qp->qpn);
+ return;
+ }
+
+ const uint32_t max_ops = qp->attr.max_dest_rd_atomic;
+
+ RDMA_LOG_DEBUG("Freeing %u RD/Atomic response resources for QP %u",
+ max_ops, qp->qpn);
+
+ for (uint32_t i = 0; i < max_ops; i++) {
+ struct vhost_rdma_resp_res *res = &qp->resp.resources[i];
+
+ /* Frees internal resources (mbuf or mr) and resets type */
+ free_rd_atomic_resource(qp, res);
+ }
+
+ /* Now free the entire array */
+ rte_free(qp->resp.resources);
+ qp->resp.resources = NULL;
+
+ RDMA_LOG_DEBUG("Successfully freed response resource array for QP %u", qp->qpn);
+}
+
+
+/**
+ * @brief Clean up a vhost RDMA queue.
+ */
+void
+vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue)
+{
+ if (!queue)
+ return;
+
+ if (queue->cb && qp)
+ rte_intr_callback_unregister(&queue->intr_handle, queue->cb, qp);
+
+ rte_free(queue->data);
+ queue->data = NULL;
+}
+
+/**
+ * @brief Cleanup callback for MR: reset type.
+ */
+void
+vhost_rdma_mr_cleanup(void *arg)
+{
+ struct vhost_rdma_mr *mr = arg;
+
+ if (mr)
+ mr->type = VHOST_MR_TYPE_NONE;
+}
+
+/**
+ * @brief Cleanup callback for QP: drop references and free resources.
+ */
+void
+vhost_rdma_qp_cleanup(void *arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+
+ if (!qp)
+ return;
+
+ if (qp->scq) {
+ vhost_rdma_drop_ref(qp->scq, qp->dev, cq);
+ qp->scq = NULL;
+ }
+
+ if (qp->rcq) {
+ vhost_rdma_drop_ref(qp->rcq, qp->dev, cq);
+ qp->rcq = NULL;
+ }
+
+ if (qp->pd) {
+ vhost_rdma_drop_ref(qp->pd, qp->dev, pd);
+ qp->pd = NULL;
+ }
+
+ if (qp->resp.mr) {
+ vhost_rdma_drop_ref(qp->resp.mr, qp->dev, mr);
+ qp->resp.mr = NULL;
+ }
+
+ free_rd_atomic_resources(qp);
+}
+
+void
+vhost_rdma_init_ib(struct vhost_rdma_device *dev)
+{
+ uint32_t qpn;
+
+ if (!dev) {
+ return;
+ }
+
+ /* Initialize device attributes (virtio-rdma IB capability) */
+ dev->attr.max_qps = 64;
+ dev->attr.max_cqs = 64;
+ dev->attr.max_mr_size = UINT64_MAX;
+ dev->attr.page_size_cap = 0xFFFFF000U;
+ dev->attr.max_qp_wr = 1024;
+ dev->attr.device_cap_flags = VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN;
+ dev->attr.max_send_sge = 32;
+ dev->attr.max_recv_sge = 32;
+ dev->attr.max_sge_rd = 32;
+ dev->attr.max_cqe = 1024;
+ dev->attr.max_mr = 0x00001000;
+ dev->attr.max_mw = 0;
+ dev->attr.max_pd = 0x7FFC;
+ dev->attr.max_qp_rd_atom = 128;
+ dev->attr.max_qp_init_rd_atom = 128;
+ dev->attr.max_ah = 100;
+ dev->attr.max_fast_reg_page_list_len = 512;
+ dev->attr.local_ca_ack_delay = 15;
+
+ /* Point to the RDMA configuration structure for cleaner assignment */
+ struct vhost_rdma_config *cfg = &dev->rdma_config;
+
+ /* Copy basic limits from device attributes */
+ cfg->max_qp = dev->attr.max_qps;
+ cfg->max_cq = dev->attr.max_cqs;
+ cfg->max_mr = dev->attr.max_mr;
+ cfg->max_pd = dev->attr.max_pd;
+ cfg->max_ah = dev->attr.max_ah;
+ cfg->max_cqe = dev->attr.max_cqe;
+ cfg->max_qp_wr = dev->attr.max_qp_wr;
+ cfg->max_send_sge = dev->attr.max_send_sge;
+ cfg->max_recv_sge = dev->attr.max_recv_sge;
+ cfg->max_sge_rd = dev->attr.max_sge_rd;
+ cfg->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
+ cfg->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
+ cfg->max_mr_size = dev->attr.max_mr_size;
+ cfg->max_mw = dev->attr.max_mw;
+ cfg->max_fast_reg_page_list_len = dev->attr.max_fast_reg_page_list_len;
+ cfg->page_size_cap = dev->attr.page_size_cap;
+ cfg->device_cap_flag = dev->attr.device_cap_flags;
+ cfg->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+ cfg->phys_port_cnt = 1;
+ cfg->max_pkeys = 1;
+ cfg->vendor_id = 0x1AF4;
+ cfg->vendor_part_id = 0x0042;
+ cfg->sys_image_guid = 1;
+
+ /* Derived capabilities */
+ cfg->max_res_rd_atom = cfg->max_qp_rd_atom * cfg->max_qp;
+ cfg->max_total_mcast_qp_attach = 8192UL * 56UL;
+ cfg->max_pi_fast_reg_page_list_len = cfg->max_fast_reg_page_list_len / 2;
+
+ /* Inline data and MTU settings */
+ dev->max_inline_data = dev->attr.max_send_sge * sizeof(struct vhost_user_rdma_sge);
+ dev->mtu_cap = ib_mtu_enum_to_int(DEFAULT_IB_MTU);
+
+ /* Reset port counters */
+ dev->port_attr.bad_pkey_cntr = 0;
+ dev->port_attr.qkey_viol_cntr = 0;
+
+ /* Initialize GID table (illegal by default) */
+ for (int i = 0; i < VHOST_MAX_GID_TBL_LEN; i++) {
+ dev->gid_tbl[i].type = VHOST_RDMA_GID_TYPE_ILLIGAL; /* Typo? Should be ILLEGAL? */
+ }
+
+ /* Setup virtual queue mappings:
+ * rdma_vqs[0] is reserved (likely control),
+ * cq_vqs starts at index 1,
+ * qp_vqs follows after all CQs.
+ */
+ dev->cq_vqs = &dev->rdma_vqs[1];
+ dev->qp_vqs = &dev->rdma_vqs[1 + dev->attr.max_cqs];
+
+ /* Initialize resource pools */
+ vhost_rdma_pool_init(&dev->pd_pool, "pd_pool", dev->attr.max_pd,
+ sizeof(struct vhost_rdma_pd), false, NULL);
+
+ vhost_rdma_pool_init(&dev->mr_pool, "mr_pool", dev->attr.max_mr,
+ sizeof(struct vhost_rdma_mr), false, vhost_rdma_mr_cleanup);
+
+ vhost_rdma_pool_init(&dev->cq_pool, "cq_pool", dev->attr.max_cqs,
+ sizeof(struct vhost_rdma_cq), true, NULL); /* Shared across cores? */
+
+ vhost_rdma_pool_init(&dev->qp_pool, "qp_pool", dev->attr.max_qps,
+ sizeof(struct vhost_rdma_qp), false, vhost_rdma_qp_cleanup);
+
+ vhost_rdma_pool_init(&dev->ah_pool, "ah_pool", dev->attr.max_ah,
+ sizeof(struct vhost_rdma_av), false, NULL);
+
+ /* Allocate special GSI QP (QP number 1), used for subsystem management (e.g., SM in IB) */
+ dev->qp_gsi = vhost_rdma_pool_alloc(&dev->qp_pool, &qpn);
+ if (!dev->qp_gsi) {
+ return; /* Failed to allocate GSI QP */
+ }
+ vhost_rdma_add_ref(dev->qp_gsi); /* Hold a reference */
+ assert(qpn == 1); /* GSI must be assigned QPN 1 */
+}
+
+/**
+ * @brief Destroy and clean up all RDMA resources associated with the device.
+ *
+ * This function safely releases all allocated QPs, CQs, MRs, PDs, and AVs,
+ * then destroys their respective memory pools.
+ *
+ * Note: It assumes no external references exist to these objects.
+ */
+void
+vhost_rdma_destroy_ib(struct vhost_rdma_device *dev)
+{
+ struct vhost_rdma_mr *mr;
+ struct vhost_rdma_pd *pd;
+ struct vhost_rdma_cq *cq;
+ struct vhost_rdma_qp *qp;
+ struct vhost_rdma_av *av;
+ uint32_t i;
+
+ if (!dev) {
+ return;
+ }
+
+ /* Clean up Memory Regions (MR): cleanup callback may have already reset state */
+ for (i = 0; i < dev->attr.max_mr; i++) {
+ mr = vhost_rdma_pool_get(&dev->mr_pool, i);
+ if (mr) {
+ vhost_rdma_pool_free(&dev->mr_pool, i); /* Triggers cleanup if registered */
+ }
+ }
+
+ /* Clean up Protection Domains (PD) */
+ for (i = 0; i < dev->attr.max_pd; i++) {
+ pd = vhost_rdma_pool_get(&dev->pd_pool, i);
+ if (pd) {
+ vhost_rdma_pool_free(&dev->pd_pool, i);
+ }
+ }
+
+ /* Clean up Completion Queues (CQ) */
+ for (i = 0; i < dev->attr.max_cqs; i++) {
+ cq = vhost_rdma_pool_get(&dev->cq_pool, i);
+ if (cq) {
+ vhost_rdma_pool_free(&dev->cq_pool, i);
+ }
+ }
+
+ /* Clean up Queue Pairs (QP): must drain SQ/RQ before freeing */
+ for (i = 0; i < dev->attr.max_qps; i++) {
+ qp = vhost_rdma_pool_get(&dev->qp_pool, i);
+ if (qp) {
+ /* Cleanup send and receive queues (e.g., unregister intr handlers, free ring buffers) */
+ vhost_rdma_queue_cleanup(qp, &qp->sq.queue);
+ vhost_rdma_queue_cleanup(qp, &qp->rq.queue);
+
+ /* Now free the QP from the pool (triggers vhost_rdma_qp_cleanup if set) */
+ vhost_rdma_pool_free(&dev->qp_pool, i);
+ }
+ }
+
+ /* Clean up Address Handles (AH / AV) */
+ for (i = 0; i < dev->attr.max_ah; i++) {
+ av = vhost_rdma_pool_get(&dev->ah_pool, i);
+ if (av) {
+ vhost_rdma_pool_free(&dev->ah_pool, i);
+ }
+ }
+
+ /*
+ * Destroy resource pools.
+ * This frees internal pool metadata and backing arrays.
+ * Pools should be empty at this point.
+ */
+ vhost_rdma_pool_destroy(&dev->mr_pool);
+ vhost_rdma_pool_destroy(&dev->pd_pool);
+ vhost_rdma_pool_destroy(&dev->cq_pool);
+ vhost_rdma_pool_destroy(&dev->qp_pool);
+ vhost_rdma_pool_destroy(&dev->ah_pool);
+}
+
+/**
+ * @brief Convert a guest physical address payload into iovec entries.
+ *
+ * This function translates a contiguous memory region (starting at 'payload'
+ * with length 'remaining') into one or more iovecs by looking up the virtual
+ * address via gpa_to_vva(). The resulting iovecs are stored in 'iovs', and
+ * 'iov_index' is updated accordingly.
+ *
+ * @param mem Pointer to vhost memory structure for GPA->VVA translation.
+ * @param iovs Array of iovec structures to fill.
+ * @param iov_index Current index in the iovs array (updated on success).
+ * @param payload Guest physical address (GPA) of the data.
+ * @param remaining Total number of bytes left to translate.
+ * @param num_iovs Maximum number of iovecs allowed.
+ * @return 0 on success, -1 on error (e.g., translation failure or overflow).
+ */
+static int
+desc_payload_to_iovs(struct rte_vhost_memory *mem,
+ struct iovec *iovs,
+ uint32_t *iov_index,
+ uintptr_t payload,
+ uint64_t remaining,
+ uint16_t num_iovs)
+{
+ void *vva;
+ uint64_t len;
+
+ do {
+ if (*iov_index >= num_iovs) {
+ RDMA_LOG_ERR("MAX_IOVS reached");
+ return -1;
+ }
+
+ len = remaining;
+ vva = (void *)(uintptr_t)gpa_to_vva(mem, payload, &len);
+ if (!vva || !len) {
+ RDMA_LOG_ERR("failed to translate desc address.");
+ return -1;
+ }
+
+ iovs[*iov_index].iov_base = vva;
+ iovs[*iov_index].iov_len = len;
+
+ payload += len;
+ remaining -= len;
+ (*iov_index)++;
+ } while (remaining);
+
+ return 0;
+}
+
+/**
+ * @brief Set up iovecs from vring descriptors for a given request.
+ *
+ * Parses the descriptor chain starting at 'req_idx'. Handles both direct and
+ * indirect descriptors. Fills the provided 'iovs' array with valid memory
+ * regions derived from GPA-to-VVA translation. Also counts input/output descriptors.
+ *
+ * @param mem Vhost memory configuration for address translation.
+ * @param vq Virtual queue containing the descriptor ring.
+ * @param req_idx Index of the first descriptor in the chain.
+ * @param iovs Pre-allocated iovec array to populate.
+ * @param num_iovs Size of the iovs array (maximum entries).
+ * @param num_in Output: number of writable (input) descriptors.
+ * @param num_out Output: number of readable (output) descriptors.
+ * @return Number of filled iovecs on success, -1 on error.
+ */
+int
+setup_iovs_from_descs(struct rte_vhost_memory *mem,
+ struct vhost_user_queue *vq,
+ uint16_t req_idx,
+ struct iovec *iovs,
+ uint16_t num_iovs,
+ uint16_t *num_in,
+ uint16_t *num_out)
+{
+ struct vring_desc *desc = &vq->vring.desc[req_idx];
+ struct vring_desc *desc_table;
+ uint32_t iovs_idx = 0;
+ uint64_t len;
+ uint16_t in = 0, out = 0;
+
+ /* Handle indirect descriptors */
+ if (desc->flags & VRING_DESC_F_INDIRECT) {
+ len = desc->len;
+ desc_table = (struct vring_desc *)(uintptr_t)gpa_to_vva(mem, desc->addr, &len);
+ if (!desc_table || !len) {
+ RDMA_LOG_ERR("failed to translate desc address.");
+ return -1;
+ }
+ assert(len == desc->len);
+ desc = desc_table;
+ } else {
+ desc_table = vq->vring.desc;
+ }
+
+ /* Walk through descriptor chain */
+ do {
+ if (iovs_idx >= num_iovs) {
+ RDMA_LOG_ERR("MAX_IOVS reached\n");
+ return -1;
+ }
+
+ if (desc->flags & VRING_DESC_F_WRITE) {
+ in++; /* Descriptor allows write from device perspective (input) */
+ } else {
+ out++; /* Descriptor allows read (output) */
+ }
+
+ /* Translate payload (address + length) into iovec(s) */
+ if (desc_payload_to_iovs(mem, iovs,
+ &iovs_idx,
+ desc->addr,
+ desc->len,
+ num_iovs) != 0) {
+ RDMA_LOG_ERR("Failed to convert desc payload to iovs");
+ return -1;
+ }
+
+ /* Move to next descriptor in chain */
+ desc = vhost_rdma_vring_get_next_desc(desc_table, desc);
+ } while (desc != NULL);
+
+ *num_in = in;
+ *num_out = out;
+ return iovs_idx;
+}
+
+static int
+vhost_rdma_query_device(struct vhost_rdma_device *dev, CTRL_NO_CMD,
+ struct iovec *out)
+{
+ struct vhost_rdma_ack_query_device *rsp;
+
+ CHK_IOVEC(rsp, out);
+
+ rsp->max_mr_size = dev->attr.max_mr_size;
+ rsp->page_size_cap = dev->attr.page_size_cap;
+ rsp->max_qp_wr = dev->attr.max_qp_wr;
+ rsp->device_cap_flags = dev->attr.device_cap_flags;
+ rsp->max_send_sge = dev->attr.max_send_sge;
+ rsp->max_recv_sge = dev->attr.max_recv_sge;
+ rsp->max_sge_rd = dev->attr.max_sge_rd;
+ rsp->max_cqe = dev->attr.max_cqe;
+ rsp->max_mr = dev->attr.max_mr;
+ rsp->max_pd = dev->attr.max_pd;
+ rsp->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
+ rsp->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
+ rsp->max_ah = dev->attr.max_ah;
+ rsp->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+
+ return 0;
+}
+
+/* Command handler table declaration */
+struct {
+ int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
+ const char *name; /* Name of the command (for logging) */
+} cmd_tbl[] = {
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
+};
+
+/**
+ * @brief Main handler for control virtqueue events.
+ *
+ * Processes incoming requests from the control virtual queue. Waits for kick
+ * notification via eventfd, then processes available descriptor chains.
+ * Each chain contains a header followed by optional input/output data.
+ * Executes corresponding handler based on command ID.
+ *
+ * @param arg Pointer to vhost_rdma_device instance.
+ */
+void
+vhost_rdma_handle_ctrl_vq(void *arg)
+{
+ struct vhost_rdma_device *dev = arg;
+ struct vhost_rdma_ctrl_hdr *hdr;
+ struct vhost_user_queue *ctrl_vq = &dev->rdma_vqs[0];
+ struct iovec data_iovs[4]; /* Fixed-size iovec buffer */
+ struct iovec *in_iovs, *out_iovs;
+ uint16_t desc_idx, num_in, num_out;
+ uint8_t *status;
+ int kick_fd, nbytes, i, in_len;
+
+ kick_fd = ctrl_vq->vring.kickfd;
+
+ /* Wait until we get a valid kick (notification) */
+ do {
+ uint64_t kick_data;
+ nbytes = eventfd_read(kick_fd, &kick_data);
+ if (nbytes < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN) {
+ continue; /* Retry on transient errors */
+ }
+ RDMA_LOG_ERR("Failed to read kickfd of ctrl virtq: %s", strerror(errno));
+ }
+ break;
+ } while (1);
+
+ /* Process all available requests in the control queue */
+ while (vhost_rdma_vq_is_avail(ctrl_vq)) {
+ desc_idx = vhost_rdma_vq_get_desc_idx(ctrl_vq);
+ /* Build iovecs from descriptor chain */
+ if (setup_iovs_from_descs(dev->mem, ctrl_vq,
+ desc_idx, data_iovs, 4,
+ &num_in, &num_out) < 0) {
+ RDMA_LOG_ERR("read from desc failed");
+ break;
+ }
+ /* Split iovecs into output (device reads) and input (device writes) */
+ out_iovs = data_iovs;
+ in_iovs = &data_iovs[num_out];
+ in_len = 0;
+
+ /* Calculate total input data length */
+ for (i = 0; i < num_in; i++) {
+ in_len += in_iovs[i].iov_len;
+ }
+
+ /* First output iovec should contain the control header */
+ hdr = (struct vhost_rdma_ctrl_hdr *)out_iovs[0].iov_base;
+ status = (uint8_t *)in_iovs[0].iov_base;
+
+ /* Validate header size */
+ if (out_iovs[0].iov_len != sizeof(*hdr)) {
+ RDMA_LOG_ERR("invalid header");
+ *status = VIRTIO_NET_ERR;
+ goto pushq;
+ }
+
+ /* Check if command ID is within valid range */
+ if (hdr->cmd >= (sizeof(cmd_tbl) / sizeof(cmd_tbl[0]))) {
+ RDMA_LOG_ERR("unknown cmd %d", hdr->cmd);
+ *status = VIRTIO_NET_ERR;
+ goto pushq;
+ }
+
+ /* Dispatch command handler; set status based on result */
+ *status = (cmd_tbl[hdr->cmd].handler(dev,
+ num_out > 1 ? &out_iovs[1] : NULL,
+ num_in > 1 ? &in_iovs[1] : NULL) == 0)
+ ? VIRTIO_NET_OK
+ : VIRTIO_NET_ERR;
+
+pushq:
+ /* Log command execution result */
+ RDMA_LOG_INFO("cmd=%d %s status: %d",
+ hdr->cmd,
+ cmd_tbl[hdr->cmd].name ? cmd_tbl[hdr->cmd].name : "unknown",
+ *status);
+
+ /* Return used descriptor to the avail ring and notify frontend */
+ vhost_rdma_queue_push(ctrl_vq, desc_idx, in_len);
+ vhost_rdma_queue_notify(dev->vid, ctrl_vq);
+ }
+}
+
+int
+vhost_rdma_task_scheduler(void *arg)
+{
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
new file mode 100644
index 0000000000..4ac896d82e
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -0,0 +1,710 @@
+/**
+ * @file vhost_rdma_ib.h
+ * @brief Vhost-user RDMA device: IB emulation layer and control path definitions.
+ *
+ * This header defines the internal data structures, constants, and function interfaces
+ * used by the vhost-user RDMA backend to emulate InfiniBand/RoCE semantics over virtio.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __VHOST_RDMA_IB_H__
+#define __VHOST_RDMA_IB_H__
+
+#include <netinet/in.h>
+#include <linux/virtio_net.h>
+
+#include <rte_spinlock.h>
+#include <rte_atomic.h>
+#include <rte_timer.h>
+#include <rte_mbuf.h>
+#include <rte_ring.h>
+#include <rte_vhost.h>
+#include <linux/vhost_types.h>
+
+#include "eal_interrupts.h"
+
+/* Forward declarations */
+struct vhost_rdma_device;
+struct vhost_queue;
+
+/**
+ * @defgroup constants Constants & Limits
+ * @{
+ */
+
+/** Invalid opcode marker */
+#define OPCODE_NONE (-1)
+
+/** Device capability flags */
+#define VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN (1 << 0)
+
+/** Maximum number of memory regions in vhost-user memory table */
+#define VHOST_USER_MEMORY_MAX_NREGIONS 8
+
+/** Maximum size for config space read/write operations */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+/** ROCE control command types (virtio-rdma extension) */
+#define VHOST_RDMA_CTRL_ROCE 6
+#define VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE 0
+#define VHOST_RDMA_CTRL_ROCE_QUERY_PORT 1
+#define VHOST_RDMA_CTRL_ROCE_CREATE_CQ 2
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_CQ 3
+#define VHOST_RDMA_CTRL_ROCE_CREATE_PD 4
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_PD 5
+#define VHOST_RDMA_CTRL_ROCE_GET_DMA_MR 6
+#define VHOST_RDMA_CTRL_ROCE_ALLOC_MR 7
+#define VHOST_RDMA_CTRL_ROCE_REG_USER_MR 9
+#define VHOST_RDMA_CTRL_ROCE_MAP_MR_SG 8
+#define VHOST_RDMA_CTRL_ROCE_DEREG_MR 10
+#define VHOST_RDMA_CTRL_ROCE_CREATE_QP 11
+#define VHOST_RDMA_CTRL_ROCE_MODIFY_QP 12
+#define VHOST_RDMA_CTRL_ROCE_QUERY_QP 13
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_QP 14
+#define VHOST_RDMA_CTRL_ROCE_QUERY_PKEY 15
+#define VHOST_RDMA_CTRL_ROCE_ADD_GID 16
+#define VHOST_RDMA_CTRL_ROCE_DEL_GID 17
+#define VHOST_RDMA_CTRL_ROCE_REQ_NOTIFY_CQ 18
+
+struct vhost_rdma_ack_query_device {
+#define VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN (1 << 0)
+ /* Capabilities mask */
+ uint64_t device_cap_flags;
+ /* Largest contiguous block that can be registered */
+ uint64_t max_mr_size;
+ /* Supported memory shift sizes */
+ uint64_t page_size_cap;
+ /* Hardware version */
+ uint32_t hw_ver;
+ /* Maximum number of outstanding Work Requests (WR) on Send Queue (SQ) and Receive Queue (RQ) */
+ uint32_t max_qp_wr;
+ /* Maximum number of scatter/gather (s/g) elements per WR for SQ for non RDMA Read operations */
+ uint32_t max_send_sge;
+ /* Maximum number of s/g elements per WR for RQ for non RDMA Read operations */
+ uint32_t max_recv_sge;
+ /* Maximum number of s/g per WR for RDMA Read operations */
+ uint32_t max_sge_rd;
+ /* Maximum size of Completion Queue (CQ) */
+ uint32_t max_cqe;
+ /* Maximum number of Memory Regions (MR) */
+ uint32_t max_mr;
+ /* Maximum number of Protection Domains (PD) */
+ uint32_t max_pd;
+ /* Maximum number of RDMA Read perations that can be outstanding per Queue Pair (QP) */
+ uint32_t max_qp_rd_atom;
+ /* Maximum depth per QP for initiation of RDMA Read operations */
+ uint32_t max_qp_init_rd_atom;
+ /* Maximum number of Address Handles (AH) */
+ uint32_t max_ah;
+ /* Local CA ack delay */
+ uint8_t local_ca_ack_delay;
+ /* Padding */
+ uint8_t padding[3];
+ /* Reserved for future */
+ uint32_t reserved[14];
+};
+
+
+/**
+ * @defgroup qp_states Queue Pair States
+ * @{
+ */
+enum vhost_rdma_ib_qp_state {
+ VHOST_RDMA_IB_QPS_RESET,
+ VHOST_RDMA_IB_QPS_INIT,
+ VHOST_RDMA_IB_QPS_RTR,
+ VHOST_RDMA_IB_QPS_RTS,
+ VHOST_RDMA_IB_QPS_SQD,
+ VHOST_RDMA_IB_QPS_SQE,
+ VHOST_RDMA_IB_QPS_ERR
+};
+/** @} */
+
+/**
+ * @defgroup mtu_sizes IB MTU Sizes
+ * @{
+ */
+enum vhost_rdma_ib_mtu {
+ VHOST_RDMA_IB_MTU_256 = 1,
+ VHOST_RDMA_IB_MTU_512 = 2,
+ VHOST_RDMA_IB_MTU_1024 = 3,
+ VHOST_RDMA_IB_MTU_2048 = 4,
+ VHOST_RDMA_IB_MTU_4096 = 5
+};
+/** @} */
+
+/**
+ * @defgroup wc_status Work Completion Status Codes
+ * @{
+ */
+enum vhost_rdma_ib_wc_status {
+ VHOST_RDMA_IB_WC_SUCCESS,
+ VHOST_RDMA_IB_WC_LOC_LEN_ERR,
+ VHOST_RDMA_IB_WC_LOC_QP_OP_ERR,
+ VHOST_RDMA_IB_WC_LOC_PROT_ERR,
+ VHOST_RDMA_IB_WC_WR_FLUSH_ERR,
+ VHOST_RDMA_IB_WC_BAD_RESP_ERR,
+ VHOST_RDMA_IB_WC_LOC_ACCESS_ERR,
+ VHOST_RDMA_IB_WC_REM_INV_REQ_ERR,
+ VHOST_RDMA_IB_WC_REM_ACCESS_ERR,
+ VHOST_RDMA_IB_WC_REM_OP_ERR,
+ VHOST_RDMA_IB_WC_RETRY_EXC_ERR,
+ VHOST_RDMA_IB_WC_RNR_RETRY_EXC_ERR,
+ VHOST_RDMA_IB_WC_REM_ABORT_ERR,
+ VHOST_RDMA_IB_WC_FATAL_ERR,
+ VHOST_RDMA_IB_WC_RESP_TIMEOUT_ERR,
+ VHOST_RDMA_IB_WC_GENERAL_ERR
+};
+/** @} */
+
+/**
+ * @defgroup res_state Responder Resource States
+ * @{
+ */
+enum vhost_rdma_res_state {
+ VHOST_RDMA_RES_STATE_NEXT,
+ VHOST_RDMA_RES_STATE_NEW,
+ VHOST_RDMA_RES_STATE_REPLAY,
+};
+/** @} */
+
+/**
+ * @defgroup vhost_user_requests Vhost-user Message Types
+ * @{
+ */
+enum vhost_user_rdma_request {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_GET_CONFIG = 24,
+ VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_MAX
+};
+/** @} */
+
+/**
+ * @brief QP capabilities structure
+ */
+struct vhost_rdma_qp_cap {
+ uint32_t max_send_wr; /**< Max work requests in send queue */
+ uint32_t max_send_sge; /**< Max scatter-gather elements per send WR */
+ uint32_t max_recv_wr; /**< Max work requests in receive queue */
+ uint32_t max_recv_sge; /**< Max SGEs per receive WR */
+ uint32_t max_inline_data; /**< Max inline data size supported */
+};
+
+/**
+ * @brief Global route attributes (used in AH/GRH)
+ */
+struct vhost_rdma_global_route {
+ uint8_t dgid[16]; /**< Destination GID or MGID */
+ uint32_t flow_label; /**< IPv6-style flow label */
+ uint8_t sgid_index; /**< Source GID table index */
+ uint8_t hop_limit; /**< TTL/Hop Limit */
+ uint8_t traffic_class; /**< Traffic class field */
+};
+
+/**
+ * @brief Address Handle (AH) attributes
+ */
+struct vhost_rdma_ah_attr {
+ struct vhost_rdma_global_route grh; /**< GRH fields */
+ uint8_t sl; /**< Service Level */
+ uint8_t static_rate; /**< Static rate (encoding) */
+ uint8_t port_num; /**< Physical port number */
+ uint8_t ah_flags; /**< Flags (e.g., GRH present) */
+ uint8_t dmac[6]; /**< Destination MAC address (for RoCE) */
+} __rte_packed;
+
+/**
+ * @brief Queue Pair attributes
+ */
+struct vhost_rdma_qp_attr {
+ enum vhost_rdma_ib_qp_state qp_state; /**< Target QP state */
+ enum vhost_rdma_ib_qp_state cur_qp_state; /**< Current QP state */
+ enum vhost_rdma_ib_mtu path_mtu; /**< Path MTU */
+ uint32_t qkey; /**< QKey for UD/RC */
+ uint32_t rq_psn; /**< Receive PSN */
+ uint32_t sq_psn; /**< Send PSN */
+ uint32_t dest_qp_num; /**< Remote QPN */
+ uint32_t qp_access_flags; /**< Access permissions */
+ uint8_t sq_draining; /**< Is SQ draining? */
+ uint8_t max_rd_atomic; /**< Max outstanding RDMA reads/atomics */
+ uint8_t max_dest_rd_atomic; /**< Max at responder side */
+ uint8_t min_rnr_timer; /**< Minimum RNR NAK timer value */
+ uint8_t timeout; /**< Timeout exponent for ACKs */
+ uint8_t retry_cnt; /**< Retry counter limit */
+ uint8_t rnr_retry; /**< RNR retry count */
+ uint32_t rate_limit; /**< Rate limit (Mb/s) */
+ struct vhost_rdma_qp_cap cap; /**< QP capacity limits */
+ struct vhost_rdma_ah_attr ah_attr; /**< AH attributes for RC/UC */
+};
+
+/**
+ * @brief Protection Domain (PD)
+ */
+struct vhost_rdma_pd {
+ struct vhost_rdma_device *dev; /**< Backing device */
+ uint32_t pdn; /**< PD identifier */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief Generic queue abstraction (used for SQ/RQ)
+ */
+struct vhost_rdma_queue {
+ struct vhost_user_queue *vq; /**< Associated vhost vring */
+ void *data; /**< Ring buffer base pointer */
+ size_t elem_size; /**< Size of each element */
+ size_t num_elems; /**< Number of elements */
+ uint16_t consumer_index; /**< Consumer index (local) */
+ uint16_t producer_index; /**< Producer index (from guest) */
+
+ struct rte_intr_handle intr_handle; /**< Interrupt handler */
+ rte_intr_callback_fn cb; /**< Optional callback on kick */
+};
+
+/**
+ * @brief Padded memory region layout (fixed-size vhost_memory)
+ */
+struct vhost_memory_padded {
+ uint32_t nregions; /**< Number of valid regions */
+ uint32_t padding; /**< Alignment padding */
+ struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS];
+};
+
+/**
+ * @brief Configuration access payload
+ */
+struct vhost_user_rdma_config {
+ uint32_t offset; /**< Offset in config space */
+ uint32_t size; /**< Data size */
+ uint32_t flags; /**< Reserved/flags */
+ uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; /**< Config data */
+};
+
+/**
+ * @brief Vhost-user RDMA message structure
+ */
+struct vhost_user_rdma_msg {
+ enum vhost_user_rdma_request request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+ uint32_t flags; /**< Version and reply flag */
+ uint32_t size; /**< Payload size */
+
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ struct vhost_memory_padded memory;
+ struct vhost_user_rdma_config cfg;
+ } payload;
+} __rte_packed;
+
+/**
+ * @brief Completion Queue (CQ)
+ */
+struct vhost_rdma_cq {
+ struct vhost_queue *vq; /**< Notification V-ring */
+ rte_spinlock_t cq_lock; /**< Protect CQ operations */
+ uint8_t notify; /**< Notify pending flag */
+ bool is_dying; /**< Being destroyed */
+
+ uint32_t cqn; /**< CQ identifier */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief Send Queue (SQ) container
+ */
+struct vhost_rdma_sq {
+ rte_spinlock_t lock; /**< Guard SQ access */
+ struct vhost_rdma_queue queue; /**< Underlying ring */
+};
+
+/**
+ * @brief Receive Queue (RQ) container
+ */
+struct vhost_rdma_rq {
+ rte_spinlock_t lock; /**< Guard RQ access */
+ struct vhost_rdma_queue queue; /**< Underlying ring */
+};
+
+/**
+ * @brief Address Vector (AV) - cached routing info
+ */
+struct vhost_rdma_av {
+ uint8_t network_type; /**< e.g., IPv4/IPv6/Ethernet */
+ uint8_t dmac[6]; /**< Destination MAC */
+ struct vhost_rdma_global_route grh; /**< GRH fields */
+
+ union {
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr; /**< GID resolution cache (optional) */
+};
+
+/**
+ * @brief Lightweight task abstraction with scheduling support
+ */
+struct vhost_rdma_task {
+ char name[8]; /**< Task name (debug) */
+ int state; /**< Execution state */
+ bool destroyed; /**< Marked for cleanup */
+ rte_atomic16_t sched; /**< Scheduled flag */
+ rte_spinlock_t state_lock; /**< Lock for state transitions */
+ struct rte_ring *task_ring; /**< Work submission ring */
+
+ int (*func)(void *arg); /**< Task function */
+ void *arg; /**< Argument to func */
+ int ret; /**< Return code */
+};
+
+/**
+ * @brief Requester-side operation tracking
+ */
+struct vhost_rdma_req_info {
+ enum vhost_rdma_ib_qp_state state;
+ int wqe_index; /**< Current WQE index */
+ uint32_t psn; /**< Packet Sequence Number */
+ int opcode; /**< Operation type */
+ rte_atomic32_t rd_atomic; /**< Outstanding RDMA read/atomic count */
+ int wait_fence; /**< Fence required */
+ int need_rd_atomic; /**< Need atomic resource */
+ int wait_psn; /**< Waiting for PSN gap */
+ int need_retry; /**< Should retry */
+ int noack_pkts; /**< Packets sent without ACK */
+ struct vhost_rdma_task task; /**< Retransmission task */
+};
+
+/**
+ * @brief Completer-side retry and retransmit context
+ */
+struct vhost_rdma_comp_info {
+ uint32_t psn; /**< Last packet PSN */
+ int opcode;
+ int timeout; /**< Timeout occurred */
+ int timeout_retry;
+ int started_retry;
+ uint32_t retry_cnt;
+ uint32_t rnr_retry;
+ struct vhost_rdma_task task; /**< RNR/retry handling task */
+};
+
+/**
+ * @brief Scatter-Gather Element (SGE)
+ */
+struct vhost_rdma_sge {
+ __le64 addr; /**< Guest virtual address */
+ __le32 length; /**< Length in bytes */
+ __le32 lkey; /**< Local key */
+};
+
+/**
+ * @brief DMA transfer context
+ */
+struct vhost_rdma_dma_info {
+ uint32_t length; /**< Total transfer length */
+ uint32_t resid; /**< Remaining bytes */
+ uint32_t cur_sge; /**< Current SGE index */
+ uint32_t num_sge; /**< Total SGE count */
+ uint32_t sge_offset; /**< Offset within current SGE */
+ uint32_t reserved;
+ union {
+ uint8_t *inline_data; /**< Inline data pointer */
+ struct vhost_rdma_sge *sge; /**< SGE array */
+ void *raw; /**< Generic pointer */
+ };
+};
+
+/**
+ * @brief Receive Work Queue Entry (WQE)
+ */
+struct vhost_rdma_recv_wqe {
+ __aligned_u64 wr_id; /**< User-defined WR ID */
+ __u32 num_sge;
+ __u32 padding;
+ struct vhost_rdma_dma_info dma; /**< DMA context */
+};
+
+/**
+ * @brief Memory Region (MR) types
+ */
+enum vhost_rdma_mr_type {
+ VHOST_MR_TYPE_NONE,
+ VHOST_MR_TYPE_DMA,
+ VHOST_MR_TYPE_MR,
+};
+
+/**
+ * @brief MR lifecycle states
+ */
+enum vhost_rdma_mr_state {
+ VHOST_MR_STATE_ZOMBIE,
+ VHOST_MR_STATE_INVALID,
+ VHOST_MR_STATE_FREE,
+ VHOST_MR_STATE_VALID,
+};
+
+/**
+ * @brief Memory Region (MR) object
+ */
+struct vhost_rdma_mr {
+ struct vhost_rdma_pd *pd; /**< Owning PD */
+ enum vhost_rdma_mr_type type; /**< Type of MR */
+ enum vhost_rdma_mr_state state; /**< State machine */
+ uint64_t va; /**< Virtual address (host VA) */
+ uint64_t iova; /**< IOVA / virtual address in guest */
+ size_t length; /**< Length of mapping */
+ uint32_t offset; /**< Offset in page array */
+ int access; /**< Access flags (e.g., LOCAL_WRITE) */
+
+ uint32_t lkey; /**< Local key */
+ uint32_t rkey; /**< Remote key */
+
+ uint32_t npages; /**< Number of mapped pages */
+ uint32_t max_pages; /**< Allocated page array size */
+ uint64_t *pages; /**< Array of page addresses */
+
+ uint32_t mrn; /**< MR identifier */
+ rte_atomic32_t refcnt; /**< Reference counter */
+};
+
+/**
+ * @brief Responder resource (used for replay and ACK handling)
+ */
+struct vhost_rdma_resp_res {
+ int type; /**< Resource type */
+ int replay; /**< Is this a replay? */
+ uint32_t first_psn;
+ uint32_t last_psn;
+ uint32_t cur_psn;
+ enum vhost_rdma_res_state state;
+
+ union {
+ struct {
+ struct rte_mbuf *mbuf; /**< Packet buffer */
+ } atomic;
+ struct {
+ struct vhost_rdma_mr *mr;
+ uint64_t va_org; /**< Original VA */
+ uint32_t rkey;
+ uint32_t length;
+ uint64_t va; /**< Current VA */
+ uint32_t resid; /**< Residual length */
+ } read;
+ };
+};
+
+/**
+ * @brief Response processing context (responder side)
+ */
+struct vhost_rdma_resp_info {
+ enum vhost_rdma_ib_qp_state state;
+ uint32_t msn; /**< Message sequence number */
+ uint32_t psn; /**< Current PSN */
+ uint32_t ack_psn; /**< Acknowledged PSN */
+ int opcode;
+ int drop_msg; /**< Drop current message */
+ int goto_error; /**< Transition to error state */
+ int sent_psn_nak; /**< Has sent NAK */
+ enum vhost_rdma_ib_wc_status status;
+ uint8_t aeth_syndrome; /**< AETH error code */
+
+ /* Receive path only */
+ struct vhost_rdma_recv_wqe *wqe;
+
+ /* RDMA read / atomic operations */
+ uint64_t va;
+ uint64_t offset;
+ struct vhost_rdma_mr *mr;
+ uint32_t resid;
+ uint32_t rkey;
+ uint32_t length;
+ uint64_t atomic_orig;
+
+ /* Circular buffer of responder resources */
+ struct vhost_rdma_resp_res *resources;
+ unsigned int res_head;
+ unsigned int res_tail;
+ struct vhost_rdma_resp_res *res;
+
+ struct vhost_rdma_task task; /**< Timeout/retry task */
+};
+
+/**
+ * @brief Queue Pair (QP)
+ */
+struct vhost_rdma_qp {
+ struct vhost_rdma_device *dev; /**< Parent device */
+ struct vhost_rdma_qp_attr attr; /**< Current attributes */
+ uint32_t qpn; /**< Queue Pair Number */
+ uint8_t type; /**< QP type (RC/UC/UD) */
+ unsigned int valid; /**< Is QP active? */
+ unsigned int mtu; /**< Effective MTU in bytes */
+
+ struct vhost_rdma_pd *pd; /**< Owning PD */
+ struct vhost_rdma_cq *scq; /**< Send CQ */
+ struct vhost_rdma_cq *rcq; /**< Receive CQ */
+
+ uint8_t sq_sig_all; /**< Every send WQE signals completion */
+
+ struct vhost_rdma_sq sq; /**< Send Queue */
+ struct vhost_rdma_rq rq; /**< Receive Queue */
+ void *srq; /**< Shared Receive Queue (reserved) */
+
+ uint32_t dst_cookie; /**< Cookie from destination */
+ uint16_t src_port; /**< Source UDP port (RoCE) */
+
+ struct vhost_rdma_av av; /**< Cached path information */
+
+ struct rte_ring *req_pkts; /**< Request packets ring (from guest) */
+ struct rte_mbuf *req_pkts_head; /**< Head for peeking packets */
+ struct rte_ring *resp_pkts; /**< Response packets ring (to guest) */
+
+ struct vhost_rdma_req_info req; /**< Requester context */
+ struct vhost_rdma_comp_info comp; /**< Completer context */
+ struct vhost_rdma_resp_info resp; /**< Responder context */
+
+ rte_atomic32_t ssn; /**< Send Sequence Number */
+ rte_atomic32_t mbuf_out; /**< Number of mbufs in flight */
+ int need_req_mbuf; /**< Need more mbufs for requests */
+
+ /* Retransmission timer (RC only) */
+ struct rte_timer retrans_timer;
+ uint64_t qp_timeout_ticks;
+
+ /* RNR NAK handling timer */
+ struct rte_timer rnr_nak_timer;
+
+ rte_spinlock_t state_lock; /**< Protect state changes */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief User-space SGE (control path)
+ */
+struct vhost_user_rdma_sge {
+ uint64_t addr; /**< Host/user virtual address */
+ uint32_t length;
+ uint32_t lkey;
+};
+
+struct vhost_rdma_ctrl_hdr {
+ uint8_t cmd;
+};
+
+/**
+ * @brief Convert IB MTU enum to byte size
+ * @param mtu The MTU enum value
+ * @return Byte size on success, -1 if invalid
+ */
+static inline int
+ib_mtu_enum_to_int(enum vhost_rdma_ib_mtu mtu)
+{
+ switch (mtu) {
+ case VHOST_RDMA_IB_MTU_256: return 256;
+ case VHOST_RDMA_IB_MTU_512: return 512;
+ case VHOST_RDMA_IB_MTU_1024: return 1024;
+ case VHOST_RDMA_IB_MTU_2048: return 2048;
+ case VHOST_RDMA_IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+/* Function declarations */
+
+/**
+ * @brief Initialize RDMA device's IB attributes and resource pools
+ * @param dev RDMA device instance
+ */
+void vhost_rdma_init_ib(struct vhost_rdma_device *dev);
+
+/**
+ * @brief Destroy all IB resources and release memory pools
+ * @param dev RDMA device instance
+ */
+void vhost_rdma_destroy_ib(struct vhost_rdma_device *dev);
+
+/**
+ * @brief Handle control virtqueue messages (device configuration)
+ * @param arg Pointer to device or thread context
+ */
+void vhost_rdma_handle_ctrl_vq(void *arg);
+
+/**
+ * @brief Main scheduler loop for RDMA tasks (retries, timeouts)
+ * @param arg Device context
+ * @return 0 on exit
+ */
+int vhost_rdma_task_scheduler(void *arg);
+
+/**
+ * @brief Cleanup callback for MR pool objects
+ * @param arg Pointer to struct vhost_rdma_mr
+ */
+void vhost_rdma_mr_cleanup(void *arg);
+
+/**
+ * @brief Cleanup callback for QP pool objects
+ * @param arg Pointer to struct vhost_rdma_qp
+ */
+void vhost_rdma_qp_cleanup(void *arg);
+
+/**
+ * @brief Clean up a vhost_rdma_queue (drain rings, unregister interrupts)
+ * @param qp Owning QP
+ * @param queue Queue to clean
+ */
+void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue);
+
+/**
+ * @brief Release one RDMA read/atomic responder resource
+ * @param qp QP owning the resource
+ * @param res Resource to free
+ */
+void free_rd_atomic_resource(struct vhost_rdma_qp *qp, struct vhost_rdma_resp_res *res);
+
+/**
+ * @brief Release all RDMA read/atomic responder resources
+ * @param qp QP whose resources to free
+ */
+void free_rd_atomic_resources(struct vhost_rdma_qp *qp);
+
+int setup_iovs_from_descs(struct rte_vhost_memory *mem,
+ struct vhost_user_queue *vq,
+ uint16_t req_idx,
+ struct iovec *iovs,
+ uint16_t num_iovs,
+ uint16_t *num_in,
+ uint16_t *num_out);
+
+#endif /* __VHOST_RDMA_IB_H__ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_log.h b/examples/vhost_user_rdma/vhost_rdma_log.h
new file mode 100644
index 0000000000..dfb4d1adae
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_log.h
@@ -0,0 +1,52 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef __VHOST_RDMA_LOG_H__
+#define __VHOST_RDMA_LOG_H__
+
+#include <rte_log.h>
+
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER2
+#define RTE_LOGTYPE_ETHER RTE_LOGTYPE_USER3
+#define RTE_LOGTYPE_RDMA RTE_LOGTYPE_USER1
+
+#define LOG_DEBUG_DP(f, ...) RTE_LOG_DP(DEBUG, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_INFO_DP(f, ...) RTE_LOG_DP(INFO, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_WARN_DP(f, ...) RTE_LOG_DP(WARNING, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_ERR_DP(f, ...) RTE_LOG_DP(ERR, ETHER, f "\n", ##__VA_ARGS__)
+
+#define LOG_DEBUG(f, ...) RTE_LOG(DEBUG, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_INFO(f, ...) RTE_LOG(INFO, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_WARN(f, ...) RTE_LOG(WARNING, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_ERR(f, ...) RTE_LOG(ERR, ETHER, f "\n", ##__VA_ARGS__)
+
+#define RDMA_LOG_DEBUG(f, ...) RTE_LOG(DEBUG, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+#define RDMA_LOG_INFO(f, ...) RTE_LOG(INFO, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+#define RDMA_LOG_ERR(f, ...) RTE_LOG(ERR, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+
+#ifdef DEBUG_RDMA_DP
+#define RDMA_LOG_DEBUG_DP(f, ...) RTE_LOG(DEBUG, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_INFO_DP(f, ...) RTE_LOG(INFO, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_ERR_DP(f, ...) RTE_LOG(ERR, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#else
+#define RDMA_LOG_DEBUG_DP(f, ...) RTE_LOG_DP(DEBUG, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_INFO_DP(f, ...) RTE_LOG_DP(INFO, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_ERR_DP(f, ...) RTE_LOG_DP(ERR, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.h b/examples/vhost_user_rdma/vhost_rdma_pkt.h
new file mode 100644
index 0000000000..2bbc030e0a
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.h
@@ -0,0 +1,296 @@
+/**
+ * @file vhost_rdma_pkt.h
+ * @brief Vhost-user RDMA packet format and opcode definitions.
+ *
+ * This header defines the internal packet representation, InfiniBand/RoCE header layout,
+ * opcode mapping, and control flags used during packet parsing and transmission
+ * in the vhost-user RDMA backend.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __VHOST_RDMA_PKT_H__
+#define __VHOST_RDMA_PKT_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <rte_byteorder.h>
+#include <rte_mbuf.h> /* For struct rte_mbuf if needed later */
+
+/* Forward declarations */
+struct vhost_rdma_dev;
+struct vhost_rdma_qp;
+struct vhost_rdma_send_wqe;
+
+#ifndef BIT
+#define BIT(x) (1U << (x)) /**< Generate bitmask from bit index */
+#endif
+
+/**
+ * @defgroup constants Constants & Limits
+ * @{
+ */
+
+/** Maximum number of QP types supported for WR mask dispatching */
+#define WR_MAX_QPT 8
+
+/** Invalid opcode marker */
+#define OPCODE_NONE (-1)
+
+/** Total number of defined opcodes (must be power-of-2 >= 256) */
+#define VHOST_NUM_OPCODE 256
+
+/** @} */
+
+/**
+ * @defgroup wr_masks Work Request Type Masks
+ * @{
+ */
+enum vhost_rdma_wr_mask {
+ WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
+ WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
+ WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
+ WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
+ WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
+ WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+/** @} */
+
+/**
+ * @brief Metadata about each Work Request (WR) opcode
+ *
+ * Used to determine which operations are valid per QP type.
+ */
+struct vhost_rdma_wr_opcode_info {
+ const char *name; /**< Human-readable name */
+ enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
+};
+
+/* Extern declaration of global opcode metadata table */
+extern struct vhost_rdma_wr_opcode_info vhost_rdma_wr_opcode_info[];
+
+/**
+ * @defgroup hdr_types Header Types (for offset tracking)
+ * @{
+ */
+enum vhost_rdma_hdr_type {
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
+};
+/** @} */
+
+/**
+ * @defgroup hdr_masks Header Presence and Semantic Flags
+ * @{
+ */
+enum vhost_rdma_hdr_mask {
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+};
+/** @} */
+
+/**
+ * @brief Per-opcode metadata for parsing and validation
+ */
+struct vhost_rdma_opcode_info {
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+};
+
+/* Global opcode info table (indexed by IB opcode byte) */
+extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+
+/**
+ * @brief Helper macro to define IB opcodes by transport and operation
+ *
+ * Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST`
+ */
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
+
+/**
+ * @defgroup ib_opcodes InfiniBand OpCode Definitions
+ *
+ * Based on IBTA Vol 1 Table 38 and extended for RoCE semantics.
+ * @{
+ */
+
+enum {
+ /* Transport types (base values) */
+ IB_OPCODE_RC = 0x00, /**< Reliable Connection */
+ IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
+ IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
+ IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
+ IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
+ IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
+
+ /* Operation subtypes */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+ /* 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+
+ /* Real opcodes generated via IB_OPCODE() macro */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+
+ /* UC opcodes */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD opcodes */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD opcodes */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+/** @} */
+
+/**
+ * @brief Runtime packet context used during processing
+ */
+struct vhost_rdma_pkt_info {
+ struct vhost_rdma_dev *dev; /**< Owning device */
+ struct vhost_rdma_qp *qp; /**< Associated QP */
+ struct vhost_rdma_send_wqe *wqe; /**< Corresponding send WQE (if applicable) */
+ uint8_t *hdr; /**< Pointer to BTH (Base Transport Header) */
+ uint32_t mask; /**< Semantic flags (from vhost_rdma_hdr_mask) */
+ uint32_t psn; /**< Packet Sequence Number from BTH */
+ uint16_t pkey_index; /**< Partition key index */
+ uint16_t paylen; /**< Payload length (BTH to ICRC) */
+ uint8_t port_num; /**< Port this packet was received on */
+ uint8_t opcode; /**< BTH opcode field */
+};
+
+#endif /* __VHOST_RDMA_PKT_H__ */
\ No newline at end of file
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH] hw/rdma: Implement vhost-user RDMA device with PCI support
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
2025-12-17 8:49 ` [PATCH 01/14] examples/vhost_user_rdma: implement core application initialization for supporting vhost_user_rdma device Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 02/14] examples/vhost_user_rdma: implement device and port query commands Xiong Weimin
` (8 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin, qemu-devel
From: xiongweimin <xiongweimin@kylinos.cn>
This commit introduces a complete vhost-user RDMA device implementation
including PCI interface bindings. The implementation enables RDMA operations
through a vhost-user backend and provides a PCI device interface for guests.
Key components included:
1. PCI device binding layer:
- Automatic MSI-X vector allocation (queues + 1)
- Virtio 1.0 compliance (no legacy support)
- Standard Red Hat vendor/device IDs
- Bootindex property passthrough
- Transitional/non-transitional device variants
2. Core vhost-user RDMA device:
- Chardev-based backend communication
- Dynamic connection management with reconnect
- 256 virtqueues (512 entries each)
- Feature negotiation (VIRTIO_F_VERSION_1, INDIRECT_DESC, etc.)
- Config space handling with live updates
- VM state preservation for migration
- Graceful start/stop sequences
- Host notifier management
3. Key functionalities:
- Automatic backend connection management
- Config change notification handling
- Queue enablement on guest activity
- Error handling for backend disconnections
- Resource cleanup on device unrealize
The implementation follows virtio and vhost-user standards, providing
a foundation for RDMA virtualization using user-space backends.
CC: qemu-devel@nongnu.org
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: I3299219282bc98800422e132298006ed1b3637da
---
hw/rdma/Kconfig | 5 +
hw/rdma/meson.build | 5 +
hw/rdma/vhost-user-rdma.c | 463 ++++++++++++++++++++
hw/virtio/meson.build | 1 +
hw/virtio/vhost-user-rdma-pci.c | 93 ++++
hw/virtio/vhost-user.c | 11 +
hw/virtio/vhost.c | 2 +
hw/virtio/virtio.c | 1 +
include/hw/pci/pci.h | 1 +
include/hw/virtio/vhost-user-rdma.h | 43 ++
include/hw/virtio/virtio.h | 2 +-
include/standard-headers/linux/virtio_ids.h | 1 +
include/standard-headers/rdma/virtio_rdma.h | 60 +++
13 files changed, 687 insertions(+), 1 deletion(-)
create mode 100644 hw/rdma/vhost-user-rdma.c
create mode 100644 hw/virtio/vhost-user-rdma-pci.c
create mode 100644 include/hw/virtio/vhost-user-rdma.h
create mode 100644 include/standard-headers/rdma/virtio_rdma.h
diff --git a/hw/rdma/Kconfig b/hw/rdma/Kconfig
index 840320bdc0..1cb7ee72ab 100644
--- a/hw/rdma/Kconfig
+++ b/hw/rdma/Kconfig
@@ -1,3 +1,8 @@
config VMW_PVRDMA
default y if PCI_DEVICES
depends on PVRDMA && MSI_NONBROKEN && VMXNET3_PCI
+
+config VHOST_USER_RDMA
+ bool
+ default y if VIRTIO_PCI
+ depends on VIRTIO && VHOST_USER && LINUX
diff --git a/hw/rdma/meson.build b/hw/rdma/meson.build
index 363c9b8c83..51c47b2d44 100644
--- a/hw/rdma/meson.build
+++ b/hw/rdma/meson.build
@@ -10,3 +10,8 @@ specific_ss.add(when: 'CONFIG_VMW_PVRDMA', if_true: files(
'vmw/pvrdma_dev_ring.c',
'vmw/pvrdma_main.c',
))
+
+
+specific_ss.add(when: 'CONFIG_VHOST_USER_RDMA', if_true: files(
+ 'vhost-user-rdma.c',
+))
diff --git a/hw/rdma/vhost-user-rdma.c b/hw/rdma/vhost-user-rdma.c
new file mode 100644
index 0000000000..e54b349ec4
--- /dev/null
+++ b/hw/rdma/vhost-user-rdma.c
@@ -0,0 +1,463 @@
+/*
+ * RDMA device interface
+ *
+ * Copyright (C) 2025 Kylinsoft
+ *
+ * Authors:
+ * Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user-rdma.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+#define VHOST_USER_RDMA_NUM_QUEUES 256
+#define VHOST_USER_RDMA_QUEUE_SIZE 512
+
+static const int user_feature_bits[] = {
+ VIRTIO_F_VERSION_1,
+ VIRTIO_RING_F_INDIRECT_DESC,
+ VIRTIO_RING_F_EVENT_IDX,
+ VIRTIO_F_NOTIFY_ON_EMPTY,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vhost_user_rdma_event(void *opaque, QEMUChrEvent event);
+
+static int vhost_user_rdma_start(VirtIODevice *vdev)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int i, ret;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return -ENOSYS;
+ }
+
+ ret = vhost_dev_enable_notifiers(&r->dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", -ret);
+ return ret;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, r->dev.nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", -ret);
+ goto err_host_notifiers;
+ }
+
+ r->dev.acked_features = vdev->guest_features;
+
+ ret = vhost_dev_start(&r->dev, vdev, true);
+ if (ret < 0) {
+ error_report("Error starting vhost: %d", -ret);
+ goto err_guest_notifiers;
+ }
+ r->started_vu = true;
+
+ for (i = 0; i < r->dev.nvqs; i++) {
+ vhost_virtqueue_mask(&r->dev, vdev, i, false);
+ }
+
+ return ret;
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, r->dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&r->dev, vdev);
+ return ret;
+}
+
+static void vhost_user_rdma_stop(VirtIODevice *vdev)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+
+ if (!r->started_vu) {
+ return;
+ }
+ r->started_vu = false;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(&r->dev, vdev, true);
+
+ ret = k->set_guest_notifiers(qbus->parent, r->dev.nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(&r->dev, vdev);
+}
+
+static int vhost_user_rdma_handle_config_change(struct vhost_dev *dev)
+{
+ int ret;
+ VHostUserRdma *r = VHOST_USER_RDMA(dev->vdev);
+ Error *local_err = NULL;
+
+ ret = vhost_dev_get_config(dev, (uint8_t *)&r->rdmacfg,
+ sizeof(struct virtio_rdma_config), &local_err);
+ if (ret < 0) {
+ error_report("get config space failed");
+ return -1;
+ }
+
+ virtio_notify_config(dev->vdev);
+ return 0;
+}
+
+const VhostDevConfigOps rdma_ops = {
+ .vhost_dev_config_notifier = vhost_user_rdma_handle_config_change,
+};
+
+static int vhost_user_rdma_connect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ int ret = 0;
+ Error *local_err = NULL;
+
+ info_report("vhost_user_rdma: vhost connect");
+
+ if (r->connected) {
+ return 0;
+ }
+ r->connected = true;
+
+ r->dev.nvqs = r->num_queues;
+ r->dev.vqs = r->vhost_vqs;
+ r->dev.vq_index = 0;
+ r->dev.backend_features = 0;
+
+ vhost_dev_set_config_notifier(&r->dev, &rdma_ops);
+
+ ret = vhost_dev_init(&r->dev, &r->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, &local_err);
+ if (ret < 0) {
+ error_report("vhost-user-rdma: vhost initialization failed: %s",
+ strerror(-ret));
+ return ret;
+ }
+
+ /* restore vhost state */
+ if (virtio_device_started(vdev, vdev->status)) {
+ info_report("vhost_user_rdma: vhost ss?");
+ ret = vhost_user_rdma_start(vdev);
+ if (ret < 0) {
+ error_report("vhost-user-rdma: vhost start failed: %s",
+ strerror(-ret));
+ return ret;
+ }
+ }
+ info_report("vhost_user_rdma: vhost connect success");
+ return 0;
+}
+
+static void vhost_user_rdma_disconnect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *s = VHOST_USER_RDMA(vdev);
+
+ if (!s->connected) {
+ return;
+ }
+ s->connected = false;
+
+ vhost_user_rdma_stop(vdev);
+
+ vhost_dev_cleanup(&s->dev);
+}
+
+static void vhost_user_rdma_chr_closed_bh(void *opaque)
+{
+ DeviceState *dev = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+
+ vhost_user_rdma_disconnect(dev);
+ qemu_chr_fe_set_handlers(&r->chardev, NULL, NULL, vhost_user_rdma_event,
+ NULL, opaque, NULL, true);
+}
+
+static void vhost_user_rdma_event(void *opaque, QEMUChrEvent event)
+{
+ DeviceState *dev = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ if (vhost_user_rdma_connect(dev) < 0) {
+ qemu_chr_fe_disconnect(&r->chardev);
+ return;
+ }
+ break;
+ case CHR_EVENT_CLOSED:
+ if (runstate_is_running()) {
+ AioContext *ctx = qemu_get_current_aio_context();
+
+ qemu_chr_fe_set_handlers(&r->chardev, NULL, NULL, NULL, NULL,
+ NULL, NULL, false);
+ aio_bh_schedule_oneshot(ctx, vhost_user_rdma_chr_closed_bh, opaque);
+ }
+
+ r->dev.started = false;
+ break;
+ case CHR_EVENT_BREAK:
+ case CHR_EVENT_MUX_IN:
+ case CHR_EVENT_MUX_OUT:
+ /* Ignore */
+ break;
+ }
+}
+
+static void vhost_user_rdma_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ int i, ret;
+
+ if (!vdev->start_on_kick) {
+ return;
+ }
+
+ if (!r->connected) {
+ return;
+ }
+
+ if (r->dev.started) {
+ return;
+ }
+
+ ret = vhost_user_rdma_start(vdev);
+ if (ret < 0) {
+ qemu_chr_fe_disconnect(&r->chardev);
+ return;
+ }
+
+ for (i = 0; i < r->dev.nvqs; i++) {
+ VirtQueue *kick_vq = virtio_get_queue(vdev, i);
+
+ if (!virtio_queue_get_desc_addr(vdev, i)) {
+ continue;
+ }
+ event_notifier_set(virtio_queue_get_host_notifier(kick_vq));
+ }
+}
+
+static void vhost_user_rdma_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+
+ memcpy(config, &r->rdmacfg, sizeof(struct virtio_rdma_config));
+}
+
+static void vhost_user_rdma_set_config(VirtIODevice *vdev,
+ const uint8_t *config)
+{
+ /* nothing to do */
+}
+
+static uint64_t vhost_user_rdma_get_features(VirtIODevice *vdev,
+ uint64_t features,
+ Error **errp)
+{
+ VHostUserRdma *s = VHOST_USER_RDMA(vdev);
+
+ return vhost_get_features(&s->dev, user_feature_bits, features);
+}
+
+static void vhost_user_rdma_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ bool should_start = virtio_device_started(vdev, status);
+ int ret;
+
+ if (!vdev->vm_running) {
+ should_start = false;
+ }
+
+ if (!r->connected) {
+ return;
+ }
+
+ if (r->dev.started == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ ret = vhost_user_rdma_start(vdev);
+ if (ret < 0) {
+ error_report("vhost-user-rdma: vhost start failed: %s",
+ strerror(-ret));
+ qemu_chr_fe_disconnect(&r->chardev);
+ }
+ } else {
+ vhost_user_rdma_stop(vdev);
+ }
+}
+
+static void vhost_user_rdma_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *r = VHOST_USER_RDMA(vdev);
+ Error *err = NULL;
+ int i, ret;
+
+ if (!r->chardev.chr) {
+ error_setg(errp, "vhost-user-rdma: chardev is mandatory");
+ return;
+ }
+
+ r->num_queues = VHOST_USER_RDMA_NUM_QUEUES;
+
+ if (r->num_queues > VIRTIO_QUEUE_MAX) {
+ error_setg(errp, "vhost-user-rdma: invalid number of IO queues");
+ return;
+ }
+
+ if (!vhost_user_init(&r->vhost_user, &r->chardev, errp)) {
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_RDMA, sizeof(struct virtio_rdma_config));
+
+ r->virtqs = g_new(VirtQueue *, r->num_queues);
+
+ for (i = 0; i < r->num_queues; i++) {
+ r->virtqs[i] = virtio_add_queue(vdev, VHOST_USER_RDMA_QUEUE_SIZE,
+ vhost_user_rdma_handle_output);
+ }
+
+ r->vhost_vqs = g_new0(struct vhost_virtqueue, r->num_queues);
+ r->connected = false;
+
+ qemu_chr_fe_set_handlers(&r->chardev, NULL, NULL, vhost_user_rdma_event,
+ NULL, (void *)dev, NULL, true);
+
+reconnect:
+ if (qemu_chr_fe_wait_connected(&r->chardev, &err) < 0) {
+ error_report_err(err);
+ goto virtio_err;
+ }
+
+ /* check whether vhost_user_rdma_connect() failed or not */
+ if (!r->connected) {
+ goto reconnect;
+ }
+
+ ret = vhost_dev_get_config(&r->dev, (uint8_t *)&r->rdmacfg,
+ sizeof(struct virtio_rdma_config), &err);
+ if (ret < 0) {
+ error_report("vhost-user-rdma: get rdma config failed");
+ goto reconnect;
+ }
+
+ return;
+
+virtio_err:
+ g_free(r->vhost_vqs);
+ r->vhost_vqs = NULL;
+ for (i = 0; i < r->num_queues; i++) {
+ virtio_delete_queue(r->virtqs[i]);
+ }
+ g_free(r->virtqs);
+ virtio_cleanup(vdev);
+ vhost_user_cleanup(&r->vhost_user);
+}
+
+static void vhost_user_rdma_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRdma *r = VHOST_USER_RDMA(dev);
+ int i;
+
+ virtio_set_status(vdev, 0);
+ qemu_chr_fe_set_handlers(&r->chardev, NULL, NULL, NULL,
+ NULL, NULL, NULL, false);
+ vhost_dev_cleanup(&r->dev);
+ g_free(r->vhost_vqs);
+ r->vhost_vqs = NULL;
+ for (i = 0; i < r->num_queues; i++) {
+ virtio_delete_queue(r->virtqs[i]);
+ }
+ g_free(r->virtqs);
+ virtio_cleanup(vdev);
+ vhost_user_cleanup(&r->vhost_user);
+}
+
+static void vhost_user_rdma_instance_init(Object *obj)
+{
+ VHostUserRdma *r = VHOST_USER_RDMA(obj);
+
+ device_add_bootindex_property(obj, &r->bootindex, "bootindex",
+ "bootindex", DEVICE(obj));
+}
+
+static const VMStateDescription vmstate_vhost_user_rdma = {
+ .name = "vhost-user-rdma",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property vhost_user_rdma_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserRdma, chardev),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_rdma_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vhost_user_rdma_properties);
+ dc->vmsd = &vmstate_vhost_user_rdma;
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+
+ vdc->realize = vhost_user_rdma_device_realize;
+ vdc->unrealize = vhost_user_rdma_device_unrealize;
+ vdc->get_config = vhost_user_rdma_update_config;
+ vdc->set_config = vhost_user_rdma_set_config;
+ vdc->get_features = vhost_user_rdma_get_features;
+ vdc->set_status = vhost_user_rdma_set_status;
+}
+
+static const TypeInfo vhost_user_rdma_info = {
+ .name = TYPE_VHOST_USER_RDMA,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostUserRdma),
+ .instance_init = vhost_user_rdma_instance_init,
+ .class_init = vhost_user_rdma_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&vhost_user_rdma_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index d7f18c96e6..3f0a7da910 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -63,6 +63,7 @@ virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c'
virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock-pci.c'))
virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk-pci.c'))
virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RDMA', if_true: files('vhost-user-rdma-pci.c'))
virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
diff --git a/hw/virtio/vhost-user-rdma-pci.c b/hw/virtio/vhost-user-rdma-pci.c
new file mode 100644
index 0000000000..6b95949c07
--- /dev/null
+++ b/hw/virtio/vhost-user-rdma-pci.c
@@ -0,0 +1,93 @@
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "standard-headers/rdma/virtio_rdma.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vhost-user-rdma.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostUserRdmaPCI VHostUserRdmaPCI;
+
+#define TYPE_VHOST_USER_RDMA_PCI "vhost-user-rdma-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostUserRdmaPCI, VHOST_USER_RDMA_PCI,
+ TYPE_VHOST_USER_RDMA_PCI)
+
+struct VHostUserRdmaPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserRdma vdev;
+};
+
+static Property vhost_user_rdma_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_rdma_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserRdmaPCI *dev = VHOST_USER_RDMA_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = dev->vdev.num_queues + 1;
+ }
+
+ virtio_pci_force_virtio_1(vpci_dev);
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_rdma_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+ device_class_set_props(dc, vhost_user_rdma_pci_properties);
+ k->realize = vhost_user_rdma_pci_realize;
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_RDMA;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_NETWORK_OTHER;
+}
+
+static void vhost_user_rdma_pci_instance_init(Object *obj)
+{
+ VHostUserRdmaPCI *dev = VHOST_USER_RDMA_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_RDMA);
+
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_rdma_pci_info = {
+ .base_name = TYPE_VHOST_USER_RDMA_PCI,
+ .generic_name = "vhost-user-rdma-pci",
+ .transitional_name = "vhost-user-rdma-pci-transitional",
+ .non_transitional_name = "vhost-user-rdma-pci-non-transitional",
+ .instance_size = sizeof(VHostUserRdmaPCI),
+ .instance_init = vhost_user_rdma_pci_instance_init,
+ .class_init = vhost_user_rdma_pci_class_init,
+};
+
+static void vhost_user_rdma_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_rdma_pci_info);
+}
+
+type_init(vhost_user_rdma_pci_register)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index cdf9af4a4b..eb0813bddd 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -460,6 +460,7 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
}
if (shmfd) {
+ error_report("vhost_user_read: vhost_user_set_log_base");
msg.hdr.size = 0;
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
@@ -753,6 +754,7 @@ static int send_add_regions(struct vhost_dev *dev,
if (track_ramblocks) {
uint64_t reply_gpa;
+ error_report("vhost_user_read: send_add_regions");
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
@@ -930,6 +932,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
return ret;
}
+ error_report("vhost_user_read: vhost_user_set_mem_table_postcopy");
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
@@ -1287,6 +1290,7 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
return ret;
}
+ error_report("vhost_user_read: vhost_user_get_vring_base");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
@@ -1433,6 +1437,7 @@ static int vhost_user_set_features(struct vhost_dev *dev,
* VHOST_USER_F_PROTOCOL_FEATURES bit for enabling protocol
* features.
*/
+
ret = vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES,
features | dev->backend_features,
log_enabled);
@@ -1673,6 +1678,7 @@ int vhost_user_get_shared_object(struct vhost_dev *dev, unsigned char *uuid,
return ret;
}
+ error_report("vhost_user_read: vhost_user_get_shared_object");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
@@ -1998,6 +2004,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
return ret;
}
+ error_report("vhost_user_read: vhost_user_postcopy_advise");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg(errp, "Failed to get postcopy_advise reply from vhost");
@@ -2435,6 +2442,7 @@ static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
return ret;
}
+ error_report("vhost_user_read: vhost_user_get_config");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret, "vhost_get_config failed");
@@ -2578,6 +2586,7 @@ static int vhost_user_crypto_create_session(struct vhost_dev *dev,
return ret;
}
+ error_report("vhost_user_read: vhost_user_crypto_create_session");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_report("vhost_user_read() return %d, create session failed",
@@ -2923,6 +2932,7 @@ static int vhost_user_set_device_state_fd(struct vhost_dev *dev,
return ret;
}
+ error_report("vhost_user_read: vhost_user_set_device_state_fd");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
@@ -2985,6 +2995,7 @@ static int vhost_user_check_device_state(struct vhost_dev *dev, Error **errp)
return ret;
}
+ error_report("vhost_user_read: vhost_user_check_device_state");
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index f50180e60e..87cec36828 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -934,6 +934,7 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
}
}
+
r = dev->vhost_ops->vhost_set_features(dev, features);
if (r < 0) {
VHOST_OPS_DEBUG(r, "vhost_set_features failed");
@@ -1804,6 +1805,7 @@ void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features)
{
const int *bit = feature_bits;
+ info_report("vhost_ack_features");
while (*bit != VHOST_INVALID_FEATURE_BIT) {
uint64_t bit_mask = (1ULL << *bit);
if (features & bit_mask) {
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index fd2dfe3a6b..bee7156e6d 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -181,6 +181,7 @@ const char *virtio_device_names[] = {
[VIRTIO_ID_FS] = "virtio-user-fs",
[VIRTIO_ID_PMEM] = "virtio-pmem",
[VIRTIO_ID_RPMB] = "virtio-rpmb",
+ [VIRTIO_ID_RDMA] = "virtio-rdma",
[VIRTIO_ID_MAC80211_HWSIM] = "virtio-mac-hwsim",
[VIRTIO_ID_VIDEO_ENCODER] = "virtio-vid-encoder",
[VIRTIO_ID_VIDEO_DECODER] = "virtio-vid-decoder",
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..a1eccfb78b 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -85,6 +85,7 @@ extern bool pci_available;
#define PCI_DEVICE_ID_VIRTIO_RNG 0x1005
#define PCI_DEVICE_ID_VIRTIO_9P 0x1009
#define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012
+#define PCI_DEVICE_ID_VIRTIO_RDMA 0x1016
/*
* modern virtio-pci devices get their id assigned automatically,
diff --git a/include/hw/virtio/vhost-user-rdma.h b/include/hw/virtio/vhost-user-rdma.h
new file mode 100644
index 0000000000..2d522cd676
--- /dev/null
+++ b/include/hw/virtio/vhost-user-rdma.h
@@ -0,0 +1,43 @@
+/*
+ * vhost-user-rdma host device
+ * Copyright(C) 2021 Bytedance Inc. All rights reserved.
+ *
+ * Authors:
+ * Junji Wei <weijunji@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef VHOST_USER_RDMA_H
+#define VHOST_USER_RDMA_H
+
+#include "standard-headers/rdma/virtio_rdma.h"
+#include "chardev/char-fe.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user.h"
+#include "qom/object.h"
+
+#define TYPE_VHOST_USER_RDMA "vhost-user-rdma"
+OBJECT_DECLARE_SIMPLE_TYPE(VHostUserRdma, VHOST_USER_RDMA)
+
+struct VHostUserRdma {
+ VirtIODevice parent_obj;
+ CharBackend chardev;
+ int32_t bootindex;
+ struct virtio_rdma_config rdmacfg;
+ struct vhost_dev dev;
+ VhostUserState vhost_user;
+ struct vhost_virtqueue *vhost_vqs;
+ VirtQueue **virtqs;
+
+ int num_queues;
+
+ /* vhost_user_rdma_connect/vhost_user_rdma_disconnect */
+ bool connected;
+ /* vhost_user_rdma_start/vhost_user_rdma_stop */
+ bool started_vu;
+};
+
+#endif
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 7d5ffdc145..f74da61477 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -369,7 +369,7 @@ typedef struct VirtIORNGConf VirtIORNGConf;
DEFINE_PROP_BIT64("packed", _state, _field, \
VIRTIO_F_RING_PACKED, false), \
DEFINE_PROP_BIT64("queue_reset", _state, _field, \
- VIRTIO_F_RING_RESET, true)
+ VIRTIO_F_RING_RESET, false)
hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n);
diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h
index 7aa2eb7662..ff2d0b01b4 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -68,6 +68,7 @@
#define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */
#define VIRTIO_ID_BT 40 /* virtio bluetooth */
#define VIRTIO_ID_GPIO 41 /* virtio gpio */
+#define VIRTIO_ID_RDMA 42 /* virtio rdma */
/*
* Virtio Transitional IDs
diff --git a/include/standard-headers/rdma/virtio_rdma.h b/include/standard-headers/rdma/virtio_rdma.h
new file mode 100644
index 0000000000..b493f973d8
--- /dev/null
+++ b/include/standard-headers/rdma/virtio_rdma.h
@@ -0,0 +1,60 @@
+/*
+ * Virtio RDMA Device
+ *
+ * Copyright (C) 2021 Bytedance Inc.
+ *
+ * Authors:
+ * Junji Wei <weijunji@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _LINUX_VIRTIO_RDMA_H
+#define _LINUX_VIRTIO_RDMA_H
+
+#include <linux/types.h>
+#include <infiniband/verbs.h>
+
+#include "standard-headers/linux/virtio_ids.h"
+#include "standard-headers/linux/virtio_config.h"
+#include "standard-headers/linux/virtio_types.h"
+
+struct virtio_rdma_config {
+ __le32 phys_port_cnt;
+
+ __le64 sys_image_guid;
+ __le32 vendor_id;
+ __le32 vendor_part_id;
+ __le32 hw_ver;
+ __le64 max_mr_size;
+ __le64 page_size_cap;
+ __le32 max_qp;
+ __le32 max_qp_wr;
+ __le64 device_cap_flags;
+ __le32 max_send_sge;
+ __le32 max_recv_sge;
+ __le32 max_sge_rd;
+ __le32 max_cq;
+ __le32 max_cqe;
+ __le32 max_mr;
+ __le32 max_pd;
+ __le32 max_qp_rd_atom;
+ __le32 max_res_rd_atom;
+ __le32 max_qp_init_rd_atom;
+ __le32 atomic_cap;
+ __le32 max_mw;
+ __le32 max_mcast_grp;
+ __le32 max_mcast_qp_attach;
+ __le32 max_total_mcast_qp_attach;
+ __le32 max_ah;
+ __le32 max_fast_reg_page_list_len;
+ __le32 max_pi_fast_reg_page_list_len;
+ __le16 max_pkeys;
+ uint8_t local_ca_ack_delay;
+
+ uint8_t reserved[64];
+} QEMU_PACKED;
+
+#endif
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 02/14] examples/vhost_user_rdma: implement device and port query commands
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
2025-12-17 8:49 ` [PATCH 01/14] examples/vhost_user_rdma: implement core application initialization for supporting vhost_user_rdma device Xiong Weimin
2025-12-17 8:49 ` [PATCH] hw/rdma: Implement vhost-user RDMA device with PCI support Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 03/14] examples/vhost_user_rdma: implement create and destroy completion queue commands Xiong Weimin
` (7 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
Added RDMA control command handlers for:
- VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE
- VHOST_RDMA_CTRL_ROCE_QUERY_PORT
Key features:
1. Device capability reporting:
- Maximum MR size and page size capabilities
- Queue Pair (QP) limits (max WR, SGE, CQE)
- Resource limits (MR, PD, AH counts)
- RDMA protocol capabilities
2. Port attribute reporting:
- GID table length and port state
- MTU settings (active, physical, maximum)
- Link speed and width capabilities
- Error counters and security attributes
3. Response validation:
- CHK_IOVEC macro ensures response buffer safety
- Fixed attribute values for standard RDMA v2 compliance
- Structured response formats matching IB specification
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: I17ac65a0801ebf5e0b0d83a50877004a54840365
---
examples/vhost_user_rdma/vhost_rdma_ib.c | 27 ++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.h | 56 +++++++++++++++++++++++-
2 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index 5535a8696b..edb6e3fea3 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -537,12 +537,39 @@ vhost_rdma_query_device(struct vhost_rdma_device *dev, CTRL_NO_CMD,
return 0;
}
+static int
+vhost_rdma_query_port(__rte_unused struct vhost_rdma_device *dev,
+ CTRL_NO_CMD,
+ struct iovec *out)
+{
+ struct vhost_rdma_ack_query_port *rsp;
+
+ CHK_IOVEC(rsp, out);
+
+ rsp->gid_tbl_len = VHOST_MAX_GID_TBL_LEN;
+ rsp->max_msg_sz = 0x800000;
+ rsp->active_mtu = VHOST_RDMA_IB_MTU_256;
+ rsp->phys_mtu = VHOST_RDMA_IB_MTU_256;
+ rsp->port_cap_flags = 65536UL;
+ rsp->bad_pkey_cntr = 0UL;
+ rsp->phys_state = VHOST_RDMA_IB_PORT_PHYS_STATE_POLLING;
+ rsp->pkey_tbl_len = 1UL;
+ rsp->qkey_viol_cntr = 0UL;
+ rsp->state = VHOST_RDMA_IB_PORT_DOWN;
+ rsp->active_speed = 1UL;
+ rsp->active_width = VHOST_RDMA_IB_WIDTH_1X;
+ rsp->max_mtu = VHOST_RDMA_IB_MTU_4096;
+
+ return 0;
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
const char *name; /* Name of the command (for logging) */
} cmd_tbl[] = {
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index 4ac896d82e..664067b024 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -204,7 +204,44 @@ enum vhost_user_rdma_request {
VHOST_USER_SET_CONFIG = 25,
VHOST_USER_MAX
};
-/** @} */
+
+enum vhost_rdma_ib_port_state {
+ VHOST_RDMA_IB_PORT_NOP = 0,
+ VHOST_RDMA_IB_PORT_DOWN = 1,
+ VHOST_RDMA_IB_PORT_INIT = 2,
+ VHOST_RDMA_IB_PORT_ARMED = 3,
+ VHOST_RDMA_IB_PORT_ACTIVE = 4,
+ VHOST_RDMA_IB_PORT_ACTIVE_DEFER = 5
+};
+
+enum vhost_rdma_ib_port_phys_state {
+ VHOST_RDMA_IB_PORT_PHYS_STATE_SLEEP = 1,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_POLLING = 2,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_DISABLED = 3,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_LINK_UP = 5,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6,
+ VHOST_RDMA_IB_PORT_PHYS_STATE_PHY_TEST = 7,
+};
+
+enum ib_port_width {
+ VHOST_RDMA_IB_WIDTH_1X = 1,
+ VHOST_RDMA_IB_WIDTH_2X = 16,
+ VHOST_RDMA_IB_WIDTH_4X = 2,
+ VHOST_RDMA_IB_WIDTH_8X = 4,
+ VHOST_RDMA_IB_WIDTH_12X = 8
+};
+
+enum ib_port_speed {
+ VHOST_RDMA_IB_SPEED_SDR = 1,
+ VHOST_RDMA_IB_SPEED_DDR = 2,
+ VHOST_RDMA_IB_SPEED_QDR = 4,
+ VHOST_RDMA_IB_SPEED_FDR10 = 8,
+ VHOST_RDMA_IB_SPEED_FDR = 16,
+ VHOST_RDMA_IB_SPEED_EDR = 32,
+ VHOST_RDMA_IB_SPEED_HDR = 64,
+ VHOST_RDMA_IB_SPEED_NDR = 128,
+};
/**
* @brief QP capabilities structure
@@ -622,6 +659,23 @@ struct vhost_rdma_ctrl_hdr {
uint8_t cmd;
};
+struct vhost_rdma_ack_query_port {
+ enum vhost_rdma_ib_port_state state;
+ enum vhost_rdma_ib_mtu max_mtu;
+ enum vhost_rdma_ib_mtu active_mtu;
+ uint32_t phys_mtu;
+ int gid_tbl_len;
+ uint32_t port_cap_flags;
+ uint32_t max_msg_sz;
+ uint32_t bad_pkey_cntr;
+ uint32_t qkey_viol_cntr;
+ uint16_t pkey_tbl_len;
+ uint16_t active_speed;
+ uint8_t active_width;
+ uint8_t phys_state;
+ uint32_t reserved[32]; /* For future extensions */
+}__rte_packed;
+
/**
* @brief Convert IB MTU enum to byte size
* @param mtu The MTU enum value
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 03/14] examples/vhost_user_rdma: implement create and destroy completion queue commands
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (2 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 02/14] examples/vhost_user_rdma: implement device and port query commands Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 04/14] examples/vhost_user_rdma: implement protection domain create/destroy commands Xiong Weimin
` (6 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds core functionality for managing RDMA Completion Queues (CQs):
1. CREATE_CQ command handler with resource allocation and initialization
2. DESTROY_CQ command with safe teardown procedures
3. Reference counting for lifecycle management
4. Concurrency control via spinlocks
5. Integration with device resource pools
Key features:
- Strict validation of CQ size against device capabilities
- Atomic state management with `is_dying` flag
- Virtual queue index reset during destruction
- Error logging for allocation failures
- Memory-safe buffer handling with CHK_IOVEC
Signed-off-by: xiongweimin <xiongweimin@kylinos.cn>
Change-Id: Ie4b51c90f36a1ceadfe4dbc622dc6fcaaaaf4261
---
examples/vhost_user_rdma/vhost_rdma_ib.c | 59 +++++++++++++++++++++++-
examples/vhost_user_rdma/vhost_rdma_ib.h | 33 ++++++++++++-
2 files changed, 89 insertions(+), 3 deletions(-)
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index edb6e3fea3..5ec0de8ae7 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -563,13 +563,68 @@ vhost_rdma_query_port(__rte_unused struct vhost_rdma_device *dev,
return 0;
}
+static int
+vhost_rdma_create_cq(struct vhost_rdma_device *dev,
+ struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_create_cq *create_cmd;
+ struct vhost_rdma_ack_create_cq *create_rsp;
+ struct vhost_rdma_cq *cq;
+ uint32_t cqn;
+
+ CHK_IOVEC(create_cmd, in);
+ if (create_cmd->cqe > dev->attr.max_cqe)
+ return -EINVAL;
+
+ CHK_IOVEC(create_rsp, out);
+
+ cq = vhost_rdma_pool_alloc(&dev->cq_pool, &cqn);
+ if (cq == NULL) {
+ RDMA_LOG_ERR("cq alloc failed");
+ }
+ vhost_rdma_ref_init(cq);
+
+ rte_spinlock_init(&cq->cq_lock);
+ cq->is_dying = false;
+ cq->notify = 0;
+ cq->vq = &dev->cq_vqs[cqn];
+ cq->cqn = cqn;
+ create_rsp->cqn = cqn;
+
+ return 0;
+}
+
+static int
+vhost_rdma_destroy_cq(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP)
+{
+ struct vhost_rdma_cmd_destroy_cq *destroy_cmd;
+ struct vhost_rdma_cq *cq;
+
+ CHK_IOVEC(destroy_cmd, in);
+
+ cq = vhost_rdma_pool_get(&dev->cq_pool, destroy_cmd->cqn);
+
+ rte_spinlock_lock(&cq->cq_lock);
+ cq->is_dying = true;
+ cq->vq->last_avail_idx = 0;
+ cq->vq->last_used_idx = 0;
+ rte_spinlock_unlock(&cq->cq_lock);
+
+ vhost_rdma_drop_ref(cq, dev, cq);
+
+ return 0;
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
const char *name; /* Name of the command (for logging) */
} cmd_tbl[] = {
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index 664067b024..6420c8c7e2 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -31,6 +31,12 @@
#include "eal_interrupts.h"
+#define vhost_rdma_ref_init(obj) \
+ do{\
+ rte_atomic32_init(&(obj)->refcnt); \
+ rte_atomic32_inc(&(obj)->refcnt); \
+ }while(0)
+
/* Forward declarations */
struct vhost_rdma_device;
struct vhost_queue;
@@ -370,7 +376,7 @@ struct vhost_user_rdma_msg {
* @brief Completion Queue (CQ)
*/
struct vhost_rdma_cq {
- struct vhost_queue *vq; /**< Notification V-ring */
+ struct vhost_user_queue *vq; /**< Notification V-ring */
rte_spinlock_t cq_lock; /**< Protect CQ operations */
uint8_t notify; /**< Notify pending flag */
bool is_dying; /**< Being destroyed */
@@ -676,6 +682,31 @@ struct vhost_rdma_ack_query_port {
uint32_t reserved[32]; /* For future extensions */
}__rte_packed;
+struct vhost_rdma_cmd_create_cq {
+ /* Size of CQ */
+ uint32_t cqe;
+};
+
+struct vhost_rdma_ack_create_cq {
+ /* The index of CQ */
+ uint32_t cqn;
+};
+
+struct vhost_rdma_cmd_destroy_cq {
+ /* The index of CQ */
+ uint32_t cqn;
+};
+
+struct vhost_rdma_ack_create_pd {
+ /* The handle of PD */
+ uint32_t pdn;
+};
+
+struct vhost_rdma_cmd_destroy_pd {
+ /* The handle of PD */
+ uint32_t pdn;
+};
+
/**
* @brief Convert IB MTU enum to byte size
* @param mtu The MTU enum value
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 04/14] examples/vhost_user_rdma: implement protection domain create/destroy commands
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (3 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 03/14] examples/vhost_user_rdma: implement create and destroy completion queue commands Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 05/14] examples/vhost_user_rdma: implement comprehensive memory region management Xiong Weimin
` (5 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
Added core functionality for managing RDMA Protection Domains (PDs):
1. CREATE_PD command for resource allocation and initialization
2. DESTROY_PD command with reference-counted teardown
3. Integration with device-specific PD resource pool
4. Minimalist state management for security domains
5. Robust input validation and error handling
Key features:
- PD identifier (pdn) generation and return to guest
- Atomic reference counting for lifecycle management
- Device association for resource tracking
- Memory-safe buffer handling with CHK_IOVEC
- ENOMEM handling for allocation failures
Signed-off-by: Xiong Weimin<xiongweimin@kylinos.cn>
Change-Id: I36d841a76067813c1880069c71b2eba90337609b
---
examples/vhost_user_rdma/vhost_rdma_ib.c | 38 ++++++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.h | 30 +++++++++----------
2 files changed, 53 insertions(+), 15 deletions(-)
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index 5ec0de8ae7..e590b555d3 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -616,6 +616,42 @@ vhost_rdma_destroy_cq(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_R
return 0;
}
+static int
+vhost_rdma_create_pd(struct vhost_rdma_device *dev, CTRL_NO_CMD, struct iovec *out)
+{
+ struct vhost_rdma_ack_create_pd *create_rsp;
+ struct vhost_rdma_pd *pd;
+ uint32_t idx;
+
+ CHK_IOVEC(create_rsp, out);
+
+ pd = vhost_rdma_pool_alloc(&dev->pd_pool, &idx);
+ if(pd == NULL) {
+ return -ENOMEM;
+ }
+ vhost_rdma_ref_init(pd);
+
+ pd->dev = dev;
+ pd->pdn = idx;
+ create_rsp->pdn = idx;
+
+ return 0;
+}
+
+static int
+vhost_rdma_destroy_pd(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP)
+{
+ struct vhost_rdma_cmd_destroy_pd *create_cmd;
+ struct vhost_rdma_pd *pd;
+
+ CHK_IOVEC(create_cmd, in);
+
+ pd = vhost_rdma_pool_get(&dev->pd_pool, create_cmd->pdn);
+ vhost_rdma_drop_ref(pd, dev, pd);
+
+ return 0;
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
@@ -625,6 +661,8 @@ struct {
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index 6420c8c7e2..6356abc65a 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -667,44 +667,44 @@ struct vhost_rdma_ctrl_hdr {
struct vhost_rdma_ack_query_port {
enum vhost_rdma_ib_port_state state;
- enum vhost_rdma_ib_mtu max_mtu;
- enum vhost_rdma_ib_mtu active_mtu;
+ enum vhost_rdma_ib_mtu max_mtu;
+ enum vhost_rdma_ib_mtu active_mtu;
uint32_t phys_mtu;
- int gid_tbl_len;
+ int gid_tbl_len;
uint32_t port_cap_flags;
uint32_t max_msg_sz;
uint32_t bad_pkey_cntr;
uint32_t qkey_viol_cntr;
uint16_t pkey_tbl_len;
uint16_t active_speed;
- uint8_t active_width;
- uint8_t phys_state;
+ uint8_t active_width;
+ uint8_t phys_state;
uint32_t reserved[32]; /* For future extensions */
}__rte_packed;
struct vhost_rdma_cmd_create_cq {
- /* Size of CQ */
- uint32_t cqe;
+ /* Size of CQ */
+ uint32_t cqe;
};
struct vhost_rdma_ack_create_cq {
- /* The index of CQ */
- uint32_t cqn;
+ /* The index of CQ */
+ uint32_t cqn;
};
struct vhost_rdma_cmd_destroy_cq {
- /* The index of CQ */
- uint32_t cqn;
+ /* The index of CQ */
+ uint32_t cqn;
};
struct vhost_rdma_ack_create_pd {
- /* The handle of PD */
- uint32_t pdn;
+ /* The handle of PD */
+ uint32_t pdn;
};
struct vhost_rdma_cmd_destroy_pd {
- /* The handle of PD */
- uint32_t pdn;
+ /* The handle of PD */
+ uint32_t pdn;
};
/**
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 05/14] examples/vhost_user_rdma: implement comprehensive memory region management
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (4 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 04/14] examples/vhost_user_rdma: implement protection domain create/destroy commands Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 06/14] examples/vhost_user_rdma: implement comprehensive queue pair lifecycle management Xiong Weimin
` (4 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds core functionality for RDMA Memory Region (MR) handling:
1. DMA MR registration for physical memory access
2. Pre-allocated MR creation for optimized buffer handling
3. User-space MR registration with GPA->VVA translation
4. MR deregistration with reference-counted cleanup
5. Secure key generation and validation mechanisms
Key features:
- Random lkey/rkey generation with collision avoidance
- Three MR types: DMA, pre-allocated, and user-mapped
- Page mapping for user-space memory regions
- State management (VALID/ZOMBIE) for safe deregistration
- Reference counting integration with PDs
- Comprehensive error handling and logging
Signed-off-by: Xiong Weimin<xiongweimin@kylinos.cn>
Change-Id: I4c26d47181f895c05b8ba125fdf0959bd0827d99
---
examples/vhost_user_rdma/vhost_rdma_ib.c | 199 +++++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.h | 74 +++++++++
2 files changed, 273 insertions(+)
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index e590b555d3..3002498151 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -18,6 +18,7 @@
#include <rte_ethdev.h>
#include <rte_spinlock.h>
#include <rte_malloc.h>
+#include <rte_random.h>
#include "vhost_rdma.h"
#include "vhost_rdma_ib.h"
@@ -652,6 +653,200 @@ vhost_rdma_destroy_pd(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_R
return 0;
}
+uint8_t
+vhost_rdma_get_next_key(uint32_t last_key)
+{
+ uint8_t key;
+
+ do {
+ key = rte_rand();
+ } while (key == last_key);
+
+ return key;
+}
+
+void
+vhost_rdma_mr_init_key(struct vhost_rdma_mr *mr, uint32_t mrn)
+{
+ uint32_t lkey = mrn << 8 | vhost_rdma_get_next_key(-1);
+ uint32_t rkey = (mr->access & VHOST_RDMA_IB_ACCESS_REMOTE) ? lkey : 0;
+
+ mr->lkey = lkey;
+ mr->rkey = rkey;
+}
+
+static int
+vhost_rdma_get_dma_mr(struct vhost_rdma_device *dev, struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_get_dma_mr *get_cmd;
+ struct vhost_rdma_ack_get_dma_mr *ack_rsp;
+ struct vhost_rdma_pd *pd;
+ struct vhost_rdma_mr *mr;
+ uint32_t mrn;
+
+ CHK_IOVEC(get_cmd, in);
+ CHK_IOVEC(ack_rsp, out);
+
+ pd = vhost_rdma_pool_get(&dev->pd_pool, get_cmd->pdn);
+ if (unlikely(pd == NULL)) {
+ RDMA_LOG_ERR("pd is not found");
+ return -EINVAL;
+ }
+
+ mr = vhost_rdma_pool_alloc(&dev->mr_pool, &mrn);
+ if (mr == NULL) {
+ RDMA_LOG_ERR("mr alloc failed");
+ return -ENOMEM;
+ }
+
+ vhost_rdma_ref_init(mr);
+ vhost_rdma_add_ref(pd);
+
+ mr->type = VHOST_MR_TYPE_DMA;
+ mr->state = VHOST_MR_STATE_VALID;
+ mr->access = get_cmd->access_flags;
+ mr->pd = pd;
+ vhost_rdma_mr_init_key(mr, mrn);
+ mr->mrn = mrn;
+
+ ack_rsp->lkey = mr->lkey;
+ ack_rsp->rkey = mr->rkey;
+ ack_rsp->mrn = mrn;
+
+ return 0;
+}
+
+static int
+vhost_rdma_alloc_mr(struct vhost_rdma_device *dev, struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_alloc_mr *alloc_cmd;
+ struct vhost_rdma_ack_get_dma_mr *ack_rsp;
+ struct vhost_rdma_pd *pd;
+ struct vhost_rdma_mr *mr;
+ uint32_t mrn;
+
+ CHK_IOVEC(alloc_cmd, in);
+ CHK_IOVEC(ack_rsp, out);
+
+ pd = vhost_rdma_pool_get(&dev->pd_pool, alloc_cmd->pdn);
+ if (unlikely(pd == NULL)) {
+ RDMA_LOG_ERR("pd is not found");
+ return -EINVAL;
+ }
+
+ mr = vhost_rdma_pool_alloc(&dev->mr_pool, &mrn);
+ if (mr == NULL) {
+ RDMA_LOG_ERR("mr alloc failed");
+ return -ENOMEM;
+ }
+
+ vhost_rdma_ref_init(mr);
+ vhost_rdma_add_ref(pd);
+
+ mr->type = VHOST_MR_TYPE_DMA;
+ mr->state = VHOST_MR_STATE_VALID;
+ mr->access = alloc_cmd->access_flags;
+ mr->pd = pd;
+ mr->max_pages = alloc_cmd->max_num_sg;
+ vhost_rdma_mr_init_key(mr, mrn);
+ mr->mrn = mrn;
+
+ ack_rsp->lkey = mr->lkey;
+ ack_rsp->rkey = mr->rkey;
+ ack_rsp->mrn = mrn;
+
+ return 0;
+}
+
+void
+vhost_rdma_map_pages(struct rte_vhost_memory *mem, uint64_t *pages,
+ uint64_t *dma_pages, uint32_t npages)
+{
+ uint32_t i;
+ uint64_t len = USER_MMAP_TARGET_PAGE_SIZE;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = gpa_to_vva(mem, dma_pages[i], &len);
+ assert(len == USER_MMAP_TARGET_PAGE_SIZE);
+ }
+}
+
+static int
+vhost_rdma_reg_user_mr(struct vhost_rdma_device *dev, struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_reg_user_mr *reg_cmd;
+ struct vhost_rdma_ack_reg_user_mr *ack_rsp;
+ struct vhost_rdma_mr *mr;
+ struct vhost_rdma_pd *pd;
+ uint32_t mrn;
+
+ CHK_IOVEC(reg_cmd, in);
+ CHK_IOVEC(ack_rsp, out);
+
+ pd = vhost_rdma_pool_get(&dev->pd_pool, reg_cmd->pdn);
+ if (unlikely(pd == NULL)) {
+ RDMA_LOG_ERR("pd is not found");
+ return -EINVAL;
+ }
+
+ mr = vhost_rdma_pool_alloc(&dev->mr_pool, &mrn);
+ if (mr == NULL) {
+ return -ENOMEM;
+ }
+
+ mr->pages = malloc(sizeof(uint64_t) * reg_cmd->npages);
+ if (mr->pages == NULL) {
+ return -ENOMEM;
+ }
+
+ vhost_rdma_ref_init(mr);
+ vhost_rdma_add_ref(pd);
+
+ vhost_rdma_map_pages(dev->mem, mr->pages, (uint64_t *)reg_cmd->pages, reg_cmd->npages);
+
+ mr->pd = pd;
+ mr->access = reg_cmd->access_flags;
+ mr->length = reg_cmd->length;
+ mr->iova = reg_cmd->virt_addr & USER_MMAP_PAGE_MASK;
+ mr->npages = reg_cmd->npages;
+ mr->type = VHOST_MR_TYPE_MR;
+ mr->state = VHOST_MR_STATE_VALID;
+ vhost_rdma_mr_init_key(mr, mrn);
+ mr->mrn = mrn;
+
+ ack_rsp->lkey = mr->lkey;
+ ack_rsp->rkey = mr->rkey;
+ ack_rsp->mrn = mrn;
+
+ return 0;
+}
+
+static int
+vhost_rdma_dereg_mr(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP)
+{
+ struct vhost_rdma_cmd_dereg_mr *dereg_cmd;
+ struct vhost_rdma_mr *mr;
+
+ CHK_IOVEC(dereg_cmd, in);
+
+ mr = vhost_rdma_pool_get(&dev->mr_pool, dereg_cmd->mrn);
+ if (unlikely(mr == NULL)) {
+ RDMA_LOG_ERR("mr not found");
+ }
+
+ mr->state = VHOST_MR_STATE_ZOMBIE;
+
+ vhost_rdma_drop_ref(mr->pd, dev, pd);
+ vhost_rdma_drop_ref(mr, dev, mr);
+
+ RDMA_LOG_DEBUG("destroy mr %u", dereg_cmd->mrn);
+
+ return 0;
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
@@ -663,6 +858,10 @@ struct {
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_GET_DMA_MR, vhost_rdma_get_dma_mr),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index 6356abc65a..ddfdcf4917 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -58,6 +58,9 @@ struct vhost_queue;
/** Maximum size for config space read/write operations */
#define VHOST_USER_MAX_CONFIG_SIZE 256
+#define USER_MMAP_TARGET_PAGE_SIZE 4096
+#define USER_MMAP_PAGE_MASK (~(USER_MMAP_TARGET_PAGE_SIZE-1))
+
/** ROCE control command types (virtio-rdma extension) */
#define VHOST_RDMA_CTRL_ROCE 6
#define VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE 0
@@ -249,6 +252,14 @@ enum ib_port_speed {
VHOST_RDMA_IB_SPEED_NDR = 128,
};
+enum vhost_ib_access_flags {
+ VHOST_RDMA_IB_ACCESS_LOCAL_WRITE = (1 << 0),
+ VHOST_RDMA_IB_ACCESS_REMOTE_WRITE = (1 << 1),
+ VHOST_RDMA_IB_ACCESS_REMOTE_READ = (1 << 2),
+};
+
+#define VHOST_RDMA_IB_ACCESS_REMOTE (VHOST_RDMA_IB_ACCESS_REMOTE_WRITE | VHOST_RDMA_IB_ACCESS_REMOTE_READ)
+
/**
* @brief QP capabilities structure
*/
@@ -707,6 +718,60 @@ struct vhost_rdma_cmd_destroy_pd {
uint32_t pdn;
};
+struct vhost_rdma_cmd_alloc_mr {
+ /* The handle of PD which the MR associated with */
+ uint32_t pdn;
+ /* MR's protection attributes, enum virtio_ib_access_flags */
+ uint32_t access_flags;
+ uint32_t max_num_sg;
+};
+struct vhost_rdma_cmd_get_dma_mr {
+ /* The handle of PD which the MR associated with */
+ uint32_t pdn;
+ /* MR's protection attributes, enum virtio_ib_access_flags */
+ uint32_t access_flags;
+};
+
+struct vhost_rdma_ack_get_dma_mr {
+ /* The handle of MR */
+ uint32_t mrn;
+ /* MR's local access key */
+ uint32_t lkey;
+ /* MR's remote access key */
+ uint32_t rkey;
+};
+
+struct vhost_rdma_cmd_reg_user_mr {
+ /* The handle of PD which the MR associated with */
+ uint32_t pdn;
+ /* MR's protection attributes, enum virtio_ib_access_flags */
+ uint32_t access_flags;
+ /* Starting virtual address of MR */
+ uint64_t virt_addr;
+ /* Length of MR */
+ uint64_t length;
+ /* Size of the below page array */
+ uint32_t npages;
+ /* Padding */
+ uint32_t padding;
+ /* Array to store physical address of each page in MR */
+ uint64_t pages[];
+};
+
+struct vhost_rdma_ack_reg_user_mr {
+ /* The handle of MR */
+ uint32_t mrn;
+ /* MR's local access key */
+ uint32_t lkey;
+ /* MR's remote access key */
+ uint32_t rkey;
+};
+
+struct vhost_rdma_cmd_dereg_mr {
+ /* The handle of MR */
+ uint32_t mrn;
+};
+
/**
* @brief Convert IB MTU enum to byte size
* @param mtu The MTU enum value
@@ -792,4 +857,13 @@ int setup_iovs_from_descs(struct rte_vhost_memory *mem,
uint16_t *num_in,
uint16_t *num_out);
+void vhost_rdma_mr_init_key(struct vhost_rdma_mr *mr, uint32_t mrn);
+
+uint8_t vhost_rdma_get_next_key(uint32_t last_key);
+
+void vhost_rdma_map_pages(struct rte_vhost_memory *mem,
+ uint64_t *pages,
+ uint64_t *dma_pages,
+ uint32_t npages);
+
#endif /* __VHOST_RDMA_IB_H__ */
\ No newline at end of file
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 06/14] examples/vhost_user_rdma: implement comprehensive queue pair lifecycle management
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (5 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 05/14] examples/vhost_user_rdma: implement comprehensive memory region management Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 07/14] examples/vhost_user_rdma: Implement high-performance requester engine with advanced flow control Xiong Weimin
` (3 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds core functionality for managing RDMA Queue Pairs (QPs):
1. QP creation with type-specific handling (RC/UC/UD/GSI)
2. QP state modification and validation
3. QP attribute querying
4. QP destruction with resource cleanup
5. Address vector to attribute conversion
Key features:
- Special handling for General Service Interface (GSI) QPs
- Detailed QP state tracking (RESET, INIT, RTR, RTS, SQD, ERROR)
- Timer management for reliable connections (retransmit, RNR NAK)
- Virtual queue initialization and cleanup
- Atomic reference counting for lifecycle management
- Comprehensive attribute reporting for QP query
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: I6bc5d82867e49ac1bfd83993b28620f91f17ce4f
---
examples/vhost_user_rdma/meson.build | 2 +
examples/vhost_user_rdma/vhost_rdma.h | 70 +-
examples/vhost_user_rdma/vhost_rdma_ib.c | 284 ++++-
examples/vhost_user_rdma/vhost_rdma_ib.h | 255 ++++-
examples/vhost_user_rdma/vhost_rdma_opcode.c | 894 +++++++++++++++
examples/vhost_user_rdma/vhost_rdma_opcode.h | 330 ++++++
examples/vhost_user_rdma/vhost_rdma_pkt.h | 238 ----
examples/vhost_user_rdma/vhost_rdma_queue.c | 1056 ++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_queue.h | 338 ++++++
9 files changed, 3169 insertions(+), 298 deletions(-)
create mode 100644 examples/vhost_user_rdma/vhost_rdma_opcode.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_opcode.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_queue.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_queue.h
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
index d6ccaf32a4..a032a27767 100644
--- a/examples/vhost_user_rdma/meson.build
+++ b/examples/vhost_user_rdma/meson.build
@@ -41,5 +41,7 @@ sources = files(
'main.c',
'vhost_rdma.c',
'vhost_rdma_ib.c',
+ 'vhost_rdma_queue.c',
+ 'vhost_rdma_opcode.c',
)
diff --git a/examples/vhost_user_rdma/vhost_rdma.h b/examples/vhost_user_rdma/vhost_rdma.h
index c1531d1a7a..980bb74beb 100644
--- a/examples/vhost_user_rdma/vhost_rdma.h
+++ b/examples/vhost_user_rdma/vhost_rdma.h
@@ -16,6 +16,7 @@
#include <stdint.h>
#include <stdbool.h>
+#include <netinet/in.h>
#include <rte_byteorder.h>
#include <rte_common.h>
@@ -26,6 +27,7 @@
#include <rte_mempool.h>
#include <rte_ring.h>
#include <rte_bitmap.h>
+#include <rte_mbuf.h>
#include "vhost_rdma_ib.h"
#include "eal_interrupts.h"
@@ -106,6 +108,25 @@ enum vhost_rdma_counters {
VHOST_RDMA_NUM_OF_COUNTERS
};
+enum vhost_rdma_network_type {
+ VHOST_RDMA_NETWORK_IB,
+ VHOST_RDMA_NETWORK_ROCE_V1,
+ VHOST_RDMA_NETWORK_IPV4,
+ VHOST_RDMA_NETWORK_IPV6
+};
+
+enum {
+ VHOST_NETWORK_TYPE_IPV4 = 1,
+ VHOST_NETWORK_TYPE_IPV6 = 2,
+};
+
+enum vhost_rdma_ib_gid_type {
+ VHOST_RDMA_IB_GID_TYPE_IB,
+ VHOST_RDMA_IB_GID_TYPE_ROCE,
+ VHOST_RDMA_IB_GID_TYPE_ROCE_UDP_ENCAP,
+ VHOST_RDMA_IB_GID_TYPE_SIZE
+};
+
struct vhost_rdma_net_dev {
int vid;
uint64_t features;
@@ -299,21 +320,6 @@ vhost_rdma_vq_is_avail(struct vhost_user_queue *vq)
return vq->vring.avail->idx != vq->last_avail_idx;
}
-/**
- * @brief Get pointer to element at given index in a generic data ring.
- *
- * Used for accessing pre-allocated memory pools where each element has fixed size.
- *
- * @param queue Pointer to the queue containing data buffer.
- * @param idx Index of the desired element.
- * @return Pointer to the data at position idx.
- */
-static __rte_always_inline void *
-vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
-{
- return queue->data + queue->elem_size * idx;
-}
-
/**
* @brief Retrieve the next available descriptor index from the avail ring.
*
@@ -417,6 +423,40 @@ gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa, uint64_t *len)
return rte_vhost_va_from_guest_pa(mem, gpa, len);
}
+static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
+{
+ return IN6_IS_ADDR_V4MAPPED(a);
+}
+
+static inline void rdma_gid2ip(struct sockaddr *out, uint8_t *gid)
+{
+ if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
+ struct sockaddr_in *out_in = (struct sockaddr_in *)out;
+ memset(out_in, 0, sizeof(*out_in));
+ out_in->sin_family = AF_INET;
+ rte_memcpy(&out_in->sin_addr.s_addr, gid + 12, 4);
+ } else {
+ struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out;
+ memset(out_in, 0, sizeof(*out_in));
+ out_in->sin6_family = AF_INET6;
+ rte_memcpy(&out_in->sin6_addr.s6_addr, gid, 16);
+ }
+}
+
+static inline enum vhost_rdma_network_type rdma_gid_attr_network_type(const struct vhost_rdma_gid *attr)
+{
+ if (attr->type == VHOST_RDMA_IB_GID_TYPE_IB)
+ return VHOST_RDMA_NETWORK_IB;
+
+ if (attr->type == VHOST_RDMA_IB_GID_TYPE_ROCE)
+ return VHOST_RDMA_NETWORK_ROCE_V1;
+
+ if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid))
+ return VHOST_RDMA_NETWORK_IPV4;
+ else
+ return VHOST_RDMA_NETWORK_IPV6;
+}
+
int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx);
void vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx);
void vs_vhost_rdma_net_setup(int vid);
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index 3002498151..aac5c28e9a 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -1,5 +1,5 @@
/*
- * Vhost-user RDMA device : init and packets forwarding
+ * Vhost-user RDMA device : Main function of rdma device
*
* Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
*
@@ -24,6 +24,8 @@
#include "vhost_rdma_ib.h"
#include "vhost_rdma_log.h"
#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma_opcode.h"
#define CHK_IOVEC(tp, iov) \
do { \
@@ -39,6 +41,22 @@
#define CTRL_NO_CMD __rte_unused struct iovec *__in
#define CTRL_NO_RSP __rte_unused struct iovec *__out
+int alloc_rd_atomic_resources(struct vhost_rdma_qp *qp, unsigned int n)
+{
+ qp->resp.res_head = 0;
+ qp->resp.res_tail = 0;
+
+ if (n == 0) {
+ qp->resp.resources = NULL;
+ } else {
+ qp->resp.resources = rte_zmalloc(NULL, sizeof(struct vhost_rdma_resp_res) * n, 0);
+ if (!qp->resp.resources)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
/**
* @brief Free resources held by a response entry in the RDMA responder path.
*
@@ -134,23 +152,6 @@ free_rd_atomic_resources(struct vhost_rdma_qp *qp)
RDMA_LOG_DEBUG("Successfully freed response resource array for QP %u", qp->qpn);
}
-
-/**
- * @brief Clean up a vhost RDMA queue.
- */
-void
-vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue)
-{
- if (!queue)
- return;
-
- if (queue->cb && qp)
- rte_intr_callback_unregister(&queue->intr_handle, queue->cb, qp);
-
- rte_free(queue->data);
- queue->data = NULL;
-}
-
/**
* @brief Cleanup callback for MR: reset type.
*/
@@ -493,7 +494,7 @@ setup_iovs_from_descs(struct rte_vhost_memory *mem,
out++; /* Descriptor allows read (output) */
}
- /* Translate payload (address + length) into iovec(s) */
+ /* Translate payload (address length) into iovec(s) */
if (desc_payload_to_iovs(mem, iovs,
&iovs_idx,
desc->addr,
@@ -847,6 +848,247 @@ vhost_rdma_dereg_mr(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP
return 0;
}
+/**
+* vhost_rdma_create_qp - Create a Queue Pair (QP) for vhost RDMA device
+* @dev: Pointer to the vhost RDMA device
+* @in: Input iovec containing command from userspace
+* @out: Output iovec for returning response to userspace
+*
+* This function handles the creation of a QP based on the requested type.
+* It allocates resources, initializes the QP, and returns the assigned QPN.
+*
+* Returns 0 on success, or a negative error code on failure.
+*/
+static int
+vhost_rdma_create_qp(struct vhost_rdma_device *dev,
+ struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_create_qp *create_cmd;
+ struct vhost_rdma_ack_create_qp *ack_rsp;
+ struct vhost_rdma_qp *qp = NULL;
+ uint32_t qpn;
+ int ret = 0;
+
+ /* Validate input parameters */
+ if (!dev || !in || !out) {
+ RDMA_LOG_ERR("Invalid argument: null pointer detected");
+ return -EINVAL;
+ }
+
+ /* Safely map iovec buffers to command and response structures */
+ CHK_IOVEC(create_cmd, in);
+ CHK_IOVEC(ack_rsp, out);
+
+ /* Handle different QP types */
+ switch (create_cmd->qp_type) {
+ case VHOST_RDMA_IB_QPT_GSI:
+ /* Only one GSI QP is allowed, check if already created */
+ if (dev->qp_gsi->valid) {
+ RDMA_LOG_ERR("GSI QP already exists, cannot create duplicate");
+ return -EINVAL;
+ }
+ qp = dev->qp_gsi; /* Use pre-allocated GSI QP */
+ qpn = VHOST_RDMA_GSI_QPN; /* Assign well-known QPN (e.g., 1) */
+ break;
+
+ case VHOST_RDMA_IB_QPT_RC:
+ case VHOST_RDMA_IB_QPT_UD:
+ case VHOST_RDMA_IB_QPT_UC:
+ /* Allocate QP from pool for reliable/unordered connection types */
+ qp = vhost_rdma_pool_alloc(&dev->qp_pool, &qpn);
+ if (!qp) {
+ RDMA_LOG_ERR("Failed to allocate QP from pool for type %d",
+ create_cmd->qp_type);
+ return -ENOMEM;
+ }
+ break;
+
+ default:
+ /* Unsupported QP type */
+ RDMA_LOG_ERR("Unsupported QP type %d", create_cmd->qp_type);
+ return -EINVAL;
+ }
+
+ /* Initialize reference counter for the newly acquired QP */
+ vhost_rdma_ref_init(qp);
+
+ /* Set QP number */
+ qp->qpn = qpn;
+
+ /* Initialize QP internal state (queues, CQ bindings, etc.) */
+ if (vhost_rdma_qp_init(dev, qp, create_cmd)) {
+ RDMA_LOG_ERR("Failed to initialize QP %u", qpn);
+ ret = -EINVAL;
+ goto err_qp_init;
+ }
+
+ /* Populate acknowledgment response with allocated QPN */
+ ack_rsp->qpn = qpn;
+
+ /* Log successful QP creation with key attributes */
+ RDMA_LOG_INFO("Created QP %u | Type=%d | SQ_VQ_ID=%u | RQ_VQ_ID=%u | "
+ "Send_CQN=%u | Recv_CQN=%u",
+ qp->qpn, create_cmd->qp_type,
+ qp->sq.queue.vq->id,
+ qp->rq.queue.vq->id,
+ create_cmd->send_cqn,
+ create_cmd->recv_cqn);
+
+ return 0;
+
+err_qp_init:
+ /* Clean up reference on initialization failure */
+ vhost_rdma_drop_ref(qp, dev, qp);
+ return ret;
+}
+
+static int
+vhost_rdma_modify_qp(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP)
+{
+ struct vhost_rdma_cmd_modify_qp *cmd;
+ struct vhost_rdma_qp *qp;
+ int err;
+
+ CHK_IOVEC(cmd, in);
+
+ qp = vhost_rdma_pool_get(&dev->qp_pool, cmd->qpn);
+ if (unlikely(qp == NULL)) {
+ RDMA_LOG_ERR("qp not found");
+ }
+
+ // FIXME: check in driver?
+ err = vhost_rdma_qp_validate(dev, qp, cmd);
+ if (err)
+ goto err;
+
+ err = vhost_rdma_qp_modify(dev, qp, cmd);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ return err;
+}
+
+void vhost_rdma_av_to_attr(struct vhost_rdma_av *av,
+ struct vhost_rdma_ah_attr *attr)
+{
+ struct vhost_rdma_global_route *grh = &attr->grh;
+
+ rte_memcpy(grh->dgid, av->grh.dgid, sizeof(av->grh.dgid));
+ grh->flow_label = av->grh.flow_label;
+ grh->sgid_index = av->grh.sgid_index;
+ grh->hop_limit = av->grh.hop_limit;
+ grh->traffic_class = av->grh.traffic_class;
+ rte_memcpy(attr->dmac, av->dmac, ETH_ALEN);
+}
+
+int vhost_rdma_qp_query(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_ack_query_qp *rsp)
+{
+ rsp->qp_state = qp->attr.qp_state;
+ rsp->path_mtu = qp->attr.path_mtu;
+ rsp->max_rd_atomic = qp->attr.max_rd_atomic;
+ rsp->max_dest_rd_atomic = qp->attr.max_dest_rd_atomic;
+ rsp->min_rnr_timer = qp->attr.min_rnr_timer;
+ rsp->timeout = qp->attr.timeout;
+ rsp->retry_cnt = qp->attr.retry_cnt;
+ rsp->rnr_retry = qp->attr.rnr_retry;
+ rsp->qkey = qp->attr.qkey;
+ rsp->dest_qp_num = qp->attr.dest_qp_num;
+ rsp->qp_access_flags = qp->attr.qp_access_flags;
+ rsp->rate_limit = qp->attr.rate_limit;
+
+ rsp->rq_psn = qp->resp.psn;
+ rsp->sq_psn = qp->req.psn;
+
+ rsp->cap.max_send_wr = qp->attr.cap.max_send_wr;
+ rsp->cap.max_send_sge = qp->attr.cap.max_send_sge;
+ rsp->cap.max_inline_data = qp->attr.cap.max_inline_data;
+ rsp->cap.max_recv_wr = qp->attr.cap.max_recv_wr;
+ rsp->cap.max_recv_sge = qp->attr.cap.max_recv_sge;
+
+ vhost_rdma_av_to_attr(&qp->av, &rsp->ah_attr);
+
+ if (qp->req.state == QP_STATE_DRAIN) {
+ rsp->sq_draining = 1;
+ } else {
+ rsp->sq_draining = 0;
+ }
+ return 0;
+}
+
+static int
+vhost_rdma_query_qp(struct vhost_rdma_device *dev,
+ struct iovec *in,
+ struct iovec *out)
+{
+ struct vhost_rdma_cmd_query_qp *cmd;
+ struct vhost_rdma_ack_query_qp *rsp;
+ struct vhost_rdma_qp *qp;
+
+ CHK_IOVEC(cmd, in);
+ CHK_IOVEC(rsp, out);
+
+ qp = vhost_rdma_pool_get(&dev->qp_pool, cmd->qpn);
+ vhost_rdma_qp_query(qp, rsp);
+
+ return 0;
+}
+
+void vhost_rdma_qp_destroy(struct vhost_rdma_qp *qp)
+{
+ qp->valid = 0;
+ qp->qp_timeout_ticks = 0;
+ vhost_rdma_cleanup_task(&qp->resp.task);
+
+ if (qp->type == VHOST_RDMA_IB_QPT_RC) {
+ rte_timer_stop_sync(&qp->retrans_timer);
+ rte_timer_stop_sync(&qp->rnr_nak_timer);
+ }
+
+ vhost_rdma_cleanup_task(&qp->req.task);
+ vhost_rdma_cleanup_task(&qp->comp.task);
+
+ /* flush out any receive wr's or pending requests */
+ __vhost_rdma_do_task(&qp->req.task);
+ if (qp->sq.queue.vq) {
+ __vhost_rdma_do_task(&qp->comp.task);
+ __vhost_rdma_do_task(&qp->req.task);
+ }
+
+ vhost_rdma_queue_cleanup(qp, &qp->sq.queue);
+ vhost_rdma_queue_cleanup(qp, &qp->rq.queue);
+
+ qp->sq.queue.vq->last_avail_idx = 0;
+ qp->sq.queue.vq->last_used_idx = 0;
+ qp->rq.queue.vq->last_avail_idx = 0;
+ qp->rq.queue.vq->last_used_idx = 0;
+
+ rte_free(qp->req_pkts);
+ rte_free(qp->resp_pkts);
+}
+
+static int
+vhost_rdma_destroy_qp(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP)
+{
+ struct vhost_rdma_cmd_destroy_qp *cmd;
+ struct vhost_rdma_qp* qp;
+
+ CHK_IOVEC(cmd, in);
+
+ qp = vhost_rdma_pool_get(&dev->qp_pool, cmd->qpn);
+
+ vhost_rdma_qp_destroy(qp);
+
+ if (qp->type != VHOST_RDMA_IB_QPT_GSI)
+ vhost_rdma_drop_ref(qp, dev, qp);
+
+ return 0;
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
@@ -862,6 +1104,10 @@ struct {
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_QP, vhost_rdma_create_qp),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_MODIFY_QP, vhost_rdma_modify_qp),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_QP, vhost_rdma_query_qp),
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_QP, vhost_rdma_destroy_qp),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index ddfdcf4917..79575e735c 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -20,6 +20,7 @@
#include <netinet/in.h>
#include <linux/virtio_net.h>
+#include <linux/vhost_types.h>
#include <rte_spinlock.h>
#include <rte_atomic.h>
@@ -27,7 +28,7 @@
#include <rte_mbuf.h>
#include <rte_ring.h>
#include <rte_vhost.h>
-#include <linux/vhost_types.h>
+#include <rte_interrupts.h>
#include "eal_interrupts.h"
@@ -61,6 +62,8 @@ struct vhost_queue;
#define USER_MMAP_TARGET_PAGE_SIZE 4096
#define USER_MMAP_PAGE_MASK (~(USER_MMAP_TARGET_PAGE_SIZE-1))
+#define VHOST_RDMA_GSI_QPN 1
+
/** ROCE control command types (virtio-rdma extension) */
#define VHOST_RDMA_CTRL_ROCE 6
#define VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE 0
@@ -121,6 +124,14 @@ struct vhost_rdma_ack_query_device {
uint32_t reserved[14];
};
+enum vhost_rdma_qp_state {
+ QP_STATE_RESET,
+ QP_STATE_INIT,
+ QP_STATE_READY,
+ QP_STATE_DRAIN, /* req only */
+ QP_STATE_DRAINED, /* req only */
+ QP_STATE_ERROR
+};
/**
* @defgroup qp_states Queue Pair States
@@ -252,25 +263,43 @@ enum ib_port_speed {
VHOST_RDMA_IB_SPEED_NDR = 128,
};
+enum vhost_rdma_ib_qp_attr_mask {
+ VHOST_RDMA_IB_QP_STATE = 1,
+ VHOST_RDMA_IB_QP_CUR_STATE = (1<<1),
+ VHOST_RDMA_IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2),
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS = (1<<3),
+ VHOST_RDMA_IB_QP_PKEY_INDEX = (1<<4),
+ VHOST_RDMA_IB_QP_PORT = (1<<5),
+ VHOST_RDMA_IB_QP_QKEY = (1<<6),
+ VHOST_RDMA_IB_QP_AV = (1<<7),
+ VHOST_RDMA_IB_QP_PATH_MTU = (1<<8),
+ VHOST_RDMA_IB_QP_TIMEOUT = (1<<9),
+ VHOST_RDMA_IB_QP_RETRY_CNT = (1<<10),
+ VHOST_RDMA_IB_QP_RNR_RETRY = (1<<11),
+ VHOST_RDMA_IB_QP_RQ_PSN = (1<<12),
+ VHOST_RDMA_IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ VHOST_RDMA_IB_QP_ALT_PATH = (1<<14),
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER = (1<<15),
+ VHOST_RDMA_IB_QP_SQ_PSN = (1<<16),
+ VHOST_RDMA_IB_QP_MAX_DEST_RD_ATOMIC = (1<<17),
+ VHOST_RDMA_IB_QP_PATH_MIG_STATE = (1<<18),
+ VHOST_RDMA_IB_QP_CAP = (1<<19),
+ VHOST_RDMA_IB_QP_DEST_QPN = (1<<20),
+ VHOST_RDMA_IB_QP_RESERVED1 = (1<<21),
+ VHOST_RDMA_IB_QP_RESERVED2 = (1<<22),
+ VHOST_RDMA_IB_QP_RESERVED3 = (1<<23),
+ VHOST_RDMA_IB_QP_RESERVED4 = (1<<24),
+ VHOST_RDMA_IB_QP_RATE_LIMIT = (1<<25),
+};
+
enum vhost_ib_access_flags {
- VHOST_RDMA_IB_ACCESS_LOCAL_WRITE = (1 << 0),
- VHOST_RDMA_IB_ACCESS_REMOTE_WRITE = (1 << 1),
- VHOST_RDMA_IB_ACCESS_REMOTE_READ = (1 << 2),
+ VHOST_RDMA_IB_ACCESS_LOCAL_WRITE = (1 << 0),
+ VHOST_RDMA_IB_ACCESS_REMOTE_WRITE = (1 << 1),
+ VHOST_RDMA_IB_ACCESS_REMOTE_READ = (1 << 2),
};
#define VHOST_RDMA_IB_ACCESS_REMOTE (VHOST_RDMA_IB_ACCESS_REMOTE_WRITE | VHOST_RDMA_IB_ACCESS_REMOTE_READ)
-/**
- * @brief QP capabilities structure
- */
-struct vhost_rdma_qp_cap {
- uint32_t max_send_wr; /**< Max work requests in send queue */
- uint32_t max_send_sge; /**< Max scatter-gather elements per send WR */
- uint32_t max_recv_wr; /**< Max work requests in receive queue */
- uint32_t max_recv_sge; /**< Max SGEs per receive WR */
- uint32_t max_inline_data; /**< Max inline data size supported */
-};
-
/**
* @brief Global route attributes (used in AH/GRH)
*/
@@ -292,7 +321,20 @@ struct vhost_rdma_ah_attr {
uint8_t port_num; /**< Physical port number */
uint8_t ah_flags; /**< Flags (e.g., GRH present) */
uint8_t dmac[6]; /**< Destination MAC address (for RoCE) */
-} __rte_packed;
+};
+
+struct vhost_rdma_qp_cap {
+ /* Maximum number of outstanding WRs in SQ */
+ uint32_t max_send_wr;
+ /* Maximum number of s/g elements per WR in SQ */
+ uint32_t max_send_sge;
+ /* Maximum number of outstanding WRs in RQ */
+ uint32_t max_recv_wr;
+ /* Maximum number of s/g elements per WR in RQ */
+ uint32_t max_recv_sge;
+ /* Maximum number of data (bytes) that can be posted inline to SQ */
+ uint32_t max_inline_data;
+};
/**
* @brief Queue Pair attributes
@@ -387,7 +429,7 @@ struct vhost_user_rdma_msg {
* @brief Completion Queue (CQ)
*/
struct vhost_rdma_cq {
- struct vhost_user_queue *vq; /**< Notification V-ring */
+ struct vhost_user_queue *vq; /**< Notification V-ring */
rte_spinlock_t cq_lock; /**< Protect CQ operations */
uint8_t notify; /**< Notify pending flag */
bool is_dying; /**< Being destroyed */
@@ -446,7 +488,7 @@ struct vhost_rdma_task {
* @brief Requester-side operation tracking
*/
struct vhost_rdma_req_info {
- enum vhost_rdma_ib_qp_state state;
+ enum vhost_rdma_qp_state state;
int wqe_index; /**< Current WQE index */
uint32_t psn; /**< Packet Sequence Number */
int opcode; /**< Operation type */
@@ -509,6 +551,28 @@ struct vhost_rdma_recv_wqe {
struct vhost_rdma_dma_info dma; /**< DMA context */
};
+/**
+ * @brief Internal representation of a Send Work Queue Entry (WQE)
+ *
+ * Created from a user-space WR; used during processing and retransmission.
+ */
+struct vhost_rdma_send_wqe {
+ struct vhost_rdma_sq_req *wr; /**< Original WR pointer (from ring) */
+ struct vhost_rdma_av av; /**< Address vector (path info) */
+ __u32 status; /**< Execution status (see ib_wc_status) */
+ __u32 state; /**< Processing state (e.g., active, done) */
+ __aligned_u64 iova; /**< IOVA base for DMA mapping */
+ __u32 mask; /**< Bitmask for PSN handling */
+ __u32 first_psn; /**< First Packet Sequence Number */
+ __u32 last_psn; /**< Last Packet Sequence Number */
+ __u32 ack_length; /**< Bytes acknowledged so far */
+ __u32 ssn; /**< Send Sequence Number */
+ __u32 has_rd_atomic; /**< Flag indicating RDMA read or atomic op */
+
+ /* DMA transfer progress */
+ struct vhost_rdma_dma_info dma;
+};
+
/**
* @brief Memory Region (MR) types
*/
@@ -582,7 +646,7 @@ struct vhost_rdma_resp_res {
* @brief Response processing context (responder side)
*/
struct vhost_rdma_resp_info {
- enum vhost_rdma_ib_qp_state state;
+ enum vhost_rdma_qp_state state;
uint32_t msn; /**< Message sequence number */
uint32_t psn; /**< Current PSN */
uint32_t ack_psn; /**< Acknowledged PSN */
@@ -772,6 +836,127 @@ struct vhost_rdma_cmd_dereg_mr {
uint32_t mrn;
};
+struct vhost_rdma_cmd_create_qp {
+ /* The handle of PD which the QP associated with */
+ uint32_t pdn;
+#define VHOST_RDMA_IB_QPT_SMI 0
+#define VHOST_RDMA_IB_QPT_GSI 1
+#define VHOST_RDMA_IB_QPT_RC 2
+#define VHOST_RDMA_IB_QPT_UC 3
+#define VHOST_RDMA_IB_QPT_UD 4
+ /* QP's type */
+ uint8_t qp_type;
+ /* If set, each WR submitted to the SQ generates a completion entry */
+ uint8_t sq_sig_all;
+ uint32_t max_send_wr;
+ uint32_t max_send_sge;
+ uint32_t send_cqn;
+ uint32_t max_recv_wr;
+ uint32_t max_recv_sge;
+ uint32_t recv_cqn;
+
+ uint32_t max_inline_data;
+ /* Reserved for future */
+ //uint32_t reserved[4];
+};
+
+struct vhost_rdma_ack_create_qp {
+ /* The index of QP */
+ uint32_t qpn;
+};
+
+struct vhost_rdma_ack_query_qp {
+ /* Move the QP to this state, enum virtio_ib_qp_state */
+ uint8_t qp_state;
+ /* Path MTU (valid only for RC/UC QPs), enum virtio_ib_mtu */
+ uint8_t path_mtu;
+ /* Is the SQ draining */
+ uint8_t sq_draining;
+ /* Number of outstanding RDMA read operations on destination QP (valid only for RC QPs) */
+ uint8_t max_rd_atomic;
+ /* Number of responder resources for handling incoming RDMA read operations (valid only for RC QPs) */
+ uint8_t max_dest_rd_atomic;
+ /* Minimum RNR NAK timer (valid only for RC QPs) */
+ uint8_t min_rnr_timer;
+ /* Local ack timeout (valid only for RC QPs) */
+ uint8_t timeout;
+ /* Retry count (valid only for RC QPs) */
+ uint8_t retry_cnt;
+ /* RNR retry (valid only for RC QPs) */
+ uint8_t rnr_retry;
+ /* Padding */
+ uint8_t padding[7];
+ /* Q_Key for the QP (valid only for UD QPs) */
+ uint32_t qkey;
+ /* PSN for RQ (valid only for RC/UC QPs) */
+ uint32_t rq_psn;
+ /* PSN for SQ */
+ uint32_t sq_psn;
+ /* Destination QP number (valid only for RC/UC QPs) */
+ uint32_t dest_qp_num;
+ /* Mask of enabled remote access operations (valid only for RC/UC QPs), enum virtio_ib_access_flags */
+ uint32_t qp_access_flags;
+ /* Rate limit in kbps for packet pacing */
+ uint32_t rate_limit;
+ /* QP capabilities */
+ struct vhost_rdma_qp_cap cap;
+ /* Address Vector (valid only for RC/UC QPs) */
+ struct vhost_rdma_ah_attr ah_attr;
+ /* Reserved for future */
+ uint32_t reserved[4];
+};
+
+enum vhost_rdma_ib_mig_state {
+ VHOST_RDMA_IB_MIG_MIGRATED,
+ VHOST_RDMA_IB_MIG_REARM,
+ VHOST_RDMA_IB_MIG_ARMED
+};
+
+struct vhost_rdma_cmd_modify_qp {
+ /* The index of QP */
+ uint32_t qpn;
+
+ uint32_t attr_mask;
+ enum vhost_rdma_ib_qp_state qp_state;
+ enum vhost_rdma_ib_qp_state cur_qp_state;
+ enum vhost_rdma_ib_mtu path_mtu;
+ enum vhost_rdma_ib_mig_state path_mig_state;
+ uint32_t qkey;
+ uint32_t rq_psn;
+ uint32_t sq_psn;
+ uint32_t dest_qp_num;
+ uint32_t qp_access_flags;
+ uint16_t pkey_index;
+ uint16_t alt_pkey_index;
+ uint8_t en_sqd_async_notify;
+ uint8_t sq_draining;
+ uint8_t max_rd_atomic;
+ uint8_t max_dest_rd_atomic;
+ uint8_t min_rnr_timer;
+ uint8_t port_num;
+ uint8_t timeout;
+ uint8_t retry_cnt;
+ uint8_t rnr_retry;
+ uint8_t alt_port_num;
+ uint8_t alt_timeout;
+ uint32_t rate_limit;
+ struct vhost_rdma_qp_cap cap;
+ struct vhost_rdma_ah_attr ah_attr;
+ struct vhost_rdma_ah_attr alt_ah_attr;
+};
+
+struct vhost_rdma_cmd_query_qp {
+ /* The index of QP */
+ uint32_t qpn;
+ /* The mask of attributes need to be queried, enum virtio_ib_qp_attr_mask */
+ uint32_t attr_mask;
+};
+
+struct vhost_rdma_cmd_destroy_qp {
+ /* The index of QP */
+ uint32_t qpn;
+};
+
/**
* @brief Convert IB MTU enum to byte size
* @param mtu The MTU enum value
@@ -790,6 +975,16 @@ ib_mtu_enum_to_int(enum vhost_rdma_ib_mtu mtu)
}
}
+static __rte_always_inline int
+__vhost_rdma_do_task(struct vhost_rdma_task *task)
+
+{
+ int ret;
+ while ((ret = task->func(task->arg)) == 0);
+ task->ret = ret;
+ return ret;
+}
+
/* Function declarations */
/**
@@ -829,13 +1024,6 @@ void vhost_rdma_mr_cleanup(void *arg);
*/
void vhost_rdma_qp_cleanup(void *arg);
-/**
- * @brief Clean up a vhost_rdma_queue (drain rings, unregister interrupts)
- * @param qp Owning QP
- * @param queue Queue to clean
- */
-void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue);
-
/**
* @brief Release one RDMA read/atomic responder resource
* @param qp QP owning the resource
@@ -843,6 +1031,8 @@ void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue
*/
void free_rd_atomic_resource(struct vhost_rdma_qp *qp, struct vhost_rdma_resp_res *res);
+int alloc_rd_atomic_resources(struct vhost_rdma_qp *qp, unsigned int n);
+
/**
* @brief Release all RDMA read/atomic responder resources
* @param qp QP whose resources to free
@@ -866,4 +1056,17 @@ void vhost_rdma_map_pages(struct rte_vhost_memory *mem,
uint64_t *dma_pages,
uint32_t npages);
+int vhost_rdma_qp_query(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_ack_query_qp *rsp);
+
+int vhost_rdma_qp_modify(struct vhost_rdma_device *dev, struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_modify_qp *cmd);
+int vhost_rdma_qp_init(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_create_qp *cmd);
+void vhost_rdma_av_to_attr(struct vhost_rdma_av *av,
+ struct vhost_rdma_ah_attr *attr);
+
+void vhost_rdma_cleanup_task(struct vhost_rdma_task *task);
+
#endif /* __VHOST_RDMA_IB_H__ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.c b/examples/vhost_user_rdma/vhost_rdma_opcode.c
new file mode 100644
index 0000000000..4284a405f5
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.c
@@ -0,0 +1,894 @@
+/*
+ * Vhost-user RDMA device : rdma opcode
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma_ib.h"
+
+struct vhost_rdma_wr_opcode_info vhost_rdma_wr_opcode_info[] = {
+ [VHOST_RDMA_IB_WR_RDMA_WRITE] = {
+ .name = "VHOST_RDMA_IB_WR_RDMA_WRITE",
+ .mask = {
+ [VHOST_RDMA_IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [VHOST_RDMA_IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM] = {
+ .name = "VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM",
+ .mask = {
+ [VHOST_RDMA_IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [VHOST_RDMA_IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [VHOST_RDMA_IB_WR_SEND] = {
+ .name = "VHOST_RDMA_IB_WR_SEND",
+ .mask = {
+ [VHOST_RDMA_IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [VHOST_RDMA_IB_WR_SEND_WITH_IMM] = {
+ .name = "VHOST_RDMA_IB_WR_SEND_WITH_IMM",
+ .mask = {
+ [VHOST_RDMA_IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [VHOST_RDMA_IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [VHOST_RDMA_IB_WR_RDMA_READ] = {
+ .name = "VHOST_RDMA_IB_WR_RDMA_READ",
+ .mask = {
+ [VHOST_RDMA_IB_QPT_RC] = WR_READ_MASK,
+ },
+ },
+};
+
+struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE] = {
+ [IB_OPCODE_RC_SEND_FIRST] = {
+ .name = "IB_OPCODE_RC_SEND_FIRST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_RWR_MASK
+ | VHOST_SEND_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RC_SEND_MIDDLE",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_SEND_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST] = {
+ .name = "IB_OPCODE_RC_SEND_LAST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_COMP_MASK
+ | VHOST_SEND_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_SEND_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_COMP_MASK
+ | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+ .mask = VHOST_RETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+ .mask = VHOST_RETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_RETH_MASK | VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_REQUEST",
+ .mask = VHOST_RETH_MASK | VHOST_REQ_MASK | VHOST_READ_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+ .mask = VHOST_AETH_MASK | VHOST_PAYLOAD_MASK | VHOST_ACK_MASK
+ | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_ACK_MASK | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+ .mask = VHOST_AETH_MASK | VHOST_PAYLOAD_MASK | VHOST_ACK_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+ .mask = VHOST_AETH_MASK | VHOST_PAYLOAD_MASK | VHOST_ACK_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ACKNOWLEDGE",
+ .mask = VHOST_AETH_MASK | VHOST_ACK_MASK | VHOST_START_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+ .mask = VHOST_AETH_MASK | VHOST_ATMACK_MASK | VHOST_ACK_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMACK_BYTES + VHOST_AETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_ATMACK] = VHOST_BTH_BYTES
+ + VHOST_AETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_ATMACK_BYTES + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_COMPARE_SWAP] = {
+ .name = "IB_OPCODE_RC_COMPARE_SWAP",
+ .mask = VHOST_ATMETH_MASK | VHOST_REQ_MASK | VHOST_ATOMIC_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_ATMETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_FETCH_ADD] = {
+ .name = "IB_OPCODE_RC_FETCH_ADD",
+ .mask = VHOST_ATMETH_MASK | VHOST_REQ_MASK | VHOST_ATOMIC_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_ATMETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+ .mask = VHOST_IETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_SEND_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_INV",
+ .mask = VHOST_IETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_END_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IETH_BYTES,
+ }
+ },
+
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = {
+ .name = "IB_OPCODE_UC_SEND_FIRST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_RWR_MASK
+ | VHOST_SEND_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_UC_SEND_MIDDLE",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_SEND_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST] = {
+ .name = "IB_OPCODE_UC_SEND_LAST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_COMP_MASK
+ | VHOST_SEND_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_SEND_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_COMP_MASK
+ | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+ .mask = VHOST_RETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST",
+ .mask = VHOST_PAYLOAD_MASK | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+ .mask = VHOST_RETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_RETH_MASK | VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_RETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+
+ /* RD */
+ [IB_OPCODE_RD_SEND_FIRST] = {
+ .name = "IB_OPCODE_RD_SEND_FIRST",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RD_SEND_MIDDLE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_SEND_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST] = {
+ .name = "IB_OPCODE_RD_SEND_LAST",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_COMP_MASK | VHOST_SEND_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_IMMDT_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_SEND_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_SEND_MASK | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_IMMDT_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_RETH_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_IMMDT_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_RETH_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_WRITE_MASK | VHOST_START_MASK
+ | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_RETH_MASK
+ | VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_WRITE_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_RETH_BYTES
+ + VHOST_DETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_RETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_RETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_REQUEST",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_RETH_MASK
+ | VHOST_REQ_MASK | VHOST_READ_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RETH_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_RETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RETH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+ .mask = VHOST_RDETH_MASK | VHOST_AETH_MASK
+ | VHOST_PAYLOAD_MASK | VHOST_ACK_MASK
+ | VHOST_START_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = VHOST_RDETH_MASK | VHOST_PAYLOAD_MASK | VHOST_ACK_MASK
+ | VHOST_MIDDLE_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+ .mask = VHOST_RDETH_MASK | VHOST_AETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_ACK_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+ .mask = VHOST_RDETH_MASK | VHOST_AETH_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_ACK_MASK | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ACKNOWLEDGE",
+ .mask = VHOST_RDETH_MASK | VHOST_AETH_MASK | VHOST_ACK_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_AETH_BYTES + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+ .mask = VHOST_RDETH_MASK | VHOST_AETH_MASK | VHOST_ATMACK_MASK
+ | VHOST_ACK_MASK | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMACK_BYTES + VHOST_AETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_AETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_ATMACK] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_COMPARE_SWAP] = {
+ .name = "RD_COMPARE_SWAP",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_ATMETH_MASK
+ | VHOST_REQ_MASK | VHOST_ATOMIC_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMETH_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_ATMETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES +
+ + VHOST_ATMETH_BYTES
+ + VHOST_DETH_BYTES +
+ + VHOST_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_FETCH_ADD] = {
+ .name = "IB_OPCODE_RD_FETCH_ADD",
+ .mask = VHOST_RDETH_MASK | VHOST_DETH_MASK | VHOST_ATMETH_MASK
+ | VHOST_REQ_MASK | VHOST_ATOMIC_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_ATMETH_BYTES + VHOST_DETH_BYTES
+ + VHOST_RDETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_RDETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES,
+ [VHOST_RDMA_ATMETH] = VHOST_BTH_BYTES
+ + VHOST_RDETH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES +
+ + VHOST_ATMETH_BYTES
+ + VHOST_DETH_BYTES +
+ + VHOST_RDETH_BYTES,
+ }
+ },
+
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY",
+ .mask = VHOST_DETH_MASK | VHOST_PAYLOAD_MASK | VHOST_REQ_MASK
+ | VHOST_COMP_MASK | VHOST_RWR_MASK | VHOST_SEND_MASK
+ | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_DETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = VHOST_DETH_MASK | VHOST_IMMDT_MASK | VHOST_PAYLOAD_MASK
+ | VHOST_REQ_MASK | VHOST_COMP_MASK | VHOST_RWR_MASK
+ | VHOST_SEND_MASK | VHOST_START_MASK | VHOST_END_MASK,
+ .length = VHOST_BTH_BYTES + VHOST_IMMDT_BYTES + VHOST_DETH_BYTES,
+ .offset = {
+ [VHOST_RDMA_BTH] = 0,
+ [VHOST_RDMA_DETH] = VHOST_BTH_BYTES,
+ [VHOST_RDMA_IMMDT] = VHOST_BTH_BYTES
+ + VHOST_DETH_BYTES,
+ [VHOST_RDMA_PAYLOAD] = VHOST_BTH_BYTES
+ + VHOST_DETH_BYTES
+ + VHOST_IMMDT_BYTES,
+ }
+ },
+
+};
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.h b/examples/vhost_user_rdma/vhost_rdma_opcode.h
new file mode 100644
index 0000000000..b8f48bcdf5
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.h
@@ -0,0 +1,330 @@
+/**
+ * @file vhost_rdma_opcode.h
+ * @brief Vhost-user RDMA packet format and opcode definitions.
+ *
+ * This header defines the internal packet representation, InfiniBand/RoCE header layout,
+ * opcode mapping, and control flags used during packet parsing and transmission
+ * in the vhost-user RDMA backend.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef __VHOST_RDMA_OPCODE_H__
+#define __VHOST_RDMA_OPCODE_H__
+
+#include <stdint.h>
+
+#include <rte_byteorder.h>
+#include <rte_interrupts.h>
+
+#include "vhost_rdma_ib.h"
+
+/** Maximum number of QP types supported for WR mask dispatching */
+#define WR_MAX_QPT 8
+
+/** Total number of defined opcodes (must be power-of-2 >= 256) */
+#define VHOST_NUM_OPCODE 256
+
+#ifndef BIT
+ #define BIT(x) (1 << (x))
+#endif
+
+/* Invalid opcode marker */
+#define OPCODE_NONE (-1)
+
+struct vhost_bth {
+ uint8_t opcode;
+ uint8_t flags;
+ rte_be16_t pkey;
+ rte_be32_t qpn;
+ rte_be32_t apsn;
+};
+
+struct vhost_deth {
+ rte_be32_t qkey;
+ rte_be32_t sqp;
+};
+
+struct vhost_immdt {
+ rte_be32_t imm;
+};
+
+struct vhost_reth {
+ rte_be64_t va;
+ rte_be32_t rkey;
+ rte_be32_t len;
+};
+
+struct vhost_aeth {
+ rte_be32_t smsn;
+};
+
+struct vhost_atmack {
+ rte_be64_t orig;
+};
+
+struct vhost_atmeth {
+ rte_be64_t va;
+ rte_be32_t rkey;
+ rte_be64_t swap_add;
+ rte_be64_t comp;
+} __rte_packed;
+
+struct vhost_ieth {
+ rte_be32_t rkey;
+};
+
+struct vhost_rdeth {
+ rte_be32_t een;
+};
+
+enum vhost_rdma_hdr_length {
+ VHOST_BTH_BYTES = sizeof(struct vhost_bth),
+ VHOST_DETH_BYTES = sizeof(struct vhost_deth),
+ VHOST_IMMDT_BYTES = sizeof(struct vhost_immdt),
+ VHOST_RETH_BYTES = sizeof(struct vhost_reth),
+ VHOST_AETH_BYTES = sizeof(struct vhost_aeth),
+ VHOST_ATMACK_BYTES = sizeof(struct vhost_atmack),
+ VHOST_ATMETH_BYTES = sizeof(struct vhost_atmeth),
+ VHOST_IETH_BYTES = sizeof(struct vhost_ieth),
+ VHOST_RDETH_BYTES = sizeof(struct vhost_rdeth),
+};
+
+/**
+ * @brief Helper macro to define IB opcodes by transport and operation
+ *
+ * Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST`
+ */
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
+
+/**
+ * @defgroup ib_opcodes InfiniBand OpCode Definitions
+ *
+ * Based on IBTA Vol 1 Table 38 and extended for RoCE semantics.
+ * @{
+ */
+
+enum {
+ /* Transport types (base values) */
+ IB_OPCODE_RC = 0x00, /**< Reliable Connection */
+ IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
+ IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
+ IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
+ IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
+ IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
+
+ /* Operation subtypes */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+ /* 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+
+ /* Real opcodes generated via IB_OPCODE() macro */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+
+ /* UC opcodes */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD opcodes */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD opcodes */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+/** @} */
+
+/**
+ * @defgroup wr_masks Work Request Type Masks
+ * @{
+ */
+enum vhost_rdma_wr_mask {
+ WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
+ WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
+ WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
+ WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
+ WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
+ WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+/**
+ * @brief Metadata about each Work Request (WR) opcode
+ *
+ * Used to determine which operations are valid per QP type.
+ */
+struct vhost_rdma_wr_opcode_info {
+ const char *name; /**< Human-readable name */
+ enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
+};
+
+/* Extern declaration of global opcode metadata table */
+extern struct vhost_rdma_wr_opcode_info vhost_rdma_wr_opcode_info[];
+
+/* vhost_rdma_opcode */
+static inline unsigned int wr_opcode_mask(int opcode, struct vhost_rdma_qp *qp)
+{
+ return vhost_rdma_wr_opcode_info[opcode].mask[qp->type];
+}
+
+/**
+ * @defgroup hdr_types Header Types (for offset tracking)
+ * @{
+ */
+enum vhost_rdma_hdr_type {
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
+};
+
+/**
+ * @defgroup hdr_masks Header Presence and Semantic Flags
+ * @{
+ */
+enum vhost_rdma_hdr_mask {
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+};
+/** @} */
+
+/**
+ * @brief Per-opcode metadata for parsing and validation
+ */
+struct vhost_rdma_opcode_info {
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+};
+
+/* Global opcode info table (indexed by IB opcode byte) */
+extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+
+#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.h b/examples/vhost_user_rdma/vhost_rdma_pkt.h
index 2bbc030e0a..e6a605f574 100644
--- a/examples/vhost_user_rdma/vhost_rdma_pkt.h
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.h
@@ -39,244 +39,6 @@ struct vhost_rdma_send_wqe;
* @{
*/
-/** Maximum number of QP types supported for WR mask dispatching */
-#define WR_MAX_QPT 8
-
-/** Invalid opcode marker */
-#define OPCODE_NONE (-1)
-
-/** Total number of defined opcodes (must be power-of-2 >= 256) */
-#define VHOST_NUM_OPCODE 256
-
-/** @} */
-
-/**
- * @defgroup wr_masks Work Request Type Masks
- * @{
- */
-enum vhost_rdma_wr_mask {
- WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
- WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
- WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
- WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
- WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
- WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
-
- WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
- WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
- WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
- WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
-};
-/** @} */
-
-/**
- * @brief Metadata about each Work Request (WR) opcode
- *
- * Used to determine which operations are valid per QP type.
- */
-struct vhost_rdma_wr_opcode_info {
- const char *name; /**< Human-readable name */
- enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
-};
-
-/* Extern declaration of global opcode metadata table */
-extern struct vhost_rdma_wr_opcode_info vhost_rdma_wr_opcode_info[];
-
-/**
- * @defgroup hdr_types Header Types (for offset tracking)
- * @{
- */
-enum vhost_rdma_hdr_type {
- VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
- VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
- VHOST_RDMA_BTH, /**< Base Transport Header */
- VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
- VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
- VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
- VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
- VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
- VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
- VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
- VHOST_RDMA_IMMDT, /**< Immediate Data Header */
- VHOST_RDMA_PAYLOAD, /**< Payload section */
- NUM_HDR_TYPES /**< Number of known header types */
-};
-/** @} */
-
-/**
- * @defgroup hdr_masks Header Presence and Semantic Flags
- * @{
- */
-enum vhost_rdma_hdr_mask {
- VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
- VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
- VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
- VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
- VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
- VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
- VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
- VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
- VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
- VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
- VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
- VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
-
- /* Semantic packet type flags */
- VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
- VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
- VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
- VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
- VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
- VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
-
- /* Packet fragmentation flags */
- VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
- VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
-
- VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
- VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
- VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
-
- VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
-
- /* Composite masks */
- VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
- VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
-};
-/** @} */
-
-/**
- * @brief Per-opcode metadata for parsing and validation
- */
-struct vhost_rdma_opcode_info {
- const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
- int length; /**< Fixed payload length (if any) */
- int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
- enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
-};
-
-/* Global opcode info table (indexed by IB opcode byte) */
-extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
-
-/**
- * @brief Helper macro to define IB opcodes by transport and operation
- *
- * Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST`
- */
-#define IB_OPCODE(transport, op) \
- IB_OPCODE_ ## transport ## _ ## op = \
- (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
-
-/**
- * @defgroup ib_opcodes InfiniBand OpCode Definitions
- *
- * Based on IBTA Vol 1 Table 38 and extended for RoCE semantics.
- * @{
- */
-
-enum {
- /* Transport types (base values) */
- IB_OPCODE_RC = 0x00, /**< Reliable Connection */
- IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
- IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
- IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
- IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
- IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
-
- /* Operation subtypes */
- IB_OPCODE_SEND_FIRST = 0x00,
- IB_OPCODE_SEND_MIDDLE = 0x01,
- IB_OPCODE_SEND_LAST = 0x02,
- IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
- IB_OPCODE_SEND_ONLY = 0x04,
- IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
- IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
- IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
- IB_OPCODE_RDMA_WRITE_LAST = 0x08,
- IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
- IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
- IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
- IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
- IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
- IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
- IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
- IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
- IB_OPCODE_ACKNOWLEDGE = 0x11,
- IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
- IB_OPCODE_COMPARE_SWAP = 0x13,
- IB_OPCODE_FETCH_ADD = 0x14,
- /* 0x15 is reserved */
- IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
- IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
-
- /* Real opcodes generated via IB_OPCODE() macro */
- IB_OPCODE(RC, SEND_FIRST),
- IB_OPCODE(RC, SEND_MIDDLE),
- IB_OPCODE(RC, SEND_LAST),
- IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RC, SEND_ONLY),
- IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_WRITE_FIRST),
- IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
- IB_OPCODE(RC, RDMA_WRITE_LAST),
- IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_WRITE_ONLY),
- IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_READ_REQUEST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
- IB_OPCODE(RC, ACKNOWLEDGE),
- IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
- IB_OPCODE(RC, COMPARE_SWAP),
- IB_OPCODE(RC, FETCH_ADD),
- IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
- IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
-
- /* UC opcodes */
- IB_OPCODE(UC, SEND_FIRST),
- IB_OPCODE(UC, SEND_MIDDLE),
- IB_OPCODE(UC, SEND_LAST),
- IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(UC, SEND_ONLY),
- IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(UC, RDMA_WRITE_FIRST),
- IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
- IB_OPCODE(UC, RDMA_WRITE_LAST),
- IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(UC, RDMA_WRITE_ONLY),
- IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
-
- /* RD opcodes */
- IB_OPCODE(RD, SEND_FIRST),
- IB_OPCODE(RD, SEND_MIDDLE),
- IB_OPCODE(RD, SEND_LAST),
- IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RD, SEND_ONLY),
- IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_WRITE_FIRST),
- IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
- IB_OPCODE(RD, RDMA_WRITE_LAST),
- IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_WRITE_ONLY),
- IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_READ_REQUEST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
- IB_OPCODE(RD, ACKNOWLEDGE),
- IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
- IB_OPCODE(RD, COMPARE_SWAP),
- IB_OPCODE(RD, FETCH_ADD),
-
- /* UD opcodes */
- IB_OPCODE(UD, SEND_ONLY),
- IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
-};
-/** @} */
-
/**
* @brief Runtime packet context used during processing
*/
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.c b/examples/vhost_user_rdma/vhost_rdma_queue.c
new file mode 100644
index 0000000000..abce651fa5
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.c
@@ -0,0 +1,1056 @@
+/*
+ * Vhost-user RDMA device : QP,SQ,RQ function
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <rte_interrupts.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_log.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_opcode.h"
+
+static const struct {
+ int valid;
+ enum vhost_rdma_ib_qp_attr_mask req_param[VHOST_RDMA_IB_QPT_UD + 1];
+ enum vhost_rdma_ib_qp_attr_mask opt_param[VHOST_RDMA_IB_QPT_UD + 1];
+} qp_state_table[VHOST_RDMA_IB_QPS_ERR + 1][VHOST_RDMA_IB_QPS_ERR + 1] =
+{
+ [VHOST_RDMA_IB_QPS_RESET] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [VHOST_RDMA_IB_QPT_UD] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_UC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_RC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_SMI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_GSI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ }
+ },
+ },
+ [VHOST_RDMA_IB_QPS_INIT] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_UC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_RC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX | VHOST_RDMA_IB_QP_PORT,
+ [VHOST_RDMA_IB_QPT_SMI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_GSI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ }
+ },
+ [VHOST_RDMA_IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_AV |
+ VHOST_RDMA_IB_QP_PATH_MTU |
+ VHOST_RDMA_IB_QP_DEST_QPN |
+ VHOST_RDMA_IB_QP_RQ_PSN),
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_AV |
+ VHOST_RDMA_IB_QP_PATH_MTU |
+ VHOST_RDMA_IB_QP_DEST_QPN |
+ VHOST_RDMA_IB_QP_RQ_PSN |
+ VHOST_RDMA_IB_QP_MAX_DEST_RD_ATOMIC |
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_UC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_RC] = VHOST_RDMA_IB_QP_ACCESS_FLAGS | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_SMI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ [VHOST_RDMA_IB_QPT_GSI] = VHOST_RDMA_IB_QP_QKEY | VHOST_RDMA_IB_QP_PKEY_INDEX,
+ },
+ },
+ },
+ [VHOST_RDMA_IB_QPS_RTR] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [VHOST_RDMA_IB_QPT_UD] = VHOST_RDMA_IB_QP_SQ_PSN,
+ [VHOST_RDMA_IB_QPT_UC] = VHOST_RDMA_IB_QP_SQ_PSN,
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_TIMEOUT |
+ VHOST_RDMA_IB_QP_RETRY_CNT |
+ VHOST_RDMA_IB_QP_RNR_RETRY |
+ VHOST_RDMA_IB_QP_SQ_PSN |
+ VHOST_RDMA_IB_QP_MAX_QP_RD_ATOMIC),
+ [VHOST_RDMA_IB_QPT_SMI] = VHOST_RDMA_IB_QP_SQ_PSN,
+ [VHOST_RDMA_IB_QPT_GSI] = VHOST_RDMA_IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS),
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS |
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER),
+ [VHOST_RDMA_IB_QPT_SMI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_GSI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ }
+ }
+ },
+ [VHOST_RDMA_IB_QPS_RTS] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS),
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS |
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER),
+ [VHOST_RDMA_IB_QPT_SMI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_GSI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ }
+ },
+ [VHOST_RDMA_IB_QPS_SQD] = {
+ .valid = 1,
+ },
+ },
+ [VHOST_RDMA_IB_QPS_SQD] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS),
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS |
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER),
+ [VHOST_RDMA_IB_QPT_SMI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_GSI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ }
+ },
+ [VHOST_RDMA_IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = VHOST_RDMA_IB_QP_QKEY,
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_AV |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS),
+ [VHOST_RDMA_IB_QPT_RC] = (VHOST_RDMA_IB_QP_AV |
+ VHOST_RDMA_IB_QP_TIMEOUT |
+ VHOST_RDMA_IB_QP_RETRY_CNT |
+ VHOST_RDMA_IB_QP_RNR_RETRY |
+ VHOST_RDMA_IB_QP_MAX_QP_RD_ATOMIC |
+ VHOST_RDMA_IB_QP_MAX_DEST_RD_ATOMIC |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS |
+ VHOST_RDMA_IB_QP_MIN_RNR_TIMER),
+ [VHOST_RDMA_IB_QPT_SMI] = VHOST_RDMA_IB_QP_QKEY,
+ [VHOST_RDMA_IB_QPT_GSI] = VHOST_RDMA_IB_QP_QKEY,
+ }
+ }
+ },
+ [VHOST_RDMA_IB_QPS_SQE] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [VHOST_RDMA_IB_QPT_UD] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_UC] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_ACCESS_FLAGS),
+ [VHOST_RDMA_IB_QPT_SMI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ [VHOST_RDMA_IB_QPT_GSI] = (VHOST_RDMA_IB_QP_CUR_STATE |
+ VHOST_RDMA_IB_QP_QKEY),
+ }
+ }
+ },
+ [VHOST_RDMA_IB_QPS_ERR] = {
+ [VHOST_RDMA_IB_QPS_RESET] = { .valid = 1 },
+ [VHOST_RDMA_IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+void
+init_av_from_vhost_rdma(struct vhost_rdma_device *dev, struct vhost_rdma_av *dst,
+ uint32_t ah)
+{
+ struct vhost_rdma_av *av;
+
+ av = vhost_rdma_pool_get(&dev->ah_pool, ah);
+
+ assert(av);
+
+ rte_memcpy(dst, av, sizeof(*dst));
+}
+
+void vhost_rdma_init_send_wqe(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_sq_req *wr,
+ unsigned int mask,
+ unsigned int length,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ int num_sge = wr->num_sge;
+
+ wqe->wr = wr;
+ wqe->mask = mask;
+
+ /* local operation */
+ if (unlikely(mask & WR_LOCAL_OP_MASK)) {
+ wqe->state = WQE_STATE_POSTED;
+ return;
+ }
+
+ if (qp->type == VHOST_RDMA_IB_QPT_UD ||
+ qp->type == VHOST_RDMA_IB_QPT_SMI ||
+ qp->type == VHOST_RDMA_IB_QPT_GSI)
+ init_av_from_vhost_rdma(qp->dev, &wqe->av, wr->ud.ah);
+
+ wqe->iova = mask & WR_READ_OR_WRITE_MASK ? wr->rdma.remote_addr : 0;
+ wqe->dma.length = length;
+ wqe->dma.resid = length;
+ wqe->dma.num_sge = num_sge;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+
+ wqe->dma.sge = wr->sg_list;
+ wqe->state = WQE_STATE_POSTED;
+ wqe->ssn = rte_atomic32_add_return(&qp->ssn, 1);
+}
+
+int
+vhost_rdma_init_task(struct vhost_rdma_task *task, struct rte_ring *task_ring,
+ void *arg, int (*func)(void *), const char *name)
+{
+ task->arg = arg;
+ task->func = func;
+ rte_strscpy(task->name, name, 8);
+ task->destroyed = false;
+ task->task_ring = task_ring;
+
+ task->state = TASK_STATE_START;
+ rte_atomic16_clear(&task->sched);
+ rte_spinlock_init(&task->state_lock);
+
+ return 0;
+}
+
+void
+vhost_rdma_do_task(struct vhost_rdma_task *task)
+{
+ int cont;
+ int ret;
+
+ rte_spinlock_lock(&task->state_lock);
+ switch (task->state) {
+ case TASK_STATE_START:
+ task->state = TASK_STATE_BUSY;
+ rte_spinlock_unlock(&task->state_lock);
+ break;
+
+ case TASK_STATE_BUSY:
+ task->state = TASK_STATE_ARMED;
+ // fallthrough
+ case TASK_STATE_ARMED:
+ rte_spinlock_unlock(&task->state_lock);
+ return;
+
+ default:
+ rte_spinlock_unlock(&task->state_lock);
+ RDMA_LOG_INFO("%s failed with bad state %d\n", __func__, task->state);
+ return;
+ }
+
+ do {
+ cont = 0;
+ ret = task->func(task->arg);
+ rte_spinlock_lock(&task->state_lock);
+ switch (task->state) {
+ case TASK_STATE_BUSY:
+ if (ret)
+ task->state = TASK_STATE_START;
+ else
+ cont = 1;
+ break;
+
+ /* soneone tried to run the task since the last time we called
+ * func, so we will call one more time regardless of the
+ * return value
+ */
+ case TASK_STATE_ARMED:
+ task->state = TASK_STATE_BUSY;
+ cont = 1;
+ break;
+
+ default:
+ RDMA_LOG_INFO("Failed with bad state %d\n", task->state);
+ }
+ rte_spinlock_unlock(&task->state_lock);
+ } while (cont);
+
+ task->ret = ret;
+}
+
+void
+vhost_rdma_run_task(struct vhost_rdma_task *task, int sched)
+{
+ if (task->destroyed)
+ return;
+ RDMA_LOG_DEBUG("run task %s sched %d", task->name, sched);
+ if (sched) {
+ if (rte_atomic16_test_and_set(&task->sched)) {
+ rte_ring_enqueue(task->task_ring, task);
+ }
+ } else {
+ vhost_rdma_do_task(task);
+ }
+}
+
+void
+vhost_rdma_cleanup_task(struct vhost_rdma_task *task)
+{
+ bool idle;
+
+ task->destroyed = true;
+ rte_atomic16_clear(&task->sched);
+
+ do {
+ rte_spinlock_lock(&task->state_lock);
+ idle = (task->state == TASK_STATE_START);
+ rte_spinlock_unlock(&task->state_lock);
+ } while (!idle);
+}
+
+void vhost_rdma_handle_sq(void *arg)
+{
+ struct vhost_rdma_qp *qp = (struct vhost_rdma_qp *)arg;
+ struct vhost_rdma_queue *queue = &qp->sq.queue;
+ struct rte_vhost_vring *vring = &queue->vq->vring;
+ int kick_fd;
+ eventfd_t kick_data;
+
+ kick_fd = queue->vq->vring.kickfd;
+ eventfd_read(kick_fd, &kick_data);
+
+ while(queue->producer_index != vring->avail->idx) {
+ uint16_t last_avail_idx = queue->producer_index & (vring->size - 1);
+ uint16_t desc_idx = vring->avail->ring[last_avail_idx];
+ struct iovec iov;
+ uint16_t num_in, num_out;
+ struct vhost_rdma_sq_req *wr;
+ unsigned int mask, length;
+
+ setup_iovs_from_descs(qp->dev->mem, queue->vq, desc_idx,
+ &iov, 1, &num_in, &num_out);
+
+ assert(num_in == 0);
+ assert(num_out == 1);
+
+ if (iov.iov_len < sizeof(*wr)) {
+ RDMA_LOG_ERR("got bad send wqe");
+ continue;
+ }
+ wr = iov.iov_base;
+
+ mask = wr_opcode_mask(wr->opcode, qp);
+
+ RDMA_LOG_DEBUG_DP("got send wqe qpn: %u type: %d wr_id: %llu opcode: %d mask: %u",
+ qp->qpn, qp->type, wr->wr_id, wr->opcode, mask);
+
+ length = 0;
+ if (unlikely(wr->send_flags & VHOST_RDMA_IB_SEND_INLINE)) {
+ length = wr->inline_len;
+ } else {
+ struct vhost_rdma_sge *sg_list = wr->sg_list;
+ for (uint32_t i = 0; i < wr->num_sge; i++)
+ length += sg_list[i].length;
+ }
+
+ vhost_rdma_init_send_wqe(qp, wr, mask, length,
+ vhost_rdma_queue_get_data(queue, desc_idx));
+
+ queue->producer_index++;
+ }
+
+ vhost_rdma_run_task(&qp->req.task, 1);
+ if (unlikely(qp->req.state == QP_STATE_ERROR))
+ vhost_rdma_run_task(&qp->comp.task, 1);
+}
+
+void vhost_rdma_handle_rq(__rte_unused void *arg)
+{
+ struct vhost_rdma_qp *qp = (struct vhost_rdma_qp *)arg;
+ struct vhost_rdma_queue *queue = &qp->rq.queue;
+ struct rte_vhost_vring *vring = &queue->vq->vring;
+ int kick_fd;
+ eventfd_t kick_data;
+
+ kick_fd = queue->vq->vring.kickfd;
+ eventfd_read(kick_fd, &kick_data);
+
+ while(queue->producer_index != vring->avail->idx) {
+ uint16_t last_avail_idx = queue->producer_index & (vring->size - 1);
+ uint16_t desc_idx = vring->avail->ring[last_avail_idx];
+ struct iovec iov;
+ uint16_t num_in, num_out;
+ unsigned int length;
+ struct vhost_rdma_rq_req *wr;
+ struct vhost_rdma_sge *sg_list;
+ struct vhost_rdma_recv_wqe *recv_wqe;
+
+ setup_iovs_from_descs(qp->dev->mem,
+ queue->vq,
+ desc_idx, &iov, 1,
+ &num_in, &num_out);
+
+ assert(num_in == 0);
+ assert(num_out == 1);
+
+ if (iov.iov_len < sizeof(*wr)) {
+ RDMA_LOG_ERR("got bad recv wqe");
+ continue;
+ }
+ wr = iov.iov_base;
+
+ length = 0;
+ sg_list = wr->sg_list;
+
+ for (uint32_t i = 0; i < wr->num_sge; i++)
+ {
+ length += sg_list[i].length;
+ RDMA_LOG_DEBUG(" length: %d %d", sg_list[i].length, length);
+ }
+
+ recv_wqe = vhost_rdma_queue_get_data(queue, desc_idx);
+
+ recv_wqe->wr_id = wr->wr_id;
+ recv_wqe->num_sge = wr->num_sge;
+ recv_wqe->dma.length = length;
+ recv_wqe->dma.resid = length;
+ recv_wqe->dma.num_sge = wr->num_sge;
+ recv_wqe->dma.cur_sge = 0;
+ recv_wqe->dma.sge_offset = 0;
+ recv_wqe->dma.raw = sg_list;
+
+ queue->producer_index++;
+ }
+
+ if (qp->resp.state == QP_STATE_ERROR)
+ vhost_rdma_run_task(&qp->resp.task, 1);
+}
+
+int vhost_rdma_cq_post(struct vhost_rdma_device *dev,
+ struct vhost_rdma_cq *cq,
+ struct vhost_rdma_cq_req *cqe,
+ int solicited)
+{
+ bool avail;
+ uint16_t desc_idx;
+ struct iovec iovs[1];
+ uint16_t num_in, num_out;
+
+ rte_spinlock_lock(&cq->cq_lock);
+
+ avail = vhost_rdma_vq_is_avail(cq->vq);
+
+ if (unlikely(!avail)) {
+ rte_spinlock_unlock(&cq->cq_lock);
+ return -EBUSY;
+ }
+
+ desc_idx = vhost_rdma_vq_get_desc_idx(cq->vq);
+
+ if (setup_iovs_from_descs(dev->mem, cq->vq, desc_idx, iovs, 1,
+ &num_in, &num_out) < 0) {
+ rte_spinlock_unlock(&cq->cq_lock);
+ RDMA_LOG_ERR("get from cq failed");
+ return -EBUSY;
+ }
+
+ if (iovs[0].iov_len < sizeof(*cqe)) {
+ RDMA_LOG_ERR_DP("cqe size is too small");
+ return -EIO;
+ }
+ rte_memcpy(iovs[0].iov_base, cqe, sizeof(*cqe));
+
+ RDMA_LOG_DEBUG("poll cqe cqn: %u wr_id: %llu opcode: %d status: %d",
+ cq->cqn, cqe->wr_id, cqe->opcode, cqe->status);
+
+ vhost_rdma_queue_push(cq->vq, desc_idx, sizeof(*cqe));
+
+ rte_spinlock_unlock(&cq->cq_lock);
+
+ if ((cq->notify == VHOST_RDMA_IB_CQ_NEXT_COMP) ||
+ (cq->notify == VHOST_RDMA_IB_NOTIFY_SOLICITED && solicited)) {
+ cq->notify = 0;
+ vhost_rdma_queue_notify(dev->vid, cq->vq);
+ }
+
+ return 0;
+}
+
+int vhost_rdma_queue_init(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_queue *queue,
+ const char *name,
+ struct vhost_user_queue *vq,
+ size_t elem_size,
+ enum vhost_rdma_queue_type type)
+{
+ queue->data = rte_zmalloc(name, elem_size * vq->vring.size, RTE_CACHE_LINE_SIZE);
+ if (queue->data == NULL)
+ return -ENOMEM;
+
+ queue->vq = vq;
+ queue->num_elems = vq->vring.size;
+ queue->elem_size = elem_size;
+ queue->consumer_index = vq->last_avail_idx;
+ queue->producer_index = vq->last_avail_idx;
+
+ switch (type) {
+ case VHOST_RDMA_QUEUE_SQ:
+ queue->cb = vhost_rdma_handle_sq;
+ break;
+ case VHOST_RDMA_QUEUE_RQ:
+ queue->cb = vhost_rdma_handle_rq;
+ break;
+ default:
+ RDMA_LOG_ERR("Unknown queue type");
+ }
+
+ queue->intr_handle.fd = vq->vring.kickfd;
+ queue->intr_handle.type = RTE_INTR_HANDLE_EXT;
+ rte_intr_callback_register(&queue->intr_handle, queue->cb, qp);
+
+ return 0;
+}
+
+/**
+ * @brief Clean up a vhost RDMA queue.
+ */
+void
+vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue)
+{
+ if (!queue)
+ return;
+
+ if (queue->cb && qp)
+ rte_intr_callback_unregister(&queue->intr_handle, queue->cb, qp);
+
+ rte_free(queue->data);
+ queue->data = NULL;
+}
+
+int vhost_rdma_requester(void *arg)
+{
+ //TODO: handle request
+ return 0;
+}
+
+int vhost_rdma_completer(void* arg)
+{
+ //TODO: handle complete
+ return 0;
+}
+
+int vhost_rdma_responder(void* arg)
+{
+ //TODO: handle response
+ return 0;
+}
+
+static int vhost_rdma_qp_init_req(__rte_unused struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_create_qp *cmd)
+{
+ int wqe_size;
+
+ qp->src_port = 0xc000;
+
+ wqe_size = RTE_MAX(cmd->max_send_sge * sizeof(struct vhost_rdma_sge),
+ cmd->max_inline_data);
+
+ vhost_rdma_queue_init(qp,
+ &qp->sq.queue,
+ "sq_queue",
+ &dev->qp_vqs[qp->qpn * 2],
+ sizeof(struct vhost_rdma_send_wqe) + wqe_size,
+ VHOST_RDMA_QUEUE_SQ);
+
+ qp->req.state = QP_STATE_RESET;
+ qp->req.opcode = QP_OPCODE_INVAILD;
+ qp->comp.opcode = QP_OPCODE_INVAILD;
+
+ qp->req_pkts = rte_zmalloc(NULL, rte_ring_get_memsize(512), RTE_CACHE_LINE_SIZE);
+ if (qp->req_pkts == NULL) {
+ RDMA_LOG_ERR("req_pkts malloc failed");
+ return -ENOMEM;
+ }
+
+ if (rte_ring_init(qp->req_pkts, "req_pkts", 512, RING_F_MP_HTS_ENQ | RING_F_MC_HTS_DEQ) != 0) {
+ RDMA_LOG_ERR("req_pkts init failed");
+ rte_free(qp->req_pkts);
+ return -ENOMEM;
+ }
+
+ qp->req_pkts_head = NULL;
+
+ vhost_rdma_init_task(&qp->req.task, dev->task_ring, qp,
+ vhost_rdma_requester, "vhost_rdma_req");
+ vhost_rdma_init_task(&qp->comp.task, dev->task_ring, qp,
+ vhost_rdma_completer, "vhost_rdma_comp");
+
+ qp->qp_timeout_ticks = 0; /* Can't be set for UD/UC in modify_qp */
+ if (cmd->qp_type == VHOST_RDMA_IB_QPT_RC) {
+ rte_timer_init(&qp->rnr_nak_timer); // req_task
+ rte_timer_init(&qp->retrans_timer); // comp_task
+ }
+ return 0;
+}
+
+static int vhost_rdma_qp_init_resp(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp)
+{
+ if (!qp->srq) {
+ vhost_rdma_queue_init(qp, &qp->rq.queue, "rq_queue",
+ &dev->qp_vqs[qp->qpn * 2 + 1],
+ sizeof(struct vhost_rdma_recv_wqe),
+ VHOST_RDMA_QUEUE_RQ);
+ }
+
+ qp->resp_pkts = rte_zmalloc(NULL, rte_ring_get_memsize(512), RTE_CACHE_LINE_SIZE);
+ if (qp->resp_pkts == NULL) {
+ RDMA_LOG_ERR("resp_pkts malloc failed");
+ return -ENOMEM;
+ }
+
+ if (rte_ring_init(qp->resp_pkts, "resp_pkts", 512, RING_F_MP_HTS_ENQ | RING_F_MC_HTS_DEQ) != 0) {
+ RDMA_LOG_ERR("resp_pkts init failed");
+ rte_free(qp->resp_pkts);
+ return -ENOMEM;
+ }
+
+ vhost_rdma_init_task(&qp->resp.task, dev->task_ring, qp,
+ vhost_rdma_responder, "resp");
+
+ qp->resp.opcode = OPCODE_NONE;
+ qp->resp.msn = 0;
+ qp->resp.state = QP_STATE_RESET;
+
+ return 0;
+}
+
+static void vhost_rdma_qp_init_misc(__rte_unused struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_create_qp *cmd)
+{
+ qp->sq_sig_all = cmd->sq_sig_all;
+ qp->attr.path_mtu = DEFAULT_IB_MTU;
+ qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+ qp->attr.cap.max_send_wr = cmd->max_send_wr;
+ qp->attr.cap.max_recv_wr = cmd->max_recv_wr;
+ qp->attr.cap.max_send_sge = cmd->max_send_sge;
+ qp->attr.cap.max_recv_sge = cmd->max_recv_sge;
+ qp->attr.cap.max_inline_data = cmd->max_inline_data;
+
+ rte_spinlock_init(&qp->state_lock);
+
+ rte_atomic32_set(&qp->ssn, 0);
+ rte_atomic32_set(&qp->mbuf_out, 0);
+}
+
+int vhost_rdma_qp_init(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_create_qp *cmd)
+{
+ int err;
+
+ qp->pd = vhost_rdma_pool_get(&dev->pd_pool, cmd->pdn);
+ qp->scq = vhost_rdma_pool_get(&dev->cq_pool, cmd->send_cqn);
+ qp->rcq = vhost_rdma_pool_get(&dev->cq_pool, cmd->recv_cqn);
+ vhost_rdma_add_ref(qp->pd);
+ vhost_rdma_add_ref(qp->rcq);
+ vhost_rdma_add_ref(qp->scq);
+
+ vhost_rdma_qp_init_misc(dev, qp, cmd);
+
+ err = vhost_rdma_qp_init_req(dev, qp, cmd);
+ if (err)
+ goto err;
+
+ err = vhost_rdma_qp_init_resp(dev, qp);
+ if (err)
+ goto err;
+
+ qp->attr.qp_state = VHOST_RDMA_IB_QPS_RESET;
+ qp->valid = 1;
+ qp->type = cmd->qp_type;
+ qp->dev = dev;
+
+ return 0;
+
+err:
+ qp->pd = NULL;
+ qp->rcq = NULL;
+ qp->scq = NULL;
+ vhost_rdma_drop_ref(qp->pd, dev, pd);
+ vhost_rdma_drop_ref(qp->rcq, dev, cq);
+ vhost_rdma_drop_ref(qp->scq, dev, cq);
+
+ return err;
+}
+
+bool vhost_rdma_ib_modify_qp_is_ok(enum vhost_rdma_ib_qp_state cur_state,
+ enum vhost_rdma_ib_qp_state next_state,
+ uint8_t type,
+ enum vhost_rdma_ib_qp_attr_mask mask)
+{
+ enum vhost_rdma_ib_qp_attr_mask req_param, opt_param;
+
+ if (mask & VHOST_RDMA_IB_QP_CUR_STATE &&
+ cur_state != VHOST_RDMA_IB_QPS_RTR && cur_state != VHOST_RDMA_IB_QPS_RTS &&
+ cur_state != VHOST_RDMA_IB_QPS_SQD && cur_state != VHOST_RDMA_IB_QPS_SQE)
+ return false;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return false;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return false;
+
+ if (mask & ~(req_param | opt_param | VHOST_RDMA_IB_QP_STATE))
+ return false;
+
+ return true;
+}
+
+static int vhost_rdma_qp_chk_cap(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp_cap *cap)
+{
+ if (cap->max_send_wr > dev->attr.max_qp_wr) {
+ RDMA_LOG_ERR("invalid send wr = %d > %d",
+ cap->max_send_wr, dev->attr.max_qp_wr);
+ return -EINVAL;
+ }
+
+ if (cap->max_send_sge > dev->attr.max_send_sge) {
+ RDMA_LOG_ERR("invalid send sge = %d > %d",
+ cap->max_send_sge, dev->attr.max_send_sge);
+ return -EINVAL;
+ }
+
+ if (cap->max_recv_wr > dev->attr.max_qp_wr) {
+ RDMA_LOG_ERR("invalid recv wr = %d > %d",
+ cap->max_recv_wr, dev->attr.max_qp_wr);
+ return -EINVAL;
+ }
+
+ if (cap->max_recv_sge > dev->attr.max_recv_sge) {
+ RDMA_LOG_ERR("invalid recv sge = %d > %d",
+ cap->max_recv_sge, dev->attr.max_recv_sge);
+ return -EINVAL;
+ }
+
+ if (cap->max_inline_data > dev->max_inline_data) {
+ RDMA_LOG_ERR("invalid max inline data = %d > %d",
+ cap->max_inline_data, dev->max_inline_data);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int
+vhost_rdma_av_chk_attr(struct vhost_rdma_device *dev,
+ struct vhost_rdma_ah_attr *attr)
+{
+ struct vhost_rdma_global_route *grh = &attr->grh;
+ int type;
+
+ // uint8 sgid_index is always smaller than VHOST_MAX_GID_TBL_LEN
+ type = rdma_gid_attr_network_type(&dev->gid_tbl[grh->sgid_index]);
+ if (type < VHOST_RDMA_NETWORK_IPV4 ||
+ type > VHOST_RDMA_NETWORK_IPV6) {
+ RDMA_LOG_ERR("invalid network type = %d", type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int vhost_rdma_qp_validate(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_modify_qp *cmd)
+{
+ int mask = cmd->attr_mask;
+ enum vhost_rdma_ib_qp_state cur_state = (mask & VHOST_RDMA_IB_QP_CUR_STATE) ?
+ cmd->cur_qp_state : qp->attr.qp_state;
+ enum vhost_rdma_ib_qp_state new_state = (mask & VHOST_RDMA_IB_QP_STATE) ?
+ cmd->qp_state : cur_state;
+
+ if (!vhost_rdma_ib_modify_qp_is_ok(cur_state, new_state, qp->type, mask)){
+ RDMA_LOG_ERR("invalid mask or state for qp");
+ return -EINVAL;
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_STATE) {
+ if (cur_state == VHOST_RDMA_IB_QPS_SQD) {
+ if (qp->req.state == QP_STATE_DRAIN &&
+ new_state != VHOST_RDMA_IB_QPS_ERR)
+ return -EINVAL;
+ }
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_CAP && vhost_rdma_qp_chk_cap(dev, &cmd->cap))
+ return -EINVAL;
+
+ if (mask & VHOST_RDMA_IB_QP_AV && vhost_rdma_av_chk_attr(dev, &cmd->ah_attr))
+ return -EINVAL;
+
+ if (mask & VHOST_RDMA_IB_QP_MAX_QP_RD_ATOMIC) {
+ if (cmd->max_rd_atomic > dev->attr.max_qp_rd_atom) {
+ RDMA_LOG_ERR("invalid max_rd_atomic %d > %d",
+ cmd->max_rd_atomic,
+ dev->attr.max_qp_rd_atom);
+ return -EINVAL;
+ }
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_TIMEOUT) {
+ if (cmd->timeout > 31) {
+ RDMA_LOG_ERR("invalid QP timeout %d > 31",
+ cmd->timeout);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+void vhost_rdma_av_from_attr(struct vhost_rdma_av *av,
+ struct vhost_rdma_ah_attr *attr)
+{
+ const struct vhost_rdma_global_route *grh = &attr->grh;
+
+ memset(av, 0, sizeof(*av));
+ rte_memcpy(av->grh.dgid, grh->dgid, sizeof(grh->dgid));
+ av->grh.flow_label = grh->flow_label;
+ av->grh.sgid_index = grh->sgid_index;
+ av->grh.hop_limit = grh->hop_limit;
+ av->grh.traffic_class = grh->traffic_class;
+ rte_memcpy(av->dmac, attr->dmac, ETH_ALEN);
+}
+
+static void vhost_rdma_av_fill_ip_info(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *av,
+ struct vhost_rdma_ah_attr *attr)
+{
+ const struct vhost_rdma_gid *sgid_attr;
+ int ibtype;
+ int type;
+
+ sgid_attr = &dev->gid_tbl[attr->grh.sgid_index];
+
+ rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid[0]);
+ rdma_gid2ip((struct sockaddr *)&av->dgid_addr, attr->grh.dgid);
+
+ ibtype = rdma_gid_attr_network_type(sgid_attr);
+
+ switch (ibtype) {
+ case VHOST_RDMA_NETWORK_IPV4:
+ type = VHOST_NETWORK_TYPE_IPV4;
+ break;
+ case VHOST_RDMA_NETWORK_IPV6:
+ type = VHOST_NETWORK_TYPE_IPV6;
+ break;
+ default:
+ /* not reached - checked in av_chk_attr */
+ type = 0;
+ break;
+ }
+
+ av->network_type = type;
+}
+
+void vhost_rdma_init_av(struct vhost_rdma_device *dev,
+ struct vhost_rdma_ah_attr *attr,
+ struct vhost_rdma_av *av)
+{
+ vhost_rdma_av_from_attr(av, attr);
+ vhost_rdma_av_fill_ip_info(dev, av, attr);
+ rte_memcpy(av->dmac, attr->dmac, ETH_ALEN);
+}
+
+void vhost_rdma_qp_error(struct vhost_rdma_qp *qp)
+{
+ qp->req.state = QP_STATE_ERROR;
+ qp->resp.state = QP_STATE_ERROR;
+ qp->attr.qp_state = VHOST_RDMA_IB_QPS_ERR;
+
+ /* drain work and packet queues */
+ vhost_rdma_run_task(&qp->resp.task, 1);
+
+ if (qp->type == VHOST_RDMA_IB_QPT_RC)
+ vhost_rdma_run_task(&qp->comp.task, 1);
+ else
+ __vhost_rdma_do_task(&qp->comp.task);
+ vhost_rdma_run_task(&qp->req.task, 1);
+}
+
+int vhost_rdma_qp_modify(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_modify_qp *cmd)
+{
+ int err, mask = cmd->attr_mask;
+
+ if (mask & VHOST_RDMA_IB_QP_MAX_QP_RD_ATOMIC) {
+ int max_rd_atomic = cmd->max_rd_atomic ?
+ roundup_pow_of_two(cmd->max_rd_atomic) : 0;
+
+ qp->attr.max_rd_atomic = max_rd_atomic;
+ rte_atomic32_set(&qp->req.rd_atomic, max_rd_atomic);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_MAX_DEST_RD_ATOMIC) {
+ int max_dest_rd_atomic = cmd->max_dest_rd_atomic ?
+ roundup_pow_of_two(cmd->max_dest_rd_atomic) : 0;
+
+ qp->attr.max_dest_rd_atomic = max_dest_rd_atomic;
+
+ free_rd_atomic_resources(qp);
+
+ err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic);
+ if (err)
+ return err;
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_CUR_STATE)
+ qp->attr.cur_qp_state = cmd->qp_state;
+
+ if (mask & VHOST_RDMA_IB_QP_ACCESS_FLAGS)
+ qp->attr.qp_access_flags = cmd->qp_access_flags;
+
+ if (mask & VHOST_RDMA_IB_QP_QKEY)
+ qp->attr.qkey = cmd->qkey;
+
+ if (mask & VHOST_RDMA_IB_QP_AV)
+ vhost_rdma_init_av(dev, &cmd->ah_attr, &qp->av);
+
+ if (mask & VHOST_RDMA_IB_QP_PATH_MTU) {
+ qp->attr.path_mtu = cmd->path_mtu;
+ qp->mtu = ib_mtu_enum_to_int(cmd->path_mtu);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_TIMEOUT) {
+ qp->attr.timeout = cmd->timeout;
+ if (cmd->timeout == 0) {
+ qp->qp_timeout_ticks = 0;
+ } else {
+ uint64_t ticks_per_us = rte_get_timer_hz() / 1000000;
+ uint64_t j = (4096ULL << cmd->timeout) / 1000 * ticks_per_us;
+ qp->qp_timeout_ticks = j ? j : 1;
+ }
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_RETRY_CNT) {
+ qp->attr.retry_cnt = cmd->retry_cnt;
+ qp->comp.retry_cnt = cmd->retry_cnt;
+ RDMA_LOG_INFO("qp#%d set retry count = %d", qp->qpn,
+ cmd->retry_cnt);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_RNR_RETRY) {
+ qp->attr.rnr_retry = cmd->rnr_retry;
+ qp->comp.rnr_retry = cmd->rnr_retry;
+ RDMA_LOG_INFO("qp#%d set rnr retry count = %d", qp->qpn,
+ cmd->rnr_retry);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_RQ_PSN) {
+ qp->attr.rq_psn = (cmd->rq_psn & VHOST_RDMA_PSN_MASK);
+ qp->resp.psn = qp->attr.rq_psn;
+ RDMA_LOG_INFO("qp#%d set resp psn = 0x%x", qp->qpn,
+ qp->resp.psn);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_MIN_RNR_TIMER) {
+ qp->attr.min_rnr_timer = cmd->min_rnr_timer;
+ RDMA_LOG_INFO("qp#%d set min rnr timer = 0x%x", qp->qpn,
+ cmd->min_rnr_timer);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_SQ_PSN) {
+ qp->attr.sq_psn = (cmd->sq_psn & VHOST_RDMA_PSN_MASK);
+ qp->req.psn = qp->attr.sq_psn;
+ qp->comp.psn = qp->attr.sq_psn;
+ RDMA_LOG_INFO("qp#%d set req psn = 0x%x", qp->qpn, qp->req.psn);
+ }
+
+ if (mask & VHOST_RDMA_IB_QP_DEST_QPN)
+ qp->attr.dest_qp_num = cmd->dest_qp_num;
+
+ if (mask & VHOST_RDMA_IB_QP_STATE) {
+ qp->attr.qp_state = cmd->qp_state;
+
+ switch (cmd->qp_state) {
+ case VHOST_RDMA_IB_QPS_RESET:
+ RDMA_LOG_INFO("qp#%d state -> RESET", qp->qpn);
+ // TODO: rxe_qp_reset(qp);
+ break;
+
+ case VHOST_RDMA_IB_QPS_INIT:
+ RDMA_LOG_INFO("qp#%d state -> INIT", qp->qpn);
+ qp->req.state = QP_STATE_INIT;
+ qp->resp.state = QP_STATE_INIT;
+ break;
+
+ case VHOST_RDMA_IB_QPS_RTR:
+ RDMA_LOG_INFO("qp#%d state -> RTR", qp->qpn);
+ qp->resp.state = QP_STATE_READY;
+ break;
+
+ case VHOST_RDMA_IB_QPS_RTS:
+ RDMA_LOG_INFO("qp#%d state -> RTS", qp->qpn);
+ qp->req.state = QP_STATE_READY;
+ break;
+
+ case VHOST_RDMA_IB_QPS_SQD:
+ RDMA_LOG_INFO("qp#%d state -> SQD", qp->qpn);
+ // TODO: rxe_qp_drain(qp);
+ break;
+
+ case VHOST_RDMA_IB_QPS_SQE:
+ RDMA_LOG_INFO("qp#%d state -> SQE !!?", qp->qpn);
+ /* Not possible from modify_qp. */
+ break;
+
+ case VHOST_RDMA_IB_QPS_ERR:
+ RDMA_LOG_INFO("qp#%d state -> ERR", qp->qpn);
+ vhost_rdma_qp_error(qp);
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.h b/examples/vhost_user_rdma/vhost_rdma_queue.h
new file mode 100644
index 0000000000..260eea51f8
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.h
@@ -0,0 +1,338 @@
+/*
+ * Vhost-user RDMA device: Queue management and work request handling
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef VHOST_RDMA_QUEUE_H_
+#define VHOST_RDMA_QUEUE_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/types.h>
+
+#include "vhost_rdma_ib.h"
+
+#define QP_OPCODE_INVAILD (-1)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct vhost_rdma_bth {
+ uint8_t opcode;
+ uint8_t flags;
+ rte_be16_t pkey;
+ rte_be32_t qpn;
+ rte_be32_t apsn;
+};
+
+#define VHOST_RDMA_TVER (0)
+#define VHOST_RDMA_DEF_PKEY (0xffff)
+
+#define VHOST_RDMA_SE_MASK (0x80)
+#define VHOST_RDMA_MIG_MASK (0x40)
+#define VHOST_RDMA_PAD_MASK (0x30)
+#define VHOST_RDMA_TVER_MASK (0x0f)
+#define VHOST_RDMA_FECN_MASK (0x80000000)
+#define VHOST_RDMA_BECN_MASK (0x40000000)
+#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
+#define VHOST_RDMA_QPN_MASK (0x00ffffff)
+#define VHOST_RDMA_ACK_MASK (0x80000000)
+#define VHOST_RDMA_RESV7_MASK (0x7f000000)
+#define VHOST_RDMA_PSN_MASK (0x00ffffff)
+
+/**
+ * @brief Operation codes for Work Completions (WC)
+ *
+ * These represent the type of operation that has completed on a QP.
+ */
+enum vhost_rdma_ib_wc_opcode {
+ VHOST_RDMA_IB_WC_SEND, /**< SEND operation completed */
+ VHOST_RDMA_IB_WC_RDMA_WRITE, /**< RDMA Write operation completed */
+ VHOST_RDMA_IB_WC_RDMA_READ, /**< RDMA Read operation completed */
+ VHOST_RDMA_IB_WC_RECV, /**< Receive operation completed */
+ VHOST_RDMA_IB_WC_RECV_RDMA_WITH_IMM, /**< RECV with immediate data */
+};
+
+/**
+ * @brief Operation codes for Work Requests (WR) posted to Send Queue (SQ)
+ */
+enum vhost_rdma_ib_wr_opcode {
+ VHOST_RDMA_IB_WR_RDMA_WRITE, /**< RDMA Write request */
+ VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM, /**< RDMA Write with immediate data */
+ VHOST_RDMA_IB_WR_SEND, /**< Send message */
+ VHOST_RDMA_IB_WR_SEND_WITH_IMM, /**< Send with immediate data */
+ VHOST_RDMA_IB_WR_RDMA_READ, /**< RDMA Read request */
+};
+
+/**
+ * @brief Types of queues in a QP
+ */
+enum vhost_rdma_queue_type {
+ VHOST_RDMA_QUEUE_SQ, /**< Send Queue */
+ VHOST_RDMA_QUEUE_RQ /**< Receive Queue */
+};
+
+enum vhost_rdma_wqe_state {
+ WQE_STATE_POSTED,
+ WQE_STATE_PROCESSING,
+ WQE_STATE_PENDING,
+ WQE_STATE_DONE,
+ WQE_STATE_ERROR,
+};
+
+enum {
+ TASK_STATE_START = 0,
+ TASK_STATE_BUSY = 1,
+ TASK_STATE_ARMED = 2,
+};
+
+/**
+ * @brief Send Queue Work Request (WR) structure from userspace
+ *
+ * Represents a single WR submitted via the SQ. Contains metadata and SGE list.
+ */
+struct vhost_rdma_sq_req {
+ union {
+ __le32 num_sge; /**< Number of scatter-gather entries */
+ __le16 inline_len; /**< Length of inline data (if SEND_INLINE flag set) */
+ };
+ __u8 send_flags; /**< Flags: FENCE, SIGNALED, SOLICITED, INLINE */
+ __u32 opcode; /**< Operation code (from vhost_rdma_ib_wr_opcode) */
+ __le64 wr_id; /**< User-defined WR identifier (passed back in CQE) */
+
+ /* Send flags definitions */
+#define VHOST_RDMA_IB_SEND_FENCE (1 << 0) /**< Fence: must wait for prior sends to complete */
+#define VHOST_RDMA_IB_SEND_SIGNALED (1 << 1) /**< Generate completion event if CQ is solicited */
+#define VHOST_RDMA_IB_SEND_SOLICITED (1 << 2) /**< Solicited event (used for reliable signaling) */
+#define VHOST_RDMA_IB_SEND_INLINE (1 << 3) /**< Data is inlined, not in MR */
+
+ __le32 imm_data; /**< Immediate data (network byte order), used in WRITE/SEND_WITH_IMM */
+
+ union {
+ __le32 imm_data; /**< Reuse field for immediate data */
+ __u32 invalidate_rkey; /**< For fast memory registration invalidation */
+ } ex;
+
+ union {
+ struct {
+ __le64 remote_addr; /**< Target address in remote memory */
+ __le32 rkey; /**< Remote key for memory region access */
+ } rdma; /**< Used by RDMA_WRITE/READ operations */
+
+ struct {
+ __u64 remote_addr; /**< Address for atomic target */
+ __u64 compare_add; /**< Compare value in CMP-and-SWAP */
+ __u64 swap; /**< Swap value in atomic operations */
+ __u32 rkey; /**< Remote key */
+ } atomic; /**< Atomic operations (not yet fully supported) */
+
+ struct {
+ __le32 remote_qpn; /**< Destination QPN (for UD QPs) */
+ __le32 remote_qkey; /**< Q_Key for UD packet validation */
+ __le32 ah; /**< Address Handle index (pre-configured path info) */
+ } ud; /**< Used only in UD (Unreliable Datagram) mode */
+
+ __le64 reserved[4]; /**< Reserved for future extensions */
+ };
+
+ __le32 reserved2[3]; /**< Padding/reserved fields */
+
+ /*
+ * Scatter/Gather Element list follows this structure.
+ * Actual number determined by num_sge.
+ * Inline data may also follow for SEND_INLINE requests.
+ */
+ struct vhost_rdma_sge sg_list[]; /**< Flexible array of SGEs */
+};
+
+/**
+ * @brief Receive Queue Work Request (RQ) structure
+ *
+ * Posted by consumers to indicate where incoming messages should be written.
+ */
+struct vhost_rdma_rq_req {
+ __le32 qpn; /**< Local QP number (for multi-qp support) */
+ __le32 num_sge; /**< Number of valid SGEs in sg_list */
+ __le64 wr_id; /**< User-provided WR ID returned upon receive completion */
+
+ /*
+ * Scatter/Gather Element list for receiving incoming payload.
+ * Memory regions must already be registered.
+ */
+ struct vhost_rdma_sge sg_list[]; /**< Flexible array of receive buffers */
+};
+
+/**
+ * @brief Work Completion Entry (CQE) format
+ *
+ * Populated when a WR completes and posted to the Completion Queue (CQ).
+ */
+struct vhost_rdma_cq_req {
+ __le64 wr_id; /**< Echoed from the original WR */
+ __u8 status; /**< Completion status (from vhost_rdma_ib_wc_status) */
+ __u8 opcode; /**< Completed operation type (from vhost_rdma_ib_wc_opcode) */
+ __le16 padding; /**< Align to 32-bit boundary */
+ __le32 vendor_err; /**< Vendor-specific error code (if any) */
+ __le32 byte_len; /**< Number of bytes transferred */
+ __le32 imm_data; /**< Immediate data received (for SEND_WITH_IMM) */
+ __le32 qp_num; /**< Local QP number where WR was executed */
+ __le32 src_qp; /**< Source QP (valid only for UD receives) */
+#define VHOST_RDMA_IB_WC_GRH (1 << 0) /**< GRH header present in received packet */
+#define VHOST_RDMA_WC_WITH_IMM (1 << 1) /**< Immediate data is valid */
+ __le32 wc_flags; /**< Additional flags (e.g., GRH, IMM) */
+ __le32 reserved[3]; /**< Future use */
+};
+
+struct vhost_rdma_cmd_req_notify {
+ /* The index of CQ */
+ uint32_t cqn;
+#define VHOST_RDMA_IB_NOTIFY_SOLICITED (1 << 0)
+#define VHOST_RDMA_IB_NOTIFY_NEXT_COMPLETION (1 << 1)
+#define VHOST_RDMA_IB_CQ_NEXT_COMP (1 << 2)
+#define VHOST_RDMA_IB_CQ_SOLICITED (1 << 3)
+ /* Notify flags */
+ uint32_t flags;
+};
+
+static __rte_always_inline void*
+vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
+{
+ return queue->data + queue->elem_size * idx;
+}
+
+/*
+ * Function declarations
+ */
+
+/**
+ * @brief Initialize an internal Send WQE from a user WR
+ *
+ * @param qp Pointer to the QP owning the WQE
+ * @param wr User-submitted SQ request (source WR)
+ * @param mask PSN mask for sequence handling
+ * @param length Total data length of the request
+ * @param wqe Output: initialized internal WQE
+ */
+void vhost_rdma_init_send_wqe(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_sq_req *wr,
+ unsigned int mask,
+ unsigned int length,
+ struct vhost_rdma_send_wqe *wqe);
+
+/**
+ * @brief Process pending work requests on the Send Queue (SQ)
+ *
+ * Runs in datapath context; handles posting RDMA ops, sending packets, etc.
+ *
+ * @param arg Pointer to QP (passed as void*)
+ */
+void vhost_rdma_handle_sq(void *arg);
+
+/**
+ * @brief Process incoming packets destined for Receive Queue (RQ)
+ *
+ * Currently stubbed; will handle packet delivery into pre-posted RQ buffers.
+ *
+ * @param arg Unused placeholder (for compatibility with callback signature)
+ */
+void vhost_rdma_handle_rq(__rte_unused void *arg);
+
+/**
+ * @brief Post a completion entry to a Completion Queue (CQ)
+ *
+ * @param dev Pointer to the vhost RDMA device
+ * @param cq Target CQ to post to
+ * @param cqe Completion entry to post
+ * @param solicited Whether this is a solicited completion (triggers interrupt)
+ *
+ * @return 0 on success, negative errno on failure (e.g., CQ full)
+ */
+int vhost_rdma_cq_post(struct vhost_rdma_device *dev,
+ struct vhost_rdma_cq *cq,
+ struct vhost_rdma_cq_req *cqe,
+ int solicited);
+
+/**
+ * @brief Initialize a queue (SQ or RQ) associated with a QP
+ *
+ * Allocates and maps the virtqueue, sets up callbacks, and prepares for I/O.
+ *
+ * @param qp Owning QP
+ * @param queue Queue structure to initialize
+ * @param name Human-readable name (e.g., "sq", "rq")
+ * @param vq Underlying vhost_user_queue (from backend)
+ * @param elem_size Size of each element (WR size)
+ * @param type Queue type: SQ or RQ
+ *
+ * @return 0 on success, negative error code on failure
+ */
+int vhost_rdma_queue_init(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_queue *queue,
+ const char *name,
+ struct vhost_user_queue *vq,
+ size_t elem_size,
+ enum vhost_rdma_queue_type type);
+
+/**
+ * @brief Clean up resources associated with a queue
+ *
+ * Frees allocated WRs, resets pointers, and prepares for QP destruction.
+ *
+ * @param qp Owning QP
+ * @param queue Queue to clean up
+ */
+void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_queue *queue);
+
+void init_av_from_vhost_rdma(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *dst,
+ uint32_t ah);
+
+int vhost_rdma_init_task(struct vhost_rdma_task *task,
+ struct rte_ring *task_ring,
+ void *arg, int (*func)(void *),
+ const char *name);
+
+void vhost_rdma_run_task(struct vhost_rdma_task *task, int sched);
+
+void vhost_rdma_do_task(struct vhost_rdma_task *task);
+
+void vhost_rdma_qp_destroy(struct vhost_rdma_qp *qp);
+
+int vhost_rdma_qp_validate(struct vhost_rdma_device *dev,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_cmd_modify_qp *cmd);
+
+void vhost_rdma_qp_error(struct vhost_rdma_qp *qp);
+void vhost_rdma_qp_cleanup(void* arg);
+
+int vhost_rdma_requester(void* arg);
+int vhost_rdma_completer(void* arg);
+int vhost_rdma_responder(void* arg);
+
+bool vhost_rdma_ib_modify_qp_is_ok(enum vhost_rdma_ib_qp_state cur_state,
+ enum vhost_rdma_ib_qp_state next_state,
+ uint8_t type,
+ enum vhost_rdma_ib_qp_attr_mask mask);
+
+void vhost_rdma_init_av(struct vhost_rdma_device *dev,
+ struct vhost_rdma_ah_attr *attr,
+ struct vhost_rdma_av *av);
+
+void vhost_rdma_av_from_attr(struct vhost_rdma_av *av,
+ struct vhost_rdma_ah_attr *attr);
+
+void vhost_rdma_qp_destroy(struct vhost_rdma_qp *qp);
+
+int vhost_rdma_av_chk_attr(struct vhost_rdma_device *dev,
+ struct vhost_rdma_ah_attr *attr);
+
+#endif /* VHOST_RDMA_QUEUE_H_ */
\ No newline at end of file
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 07/14] examples/vhost_user_rdma: Implement high-performance requester engine with advanced flow control
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (6 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 06/14] examples/vhost_user_rdma: implement comprehensive queue pair lifecycle management Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 08/14] examples/vhost_user_rdma: implement advanced completer engine with reliability features Xiong Weimin
` (2 subsequent siblings)
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds the core requester engine for RDMA operations:
1. Work Queue Element (WQE) processing state machine
2. Flow control with window-based congestion avoidance
3. MTU-aware packet segmentation
4. Error handling with automatic retry mechanisms
5. Atomic operation support and resource management
Key features:
- PSN-based flow control for reliable connections (RC)
- UD MTU handling with simulated success for oversize packets
- Work request state management (DONE, ERROR, RETRY)
- Packet construction and transmission pipeline
- Memory buffer (mbuf) accounting for congestion control
- Atomic reference counting for safe resource handling
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: Ib0873f3d56ff71ed9f51e47edfa972054145f226
---
examples/vhost_user_rdma/meson.build | 2 +
examples/vhost_user_rdma/vhost_rdma.h | 9 +
examples/vhost_user_rdma/vhost_rdma_crc.c | 163 ++++
examples/vhost_user_rdma/vhost_rdma_opcode.c | 141 +++-
examples/vhost_user_rdma/vhost_rdma_opcode.h | 335 ++++++--
examples/vhost_user_rdma/vhost_rdma_pkt.c | 221 +++++
examples/vhost_user_rdma/vhost_rdma_pkt.h | 31 +-
examples/vhost_user_rdma/vhost_rdma_queue.c | 826 ++++++++++++++++++-
examples/vhost_user_rdma/vhost_rdma_queue.h | 221 ++++-
9 files changed, 1855 insertions(+), 94 deletions(-)
create mode 100644 examples/vhost_user_rdma/vhost_rdma_crc.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_pkt.c
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
index a032a27767..2a0a6ffc15 100644
--- a/examples/vhost_user_rdma/meson.build
+++ b/examples/vhost_user_rdma/meson.build
@@ -43,5 +43,7 @@ sources = files(
'vhost_rdma_ib.c',
'vhost_rdma_queue.c',
'vhost_rdma_opcode.c',
+ 'vhost_rdma_pkt.c',
+ 'vhost_rdma_crc.c',
)
diff --git a/examples/vhost_user_rdma/vhost_rdma.h b/examples/vhost_user_rdma/vhost_rdma.h
index 980bb74beb..bf772283b8 100644
--- a/examples/vhost_user_rdma/vhost_rdma.h
+++ b/examples/vhost_user_rdma/vhost_rdma.h
@@ -72,6 +72,8 @@ extern "C" {
#define VHOST_NET_RXQ 0
#define VHOST_NET_TXQ 1
+#define ROCE_V2_UDP_DPORT 4791
+
/* VIRTIO_F_EVENT_IDX is NOT supported now */
#define VHOST_RDMA_FEATURE ((1ULL << VIRTIO_F_VERSION_1) |\
(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
@@ -457,6 +459,13 @@ static inline enum vhost_rdma_network_type rdma_gid_attr_network_type(const stru
return VHOST_RDMA_NETWORK_IPV6;
}
+static __rte_always_inline void
+vhost_rdma_counter_inc(struct vhost_rdma_device *dev,
+ enum vhost_rdma_counters index)
+{
+ rte_atomic64_inc(&dev->stats_counters[index]);
+}
+
int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx);
void vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx);
void vs_vhost_rdma_net_setup(int vid);
diff --git a/examples/vhost_user_rdma/vhost_rdma_crc.c b/examples/vhost_user_rdma/vhost_rdma_crc.c
new file mode 100644
index 0000000000..7802bc61e1
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_crc.c
@@ -0,0 +1,163 @@
+/*
+ * Vhost-user RDMA device : Calculating the CRC of data packet
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_pkt.h"
+
+const uint32_t crc_table[256] = {
+ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+ 0x2d02ef8dL
+};
+
+#define DO1(buf) crc = crc_table[((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8);
+#define DO2(buf) DO1(buf); DO1(buf);
+#define DO4(buf) DO2(buf); DO2(buf);
+#define DO8(buf) DO4(buf); DO4(buf);
+
+#define CSUM_MANGLED_0 0xffff
+
+uint32_t
+crc32(uint32_t crc, void* buf, uint32_t len)
+{
+ char* bufc = buf;
+ while (len >= 8)
+ {
+ DO8(bufc);
+ len -= 8;
+ }
+ if (len) do {
+ DO1(bufc);
+ } while (--len);
+ return crc;
+}
+
+uint32_t
+vhost_rdma_icrc_hdr(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf)
+{
+ unsigned int bth_offset = 0;
+ struct rte_ipv4_hdr *ip4h = NULL;
+ struct rte_ipv6_hdr *ip6h = NULL;
+ struct rte_udp_hdr *udph;
+ struct vhost_bth *bth;
+ int crc;
+ int length;
+ int hdr_size = sizeof(struct rte_udp_hdr) +
+ (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4 ?
+ sizeof(struct rte_ipv4_hdr) : sizeof(struct rte_ipv6_hdr));
+ /* pseudo header buffer size is calculate using ipv6 header size since
+ * it is bigger than ipv4
+ */
+ uint8_t pshdr[sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv6_hdr) +
+ VHOST_BTH_BYTES];
+
+ /* This seed is the result of computing a CRC with a seed of
+ * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+ */
+ crc = 0xdebb20e3;
+
+ if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4) { /* IPv4 */
+ rte_memcpy(pshdr, ip_hdr(pkt), hdr_size);
+ ip4h = (struct rte_ipv4_hdr *)pshdr;
+ udph = (struct rte_udp_hdr *)(ip4h + 1);
+
+ ip4h->time_to_live = 0xff;
+ ip4h->hdr_checksum = CSUM_MANGLED_0;
+ ip4h->type_of_service = 0xff;
+ } else { /* IPv6 */
+ rte_memcpy(pshdr, ipv6_hdr(pkt), hdr_size);
+ ip6h = (struct rte_ipv6_hdr *)pshdr;
+ udph = (struct rte_udp_hdr *)(ip6h + 1);
+
+ // memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+ // ip6h->priority = 0xf;
+ ip6h->vtc_flow = rte_cpu_to_be_32(RTE_IPV6_HDR_FL_MASK | RTE_IPV6_HDR_TC_MASK);
+ ip6h->hop_limits = 0xff;
+ }
+ udph->dgram_cksum = CSUM_MANGLED_0;
+
+ bth_offset += hdr_size;
+
+ rte_memcpy(&pshdr[bth_offset], pkt->hdr, VHOST_BTH_BYTES);
+ bth = (struct vhost_bth *)&pshdr[bth_offset];
+
+ /* exclude bth.resv8a */
+ bth->qpn |= rte_cpu_to_be_32(~VHOST_RDMA_QPN_MASK);
+
+ length = hdr_size + VHOST_BTH_BYTES;
+ crc = crc32(crc, pshdr, length);
+
+ /* And finish to compute the CRC on the remainder of the headers. */
+ crc = crc32(crc, pkt->hdr + VHOST_BTH_BYTES,
+ vhost_rdma_opcode[pkt->opcode].length - VHOST_BTH_BYTES);
+ return crc;
+}
+
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.c b/examples/vhost_user_rdma/vhost_rdma_opcode.c
index 4284a405f5..fbbed5b0e2 100644
--- a/examples/vhost_user_rdma/vhost_rdma_opcode.c
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.c
@@ -891,4 +891,143 @@ struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE] = {
}
},
-};
\ No newline at end of file
+};
+
+static int
+next_opcode_rc(struct vhost_rdma_qp *qp, uint32_t opcode, int fits)
+{
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_READ:
+ return IB_OPCODE_RC_RDMA_READ_REQUEST;
+ }
+
+ return -EINVAL;
+}
+
+static int
+next_opcode_uc(struct vhost_rdma_qp *qp, uint32_t opcode, int fits)
+{
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY :
+ IB_OPCODE_UC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_FIRST;
+ }
+
+ return -EINVAL;
+}
+
+int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ uint32_t opcode)
+{
+ int fits = (wqe->dma.resid <= qp->mtu);
+
+ switch (qp->type) {
+ case VHOST_RDMA_IB_QPT_RC:
+ return next_opcode_rc(qp, opcode, fits);
+
+ case VHOST_RDMA_IB_QPT_UC:
+ return next_opcode_uc(qp, opcode, fits);
+
+ case VHOST_RDMA_IB_QPT_SMI:
+ case VHOST_RDMA_IB_QPT_UD:
+ case VHOST_RDMA_IB_QPT_GSI:
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_SEND:
+ return IB_OPCODE_UD_SEND_ONLY;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.h b/examples/vhost_user_rdma/vhost_rdma_opcode.h
index b8f48bcdf5..6c3660f36b 100644
--- a/examples/vhost_user_rdma/vhost_rdma_opcode.h
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.h
@@ -24,6 +24,7 @@
#include <rte_interrupts.h>
#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
/** Maximum number of QP types supported for WR mask dispatching */
#define WR_MAX_QPT 8
@@ -38,6 +39,92 @@
/* Invalid opcode marker */
#define OPCODE_NONE (-1)
+#define VHOST_RDMA_SE_MASK (0x80)
+#define VHOST_RDMA_MIG_MASK (0x40)
+#define VHOST_RDMA_PAD_MASK (0x30)
+#define VHOST_RDMA_TVER_MASK (0x0f)
+#define VHOST_RDMA_FECN_MASK (0x80000000)
+#define VHOST_RDMA_BECN_MASK (0x40000000)
+#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
+#define VHOST_RDMA_QPN_MASK (0x00ffffff)
+#define VHOST_RDMA_ACK_MASK (0x80000000)
+#define VHOST_RDMA_RESV7_MASK (0x7f000000)
+#define VHOST_RDMA_PSN_MASK (0x00ffffff)
+
+/**
+ * @defgroup hdr_types Header Types (for offset tracking)
+ * @{
+ */
+enum vhost_rdma_hdr_type {
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
+};
+
+/**
+ * @defgroup hdr_masks Header Presence and Semantic Flags
+ * @{
+ */
+enum vhost_rdma_hdr_mask {
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+};
+
+/**
+ * @brief Per-opcode metadata for parsing and validation
+ */
+struct vhost_rdma_opcode_info {
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+};
+
+/* Global opcode info table (indexed by IB opcode byte) */
+extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+
struct vhost_bth {
uint8_t opcode;
uint8_t flags;
@@ -46,21 +133,192 @@ struct vhost_bth {
rte_be32_t apsn;
};
+static inline uint8_t __bth_pad(void *arg)
+{
+ struct vhost_bth *bth = arg;
+
+ return (VHOST_RDMA_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline uint8_t bth_pad(struct vhost_rdma_pkt_info *pkt)
+{
+ return __bth_pad(pkt->hdr);
+}
+
struct vhost_deth {
rte_be32_t qkey;
rte_be32_t sqp;
};
+#define GSI_QKEY (0x80010000)
+#define DETH_SQP_MASK (0x00ffffff)
+
+static inline uint32_t __deth_qkey(void *arg)
+{
+ struct vhost_deth *deth = arg;
+
+ return rte_be_to_cpu_32(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, uint32_t qkey)
+{
+ struct vhost_deth *deth = arg;
+
+ deth->qkey = rte_cpu_to_be_32(qkey);
+}
+
+static inline uint32_t __deth_sqp(void *arg)
+{
+ struct vhost_deth *deth = arg;
+
+ return DETH_SQP_MASK & rte_be_to_cpu_32(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, uint32_t sqp)
+{
+ struct vhost_deth *deth = arg;
+
+ deth->sqp = rte_cpu_to_be_32(DETH_SQP_MASK & sqp);
+}
+
+static inline uint32_t deth_qkey(struct vhost_rdma_pkt_info *pkt)
+{
+ return __deth_qkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH]);
+}
+
+static inline void deth_set_qkey(struct vhost_rdma_pkt_info *pkt, uint32_t qkey)
+{
+ __deth_set_qkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH], qkey);
+}
+
+static inline uint32_t deth_sqp(struct vhost_rdma_pkt_info *pkt)
+{
+ return __deth_sqp(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH]);
+}
+
+static inline void deth_set_sqp(struct vhost_rdma_pkt_info *pkt, uint32_t sqp)
+{
+ __deth_set_sqp(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH], sqp);
+}
+
struct vhost_immdt {
rte_be32_t imm;
};
+static inline rte_be32_t __immdt_imm(void *arg)
+{
+ struct vhost_immdt *immdt = arg;
+
+ return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, rte_be32_t imm)
+{
+ struct vhost_immdt *immdt = arg;
+
+ immdt->imm = imm;
+}
+
+static inline rte_be32_t immdt_imm(struct vhost_rdma_pkt_info *pkt)
+{
+ return __immdt_imm(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct vhost_rdma_pkt_info *pkt, rte_be32_t imm)
+{
+ __immdt_set_imm(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_IMMDT], imm);
+}
+
struct vhost_reth {
rte_be64_t va;
rte_be32_t rkey;
rte_be32_t len;
};
+static inline uint64_t __reth_va(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_64(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, uint64_t va)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->va = rte_cpu_to_be_64(va);
+}
+
+static inline uint32_t __reth_rkey(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_32(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, uint32_t rkey)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->rkey = rte_cpu_to_be_32(rkey);
+}
+
+static inline uint32_t __reth_len(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_32(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, uint32_t len)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->len = rte_cpu_to_be_32(len);
+}
+
+static inline uint64_t reth_va(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_va(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_va(struct vhost_rdma_pkt_info *pkt, uint64_t va)
+{
+ __reth_set_va(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], va);
+}
+
+static inline uint32_t reth_rkey(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_rkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_rkey(struct vhost_rdma_pkt_info *pkt, uint32_t rkey)
+{
+ __reth_set_rkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], rkey);
+}
+
+static inline uint32_t reth_len(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_len(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_len(struct vhost_rdma_pkt_info *pkt, uint32_t len)
+{
+ __reth_set_len(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], len);
+}
+
struct vhost_aeth {
rte_be32_t smsn;
};
@@ -252,79 +510,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct vhost_rdma_qp *qp)
return vhost_rdma_wr_opcode_info[opcode].mask[qp->type];
}
-/**
- * @defgroup hdr_types Header Types (for offset tracking)
- * @{
- */
-enum vhost_rdma_hdr_type {
- VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
- VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
- VHOST_RDMA_BTH, /**< Base Transport Header */
- VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
- VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
- VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
- VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
- VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
- VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
- VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
- VHOST_RDMA_IMMDT, /**< Immediate Data Header */
- VHOST_RDMA_PAYLOAD, /**< Payload section */
- NUM_HDR_TYPES /**< Number of known header types */
-};
-
-/**
- * @defgroup hdr_masks Header Presence and Semantic Flags
- * @{
- */
-enum vhost_rdma_hdr_mask {
- VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
- VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
- VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
- VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
- VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
- VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
- VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
- VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
- VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
- VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
- VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
- VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
-
- /* Semantic packet type flags */
- VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
- VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
- VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
- VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
- VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
- VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
-
- /* Packet fragmentation flags */
- VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
- VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
-
- VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
- VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
- VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
-
- VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
-
- /* Composite masks */
- VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
- VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
-};
-/** @} */
-
-/**
- * @brief Per-opcode metadata for parsing and validation
- */
-struct vhost_rdma_opcode_info {
- const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
- int length; /**< Fixed payload length (if any) */
- int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
- enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
-};
-
-/* Global opcode info table (indexed by IB opcode byte) */
-extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ uint32_t opcode);
#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.c b/examples/vhost_user_rdma/vhost_rdma_pkt.c
new file mode 100644
index 0000000000..27f7dd0647
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.c
@@ -0,0 +1,221 @@
+/*
+ * Vhost-user RDMA device : handling ipv4 or ipv6 hdr and data
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_queue.h"
+
+static __rte_always_inline
+void default_gid_to_mac(struct vhost_rdma_device *dev, char *mac)
+{
+ struct vhost_rdma_gid *gid = &dev->gid_tbl[0];
+
+ mac[0] = gid->gid[8];
+ mac[1] = gid->gid[9];
+ mac[2] = gid->gid[10];
+ mac[3] = gid->gid[13];
+ mac[4] = gid->gid[14];
+ mac[5] = gid->gid[15];
+}
+
+static void prepare_udp_hdr(struct rte_mbuf *m,
+ rte_be16_t src_port,
+ rte_be16_t dst_port)
+{
+ struct rte_udp_hdr *udph;
+
+ udph = (struct rte_udp_hdr *)rte_pktmbuf_prepend(m, sizeof(*udph));
+
+ udph->dst_port = dst_port;
+ udph->src_port = src_port;
+ udph->dgram_len = rte_cpu_to_be_16(m->data_len);
+ udph->dgram_cksum = 0;
+}
+
+static void prepare_ipv4_hdr(struct rte_mbuf *m,
+ rte_be32_t saddr,
+ rte_be32_t daddr,
+ uint8_t proto,
+ uint8_t tos,
+ uint8_t ttl,
+ rte_be16_t df)
+{
+ struct rte_ipv4_hdr *iph;
+
+ iph = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(m, sizeof(*iph));
+
+ iph->version_ihl = RTE_IPV4_VHL_DEF;
+ iph->total_length = rte_cpu_to_be_16(m->data_len);
+ iph->fragment_offset = df;
+ iph->next_proto_id = proto;
+ iph->type_of_service = tos;
+ iph->dst_addr = daddr;
+ iph->src_addr = saddr;
+ iph->time_to_live = ttl;
+}
+
+static inline void ip6_flow_hdr(struct rte_ipv6_hdr *hdr, unsigned int tclass,
+ rte_be32_t flowlabel)
+{
+ *(rte_be32_t *)hdr = rte_cpu_to_be_32(0x60000000 | (tclass << 20))|flowlabel;
+}
+
+static void
+prepare_ipv6_hdr(struct rte_mbuf *m,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ uint8_t proto,
+ uint8_t prio,
+ uint8_t ttl)
+{
+ struct rte_ipv6_hdr *ip6h;
+
+ ip6h = (struct rte_ipv6_hdr *)rte_pktmbuf_prepend(m, sizeof(*ip6h));
+
+ ip6_flow_hdr(ip6h, prio, rte_cpu_to_be_32(0));
+ ip6h->proto = proto;
+ ip6h->hop_limits = ttl;
+ rte_memcpy(ip6h->dst_addr, daddr, sizeof(*daddr));
+ rte_memcpy(ip6h->src_addr, saddr, sizeof(*daddr));
+ ip6h->payload_len = rte_cpu_to_be_16(m->data_len - sizeof(*ip6h));
+}
+
+static int
+prepare4(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ struct vhost_rdma_qp *qp = pkt->qp;
+ struct vhost_rdma_av *av = vhost_rdma_get_av(pkt);
+ struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+ rte_be16_t df = rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG);
+
+ prepare_udp_hdr(m, rte_cpu_to_be_16(qp->src_port),
+ rte_cpu_to_be_16(ROCE_V2_UDP_DPORT));
+
+ // FIXME: check addr
+ prepare_ipv4_hdr(m, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+ av->grh.traffic_class, av->grh.hop_limit, df);
+
+ return 0;
+}
+
+static int
+prepare6(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ struct vhost_rdma_qp *qp = pkt->qp;
+ struct vhost_rdma_av *av = vhost_rdma_get_av(pkt);
+ struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+
+ prepare_udp_hdr(m, rte_cpu_to_be_16(qp->src_port),
+ rte_cpu_to_be_16(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv6_hdr(m, saddr, daddr, IPPROTO_UDP,
+ av->grh.traffic_class,
+ av->grh.hop_limit);
+
+ return 0;
+}
+
+int
+vhost_rdma_prepare(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *m,
+ uint32_t *crc)
+{
+ int err = 0;
+ char dev_mac[6];
+
+ if (m->l3_type == VHOST_NETWORK_TYPE_IPV4)
+ err = prepare4(pkt, m);
+ else if (m->l3_type == VHOST_NETWORK_TYPE_IPV6)
+ err = prepare6(pkt, m);
+
+ *crc = vhost_rdma_icrc_hdr(pkt, m);
+
+ default_gid_to_mac(pkt->dev, dev_mac);
+
+ if (memcmp(dev_mac, vhost_rdma_get_av(pkt)->dmac, 6) == 0) {
+ pkt->mask |= VHOST_LOOPBACK_MASK;
+ }
+
+ return err;
+}
+
+static int
+ip_out(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf* mbuf, uint16_t type)
+{
+ struct rte_ether_hdr *ether;
+
+ ether = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(*ether));
+
+ ether->ether_type = rte_cpu_to_be_16(type);
+ default_gid_to_mac(pkt->dev, (char*)ðer->src_addr.addr_bytes[0]);
+ rte_memcpy(ðer->dst_addr.addr_bytes[0], vhost_rdma_get_av(pkt)->dmac, 6);
+
+ // IP checksum offload
+ mbuf->ol_flags = RTE_MBUF_F_TX_IP_CKSUM;
+ if (type == RTE_ETHER_TYPE_IPV4) {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_IPV4;
+ mbuf->l3_len = sizeof(struct rte_ipv4_hdr);
+ } else {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_IPV6;
+ mbuf->l3_len = sizeof(struct rte_ipv6_hdr);
+ }
+ mbuf->l4_len = sizeof(struct rte_udp_hdr);
+ mbuf->l2_len = sizeof(struct rte_ether_hdr);
+
+ rte_ring_enqueue(pkt->dev->tx_ring, mbuf);
+
+ return 0;
+}
+
+int vhost_rdma_send(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *mbuf)
+{
+ int err;
+ int mbuf_out;
+ struct vhost_rdma_qp *qp = pkt->qp;
+
+ vhost_rdma_add_ref(qp);
+ rte_atomic32_inc(&pkt->qp->mbuf_out);
+
+ if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4) {
+ err = ip_out(pkt, mbuf, RTE_ETHER_TYPE_IPV4);
+ } else if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV6) {
+ err = ip_out(pkt, mbuf, RTE_ETHER_TYPE_IPV6);
+ } else {
+ RDMA_LOG_ERR("Unknown layer 3 protocol: %u\n", mbuf->l3_type);
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ rte_pktmbuf_free(mbuf);
+ return -EINVAL;
+ }
+
+ mbuf_out = rte_atomic32_sub_return(&pkt->qp->mbuf_out, 1);
+ if (unlikely(pkt->qp->need_req_mbuf &&
+ mbuf_out < VHOST_INFLIGHT_SKBS_PER_QP_LOW))
+ vhost_rdma_run_task(&pkt->qp->req.task, 1);
+
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+
+ if (unlikely(err)) {
+ RDMA_LOG_ERR("ip out failed");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.h b/examples/vhost_user_rdma/vhost_rdma_pkt.h
index e6a605f574..f012edd8ec 100644
--- a/examples/vhost_user_rdma/vhost_rdma_pkt.h
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.h
@@ -22,9 +22,13 @@
#include <stdint.h>
#include <stddef.h>
+#include <netinet/in.h>
#include <rte_byteorder.h>
#include <rte_mbuf.h> /* For struct rte_mbuf if needed later */
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+
/* Forward declarations */
struct vhost_rdma_dev;
struct vhost_rdma_qp;
@@ -34,16 +38,23 @@ struct vhost_rdma_send_wqe;
#define BIT(x) (1U << (x)) /**< Generate bitmask from bit index */
#endif
+#define ip_hdr(p) ((struct rte_ipv4_hdr*) \
+ (RTE_PTR_SUB(p->hdr, \
+ sizeof(struct rte_udp_hdr) + sizeof(struct rte_ipv4_hdr))))
+#define ipv6_hdr(p) ((struct rte_ipv6_hdr*) \
+ (RTE_PTR_SUB(p->hdr, \
+ sizeof(struct rte_udp_hdr) + sizeof(struct rte_ipv6_hdr))))
+
/**
- * @defgroup constants Constants & Limits
- * @{
- */
+* @defgroup constants Constants & Limits
+* @{
+*/
/**
- * @brief Runtime packet context used during processing
- */
+* @brief Runtime packet context used during processing
+*/
struct vhost_rdma_pkt_info {
- struct vhost_rdma_dev *dev; /**< Owning device */
+ struct vhost_rdma_device *dev; /**< Owning device */
struct vhost_rdma_qp *qp; /**< Associated QP */
struct vhost_rdma_send_wqe *wqe; /**< Corresponding send WQE (if applicable) */
uint8_t *hdr; /**< Pointer to BTH (Base Transport Header) */
@@ -55,4 +66,12 @@ struct vhost_rdma_pkt_info {
uint8_t opcode; /**< BTH opcode field */
};
+int vhost_rdma_prepare(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *m,
+ uint32_t *crc);
+
+uint32_t vhost_rdma_icrc_hdr(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf);
+
+uint32_t crc32(uint32_t crc, void* buf, uint32_t len);
+
#endif /* __VHOST_RDMA_PKT_H__ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.c b/examples/vhost_user_rdma/vhost_rdma_queue.c
index abce651fa5..7d0c45592c 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.c
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.c
@@ -13,6 +13,11 @@
#include <rte_interrupts.h>
#include <rte_malloc.h>
#include <rte_vhost.h>
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_timer.h>
#include "vhost_rdma_queue.h"
#include "vhost_rdma_pkt.h"
@@ -560,12 +565,829 @@ vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queu
queue->data = NULL;
}
-int vhost_rdma_requester(void *arg)
+int
+vhost_rdma_advance_dma_data(struct vhost_rdma_dma_info *dma, unsigned int length)
{
- //TODO: handle request
+ struct vhost_rdma_sge *sge = &dma->sge[dma->cur_sge];
+ uint32_t offset = dma->sge_offset;
+ int resid = dma->resid;
+
+ while (length) {
+ unsigned int bytes;
+
+ if (offset >= sge->length) {
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+ if (dma->cur_sge >= dma->num_sge)
+ return -ENOSPC;
+ }
+
+ bytes = length;
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
return 0;
}
+static __rte_always_inline void
+vhost_rdma_retry_first_write_send(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ unsigned int mask, int npsn)
+{
+ int i;
+
+ for (i = 0; i < npsn; i++) {
+ int to_send = (wqe->dma.resid > qp->mtu) ?
+ qp->mtu : wqe->dma.resid;
+
+ qp->req.opcode = vhost_rdma_next_opcode(qp,
+ wqe,
+ wqe->wr->opcode);
+
+ if (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_INLINE) {
+ wqe->dma.resid -= to_send;
+ wqe->dma.sge_offset += to_send;
+ } else {
+ vhost_rdma_advance_dma_data(&wqe->dma, to_send);
+ }
+ if (mask & WR_WRITE_MASK)
+ wqe->iova += qp->mtu;
+ }
+}
+
+static void vhost_rdma_req_retry(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_send_wqe *wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ unsigned int cons;
+ unsigned int prod;
+ unsigned int wqe_index;
+ unsigned int mask;
+ int npsn;
+ int first = 1;
+
+ cons = q->consumer_index;
+ prod = q->producer_index;
+
+ qp->req.wqe_index = cons;
+ qp->req.psn = qp->comp.psn;
+ qp->req.opcode = -1;
+
+ for (wqe_index = cons; wqe_index != prod; wqe_index++) {
+ wqe = addr_from_index(&qp->sq.queue, wqe_index);
+ mask = wr_opcode_mask(wqe->wr->opcode, qp);
+
+ if (wqe->state == WQE_STATE_POSTED)
+ break;
+
+ if (wqe->state == WQE_STATE_DONE)
+ continue;
+
+ wqe->iova = (mask & WR_READ_OR_WRITE_MASK) ?
+ wqe->wr->rdma.remote_addr : 0;
+
+ if (!first || (mask & WR_READ_MASK) == 0) {
+ wqe->dma.resid = wqe->dma.length;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ }
+
+ if (first) {
+ first = 0;
+
+ if (mask & WR_WRITE_OR_SEND_MASK) {
+ npsn = (qp->comp.psn - wqe->first_psn) & VHOST_RDMA_PSN_MASK;
+ vhost_rdma_retry_first_write_send(qp, wqe, mask, npsn);
+ }
+
+ if (mask & WR_READ_MASK) {
+ npsn = (wqe->dma.length - wqe->dma.resid) / qp->mtu;
+ wqe->iova += npsn * qp->mtu;
+ }
+ }
+ wqe->state = WQE_STATE_POSTED;
+ }
+}
+
+static struct vhost_rdma_send_wqe* vhost_rdma_req_next_wqe(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_send_wqe *wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ unsigned int index = qp->req.wqe_index;
+ unsigned int cons;
+ unsigned int prod;
+
+ wqe = queue_head(q);
+ cons = q->consumer_index;
+ prod = q->producer_index;
+
+ if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+ rte_spinlock_lock(&qp->state_lock);
+ do {
+ if (qp->req.state != QP_STATE_DRAIN) {
+ /* comp just finished */
+ rte_spinlock_unlock(&qp->state_lock);
+ break;
+ }
+
+ if (wqe && ((index != cons) ||
+ (wqe->state != WQE_STATE_POSTED))) {
+ /* comp not done yet */
+ rte_spinlock_unlock(&qp->state_lock);
+ break;
+ }
+
+ qp->req.state = QP_STATE_DRAINED;
+ rte_spinlock_unlock(&qp->state_lock);
+ } while (0);
+ }
+
+ if (index == prod)
+ return NULL;
+
+ wqe = addr_from_index(q, index);
+
+ if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+ qp->req.state == QP_STATE_DRAINED) &&
+ (wqe->state != WQE_STATE_PROCESSING)))
+ return NULL;
+
+ if (unlikely((wqe->wr->send_flags & VHOST_RDMA_IB_SEND_FENCE) &&
+ (index != cons))) {
+ qp->req.wait_fence = 1;
+ return NULL;
+ }
+
+ wqe->mask = wr_opcode_mask(wqe->wr->opcode, qp);
+ return wqe;
+}
+
+struct vhost_rdma_av *vhost_rdma_get_av(struct vhost_rdma_pkt_info *pkt)
+{
+ if (!pkt || !pkt->qp)
+ return NULL;
+
+ if (pkt->qp->type == VHOST_RDMA_IB_QPT_RC ||
+ pkt->qp->type == VHOST_RDMA_IB_QPT_UC)
+ return &pkt->qp->av;
+
+ return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
+
+struct rte_mbuf *vhost_rdma_init_packet(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *av,
+ int paylen,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ const struct vhost_rdma_gid *attr;
+ unsigned int hdr_len;
+ struct rte_mbuf *mbuf = NULL;
+ const int port_num = 1;
+ uint16_t data_room;
+
+ attr = &dev->gid_tbl[av->grh.sgid_index];
+
+ if (attr->type == VHOST_RDMA_GID_TYPE_ILLIGAL)
+ return NULL;
+
+ if (av->network_type == VHOST_NETWORK_TYPE_IPV4)
+ hdr_len = ETH_HLEN + sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv4_hdr);
+ else
+ hdr_len = ETH_HLEN + sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv6_hdr);
+
+ hdr_len += sizeof(struct rte_ether_hdr);
+
+ mbuf = rte_pktmbuf_alloc(dev->mbuf_pool);
+
+ if (unlikely(mbuf == NULL)) {
+ goto out;
+ }
+
+ if (unlikely(hdr_len > rte_pktmbuf_headroom(mbuf))) {
+ RDMA_LOG_ERR("no enough head room %u > %u", hdr_len, rte_pktmbuf_headroom(mbuf));
+ rte_pktmbuf_free(mbuf);
+ return NULL;
+ }
+
+ data_room = mbuf->buf_len - rte_pktmbuf_headroom(mbuf);
+ if (unlikely(paylen > data_room)) {
+ RDMA_LOG_ERR("no enough data room %u > %u", paylen, data_room);
+ rte_pktmbuf_free(mbuf);
+ return NULL;
+ }
+
+ if (av->network_type == VHOST_NETWORK_TYPE_IPV4)
+ mbuf->l3_type = VHOST_NETWORK_TYPE_IPV4;
+ else
+ mbuf->l3_type = VHOST_NETWORK_TYPE_IPV6;
+
+ pkt->dev = dev;
+ pkt->port_num = port_num;
+ pkt->hdr = (uint8_t *)rte_pktmbuf_adj(mbuf, 0);
+ pkt->mask |= VHOST_GRH_MASK;
+
+ rte_pktmbuf_data_len(mbuf) = paylen;
+
+out:
+ return mbuf;
+}
+
+
+static struct rte_mbuf* vhost_rdma_init_req_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ int opcode,
+ int payload,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+ struct rte_mbuf *mbuf;
+ struct vhost_rdma_sq_req *wr = wqe->wr;
+ struct vhost_rdma_av *av;
+ int pad = (-payload) & 0x3;
+ int paylen;
+ int solicited;
+ uint16_t pkey;
+ uint32_t qp_num;
+ int ack_req;
+
+ /* length from start of bth to end of icrc */
+ paylen = vhost_rdma_opcode[opcode].length + payload + pad + VHOST_ICRC_SIZE;
+
+ /* pkt->hdr, rxe, port_num and mask are initialized in ifc
+ * layer
+ */
+ pkt->opcode = opcode;
+ pkt->qp = qp;
+ pkt->psn = qp->req.psn;
+ pkt->mask = vhost_rdma_opcode[opcode].mask;
+ pkt->paylen = paylen;
+ pkt->wqe = wqe;
+
+ /* init mbuf */
+ av = vhost_rdma_get_av(pkt);
+ mbuf = vhost_rdma_init_packet(dev, av, paylen, pkt);
+ if (unlikely(!mbuf))
+ return NULL;
+
+ /* init bth */
+ solicited = (wr->send_flags & VHOST_RDMA_IB_SEND_SOLICITED) &&
+ (pkt->mask & VHOST_END_MASK) &&
+ ((pkt->mask & (VHOST_SEND_MASK)) ||
+ (pkt->mask & (VHOST_WRITE_MASK | VHOST_IMMDT_MASK)) ==
+ (VHOST_WRITE_MASK | VHOST_IMMDT_MASK));
+
+ pkey = IB_DEFAULT_PKEY_FULL;
+
+ qp_num = (pkt->mask & VHOST_DETH_MASK) ? wr->ud.remote_qpn :
+ qp->attr.dest_qp_num;
+
+ ack_req = ((pkt->mask & VHOST_END_MASK) ||
+ (qp->req.noack_pkts++ > VHOST_MAX_PKT_PER_ACK));
+ if (ack_req)
+ qp->req.noack_pkts = 0;
+
+ bth_init(pkt, pkt->opcode, solicited, 0,
+ pad, pkey, qp_num,
+ ack_req, pkt->psn);
+
+ /* init optional headers */
+ if (pkt->mask & VHOST_RETH_MASK) {
+ reth_set_rkey(pkt, wr->rdma.rkey);
+ reth_set_va(pkt, wqe->iova);
+ reth_set_len(pkt, wqe->dma.resid);
+ }
+
+ if (pkt->mask & VHOST_IMMDT_MASK)
+ immdt_set_imm(pkt, wr->imm_data);
+ if (pkt->mask & VHOST_DETH_MASK) {
+ if (qp->qpn == 1)
+ deth_set_qkey(pkt, GSI_QKEY);
+ else
+ deth_set_qkey(pkt, wr->ud.remote_qkey);
+ deth_set_sqp(pkt, qp->qpn);
+ }
+
+ return mbuf;
+}
+
+struct vhost_rdma_mr* lookup_mr(struct vhost_rdma_pd *pd,
+ int access,
+ uint32_t key,
+ enum vhost_rdma_mr_lookup_type type)
+{
+ struct vhost_rdma_mr *mr;
+ int index = key >> 8;
+
+ mr = vhost_rdma_pool_get(&pd->dev->mr_pool, index);
+ if (!mr)
+ return NULL;
+ vhost_rdma_add_ref(mr);
+
+ if (unlikely((type == VHOST_LOOKUP_LOCAL && mr->lkey != key) ||
+ (type == VHOST_LOOKUP_REMOTE && mr->rkey != key) ||
+ mr->pd != pd || (access && !(access & mr->access)) ||
+ mr->state != VHOST_MR_STATE_VALID)) {
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+ mr = NULL;
+ }
+
+ return mr;
+}
+
+int
+mr_check_range(struct vhost_rdma_mr *mr, uint64_t iova, size_t length)
+{
+ switch (mr->type) {
+ case VHOST_MR_TYPE_DMA:
+ return 0;
+
+ case VHOST_MR_TYPE_MR:
+ if (iova < mr->iova || length > mr->length ||
+ iova > mr->iova + mr->length - length)
+ return -EFAULT;
+ return 0;
+
+ default:
+ return -EFAULT;
+ }
+}
+
+static __rte_always_inline uint64_t
+lookup_iova(struct vhost_rdma_mr *mr, uint64_t iova)
+{
+ size_t offset, index;
+
+ index = (iova - mr->iova) / USER_MMAP_TARGET_PAGE_SIZE;
+ offset = (iova - mr->iova) & ~USER_MMAP_PAGE_MASK;
+
+ return mr->pages[index] + offset;
+}
+
+int
+vhost_rdma_mr_copy(struct rte_vhost_memory *mem,
+ struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ void *addr,
+ uint64_t length,
+ enum vhost_rdma_mr_copy_dir dir,
+ uint32_t *crcp)
+{
+ int err;
+ uint64_t bytes;
+ uint8_t *va;
+ uint32_t crc = crcp ? (*crcp) : 0;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->type == VHOST_MR_TYPE_DMA) {
+ uint8_t *src, *dest;
+ // for dma addr, need to translate
+ iova = gpa_to_vva(mem, iova, &length);
+
+ src = (dir == VHOST_RDMA_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova);
+
+ dest = (dir == VHOST_RDMA_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr;
+
+ rte_memcpy(dest, src, length);
+
+ if (crcp)
+ *crcp = crc32(*crcp, dest, length);
+
+ return 0;
+ }
+
+ err = mr_check_range(mr, iova, length);
+ if (err) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ while (length > 0) {
+ uint8_t *src, *dest;
+
+ va = (uint8_t *)lookup_iova(mr, iova);
+ src = (dir == VHOST_RDMA_TO_MR_OBJ) ? addr : va;
+ dest = (dir == VHOST_RDMA_TO_MR_OBJ) ? va : addr;
+
+ bytes = USER_MMAP_TARGET_PAGE_SIZE - ((uint64_t)va & ~ USER_MMAP_PAGE_MASK);
+
+ if (bytes > length)
+ bytes = length;
+
+ RDMA_LOG_DEBUG_DP("copy %p <- %p %lu", dest, src, bytes);
+ rte_memcpy(dest, src, bytes);
+
+ if (crcp)
+ crc = crc32(crc, dest, bytes);
+
+ length -= bytes;
+ addr += bytes;
+ iova += bytes;
+ }
+
+ if (crcp)
+ *crcp = crc;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+int
+copy_data(struct vhost_rdma_pd *pd, int access,
+ struct vhost_rdma_dma_info *dma, void *addr,
+ int length, enum vhost_rdma_mr_copy_dir dir, uint32_t *crcp)
+{
+ uint32_t bytes;
+ struct vhost_rdma_sge *sge = &dma->sge[dma->cur_sge];
+ uint32_t offset = dma->sge_offset;
+ int resid = dma->resid;
+ struct vhost_rdma_mr *mr = NULL;
+ uint64_t iova;
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (length > resid) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ RDMA_LOG_DEBUG("sge %llx %u offset %u %d", sge->addr, sge->length, offset, length);
+ if (sge->length && (offset < sge->length)) {
+ mr = lookup_mr(pd, access, sge->lkey, VHOST_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ }
+
+ while (length > 0) {
+ bytes = length;
+
+ if (offset >= sge->length) {
+ if (mr) {
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+ mr = NULL;
+ }
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+
+ if (dma->cur_sge >= dma->num_sge) {
+ err = -ENOSPC;
+ goto err2;
+ }
+
+ if (sge->length) {
+ mr = lookup_mr(pd, access, sge->lkey, VHOST_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ } else {
+ continue;
+ }
+ }
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ if (bytes > 0) {
+ iova = sge->addr + offset;
+
+ err = vhost_rdma_mr_copy(pd->dev->mem, mr, iova, addr, bytes, dir, crcp);
+ if (err)
+ goto err2;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ addr += bytes;
+ }
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ if (mr)
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+
+ return 0;
+
+err2:
+ if (mr)
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+err1:
+ return err;
+}
+
+static int
+vhost_rdma_finish_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *skb, int paylen)
+{
+ uint32_t crc = 0;
+ uint32_t *p;
+ int err;
+
+ err = vhost_rdma_prepare(pkt, skb, &crc);
+ if (err)
+ return err;
+
+ if (pkt->mask & VHOST_WRITE_OR_SEND) {
+ if (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_INLINE) {
+ uint8_t *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+ crc = crc32(crc, tmp, paylen);
+ memcpy(payload_addr(pkt), tmp, paylen);
+
+ wqe->dma.resid -= paylen;
+ wqe->dma.sge_offset += paylen;
+ }else{
+ err = copy_data(qp->pd, 0, &wqe->dma,
+ payload_addr(pkt), paylen,
+ VHOST_RDMA_TO_MR_OBJ,
+ &crc);
+ if (err)
+ return err;
+ }
+ if (bth_pad(pkt)) {
+ uint8_t *pad = payload_addr(pkt) + paylen;
+
+ memset(pad, 0, bth_pad(pkt));
+ crc = crc32(crc, pad, bth_pad(pkt));
+ }
+ }
+ p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+ *p = ~crc;
+
+ return 0;
+}
+
+static void
+save_state(struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *rollback_wqe,
+ uint32_t *rollback_psn)
+{
+ rollback_wqe->state = wqe->state;
+ rollback_wqe->first_psn = wqe->first_psn;
+ rollback_wqe->last_psn = wqe->last_psn;
+ *rollback_psn = qp->req.psn;
+}
+
+static void
+rollback_state(struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *rollback_wqe,
+ uint32_t rollback_psn)
+{
+ wqe->state = rollback_wqe->state;
+ wqe->first_psn = rollback_wqe->first_psn;
+ wqe->last_psn = rollback_wqe->last_psn;
+ qp->req.psn = rollback_psn;
+}
+
+void
+retransmit_timer(__rte_unused struct rte_timer *timer, void* arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+
+ if (qp->valid) {
+ qp->comp.timeout = 1;
+ vhost_rdma_run_task(&qp->comp.task, 1);
+ }
+}
+
+static void
+update_state(struct vhost_rdma_qp *qp, struct vhost_rdma_pkt_info *pkt)
+{
+ qp->req.opcode = pkt->opcode;
+
+ if (pkt->mask & VHOST_END_MASK)
+ qp->req.wqe_index += 1;
+
+ qp->need_req_mbuf = 0;
+
+ if (qp->qp_timeout_ticks && !rte_timer_pending(&qp->retrans_timer))
+ rte_timer_reset(&qp->retrans_timer, qp->qp_timeout_ticks, SINGLE,
+ rte_lcore_id(), retransmit_timer, qp);
+}
+
+static __rte_always_inline void
+update_wqe_state(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ if (pkt->mask & VHOST_END_MASK) {
+ if (qp->type == VHOST_RDMA_IB_QPT_RC)
+ wqe->state = WQE_STATE_PENDING;
+ } else {
+ wqe->state = WQE_STATE_PROCESSING;
+ }
+}
+
+static __rte_always_inline void
+update_wqe_psn(struct vhost_rdma_qp *qp, struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt, int payload)
+{
+ /* number of packets left to send including current one */
+ int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+ /* handle zero length packet case */
+ if (num_pkt == 0)
+ num_pkt = 1;
+
+ if (pkt->mask & VHOST_START_MASK) {
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = (qp->req.psn + num_pkt - 1) & VHOST_RDMA_PSN_MASK;
+ }
+
+ if (pkt->mask & VHOST_READ_MASK)
+ qp->req.psn = (wqe->first_psn + num_pkt) & VHOST_RDMA_PSN_MASK;
+ else
+ qp->req.psn = (qp->req.psn + 1) & VHOST_RDMA_PSN_MASK;
+}
+
+int vhost_rdma_requester(void *arg)
+{
+ struct vhost_rdma_qp *qp = (struct vhost_rdma_qp *)arg;
+ struct vhost_rdma_pkt_info pkt;
+ struct rte_mbuf *mbuf;
+ struct vhost_rdma_send_wqe *wqe;
+ enum vhost_rdma_hdr_mask mask;
+ struct vhost_rdma_send_wqe rollback_wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ uint32_t rollback_psn;
+ int payload;
+ int mtu;
+ int opcode;
+ int ret;
+
+ vhost_rdma_add_ref(qp);
+
+next_wqe:
+ if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+ goto exit;
+
+ if (unlikely(qp->req.state == QP_STATE_RESET)) {
+ qp->req.wqe_index = q->consumer_index;
+ qp->req.opcode = -1;
+ qp->req.need_rd_atomic = 0;
+ qp->req.wait_psn = 0;
+ qp->req.need_retry = 0;
+ goto exit;
+ }
+
+ if (unlikely(qp->req.need_retry)) {
+ vhost_rdma_req_retry(qp);
+ qp->req.need_retry = 0;
+ }
+
+ wqe = vhost_rdma_req_next_wqe(qp);
+ if (unlikely(!wqe))
+ goto exit;
+
+ assert(!(wqe->mask & WR_LOCAL_OP_MASK));
+
+ if (unlikely(qp->type == VHOST_RDMA_IB_QPT_RC &&
+ psn_compare(qp->req.psn, (qp->comp.psn + VHOST_MAX_UNACKED_PSNS)) > 0)) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ if (unlikely(rte_atomic32_read(&qp->mbuf_out) >
+ VHOST_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_mbuf = 1;
+ goto exit;
+ }
+
+ assert(!(wqe->mask & WR_LOCAL_OP_MASK));
+
+ if (unlikely(qp->type == VHOST_RDMA_IB_QPT_RC &&
+ psn_compare(qp->req.psn, (qp->comp.psn +
+ VHOST_MAX_UNACKED_PSNS)) > 0)) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ /* Limit the number of inflight SKBs per QP */
+ if (unlikely(rte_atomic32_read(&qp->mbuf_out) >
+ VHOST_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_mbuf = 1;
+ goto exit;
+ }
+
+ opcode = vhost_rdma_next_opcode(qp, wqe, wqe->wr->opcode);
+ if (unlikely(opcode < 0)) {
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto exit;
+ }
+
+ mask = vhost_rdma_opcode[opcode].mask;
+ if (unlikely(mask & VHOST_READ_OR_ATOMIC)) {
+ if (check_init_depth(qp, wqe))
+ goto exit;
+ }
+
+ mtu = get_mtu(qp);
+ payload = (mask & VHOST_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+
+ if (payload > mtu) {
+ if (qp->type == VHOST_RDMA_IB_QPT_UD) {
+ /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+ * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+ * shall not emit any packets for this message. Further, the CI shall not
+ * generate an error due to this condition.
+ */
+
+ /* fake a successful UD send */
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = qp->req.psn;
+ qp->req.psn = (qp->req.psn + 1) & VHOST_RDMA_PSN_MASK;
+ qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+ qp->req.wqe_index += 1;
+ wqe->state = WQE_STATE_DONE;
+ wqe->status =VHOST_RDMA_IB_WC_SUCCESS;
+ __vhost_rdma_do_task(&qp->comp.task);
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ return 0;
+ }
+ payload = mtu;
+ }
+
+ mbuf = vhost_rdma_init_req_packet(qp, wqe, opcode, payload, &pkt);
+ if (unlikely(!mbuf)) {
+ RDMA_LOG_ERR_DP("qp#%d Failed allocating mbuf", qp->qpn);
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ ret = vhost_rdma_finish_packet(qp, wqe, &pkt, mbuf, payload);
+ if (unlikely(ret)) {
+ RDMA_LOG_DEBUG("qp#%d Error during finish packet", qp->qpn);
+ if (ret == -EFAULT)
+ wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR;
+ else
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ rte_pktmbuf_free(mbuf);
+ goto err;
+ }
+ /*
+ * To prevent a race on wqe access between requester and completer,
+ * wqe members state and psn need to be set before calling
+ * rxe_xmit_packet().
+ * Otherwise, completer might initiate an unjustified retry flow.
+ */
+ save_state(wqe, qp, &rollback_wqe, &rollback_psn);
+ update_wqe_state(qp, wqe, &pkt);
+ update_wqe_psn(qp, wqe, &pkt, payload);
+ ret = vhost_rdma_xmit_packet(qp, &pkt, mbuf);
+ if (ret) {
+ qp->need_req_mbuf = 1;
+
+ rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
+
+ if (ret == -EAGAIN) {
+ vhost_rdma_run_task(&qp->req.task, 1);
+ goto exit;
+ }
+
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ update_state(qp, &pkt);
+
+ goto next_wqe;
+
+err:
+ wqe->state = WQE_STATE_ERROR;
+ __vhost_rdma_do_task(&qp->comp.task);
+
+exit:
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ return -EAGAIN;
+}
+
int vhost_rdma_completer(void* arg)
{
//TODO: handle complete
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.h b/examples/vhost_user_rdma/vhost_rdma_queue.h
index 260eea51f8..fb5a90235f 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.h
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.h
@@ -19,6 +19,10 @@
#include <linux/types.h>
#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_log.h"
#define QP_OPCODE_INVAILD (-1)
@@ -36,17 +40,15 @@ struct vhost_rdma_bth {
#define VHOST_RDMA_TVER (0)
#define VHOST_RDMA_DEF_PKEY (0xffff)
-#define VHOST_RDMA_SE_MASK (0x80)
-#define VHOST_RDMA_MIG_MASK (0x40)
-#define VHOST_RDMA_PAD_MASK (0x30)
-#define VHOST_RDMA_TVER_MASK (0x0f)
-#define VHOST_RDMA_FECN_MASK (0x80000000)
-#define VHOST_RDMA_BECN_MASK (0x40000000)
-#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
-#define VHOST_RDMA_QPN_MASK (0x00ffffff)
-#define VHOST_RDMA_ACK_MASK (0x80000000)
-#define VHOST_RDMA_RESV7_MASK (0x7f000000)
-#define VHOST_RDMA_PSN_MASK (0x00ffffff)
+#define VHOST_MAX_UNACKED_PSNS 128
+#define VHOST_INFLIGHT_SKBS_PER_QP_HIGH 64
+#define VHOST_INFLIGHT_SKBS_PER_QP_LOW 16
+#define VHOST_MAX_PKT_PER_ACK 64
+
+#define VHOST_ICRC_SIZE (4)
+#define VHOST_MAX_HDR_LENGTH (80)
+
+#define IB_DEFAULT_PKEY_FULL 0xFFFF
/**
* @brief Operation codes for Work Completions (WC)
@@ -94,6 +96,16 @@ enum {
TASK_STATE_ARMED = 2,
};
+enum vhost_rdma_mr_copy_dir {
+ VHOST_RDMA_TO_MR_OBJ,
+ VHOST_RDMA_FROM_MR_OBJ,
+};
+
+enum vhost_rdma_mr_lookup_type {
+ VHOST_LOOKUP_LOCAL,
+ VHOST_LOOKUP_REMOTE,
+};
+
/**
* @brief Send Queue Work Request (WR) structure from userspace
*
@@ -208,10 +220,129 @@ vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
return queue->data + queue->elem_size * idx;
}
+static __rte_always_inline void*
+addr_from_index(struct vhost_rdma_queue *q, unsigned int index)
+{
+ uint16_t cons;
+ uint16_t desc_idx;
+
+ cons = index & (q->num_elems - 1);
+ desc_idx = q->vq->vring.avail->ring[cons];
+
+ return vhost_rdma_queue_get_data(q, desc_idx);
+}
+
+static __rte_always_inline bool queue_empty(struct vhost_rdma_queue *q)
+{
+ uint16_t prod;
+ uint16_t cons;
+
+ prod = q->producer_index;
+ cons = q->consumer_index;
+
+ return prod == cons;
+}
+
+static __rte_always_inline void*
+consumer_addr(struct vhost_rdma_queue *q)
+{
+ uint16_t cons;
+ uint16_t desc_idx;
+
+ cons = q->consumer_index & (q->num_elems - 1);
+ desc_idx = q->vq->vring.avail->ring[cons];
+
+ return vhost_rdma_queue_get_data(q, desc_idx);
+}
+
+static __rte_always_inline void*
+queue_head(struct vhost_rdma_queue *q)
+{
+ return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+static inline int psn_compare(uint32_t psn_a, uint32_t psn_b)
+{
+ int32_t diff;
+
+ diff = (psn_a - psn_b) << 8;
+ return diff;
+}
+
+static __rte_always_inline int
+check_init_depth(struct vhost_rdma_qp *qp, struct vhost_rdma_send_wqe *wqe)
+{
+ int depth;
+
+ if (wqe->has_rd_atomic)
+ return 0;
+
+ qp->req.need_rd_atomic = 1;
+ depth = rte_atomic32_sub_return(&qp->req.rd_atomic, 1);
+
+ if (depth >= 0) {
+ qp->req.need_rd_atomic = 0;
+ wqe->has_rd_atomic = 1;
+ return 0;
+ }
+
+ rte_atomic32_inc(&qp->req.rd_atomic);
+ return -EAGAIN;
+}
+
+static __rte_always_inline int
+get_mtu(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+
+ if (qp->type == VHOST_RDMA_IB_QPT_RC || qp->type == VHOST_RDMA_IB_QPT_UC)
+ return qp->mtu;
+
+ return dev->mtu_cap;
+}
+
+static inline void bth_init(struct vhost_rdma_pkt_info *pkt, uint8_t opcode, int se,
+ int mig, int pad, uint16_t pkey, uint32_t qpn, int ack_req,
+ uint32_t psn)
+{
+ struct vhost_bth *bth = (struct vhost_bth *)(pkt->hdr);
+
+ bth->opcode = opcode;
+ bth->flags = (pad << 4) & VHOST_RDMA_PAD_MASK;
+ if (se)
+ bth->flags |= VHOST_RDMA_SE_MASK;
+ if (mig)
+ bth->flags |= VHOST_RDMA_MIG_MASK;
+ bth->pkey = rte_cpu_to_be_16(pkey);
+ bth->qpn = rte_cpu_to_be_32(qpn & VHOST_RDMA_QPN_MASK);
+ psn &= VHOST_RDMA_PSN_MASK;
+ if (ack_req)
+ psn |= VHOST_RDMA_ACK_MASK;
+ bth->apsn = rte_cpu_to_be_32(psn);
+}
+
+static inline size_t header_size(struct vhost_rdma_pkt_info *pkt)
+{
+ return vhost_rdma_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct vhost_rdma_pkt_info *pkt)
+{
+ return pkt->hdr + vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_PAYLOAD];
+}
+
+static inline size_t payload_size(struct vhost_rdma_pkt_info *pkt)
+{
+ return pkt->paylen - vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_PAYLOAD]
+ - bth_pad(pkt) - VHOST_ICRC_SIZE;
+}
+
/*
* Function declarations
*/
+int vhost_rdma_advance_dma_data(struct vhost_rdma_dma_info *dma, unsigned int length);
+
/**
* @brief Initialize an internal Send WQE from a user WR
*
@@ -335,4 +466,72 @@ void vhost_rdma_qp_destroy(struct vhost_rdma_qp *qp);
int vhost_rdma_av_chk_attr(struct vhost_rdma_device *dev,
struct vhost_rdma_ah_attr *attr);
+struct vhost_rdma_av *vhost_rdma_get_av(struct vhost_rdma_pkt_info *pkt);
+struct rte_mbuf* vhost_rdma_init_packet(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *av,
+ int paylen,
+ struct vhost_rdma_pkt_info *pkt);
+
+int vhost_rdma_send(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf);
+
+int copy_data(struct vhost_rdma_pd *pd, int access,
+ struct vhost_rdma_dma_info *dma,
+ void *addr, int length,
+ enum vhost_rdma_mr_copy_dir dir, uint32_t *crcp);
+
+struct vhost_rdma_mr* lookup_mr(struct vhost_rdma_pd *pd,
+ int access,
+ uint32_t key,
+ enum vhost_rdma_mr_lookup_type type);
+
+int mr_check_range(struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ size_t length);
+
+int vhost_rdma_mr_copy(struct rte_vhost_memory *mem,
+ struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ void *addr,
+ uint64_t length,
+ enum vhost_rdma_mr_copy_dir dir,
+ uint32_t *crcp);
+
+void retransmit_timer(__rte_unused struct rte_timer *timer, void* arg);
+
+static __rte_always_inline int
+vhost_rdma_xmit_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ int err;
+ int is_request = pkt->mask & VHOST_REQ_MASK;
+ struct vhost_rdma_device *dev = qp->dev;
+
+ if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+ (!is_request && (qp->resp.state != QP_STATE_READY))) {
+ RDMA_LOG_ERR("Packet dropped. QP is not in ready state\n");
+ goto drop;
+ }
+
+ err = vhost_rdma_send(pkt, m);
+ if (err) {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_SEND_ERR);
+ return err;
+ }
+
+ if ((qp->type != VHOST_RDMA_IB_QPT_RC) &&
+ (pkt->mask & VHOST_END_MASK)) {
+ pkt->wqe->state = WQE_STATE_DONE;
+ vhost_rdma_run_task(&qp->comp.task, 1);
+ }
+
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_SENT_PKTS);
+ goto done;
+
+drop:
+ rte_pktmbuf_free(m);
+ err = 0;
+done:
+ return err;
+}
+
#endif /* VHOST_RDMA_QUEUE_H_ */
\ No newline at end of file
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 08/14] examples/vhost_user_rdma: implement advanced completer engine with reliability features
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (7 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 07/14] examples/vhost_user_rdma: Implement high-performance requester engine with advanced flow control Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-17 8:49 ` [PATCH 09/14] examples/vhost_user_rdma: implement P_Key query operation with default partition key Xiong Weimin
2025-12-18 16:20 ` Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Leon Romanovsky
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 48005 bytes --]
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds the completer engine for RDMA operations with:
1. State machine for ACK packet processing
2. PSN-based sequence validation
3. Reliability mechanisms (retry, RNR backoff)
4. Atomic operation execution
5. Comprehensive error handling
6. Performance counters for diagnostics
Key features:
- 11-state processing pipeline for response handling
- Dynamic retransmission timer management
- RNR NAK timer for flow control
- Packet lifetime tracking (mbuf release)
- Work completion error propagation
- Congestion-aware task scheduling
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: I12a7baf03edffcd66da7bdc84218001c6bf3a0de
---
examples/vhost_user_rdma/meson.build | 1 +
.../vhost_user_rdma/vhost_rdma_complete.c | 850 ++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_opcode.h | 437 +++++----
examples/vhost_user_rdma/vhost_rdma_queue.c | 6 -
examples/vhost_user_rdma/vhost_rdma_queue.h | 5 +
5 files changed, 1096 insertions(+), 203 deletions(-)
create mode 100644 examples/vhost_user_rdma/vhost_rdma_complete.c
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
index 2a0a6ffc15..4948f709d9 100644
--- a/examples/vhost_user_rdma/meson.build
+++ b/examples/vhost_user_rdma/meson.build
@@ -45,5 +45,6 @@ sources = files(
'vhost_rdma_opcode.c',
'vhost_rdma_pkt.c',
'vhost_rdma_crc.c',
+ 'vhost_rdma_complete.c',
)
diff --git a/examples/vhost_user_rdma/vhost_rdma_complete.c b/examples/vhost_user_rdma/vhost_rdma_complete.c
new file mode 100644
index 0000000000..623b8dd2a0
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_complete.c
@@ -0,0 +1,850 @@
+/*
+ * Vhost-user RDMA device: Completion Queue Handler (Completer)
+ *
+ * This module handles the completion of Send Queue Work Queue Entries (WQEs)
+ * based on incoming response packets such as ACKs, Read Responses, or NAKs.
+ * It ensures reliable delivery for RC QPs by checking PSN, handling retries,
+ * and posting completions to the CQ.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@kylinos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_timer.h>
+#include <rte_atomic.h>
+#include <rte_log.h>
+
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_pkt.h"
+
+/**
+ * enum comp_state - State machine for RDMA completer
+ *
+ * The completer processes incoming responses using a state machine to handle:
+ * - Packet validation (PSN, opcode)
+ * - Retry logic (timeout, RNR NAK)
+ * - Data operations (READ, ATOMIC)
+ * - Completion generation
+ */
+enum comp_state {
+ VHOST_RDMA_COMPST_GET_ACK,
+ VHOST_RDMA_COMPST_GET_WQE,
+ VHOST_RDMA_COMPST_COMP_WQE,
+ VHOST_RDMA_COMPST_COMP_ACK,
+ VHOST_RDMA_COMPST_CHECK_PSN,
+ VHOST_RDMA_COMPST_CHECK_ACK,
+ VHOST_RDMA_COMPST_READ,
+ VHOST_RDMA_COMPST_ATOMIC,
+ VHOST_RDMA_COMPST_WRITE_SEND,
+ VHOST_RDMA_COMPST_UPDATE_COMP,
+ VHOST_RDMA_COMPST_ERROR_RETRY,
+ VHOST_RDMA_COMPST_RNR_RETRY,
+ VHOST_RDMA_COMPST_ERROR,
+ VHOST_RDMA_COMPST_EXIT,
+ VHOST_RDMA_COMPST_DONE,
+};
+
+/* Human-readable state names for debugging */
+static const char *comp_state_name[] = {
+ [VHOST_RDMA_COMPST_GET_ACK] = "GET ACK",
+ [VHOST_RDMA_COMPST_GET_WQE] = "GET WQE",
+ [VHOST_RDMA_COMPST_COMP_WQE] = "COMP WQE",
+ [VHOST_RDMA_COMPST_COMP_ACK] = "COMP ACK",
+ [VHOST_RDMA_COMPST_CHECK_PSN] = "CHECK PSN",
+ [VHOST_RDMA_COMPST_CHECK_ACK] = "CHECK ACK",
+ [VHOST_RDMA_COMPST_READ] = "READ",
+ [VHOST_RDMA_COMPST_ATOMIC] = "ATOMIC",
+ [VHOST_RDMA_COMPST_WRITE_SEND] = "WRITE/SEND",
+ [VHOST_RDMA_COMPST_UPDATE_COMP] = "UPDATE COMP",
+ [VHOST_RDMA_COMPST_ERROR_RETRY] = "ERROR RETRY",
+ [VHOST_RDMA_COMPST_RNR_RETRY] = "RNR RETRY",
+ [VHOST_RDMA_COMPST_ERROR] = "ERROR",
+ [VHOST_RDMA_COMPST_EXIT] = "EXIT",
+ [VHOST_RDMA_COMPST_DONE] = "DONE",
+};
+
+/**
+ * enum ib_rnr_timeout - Backoff values for RNR NAK timer
+ *
+ * These define exponential backoff delays when receiver is not ready.
+ * Expressed in microseconds via rnrnak_usec[] table.
+ */
+enum ib_rnr_timeout {
+ IB_RNR_TIMER_655_36 = 0,
+ IB_RNR_TIMER_000_01 = 1,
+ IB_RNR_TIMER_000_02 = 2,
+ IB_RNR_TIMER_000_03 = 3,
+ IB_RNR_TIMER_000_04 = 4,
+ IB_RNR_TIMER_000_06 = 5,
+ IB_RNR_TIMER_000_08 = 6,
+ IB_RNR_TIMER_000_12 = 7,
+ IB_RNR_TIMER_000_16 = 8,
+ IB_RNR_TIMER_000_24 = 9,
+ IB_RNR_TIMER_000_32 = 10,
+ IB_RNR_TIMER_000_48 = 11,
+ IB_RNR_TIMER_000_64 = 12,
+ IB_RNR_TIMER_000_96 = 13,
+ IB_RNR_TIMER_001_28 = 14,
+ IB_RNR_TIMER_001_92 = 15,
+ IB_RNR_TIMER_002_56 = 16,
+ IB_RNR_TIMER_003_84 = 17,
+ IB_RNR_TIMER_005_12 = 18,
+ IB_RNR_TIMER_007_68 = 19,
+ IB_RNR_TIMER_010_24 = 20,
+ IB_RNR_TIMER_015_36 = 21,
+ IB_RNR_TIMER_020_48 = 22,
+ IB_RNR_TIMER_030_72 = 23,
+ IB_RNR_TIMER_040_96 = 24,
+ IB_RNR_TIMER_061_44 = 25,
+ IB_RNR_TIMER_081_92 = 26,
+ IB_RNR_TIMER_122_88 = 27,
+ IB_RNR_TIMER_163_84 = 28,
+ IB_RNR_TIMER_245_76 = 29,
+ IB_RNR_TIMER_327_68 = 30,
+ IB_RNR_TIMER_491_52 = 31
+};
+
+/**
+ * rnrnak_usec - Microsecond delay lookup for RNR timeout codes
+ *
+ * Indexed by enum ib_rnr_timeout. Used to schedule RNR retry timers.
+ */
+static unsigned long rnrnak_usec[32] = {
+ [IB_RNR_TIMER_655_36] = 655360,
+ [IB_RNR_TIMER_000_01] = 10,
+ [IB_RNR_TIMER_000_02] = 20,
+ [IB_RNR_TIMER_000_03] = 30,
+ [IB_RNR_TIMER_000_04] = 40,
+ [IB_RNR_TIMER_000_06] = 60,
+ [IB_RNR_TIMER_000_08] = 80,
+ [IB_RNR_TIMER_000_12] = 120,
+ [IB_RNR_TIMER_000_16] = 160,
+ [IB_RNR_TIMER_000_24] = 240,
+ [IB_RNR_TIMER_000_32] = 320,
+ [IB_RNR_TIMER_000_48] = 480,
+ [IB_RNR_TIMER_000_64] = 640,
+ [IB_RNR_TIMER_000_96] = 960,
+ [IB_RNR_TIMER_001_28] = 1280,
+ [IB_RNR_TIMER_001_92] = 1920,
+ [IB_RNR_TIMER_002_56] = 2560,
+ [IB_RNR_TIMER_003_84] = 3840,
+ [IB_RNR_TIMER_005_12] = 5120,
+ [IB_RNR_TIMER_007_68] = 7680,
+ [IB_RNR_TIMER_010_24] = 10240,
+ [IB_RNR_TIMER_015_36] = 15360,
+ [IB_RNR_TIMER_020_48] = 20480,
+ [IB_RNR_TIMER_030_72] = 30720,
+ [IB_RNR_TIMER_040_96] = 40960,
+ [IB_RNR_TIMER_061_44] = 61410,
+ [IB_RNR_TIMER_081_92] = 81920,
+ [IB_RNR_TIMER_122_88] = 122880,
+ [IB_RNR_TIMER_163_84] = 163840,
+ [IB_RNR_TIMER_245_76] = 245760,
+ [IB_RNR_TIMER_327_68] = 327680,
+ [IB_RNR_TIMER_491_52] = 491520,
+};
+
+/**
+ * vhost_rdma_get_wqe - Retrieve head WQE from send queue
+ * @qp: Queue pair
+ * @pkt: Incoming packet (may be NULL)
+ * @wqe_p: Output pointer to current WQE
+ *
+ * Returns next state depending on WQE state and presence of packet.
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_get_wqe(struct vhost_rdma_qp *qp, struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe **wqe_p)
+{
+ struct vhost_rdma_send_wqe *wqe;
+
+ wqe = queue_head(&qp->sq.queue);
+ *wqe_p = wqe;
+
+ /* No WQE available or requester hasn't started processing */
+ if (!wqe || wqe->state == WQE_STATE_POSTED)
+ return pkt ? VHOST_RDMA_COMPST_DONE : VHOST_RDMA_COMPST_EXIT;
+
+ /* Already completed locally */
+ if (wqe->state == WQE_STATE_DONE)
+ return VHOST_RDMA_COMPST_COMP_WQE;
+
+ /* WQE previously failed */
+ if (wqe->state == WQE_STATE_ERROR)
+ return VHOST_RDMA_COMPST_ERROR;
+
+ /* Valid WQE exists — proceed to PSN check if packet exists */
+ return pkt ? VHOST_RDMA_COMPST_CHECK_PSN : VHOST_RDMA_COMPST_EXIT;
+}
+
+/**
+ * reset_retry_counters - Reset retry counters after successful ACK
+ * @qp: Queue pair whose attributes are used
+ */
+static __rte_always_inline void
+reset_retry_counters(struct vhost_rdma_qp *qp)
+{
+ qp->comp.retry_cnt = qp->attr.retry_cnt;
+ qp->comp.rnr_retry = qp->attr.rnr_retry;
+ qp->comp.started_retry = 0;
+}
+
+/**
+* vhost_rdma_check_psn - Validate packet sequence number against expected
+* @qp: Queue pair
+* @pkt: Response packet
+* @wqe: Current WQE
+*
+* Checks whether PSN is valid, detects retransmissions, timeouts, or gaps.
+*/
+static __rte_always_inline enum comp_state
+vhost_rdma_check_psn(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ int32_t diff;
+
+ /* Check if this response is newer than last segment of current WQE */
+ diff = psn_compare(pkt->psn, wqe->last_psn);
+ if (diff > 0) {
+ if (wqe->state == WQE_STATE_PENDING) {
+ /* Unexpected late arrival — likely timeout occurred */
+ if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+ return VHOST_RDMA_COMPST_ERROR_RETRY;
+
+ /* Reset retry count on new transaction */
+ reset_retry_counters(qp);
+ return VHOST_RDMA_COMPST_COMP_WQE;
+ } else {
+ return VHOST_RDMA_COMPST_DONE;
+ }
+ }
+
+ /* Compare with expected PSN at completer */
+ diff = psn_compare(pkt->psn, qp->comp.psn);
+ if (diff < 0) {
+ /* Retransmitted packet — complete only if matches WQE */
+ if (pkt->psn == wqe->last_psn)
+ return VHOST_RDMA_COMPST_COMP_ACK;
+ else
+ return VHOST_RDMA_COMPST_DONE;
+ } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+ /* Out-of-order read/atomic response — skip */
+ return VHOST_RDMA_COMPST_DONE;
+ } else {
+ return VHOST_RDMA_COMPST_CHECK_ACK;
+ }
+}
+
+/**
+ * vhost_rdma_check_ack - Validate response opcode and AETH status
+ * @qp: Queue pair
+ * @pkt: Incoming packet
+ * @wqe: Associated WQE
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_check_ack(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+ unsigned int mask = pkt->mask;
+ uint8_t syn;
+
+ /* Handle initial opcode expectations */
+ switch (qp->comp.opcode) {
+ case -1:
+ /* Expecting start of message */
+ if (!(mask & VHOST_START_MASK))
+ return VHOST_RDMA_COMPST_ERROR;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+ pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+ /* Allow retry from first or only segment */
+ if ((pkt->psn == wqe->first_psn &&
+ pkt->opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) ||
+ (wqe->first_psn == wqe->last_psn &&
+ pkt->opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY))
+ break;
+
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+ break;
+ default:
+ RDMA_LOG_ERR("Invalid comp opcode state: %d", qp->comp.opcode);
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+
+ /* Parse AETH syndrome for ACK/NAK types */
+ syn = aeth_syn(pkt);
+
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+ if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+ return VHOST_RDMA_COMPST_ERROR;
+ /* Fall through */
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ if (wqe->wr->opcode != VHOST_RDMA_IB_WR_RDMA_READ) {
+ wqe->status = VHOST_RDMA_IB_WC_FATAL_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+ reset_retry_counters(qp);
+ return VHOST_RDMA_COMPST_READ;
+
+ case IB_OPCODE_RC_ACKNOWLEDGE:
+ switch (syn & AETH_TYPE_MASK) {
+ case AETH_ACK:
+ reset_retry_counters(qp);
+ return VHOST_RDMA_COMPST_WRITE_SEND;
+
+ case AETH_RNR_NAK:
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RCV_RNR);
+ return VHOST_RDMA_COMPST_RNR_RETRY;
+
+ case AETH_NAK:
+ switch (syn) {
+ case AETH_NAK_PSN_SEQ_ERROR:
+ int diff;
+ diff = psn_compare(pkt->psn, qp->comp.psn);
+ if (diff > 0) {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RCV_SEQ_ERR);
+ qp->comp.psn = pkt->psn;
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ vhost_rdma_run_task(&qp->req.task, 0);
+ }
+ }
+ return VHOST_RDMA_COMPST_ERROR_RETRY;
+
+ case AETH_NAK_INVALID_REQ:
+ wqe->status = VHOST_RDMA_IB_WC_REM_INV_REQ_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+
+ case AETH_NAK_REM_ACC_ERR:
+ wqe->status = VHOST_RDMA_IB_WC_REM_ACCESS_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+
+ case AETH_NAK_REM_OP_ERR:
+ wqe->status = VHOST_RDMA_IB_WC_REM_OP_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+
+ default:
+ RDMA_LOG_ERR("Unexpected NAK type: 0x%x", syn);
+ wqe->status = VHOST_RDMA_IB_WC_REM_OP_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+
+ default:
+ RDMA_LOG_ERR("Unknown AETH type: 0x%x", syn);
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+ break;
+
+ default:
+ RDMA_LOG_ERR("Unexpected opcode: %u", pkt->opcode);
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+}
+
+/**
+ * vhost_rdma_do_read - Copy data from read response into local buffer
+ * @qp: Queue pair
+ * @pkt: Read response packet
+ * @wqe: Corresponding WQE
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_do_read(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ int ret;
+
+ ret = copy_data(qp->pd, VHOST_RDMA_IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, payload_addr(pkt),
+ payload_size(pkt), VHOST_RDMA_TO_MR_OBJ, NULL);
+ if (ret) {
+ wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+
+ /* Final packet? Complete now */
+ if (wqe->dma.resid == 0 && (pkt->mask & VHOST_END_MASK))
+ return VHOST_RDMA_COMPST_COMP_ACK;
+
+ return VHOST_RDMA_COMPST_UPDATE_COMP;
+}
+
+/**
+ * vhost_rdma_do_atomic - Handle atomic acknowledgment with original value
+ * @qp: Queue pair
+ * @pkt: Atomic ACK packet
+ * @wqe: WQE
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_do_atomic(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ int ret;
+ uint64_t atomic_orig = atmack_orig(pkt);
+
+ ret = copy_data(qp->pd, VHOST_RDMA_IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, &atomic_orig,
+ sizeof(uint64_t), VHOST_RDMA_TO_MR_OBJ, NULL);
+ if (ret) {
+ wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR;
+ return VHOST_RDMA_COMPST_ERROR;
+ }
+
+ return VHOST_RDMA_COMPST_COMP_ACK;
+}
+
+/**
+ * wr_to_wc_opcode - Convert Work Request opcode to Work Completion opcode
+ * @opcode: WR opcode
+ *
+ * Returns corresponding WC opcode or 0xff on error.
+ */
+static enum vhost_rdma_ib_wc_opcode
+wr_to_wc_opcode(enum vhost_rdma_ib_wr_opcode opcode)
+{
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_RDMA_WRITE:
+ case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM:
+ return VHOST_RDMA_IB_WC_RDMA_WRITE;
+ case VHOST_RDMA_IB_WR_SEND:
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ return VHOST_RDMA_IB_WC_SEND;
+ case VHOST_RDMA_IB_WR_RDMA_READ:
+ return VHOST_RDMA_IB_WC_RDMA_READ;
+ default:
+ return 0xff;
+ }
+}
+
+/**
+ * make_send_cqe - Build a completion queue entry from WQE
+ * @qp: Queue pair
+ * @wqe: Completed WQE
+ * @cqe: Output CQE
+ */
+static void
+make_send_cqe(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_cq_req *cqe)
+{
+ memset(cqe, 0, sizeof(*cqe));
+
+ cqe->wr_id = wqe->wr->wr_id;
+ cqe->status = wqe->status;
+ cqe->opcode = wr_to_wc_opcode(wqe->wr->opcode);
+
+ if (wqe->wr->opcode == VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM ||
+ wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND_WITH_IMM)
+ cqe->wc_flags |= VHOST_RDMA_WC_WITH_IMM;
+
+ cqe->byte_len = wqe->dma.length;
+ cqe->qp_num = qp->qpn;
+}
+
+/**
+ * advance_consumer - Advance SQ consumer index and notify virtqueue
+ * @q: Queue structure
+ */
+static __rte_always_inline void
+advance_consumer(struct vhost_rdma_queue *q)
+{
+ uint16_t cons_idx;
+ uint16_t desc_idx;
+
+ assert(q->consumer_index == q->vq->last_avail_idx);
+
+ cons_idx = q->consumer_index & (q->num_elems - 1);
+ desc_idx = q->vq->vring.avail->ring[cons_idx];
+
+ vhost_rdma_queue_push(q->vq, desc_idx, 0);
+
+ q->consumer_index++;
+ q->vq->last_avail_idx++;
+}
+
+/**
+ * vhost_rdma_do_complete - Complete a WQE and post CQE if needed
+ * @qp: Queue pair
+ * @wqe: WQE to complete
+ *
+ * Per IB spec, even unsignaled WQEs must generate CQE on error.
+ */
+static void
+vhost_rdma_do_complete(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+ struct vhost_rdma_cq_req cqe;
+ bool post;
+
+ post = (qp->sq_sig_all ||
+ (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_SIGNALED) ||
+ wqe->status != VHOST_RDMA_IB_WC_SUCCESS);
+
+ if (post)
+ make_send_cqe(qp, wqe, &cqe);
+
+ advance_consumer(&qp->sq.queue);
+
+ if (post)
+ vhost_rdma_cq_post(dev, qp->scq, &cqe, 0);
+
+ if (wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND ||
+ wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND_WITH_IMM)
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RDMA_SEND);
+
+ /* Wake up requester if waiting for fence or PSN */
+ if (qp->req.wait_fence) {
+ qp->req.wait_fence = 0;
+ vhost_rdma_run_task(&qp->req.task, 0);
+ }
+}
+
+/**
+ * vhost_rdma_complete_wqe - Mark WQE as completed and update PSN
+ * @qp: Queue pair
+ * @pkt: Response packet (may be NULL)
+ * @wqe: WQE
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_complete_wqe(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ if (pkt && wqe->state == WQE_STATE_PENDING) {
+ if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) {
+ qp->comp.psn = (wqe->last_psn + 1) & VHOST_RDMA_PSN_MASK;
+ qp->comp.opcode = -1;
+ }
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ vhost_rdma_run_task(&qp->req.task, 1);
+ }
+ }
+
+ vhost_rdma_do_complete(qp, wqe);
+ return VHOST_RDMA_COMPST_GET_WQE;
+}
+
+/**
+ * vhost_rdma_rnr_nak_timer - Callback when RNR backoff timer expires
+ * @timer: Timer instance
+ * @arg: Pointer to QP
+ */
+static void
+vhost_rdma_rnr_nak_timer(__rte_unused struct rte_timer *timer, void *arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+
+ RDMA_LOG_DEBUG_DP("QP#%d RNR NAK timer expired", qp->qpn);
+ vhost_rdma_run_task(&qp->req.task, 1);
+}
+
+/**
+ * vhost_rdma_complete_ack - Handle ACK completion including RD_ATOMICS sync
+ * @qp: Queue pair
+ * @pkt: ACK packet
+ * @wqe: WQE
+ */
+static __rte_always_inline enum comp_state
+vhost_rdma_complete_ack(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt,
+ struct vhost_rdma_send_wqe *wqe)
+{
+ if (wqe->has_rd_atomic) {
+ wqe->has_rd_atomic = 0;
+ rte_atomic32_inc(&qp->req.rd_atomic);
+ if (qp->req.need_rd_atomic) {
+ qp->comp.timeout_retry = 0;
+ qp->req.need_rd_atomic = 0;
+ vhost_rdma_run_task(&qp->req.task, 0);
+ }
+ }
+
+ /* Handle SQ drain transition */
+ if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+ rte_spinlock_lock(&qp->state_lock);
+ if (qp->req.state == QP_STATE_DRAIN &&
+ qp->comp.psn == qp->req.psn) {
+ qp->req.state = QP_STATE_DRAINED;
+ rte_spinlock_unlock(&qp->state_lock);
+
+ // TODO: Trigger IB_EVENT_SQ_DRAINED
+ } else {
+ rte_spinlock_unlock(&qp->state_lock);
+ }
+ }
+
+ vhost_rdma_do_complete(qp, wqe);
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ return VHOST_RDMA_COMPST_UPDATE_COMP;
+ else
+ return VHOST_RDMA_COMPST_DONE;
+}
+
+/**
+ * free_pkt - Release packet reference and free mbuf
+ * @pkt: Packet info to release
+ */
+static __rte_always_inline void
+free_pkt(struct vhost_rdma_pkt_info *pkt)
+{
+ struct rte_mbuf *mbuf = PKT_TO_MBUF(pkt);
+
+ vhost_rdma_drop_ref(pkt->qp, pkt->qp->dev, qp);
+ rte_pktmbuf_free(mbuf);
+}
+
+/**
+ * rnrnak_ticks - Convert RNR timeout code to timer ticks
+ * @timeout: Timeout code
+ */
+static __rte_always_inline unsigned long
+rnrnak_ticks(uint8_t timeout)
+{
+ uint64_t ticks_per_us = rte_get_timer_hz() / 1000000;
+ return RTE_MAX(rnrnak_usec[timeout] * ticks_per_us, 1UL);
+}
+
+/**
+ * vhost_rdma_drain_resp_pkts - Flush all pending response packets
+ * @qp: Queue pair
+ * @notify: Whether to signal flush error
+ */
+static void
+vhost_rdma_drain_resp_pkts(struct vhost_rdma_qp *qp, bool notify)
+{
+ struct rte_mbuf *mbuf;
+ struct vhost_rdma_send_wqe *wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+
+ while (rte_ring_dequeue(qp->resp_pkts, (void **)&mbuf) == 0) {
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ rte_pktmbuf_free(mbuf);
+ }
+
+ while ((wqe = queue_head(q))) {
+ if (notify) {
+ wqe->status = VHOST_RDMA_IB_WC_WR_FLUSH_ERR;
+ vhost_rdma_do_complete(qp, wqe);
+ } else {
+ advance_consumer(q);
+ }
+ }
+}
+
+/**
+ * vhost_rdma_completer - Main completer function (run per QP)
+ * @arg: Pointer to vhost_rdma_qp
+ *
+ * Processes incoming response packets and completes WQEs accordingly.
+ * Implements reliability mechanisms: retry, RNR backoff, PSN tracking.
+ *
+ * Return: 0 on success, -EAGAIN if needs rescheduling
+ */
+int
+vhost_rdma_completer(void *arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+ struct vhost_rdma_device *dev = qp->dev;
+ struct vhost_rdma_send_wqe *wqe = NULL;
+ struct rte_mbuf *mbuf = NULL;
+ struct vhost_rdma_pkt_info *pkt = NULL;
+ enum comp_state state;
+ int ret = 0;
+
+ vhost_rdma_add_ref(qp);
+
+ if (!qp->valid || qp->req.state == QP_STATE_ERROR ||
+ qp->req.state == QP_STATE_RESET) {
+ vhost_rdma_drain_resp_pkts(qp, qp->valid &&
+ qp->req.state == QP_STATE_ERROR);
+ ret = -EAGAIN;
+ goto done;
+ }
+
+ if (qp->comp.timeout) {
+ qp->comp.timeout_retry = 1;
+ qp->comp.timeout = 0;
+ } else {
+ qp->comp.timeout_retry = 0;
+ }
+
+ if (qp->req.need_retry) {
+ ret = -EAGAIN;
+ goto done;
+ }
+
+ state = VHOST_RDMA_COMPST_GET_ACK;
+
+ while (1) {
+ RDMA_LOG_DEBUG_DP("QP#%d state=%s", qp->qpn, comp_state_name[state]);
+
+ switch (state) {
+ case VHOST_RDMA_COMPST_GET_ACK:
+ if (rte_ring_dequeue(qp->resp_pkts, (void **)&mbuf) == 0) {
+ pkt = MBUF_TO_PKT(mbuf);
+ qp->comp.timeout_retry = 0;
+ } else {
+ mbuf = NULL;
+ }
+ state = VHOST_RDMA_COMPST_GET_WQE;
+ break;
+
+ case VHOST_RDMA_COMPST_GET_WQE:
+ state = vhost_rdma_get_wqe(qp, pkt, &wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_CHECK_PSN:
+ state = vhost_rdma_check_psn(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_CHECK_ACK:
+ state = vhost_rdma_check_ack(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_READ:
+ state = vhost_rdma_do_read(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_ATOMIC:
+ state = vhost_rdma_do_atomic(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_WRITE_SEND:
+ if (wqe && wqe->state == WQE_STATE_PENDING &&
+ wqe->last_psn == pkt->psn)
+ state = VHOST_RDMA_COMPST_COMP_ACK;
+ else
+ state = VHOST_RDMA_COMPST_UPDATE_COMP;
+ break;
+
+ case VHOST_RDMA_COMPST_COMP_ACK:
+ state = vhost_rdma_complete_ack(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_COMP_WQE:
+ state = vhost_rdma_complete_wqe(qp, pkt, wqe);
+ break;
+
+ case VHOST_RDMA_COMPST_UPDATE_COMP:
+ if (pkt->mask & VHOST_END_MASK)
+ qp->comp.opcode = -1;
+ else
+ qp->comp.opcode = pkt->opcode;
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ qp->comp.psn = (pkt->psn + 1) & VHOST_RDMA_PSN_MASK;
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ vhost_rdma_run_task(&qp->req.task, 1);
+ }
+ state = VHOST_RDMA_COMPST_DONE;
+ break;
+
+ case VHOST_RDMA_COMPST_DONE:
+ goto done;
+
+ case VHOST_RDMA_COMPST_EXIT:
+ if (qp->comp.timeout_retry && wqe) {
+ state = VHOST_RDMA_COMPST_ERROR_RETRY;
+ break;
+ }
+
+ /* Restart retransmit timer if conditions met */
+ if ((qp->type == VHOST_RDMA_IB_QPT_RC) &&
+ (qp->req.state == QP_STATE_READY) &&
+ (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
+ qp->qp_timeout_ticks) {
+ rte_timer_reset(&qp->retrans_timer,
+ qp->qp_timeout_ticks,
+ SINGLE, rte_lcore_id(),
+ retransmit_timer, qp);
+ }
+ ret = -EAGAIN;
+ goto done;
+
+ case VHOST_RDMA_COMPST_ERROR_RETRY:
+ if (!wqe || wqe->state == WQE_STATE_POSTED)
+ goto done;
+
+ if (qp->comp.started_retry && !qp->comp.timeout_retry)
+ goto done;
+
+ if (qp->comp.retry_cnt > 0) {
+ if (qp->comp.retry_cnt != 7)
+ qp->comp.retry_cnt--;
+
+ if (psn_compare(qp->req.psn, qp->comp.psn) > 0) {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_COMP_RETRY);
+ qp->req.need_retry = 1;
+ qp->comp.started_retry = 1;
+ vhost_rdma_run_task(&qp->req.task, 0);
+ }
+ goto done;
+ } else {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RETRY_EXCEEDED);
+ wqe->status = VHOST_RDMA_IB_WC_RETRY_EXC_ERR;
+ state = VHOST_RDMA_COMPST_ERROR;
+ }
+ break;
+
+ case VHOST_RDMA_COMPST_RNR_RETRY:
+ if (qp->comp.rnr_retry > 0) {
+ if (qp->comp.rnr_retry != 7)
+ qp->comp.rnr_retry--;
+
+ qp->req.need_retry = 1;
+ RDMA_LOG_DEBUG_DP("QP#%d setting RNR NAK timer", qp->qpn);
+ rte_timer_reset(&qp->rnr_nak_timer,
+ rnrnak_ticks(aeth_syn(pkt) & ~AETH_TYPE_MASK),
+ SINGLE, rte_lcore_id(),
+ vhost_rdma_rnr_nak_timer, qp);
+ ret = -EAGAIN;
+ goto done;
+ } else {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RNR_RETRY_EXCEEDED);
+ wqe->status = VHOST_RDMA_IB_WC_RNR_RETRY_EXC_ERR;
+ state = VHOST_RDMA_COMPST_ERROR;
+ }
+ break;
+
+ case VHOST_RDMA_COMPST_ERROR:
+ RDMA_LOG_ERR_DP("WQE Error: %u", wqe->status);
+ vhost_rdma_do_complete(qp, wqe);
+ vhost_rdma_qp_error(qp);
+ ret = -EAGAIN;
+ goto done;
+ }
+ }
+
+done:
+ if (pkt)
+ free_pkt(pkt);
+ vhost_rdma_drop_ref(qp, dev, qp);
+
+ return ret;
+}
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.h b/examples/vhost_user_rdma/vhost_rdma_opcode.h
index 6c3660f36b..0c2961d5cd 100644
--- a/examples/vhost_user_rdma/vhost_rdma_opcode.h
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.h
@@ -27,28 +27,28 @@
#include "vhost_rdma_pkt.h"
/** Maximum number of QP types supported for WR mask dispatching */
-#define WR_MAX_QPT 8
+#define WR_MAX_QPT 8
/** Total number of defined opcodes (must be power-of-2 >= 256) */
-#define VHOST_NUM_OPCODE 256
+#define VHOST_NUM_OPCODE 256
#ifndef BIT
#define BIT(x) (1 << (x))
#endif
/* Invalid opcode marker */
-#define OPCODE_NONE (-1)
+#define OPCODE_NONE (-1)
#define VHOST_RDMA_SE_MASK (0x80)
#define VHOST_RDMA_MIG_MASK (0x40)
#define VHOST_RDMA_PAD_MASK (0x30)
-#define VHOST_RDMA_TVER_MASK (0x0f)
-#define VHOST_RDMA_FECN_MASK (0x80000000)
-#define VHOST_RDMA_BECN_MASK (0x40000000)
-#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
+#define VHOST_RDMA_TVER_MASK (0x0f)
+#define VHOST_RDMA_FECN_MASK (0x80000000)
+#define VHOST_RDMA_BECN_MASK (0x40000000)
+#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
#define VHOST_RDMA_QPN_MASK (0x00ffffff)
#define VHOST_RDMA_ACK_MASK (0x80000000)
-#define VHOST_RDMA_RESV7_MASK (0x7f000000)
+#define VHOST_RDMA_RESV7_MASK (0x7f000000)
#define VHOST_RDMA_PSN_MASK (0x00ffffff)
/**
@@ -56,19 +56,19 @@
* @{
*/
enum vhost_rdma_hdr_type {
- VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
- VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
- VHOST_RDMA_BTH, /**< Base Transport Header */
- VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
- VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
- VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
- VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
- VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
- VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
- VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
- VHOST_RDMA_IMMDT, /**< Immediate Data Header */
- VHOST_RDMA_PAYLOAD, /**< Payload section */
- NUM_HDR_TYPES /**< Number of known header types */
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
};
/**
@@ -76,50 +76,50 @@ enum vhost_rdma_hdr_type {
* @{
*/
enum vhost_rdma_hdr_mask {
- VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
- VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
- VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
- VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
- VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
- VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
- VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
- VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
- VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
- VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
- VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
- VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
-
- /* Semantic packet type flags */
- VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
- VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
- VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
- VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
- VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
- VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
-
- /* Packet fragmentation flags */
- VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
- VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
-
- VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
- VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
- VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
-
- VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
-
- /* Composite masks */
- VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
- VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
};
/**
* @brief Per-opcode metadata for parsing and validation
*/
struct vhost_rdma_opcode_info {
- const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
- int length; /**< Fixed payload length (if any) */
- int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
- enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
};
/* Global opcode info table (indexed by IB opcode byte) */
@@ -146,8 +146,8 @@ static inline uint8_t bth_pad(struct vhost_rdma_pkt_info *pkt)
}
struct vhost_deth {
- rte_be32_t qkey;
- rte_be32_t sqp;
+ rte_be32_t qkey;
+ rte_be32_t sqp;
};
#define GSI_QKEY (0x80010000)
@@ -206,7 +206,7 @@ static inline void deth_set_sqp(struct vhost_rdma_pkt_info *pkt, uint32_t sqp)
}
struct vhost_immdt {
- rte_be32_t imm;
+ rte_be32_t imm;
};
static inline rte_be32_t __immdt_imm(void *arg)
@@ -236,9 +236,9 @@ static inline void immdt_set_imm(struct vhost_rdma_pkt_info *pkt, rte_be32_t imm
}
struct vhost_reth {
- rte_be64_t va;
- rte_be32_t rkey;
- rte_be32_t len;
+ rte_be64_t va;
+ rte_be32_t rkey;
+ rte_be32_t len;
};
static inline uint64_t __reth_va(void *arg)
@@ -323,35 +323,65 @@ struct vhost_aeth {
rte_be32_t smsn;
};
+#define AETH_SYN_MASK (0xff000000)
+#define AETH_MSN_MASK (0x00ffffff)
+
+enum aeth_syndrome {
+ AETH_TYPE_MASK = 0xe0,
+ AETH_ACK = 0x00,
+ AETH_RNR_NAK = 0x20,
+ AETH_RSVD = 0x40,
+ AETH_NAK = 0x60,
+ AETH_ACK_UNLIMITED = 0x1f,
+ AETH_NAK_PSN_SEQ_ERROR = 0x60,
+ AETH_NAK_INVALID_REQ = 0x61,
+ AETH_NAK_REM_ACC_ERR = 0x62,
+ AETH_NAK_REM_OP_ERR = 0x63,
+ AETH_NAK_INV_RD_REQ = 0x64,
+};
+
+static inline uint8_t __aeth_syn(void *arg)
+{
+ struct vhost_aeth *aeth = arg;
+
+ return (AETH_SYN_MASK & rte_be_to_cpu_32(aeth->smsn)) >> 24;
+}
+
+static inline uint8_t aeth_syn(struct vhost_rdma_pkt_info *pkt)
+{
+ return __aeth_syn(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_AETH]);
+}
+
struct vhost_atmack {
- rte_be64_t orig;
+ rte_be64_t orig;
};
struct vhost_atmeth {
- rte_be64_t va;
- rte_be32_t rkey;
- rte_be64_t swap_add;
- rte_be64_t comp;
+ rte_be64_t va;
+ rte_be32_t rkey;
+ rte_be64_t swap_add;
+ rte_be64_t comp;
} __rte_packed;
struct vhost_ieth {
- rte_be32_t rkey;
+ rte_be32_t rkey;
};
struct vhost_rdeth {
- rte_be32_t een;
+ rte_be32_t een;
};
enum vhost_rdma_hdr_length {
- VHOST_BTH_BYTES = sizeof(struct vhost_bth),
- VHOST_DETH_BYTES = sizeof(struct vhost_deth),
- VHOST_IMMDT_BYTES = sizeof(struct vhost_immdt),
- VHOST_RETH_BYTES = sizeof(struct vhost_reth),
- VHOST_AETH_BYTES = sizeof(struct vhost_aeth),
- VHOST_ATMACK_BYTES = sizeof(struct vhost_atmack),
- VHOST_ATMETH_BYTES = sizeof(struct vhost_atmeth),
- VHOST_IETH_BYTES = sizeof(struct vhost_ieth),
- VHOST_RDETH_BYTES = sizeof(struct vhost_rdeth),
+ VHOST_BTH_BYTES = sizeof(struct vhost_bth),
+ VHOST_DETH_BYTES = sizeof(struct vhost_deth),
+ VHOST_IMMDT_BYTES = sizeof(struct vhost_immdt),
+ VHOST_RETH_BYTES = sizeof(struct vhost_reth),
+ VHOST_AETH_BYTES = sizeof(struct vhost_aeth),
+ VHOST_ATMACK_BYTES = sizeof(struct vhost_atmack),
+ VHOST_ATMETH_BYTES = sizeof(struct vhost_atmeth),
+ VHOST_IETH_BYTES = sizeof(struct vhost_ieth),
+ VHOST_RDETH_BYTES = sizeof(struct vhost_rdeth),
};
/**
@@ -360,8 +390,8 @@ enum vhost_rdma_hdr_length {
* Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST`
*/
#define IB_OPCODE(transport, op) \
- IB_OPCODE_ ## transport ## _ ## op = \
- (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
+ IB_OPCODE_ ## transport ## _ ## op = \
+ (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
/**
* @defgroup ib_opcodes InfiniBand OpCode Definitions
@@ -371,105 +401,105 @@ enum vhost_rdma_hdr_length {
*/
enum {
- /* Transport types (base values) */
- IB_OPCODE_RC = 0x00, /**< Reliable Connection */
- IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
- IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
- IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
- IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
- IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
-
- /* Operation subtypes */
- IB_OPCODE_SEND_FIRST = 0x00,
- IB_OPCODE_SEND_MIDDLE = 0x01,
- IB_OPCODE_SEND_LAST = 0x02,
- IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
- IB_OPCODE_SEND_ONLY = 0x04,
- IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
- IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
- IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
- IB_OPCODE_RDMA_WRITE_LAST = 0x08,
- IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
- IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
- IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
- IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
- IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
- IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
- IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
- IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
- IB_OPCODE_ACKNOWLEDGE = 0x11,
- IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
- IB_OPCODE_COMPARE_SWAP = 0x13,
- IB_OPCODE_FETCH_ADD = 0x14,
- /* 0x15 is reserved */
- IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
- IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
-
- /* Real opcodes generated via IB_OPCODE() macro */
- IB_OPCODE(RC, SEND_FIRST),
- IB_OPCODE(RC, SEND_MIDDLE),
- IB_OPCODE(RC, SEND_LAST),
- IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RC, SEND_ONLY),
- IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_WRITE_FIRST),
- IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
- IB_OPCODE(RC, RDMA_WRITE_LAST),
- IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_WRITE_ONLY),
- IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RC, RDMA_READ_REQUEST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
- IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
- IB_OPCODE(RC, ACKNOWLEDGE),
- IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
- IB_OPCODE(RC, COMPARE_SWAP),
- IB_OPCODE(RC, FETCH_ADD),
- IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
- IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
-
- /* UC opcodes */
- IB_OPCODE(UC, SEND_FIRST),
- IB_OPCODE(UC, SEND_MIDDLE),
- IB_OPCODE(UC, SEND_LAST),
- IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(UC, SEND_ONLY),
- IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(UC, RDMA_WRITE_FIRST),
- IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
- IB_OPCODE(UC, RDMA_WRITE_LAST),
- IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(UC, RDMA_WRITE_ONLY),
- IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
-
- /* RD opcodes */
- IB_OPCODE(RD, SEND_FIRST),
- IB_OPCODE(RD, SEND_MIDDLE),
- IB_OPCODE(RD, SEND_LAST),
- IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RD, SEND_ONLY),
- IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_WRITE_FIRST),
- IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
- IB_OPCODE(RD, RDMA_WRITE_LAST),
- IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_WRITE_ONLY),
- IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
- IB_OPCODE(RD, RDMA_READ_REQUEST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
- IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
- IB_OPCODE(RD, ACKNOWLEDGE),
- IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
- IB_OPCODE(RD, COMPARE_SWAP),
- IB_OPCODE(RD, FETCH_ADD),
-
- /* UD opcodes */
- IB_OPCODE(UD, SEND_ONLY),
- IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+ /* Transport types (base values) */
+ IB_OPCODE_RC = 0x00, /**< Reliable Connection */
+ IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
+ IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
+ IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
+ IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
+ IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
+
+ /* Operation subtypes */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+ /* 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+
+ /* Real opcodes generated via IB_OPCODE() macro */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+
+ /* UC opcodes */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD opcodes */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD opcodes */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
};
/** @} */
@@ -478,17 +508,17 @@ enum {
* @{
*/
enum vhost_rdma_wr_mask {
- WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
- WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
- WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
- WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
- WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
- WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
-
- WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
- WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
- WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
- WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+ WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
+ WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
+ WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
+ WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
+ WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
+ WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
};
/**
@@ -497,8 +527,8 @@ enum vhost_rdma_wr_mask {
* Used to determine which operations are valid per QP type.
*/
struct vhost_rdma_wr_opcode_info {
- const char *name; /**< Human-readable name */
- enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
+ const char *name; /**< Human-readable name */
+ enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
};
/* Extern declaration of global opcode metadata table */
@@ -510,8 +540,21 @@ static inline unsigned int wr_opcode_mask(int opcode, struct vhost_rdma_qp *qp)
return vhost_rdma_wr_opcode_info[opcode].mask[qp->type];
}
+static inline uint64_t __atmack_orig(void *arg)
+{
+ struct vhost_atmack *atmack = arg;
+
+ return rte_be_to_cpu_64(atmack->orig);
+}
+
+static inline uint64_t atmack_orig(struct vhost_rdma_pkt_info *pkt)
+{
+ return __atmack_orig(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_ATMACK]);
+}
+
int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp,
- struct vhost_rdma_send_wqe *wqe,
- uint32_t opcode);
+ struct vhost_rdma_send_wqe *wqe,
+ uint32_t opcode);
#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.c b/examples/vhost_user_rdma/vhost_rdma_queue.c
index 7d0c45592c..5f9f7fd3c7 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.c
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.c
@@ -1388,12 +1388,6 @@ int vhost_rdma_requester(void *arg)
return -EAGAIN;
}
-int vhost_rdma_completer(void* arg)
-{
- //TODO: handle complete
- return 0;
-}
-
int vhost_rdma_responder(void* arg)
{
//TODO: handle response
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.h b/examples/vhost_user_rdma/vhost_rdma_queue.h
index fb5a90235f..d8af86cdf2 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.h
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.h
@@ -24,6 +24,11 @@
#include "vhost_rdma.h"
#include "vhost_rdma_log.h"
+#define PKT_TO_MBUF(p) ((struct rte_mbuf *) \
+ (RTE_PTR_SUB(p, sizeof(struct rte_mbuf))))
+#define MBUF_TO_PKT(m) ((struct vhost_rdma_pkt_info *) \
+ (RTE_PTR_ADD(m, sizeof(struct rte_mbuf))))
+
#define QP_OPCODE_INVAILD (-1)
/******************************************************************************
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 09/14] examples/vhost_user_rdma: implement P_Key query operation with default partition key
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (8 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 08/14] examples/vhost_user_rdma: implement advanced completer engine with reliability features Xiong Weimin
@ 2025-12-17 8:49 ` Xiong Weimin
2025-12-18 16:20 ` Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Leon Romanovsky
10 siblings, 0 replies; 17+ messages in thread
From: Xiong Weimin @ 2025-12-17 8:49 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev
Cc: linux-kernel, netdev, xiongweimin
From: xiongweimin <xiongweimin@kylinos.cn>
This commit adds support for the IB_QUERY_PKEY command:
1. Implements mandatory InfiniBand partition key query
2. Provides default full-membership P_Key (0xFFFF)
3. Includes I/O vector safety validation
4. Maintains compatibility with standard IB management tools
Key features:
- Hardcoded default P_Key for simplified management
- Buffer size validation using CHK_IOVEC macro
- Zero-copy response writing via iovec
- Minimal overhead for frequent management operations
Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn>
Change-Id: Ibc7be3488989285da205aff7400be38995a435fd
---
examples/vhost_user_rdma/meson.build | 52 ++++++++++++------------
examples/vhost_user_rdma/vhost_rdma_ib.c | 46 ++++++++++++++-------
examples/vhost_user_rdma/vhost_rdma_ib.h | 4 ++
3 files changed, 61 insertions(+), 41 deletions(-)
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
index 4948f709d9..89ff4fbbf1 100644
--- a/examples/vhost_user_rdma/meson.build
+++ b/examples/vhost_user_rdma/meson.build
@@ -7,8 +7,8 @@
# DPDK instance, use 'make'
if not is_linux
- build = false
- subdir_done()
+ build = false
+ subdir_done()
endif
deps += ['vhost', 'timer']
@@ -16,35 +16,35 @@ deps += ['vhost', 'timer']
allow_experimental_apis = true
cflags_options = [
- '-std=c11',
- '-Wno-strict-prototypes',
- '-Wno-pointer-arith',
- '-Wno-maybe-uninitialized',
- '-Wno-discarded-qualifiers',
- '-Wno-old-style-definition',
- '-Wno-sign-compare',
- '-Wno-stringop-overflow',
- '-O3',
- '-g',
- '-DALLOW_EXPERIMENTAL_API',
- '-DDEBUG_RDMA',
- '-DDEBUG_RDMA_DP',
+ '-std=c11',
+ '-Wno-strict-prototypes',
+ '-Wno-pointer-arith',
+ '-Wno-maybe-uninitialized',
+ '-Wno-discarded-qualifiers',
+ '-Wno-old-style-definition',
+ '-Wno-sign-compare',
+ '-Wno-stringop-overflow',
+ '-O3',
+ '-g',
+ '-DALLOW_EXPERIMENTAL_API',
+ '-DDEBUG_RDMA',
+ '-DDEBUG_RDMA_DP',
]
foreach option:cflags_options
- if cc.has_argument(option)
- cflags += option
- endif
+ if cc.has_argument(option)
+ cflags += option
+ endif
endforeach
sources = files(
- 'main.c',
- 'vhost_rdma.c',
- 'vhost_rdma_ib.c',
- 'vhost_rdma_queue.c',
- 'vhost_rdma_opcode.c',
- 'vhost_rdma_pkt.c',
- 'vhost_rdma_crc.c',
- 'vhost_rdma_complete.c',
+ 'main.c',
+ 'vhost_rdma.c',
+ 'vhost_rdma_ib.c',
+ 'vhost_rdma_queue.c',
+ 'vhost_rdma_opcode.c',
+ 'vhost_rdma_pkt.c',
+ 'vhost_rdma_crc.c',
+ 'vhost_rdma_complete.c',
)
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
index aac5c28e9a..437d45c5ce 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.c
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -36,7 +36,7 @@
tp = iov->iov_base; \
} while(0); \
-#define DEFINE_VIRTIO_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd}
+#define DEFINE_VHOST_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd}
#define CTRL_NO_CMD __rte_unused struct iovec *__in
#define CTRL_NO_RSP __rte_unused struct iovec *__out
@@ -1089,25 +1089,41 @@ vhost_rdma_destroy_qp(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_R
return 0;
}
+static int
+vhost_rdma_query_pkey(__rte_unused struct vhost_rdma_device *dev,
+ CTRL_NO_CMD, struct iovec *out)
+{
+ struct vhost_rdma_cmd_query_pkey *pkey_rsp;
+ uint16_t pkey = IB_DEFAULT_PKEY_FULL;
+
+ CHK_IOVEC(pkey_rsp, out);
+
+ pkey_rsp->pkey = pkey;
+
+ return 0;
+
+}
+
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
const char *name; /* Name of the command (for logging) */
} cmd_tbl[] = {
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_GET_DMA_MR, vhost_rdma_get_dma_mr),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_QP, vhost_rdma_create_qp),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_MODIFY_QP, vhost_rdma_modify_qp),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_QP, vhost_rdma_query_qp),
- DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_QP, vhost_rdma_destroy_qp),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_GET_DMA_MR, vhost_rdma_get_dma_mr),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_QP, vhost_rdma_create_qp),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_MODIFY_QP, vhost_rdma_modify_qp),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_QP, vhost_rdma_query_qp),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_QP, vhost_rdma_destroy_qp),
+ DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PKEY, vhost_rdma_query_pkey),
};
/**
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
index 79575e735c..5a1787fabe 100644
--- a/examples/vhost_user_rdma/vhost_rdma_ib.h
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -957,6 +957,10 @@ struct vhost_rdma_cmd_destroy_qp {
uint32_t qpn;
};
+struct vhost_rdma_cmd_query_pkey{
+ uint16_t pkey;
+};
+
/**
* @brief Convert IB MTU enum to byte size
* @param mtu The MTU enum value
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk)
2025-12-17 8:49 Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Xiong Weimin
` (9 preceding siblings ...)
2025-12-17 8:49 ` [PATCH 09/14] examples/vhost_user_rdma: implement P_Key query operation with default partition key Xiong Weimin
@ 2025-12-18 16:20 ` Leon Romanovsky
2025-12-19 2:26 ` 熊伟民
10 siblings, 1 reply; 17+ messages in thread
From: Leon Romanovsky @ 2025-12-18 16:20 UTC (permalink / raw)
To: Xiong Weimin
Cc: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev, linux-kernel, netdev
On Wed, Dec 17, 2025 at 04:49:47PM +0800, Xiong Weimin wrote:
> Hi all,
>
> This testing instructions aims to introduce an emulating a soft ROCE
> device with normal NIC(no RDMA).
What is it? We already have one soft RoCE device implemented in the
kernel (drivers/infiniband/sw/rxe), which doesn't require any QEMU
changes at all.
Thanks
^ permalink raw reply [flat|nested] 17+ messages in thread* Re:Re: Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk)
2025-12-18 16:20 ` Implement initial driver for virtio-RDMA devices(kernel), virtio-rdma device model(qemu) and vhost-user-RDMA backend device(dpdk) Leon Romanovsky
@ 2025-12-19 2:26 ` 熊伟民
0 siblings, 0 replies; 17+ messages in thread
From: 熊伟民 @ 2025-12-19 2:26 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Alexei Starovoitov, Daniel Borkmann, David S . Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Stanislav Fomichev, linux-kernel, netdev
At 2025-12-19 00:20:28, "Leon Romanovsky" <leon@kernel.org> wrote:
>On Wed, Dec 17, 2025 at 04:49:47PM +0800, Xiong Weimin wrote:
>> Hi all,
>>
>> This testing instructions aims to introduce an emulating a soft ROCE
>> device with normal NIC(no RDMA).
>
>What is it? We already have one soft RoCE device implemented in the
>kernel (drivers/infiniband/sw/rxe), which doesn't require any QEMU
>changes at all.
>
>Thanks
>
The framwork of vhost_user_rdma(dpdk)/virtio-rdma driver(kernel) is actually
a userspace RDMA backend optimized for virtualization, while rxe (Soft-RoCE)
is a kernel-based software RDMA implementation. Key advantages include:
1. Zero-Copy Architecture: vhost_user_rdma uses shared memory between VMs and
host processes, eliminating data copies.rxe requires kernel-mediated data copies,
adding latency.
2. Polling Mode: Avoids VM-Exit interrupts by using busy-wait polling, reducing
CPU context switches.
3. QEMU/KVM Native Support: vhost_user_rdma integrates directly with hypervisors
via vhost-user protocol.rxe requires PCI device passthrough ( e.g., VFIO),
complicating deployment.
4. Features Support: vhost_user_rdma enables live migration, multi-queue virtio,
and NUMA-aware I/O processing.
5. Userspace Processing: Operates entirely in userspace ( e.g., with SPDK), bypassing
the kernel network stack. rxe relies on the Linux kernel network stack, consuming more
CPU resources.
6. Resource Efficiency: Achieves lower latency in benchmarks for VM-to-VM communication.
7. vhost-user Backend: DPDK provides a vhost-user library that implements the vhost-user
protocol in userspace. This library enables efficient communication between the hypervisor
(QEMU) and the userspace networking stack (like a DPDK-based application). For RDMA, this
means that the vhost-user backend can directly handle RDMA operations without going through
the kernel.
Thanks
^ permalink raw reply [flat|nested] 17+ messages in thread