From: Cliff Burdick <cburdick@nvidia.com>
To: <dev@dpdk.org>
Cc: <anatoly.burakov@intel.com>, Thomas Monjalon <thomas@monjalon.net>
Subject: [PATCH 1/2] eal: support dmabuf
Date: Tue, 27 Jan 2026 17:44:08 +0000 [thread overview]
Message-ID: <20260127174429.1504288-2-cburdick@nvidia.com> (raw)
In-Reply-To: <20260127174429.1504288-1-cburdick@nvidia.com>
dmabuf is a modern Linux kernel feature to allow DMA transfers between
two drivers. Common examples of usage are streaming video devices and
NIC to GPU transfers. Prior to dmabuf users had to load proprietary
drivers to expose the DMA mappings. With dmabuf the proprietary drivers
are no longer required.
A new api function rte_extmem_register_dmabuf is introduced to create
the mapping from a dmabuf file descriptor. dmabuf uses a file descriptor
and an offset that has been pre-opened with the kernel. The kernel uses
the file descriptor to map to a VA pointer. To avoid ABI changes, a
static struct is used inside of eal_common_memory.c, and lookups are
done on this struct rather than from the rte_memseg_list.
Ideally we would like to add both the dmabuf file descriptor and offset
to rte_memseg_list, but it's not clear if we can reuse existing fields
when using the dmabuf API.
We could rename the external flag to a more generic "properties" flag where
"external" is the lowest bit, then we can use the second bit to indicate the
presence of dmabuf. In the presence of the flag for dmabuf we could
reuse the base_va address field for the dmabuf offset, and the socket_id
for the file descriptor.
Which option is preferred?
Signed-off-by: Cliff Burdick <cburdick@nvidia.com>
---
.mailmap | 1 +
lib/eal/common/eal_common_memory.c | 168 +++++++++++++++++++++++++++++
lib/eal/common/eal_memalloc.h | 21 ++++
lib/eal/common/malloc_heap.c | 27 +++++
lib/eal/common/malloc_heap.h | 5 +
lib/eal/include/rte_memory.h | 125 +++++++++++++++++++++
6 files changed, 347 insertions(+)
diff --git a/.mailmap b/.mailmap
index 2f089326ff..4c2b2f921d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -291,6 +291,7 @@ Cian Ferriter <cian.ferriter@intel.com>
Ciara Loftus <ciara.loftus@intel.com>
Ciara Power <ciara.power@intel.com>
Claire Murphy <claire.k.murphy@intel.com>
+Cliff Burdick <cburdick@nvidia.com>
Clemens Famulla-Conrad <cfamullaconrad@suse.com>
Cody Doucette <doucette@bu.edu>
Congwen Zhang <zhang.congwen@zte.com.cn>
diff --git a/lib/eal/common/eal_common_memory.c b/lib/eal/common/eal_common_memory.c
index c62edf5e55..304ed18396 100644
--- a/lib/eal/common/eal_common_memory.c
+++ b/lib/eal/common/eal_common_memory.c
@@ -45,6 +45,18 @@
static void *next_baseaddr;
static uint64_t system_page_sz;
+/* Internal storage for dmabuf info, indexed by memseg list index.
+ * This keeps dmabuf metadata out of the public rte_memseg_list structure
+ * to preserve ABI compatibility.
+ */
+static struct {
+ int fd; /**< dmabuf fd, -1 if not dmabuf backed */
+ uint64_t offset; /**< offset within dmabuf */
+ } dmabuf_info[RTE_MAX_MEMSEG_LISTS] = {
+ [0 ... RTE_MAX_MEMSEG_LISTS - 1] = { .fd = -1, .offset = 0 }
+};
+
+
#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
@@ -930,6 +942,109 @@ rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
return ret;
}
+/* Internal dmabuf info functions */
+int
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset)
+{
+ if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+ return -EINVAL;
+
+ dmabuf_info[list_idx].fd = fd;
+ dmabuf_info[list_idx].offset = offset;
+ return 0;
+}
+
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx)
+{
+ if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+ return -EINVAL;
+
+ return dmabuf_info[list_idx].fd;
+}
+
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset)
+{
+ if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS || offset == NULL)
+ return -EINVAL;
+
+ *offset = dmabuf_info[list_idx].offset;
+ return 0;
+}
+
+/* Public dmabuf info API functions */
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd_thread_unsafe)
+int
+rte_memseg_list_get_dmabuf_fd_thread_unsafe(const struct rte_memseg_list *msl)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx;
+
+ if (msl == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ return dmabuf_info[msl_idx].fd;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd)
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl)
+{
+ int ret;
+
+ rte_mcfg_mem_read_lock();
+ ret = rte_memseg_list_get_dmabuf_fd_thread_unsafe(msl);
+ rte_mcfg_mem_read_unlock();
+
+ return ret;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset_thread_unsafe)
+int
+rte_memseg_list_get_dmabuf_offset_thread_unsafe(const struct rte_memseg_list *msl,
+ uint64_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx;
+
+ if (msl == NULL || offset == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ *offset = dmabuf_info[msl_idx].offset;
+ return 0;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset)
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+ uint64_t *offset)
+{
+ int ret;
+
+ rte_mcfg_mem_read_lock();
+ ret = rte_memseg_list_get_dmabuf_offset_thread_unsafe(msl, offset);
+ rte_mcfg_mem_read_unlock();
+
+ return ret;
+}
+
RTE_EXPORT_SYMBOL(rte_extmem_register)
int
rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
@@ -980,6 +1095,59 @@ rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
return ret;
}
+RTE_EXPORT_SYMBOL(rte_extmem_register_dmabuf)
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+ int dmabuf_fd, uint64_t dmabuf_offset,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int socket_id, n;
+ int ret = 0;
+
+ if (va_addr == NULL || page_sz == 0 || len == 0 ||
+ !rte_is_power_of_2(page_sz) ||
+ RTE_ALIGN(len, page_sz) != len ||
+ ((len / page_sz) != n_pages && iova_addrs != NULL) ||
+ !rte_is_aligned(va_addr, page_sz) ||
+ dmabuf_fd < 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_mcfg_mem_write_lock();
+
+ /* make sure the segment doesn't already exist */
+ if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* get next available socket ID */
+ socket_id = mcfg->next_socket_id;
+ if (socket_id > INT32_MAX) {
+ EAL_LOG(ERR, "Cannot assign new socket ID's");
+ rte_errno = ENOSPC;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* we can create a new memseg with dma-buf info */
+ n = len / page_sz;
+ if (malloc_heap_create_external_seg_dmabuf(va_addr, iova_addrs, n,
+ page_sz, "extmem_dmabuf", socket_id,
+ dmabuf_fd, dmabuf_offset) == NULL) {
+ ret = -1;
+ goto unlock;
+ }
+
+ /* memseg list successfully created - increment next socket ID */
+ mcfg->next_socket_id++;
+unlock:
+ rte_mcfg_mem_write_unlock();
+ return ret;
+}
+
RTE_EXPORT_SYMBOL(rte_extmem_unregister)
int
rte_extmem_unregister(void *va_addr, size_t len)
diff --git a/lib/eal/common/eal_memalloc.h b/lib/eal/common/eal_memalloc.h
index 0c267066d9..bb2cfa0717 100644
--- a/lib/eal/common/eal_memalloc.h
+++ b/lib/eal/common/eal_memalloc.h
@@ -90,6 +90,27 @@ eal_memalloc_set_seg_list_fd(int list_idx, int fd);
int
eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset);
+/*
+ * Set dmabuf info for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset);
+
+/*
+ * Get dmabuf fd for a memseg list.
+ * Returns fd (>= 0) on success, -1 if not dmabuf backed, -errno on error.
+ */
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx);
+
+/*
+ * Get dmabuf offset for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset);
+
int
eal_memalloc_init(void)
__rte_requires_shared_capability(rte_mcfg_mem_get_lock());
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 39240c261c..fd0376d13b 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -1232,6 +1232,33 @@ malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[],
msl->version = 0;
msl->external = 1;
+ /* initialize dmabuf info to "not dmabuf backed" */
+ eal_memseg_list_set_dmabuf_info(i, -1, 0);
+
+ return msl;
+}
+
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+ unsigned int n_pages, size_t page_sz, const char *seg_name,
+ unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ /* Create the base external segment */
+ msl = malloc_heap_create_external_seg(va_addr, iova_addrs, n_pages,
+ page_sz, seg_name, socket_id);
+ if (msl == NULL)
+ return NULL;
+
+ /* Get memseg list index */
+ msl_idx = msl - mcfg->memsegs;
+
+ /* Set dma-buf info in the internal side-table */
+ eal_memseg_list_set_dmabuf_info(msl_idx, dmabuf_fd, dmabuf_offset);
+
return msl;
}
diff --git a/lib/eal/common/malloc_heap.h b/lib/eal/common/malloc_heap.h
index dfc56d4ae3..87525d1a68 100644
--- a/lib/eal/common/malloc_heap.h
+++ b/lib/eal/common/malloc_heap.h
@@ -51,6 +51,11 @@ malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[],
unsigned int n_pages, size_t page_sz, const char *seg_name,
unsigned int socket_id);
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+ unsigned int n_pages, size_t page_sz, const char *seg_name,
+ unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset);
+
struct rte_memseg_list *
malloc_heap_find_external_seg(void *va_addr, size_t len);
diff --git a/lib/eal/include/rte_memory.h b/lib/eal/include/rte_memory.h
index b6e97ad695..d1c2fc8aa5 100644
--- a/lib/eal/include/rte_memory.h
+++ b/lib/eal/include/rte_memory.h
@@ -405,6 +405,82 @@ int
rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
size_t *offset);
+/**
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @param msl
+ * A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ * Valid dma-buf file descriptor (>= 0) in case of success.
+ * -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ * - EINVAL - ``msl`` pointer was NULL or did not point to a valid memseg list
+ */
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl);
+
+/**
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @param msl
+ * A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ * Valid dma-buf file descriptor (>= 0) in case of success.
+ * -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ * - EINVAL - ``msl`` pointer was NULL or did not point to a valid memseg list
+ */
+int
+rte_memseg_list_get_dmabuf_fd_thread_unsafe(const struct rte_memseg_list *msl);
+
+/**
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @param msl
+ * A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * 0 on success.
+ * -1 in case of error, with ``rte_errno`` set to:
+ * - EINVAL - ``msl`` pointer was NULL or did not point to a valid memseg list
+ * - EINVAL - ``offset`` pointer was NULL
+ */
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+ uint64_t *offset);
+
+/**
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @param msl
+ * A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * 0 on success.
+ * -1 in case of error, with ``rte_errno`` set to:
+ * - EINVAL - ``msl`` pointer was NULL or did not point to a valid memseg list
+ * - EINVAL - ``offset`` pointer was NULL
+ */
+int
+rte_memseg_list_get_dmabuf_offset_thread_unsafe(const struct rte_memseg_list *msl,
+ uint64_t *offset);
+
/**
* Register external memory chunk with DPDK.
*
@@ -443,6 +519,55 @@ int
rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
unsigned int n_pages, size_t page_sz);
+/**
+ * Register external memory chunk backed by a dma-buf with DPDK.
+ *
+ * This is similar to rte_extmem_register() but additionally stores dma-buf
+ * file descriptor information, allowing drivers to use dma-buf based
+ * memory registration (e.g., ibv_reg_dmabuf_mr for RDMA devices).
+ *
+ * @note Using this API is mutually exclusive with ``rte_malloc`` family of
+ * API's.
+ *
+ * @note This API will not perform any DMA mapping. It is expected that user
+ * will do that themselves via rte_dev_dma_map().
+ *
+ * @note Before accessing this memory in other processes, it needs to be
+ * attached in each of those processes by calling ``rte_extmem_attach`` in
+ * each other process.
+ *
+ * @param va_addr
+ * Start of virtual area to register (mmap'd address of the dma-buf).
+ * Must be aligned by ``page_sz``.
+ * @param len
+ * Length of virtual area to register. Must be aligned by ``page_sz``.
+ * This is independent of dmabuf_offset.
+ * @param dmabuf_fd
+ * File descriptor of the dma-buf.
+ * @param dmabuf_offset
+ * Offset within the dma-buf where the registered region starts.
+ * @param iova_addrs
+ * Array of page IOVA addresses corresponding to each page in this memory
+ * area. Can be NULL, in which case page IOVA addresses will be set to
+ * RTE_BAD_IOVA.
+ * @param n_pages
+ * Number of elements in the iova_addrs array. Ignored if ``iova_addrs``
+ * is NULL.
+ * @param page_sz
+ * Page size of the underlying memory
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - one of the parameters was invalid
+ * EEXIST - memory chunk is already registered
+ * ENOSPC - no more space in internal config to store a new memory chunk
+ */
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+ int dmabuf_fd, uint64_t dmabuf_offset,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
/**
* Unregister external memory chunk with DPDK.
*
--
2.52.0
next prev parent reply other threads:[~2026-01-27 17:45 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-27 17:44 [PATCH 0/2] support dmabuf Cliff Burdick
2026-01-27 17:44 ` Cliff Burdick [this message]
2026-01-29 1:48 ` [PATCH 1/2] eal: " Stephen Hemminger
2026-01-29 1:51 ` Stephen Hemminger
2026-01-27 17:44 ` [PATCH 2/2] common/mlx5: " Cliff Burdick
2026-01-27 19:21 ` [REVIEW] " Stephen Hemminger
2026-01-28 14:30 ` David Marchand
2026-01-28 17:10 ` Stephen Hemminger
2026-01-28 17:43 ` Stephen Hemminger
2026-02-03 17:34 ` Cliff Burdick
2026-01-29 1:51 ` [PATCH 2/2] " Stephen Hemminger
2026-01-28 0:04 ` [PATCH 0/2] " Stephen Hemminger
2026-02-03 17:18 ` Cliff Burdick
2026-02-03 22:26 ` [PATCH v2 " Cliff Burdick
2026-02-03 22:26 ` [PATCH v2 1/2] eal: " Cliff Burdick
2026-02-03 22:26 ` [PATCH v2 2/2] common/mlx5: " Cliff Burdick
2026-02-03 23:02 ` [PATCH v3 0/2] " Cliff Burdick
2026-02-03 23:02 ` [PATCH v3 1/2] eal: " Cliff Burdick
2026-02-03 23:02 ` [PATCH v3 2/2] common/mlx5: " Cliff Burdick
2026-02-04 15:50 ` [PATCH v4 0/2] " Cliff Burdick
2026-02-04 15:50 ` [PATCH v4 1/2] eal: " Cliff Burdick
2026-02-12 13:57 ` Burakov, Anatoly
2026-02-04 15:50 ` [PATCH v4 2/2] common/mlx5: " Cliff Burdick
2026-02-05 18:48 ` [PATCH v4 0/2] " Stephen Hemminger
2026-02-05 20:25 ` Cliff Burdick
2026-02-05 22:50 ` Stephen Hemminger
2026-03-31 3:15 ` Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260127174429.1504288-2-cburdick@nvidia.com \
--to=cburdick@nvidia.com \
--cc=anatoly.burakov@intel.com \
--cc=dev@dpdk.org \
--cc=thomas@monjalon.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox