From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: Caleb Sander Mateos <csander@purestorage.com>,
Ming Lei <ming.lei@redhat.com>
Subject: [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands
Date: Tue, 31 Mar 2026 23:31:52 +0800 [thread overview]
Message-ID: <20260331153207.3635125-2-ming.lei@redhat.com> (raw)
In-Reply-To: <20260331153207.3635125-1-ming.lei@redhat.com>
Add control commands for registering and unregistering shared memory
buffers for zero-copy I/O:
- UBLK_U_CMD_REG_BUF (0x18): pins pages from userspace, inserts PFN
ranges into a per-device maple tree for O(log n) lookup during I/O.
Buffer pointers are tracked in a per-device xarray. Returns the
assigned buffer index.
- UBLK_U_CMD_UNREG_BUF (0x19): removes PFN entries and unpins pages.
Queue freeze/unfreeze is handled internally so userspace need not
quiesce the device during registration.
Also adds:
- UBLK_IO_F_SHMEM_ZC flag and addr encoding helpers in UAPI header
(16-bit buffer index supporting up to 65536 buffers)
- Data structures (ublk_buf, ublk_buf_range) and xarray/maple tree
- __ublk_ctrl_reg_buf() helper for PFN insertion with error unwinding
- __ublk_ctrl_unreg_buf() helper for cleanup reuse
- ublk_support_shmem_zc() / ublk_dev_support_shmem_zc() stubs
(returning false — feature not enabled yet)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/block/ublk_drv.c | 300 ++++++++++++++++++++++++++++++++++
include/uapi/linux/ublk_cmd.h | 72 ++++++++
2 files changed, 372 insertions(+)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 71c7c56b38ca..ac6ccc174d44 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -46,6 +46,8 @@
#include <linux/kref.h>
#include <linux/kfifo.h>
#include <linux/blk-integrity.h>
+#include <linux/maple_tree.h>
+#include <linux/xarray.h>
#include <uapi/linux/fs.h>
#include <uapi/linux/ublk_cmd.h>
@@ -58,6 +60,8 @@
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
+#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
+#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -289,6 +293,20 @@ struct ublk_queue {
struct ublk_io ios[] __counted_by(q_depth);
};
+/* Per-registered shared memory buffer */
+struct ublk_buf {
+ struct page **pages;
+ unsigned int nr_pages;
+};
+
+/* Maple tree value: maps a PFN range to buffer location */
+struct ublk_buf_range {
+ unsigned long base_pfn;
+ unsigned short buf_index;
+ unsigned short flags;
+ unsigned int base_offset; /* byte offset within buffer */
+};
+
struct ublk_device {
struct gendisk *ub_disk;
@@ -323,6 +341,10 @@ struct ublk_device {
bool block_open; /* protected by open_mutex */
+ /* shared memory zero copy */
+ struct maple_tree buf_tree;
+ struct xarray bufs_xa;
+
struct ublk_queue *queues[];
};
@@ -334,6 +356,7 @@ struct ublk_params_header {
static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
+static void ublk_buf_cleanup(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
u16 q_id, u16 tag, struct ublk_io *io);
@@ -398,6 +421,16 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
+{
+ return false;
+}
+
+static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
+{
+ return false;
+}
+
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
@@ -1460,6 +1493,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
+
iod->addr = io->buf.addr;
return BLK_STS_OK;
@@ -1665,6 +1699,7 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
{
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+
/* partially mapped, update io descriptor */
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
/*
@@ -4206,6 +4241,7 @@ static void ublk_cdev_rel(struct device *dev)
{
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
+ ublk_buf_cleanup(ub);
blk_mq_free_tag_set(&ub->tag_set);
ublk_deinit_queues(ub);
ublk_free_dev_number(ub);
@@ -4625,6 +4661,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
mutex_init(&ub->cancel_mutex);
+ mt_init(&ub->buf_tree);
+ xa_init_flags(&ub->bufs_xa, XA_FLAGS_ALLOC);
INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
@@ -5168,6 +5206,260 @@ static int ublk_char_dev_permission(struct ublk_device *ub,
return err;
}
+/*
+ * Drain inflight I/O and quiesce the queue. Freeze drains all inflight
+ * requests, quiesce_nowait marks the queue so no new requests dispatch,
+ * then unfreeze allows new submissions (which won't dispatch due to
+ * quiesce). This keeps freeze and ub->mutex non-nested.
+ */
+static void ublk_quiesce_and_release(struct gendisk *disk)
+{
+ unsigned int memflags;
+
+ memflags = blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue_nowait(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue, memflags);
+}
+
+static void ublk_unquiesce_and_resume(struct gendisk *disk)
+{
+ blk_mq_unquiesce_queue(disk->queue);
+}
+
+/*
+ * Insert PFN ranges of a registered buffer into the maple tree,
+ * coalescing consecutive PFNs into single range entries.
+ * Returns 0 on success, negative error with partial insertions unwound.
+ */
+/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
+static void ublk_buf_erase_ranges(struct ublk_device *ub,
+ struct ublk_buf *ubuf,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ) {
+ unsigned long pfn = page_to_pfn(ubuf->pages[i]);
+ unsigned long start = i;
+
+ while (i + 1 < nr_pages &&
+ page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
+ i++;
+ i++;
+ kfree(mtree_erase(&ub->buf_tree, pfn));
+ }
+}
+
+static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
+ struct ublk_buf *ubuf, int index,
+ unsigned short flags)
+{
+ unsigned long nr_pages = ubuf->nr_pages;
+ unsigned long i;
+ int ret;
+
+ for (i = 0; i < nr_pages; ) {
+ unsigned long pfn = page_to_pfn(ubuf->pages[i]);
+ unsigned long start = i;
+ struct ublk_buf_range *range;
+
+ /* Find run of consecutive PFNs */
+ while (i + 1 < nr_pages &&
+ page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
+ i++;
+ i++; /* past the last page in this run */
+
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+ if (!range) {
+ ret = -ENOMEM;
+ goto unwind;
+ }
+ range->buf_index = index;
+ range->flags = flags;
+ range->base_pfn = pfn;
+ range->base_offset = start << PAGE_SHIFT;
+
+ ret = mtree_insert_range(&ub->buf_tree, pfn,
+ pfn + (i - start) - 1,
+ range, GFP_KERNEL);
+ if (ret) {
+ kfree(range);
+ goto unwind;
+ }
+ }
+ return 0;
+
+unwind:
+ ublk_buf_erase_ranges(ub, ubuf, i);
+ return ret;
+}
+
+/*
+ * Register a shared memory buffer for zero-copy I/O.
+ * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
+ * internally. Returns buffer index (>= 0) on success.
+ */
+static int ublk_ctrl_reg_buf(struct ublk_device *ub,
+ struct ublksrv_ctrl_cmd *header)
+{
+ void __user *argp = (void __user *)(unsigned long)header->addr;
+ struct ublk_shmem_buf_reg buf_reg;
+ unsigned long addr, size, nr_pages;
+ unsigned int gup_flags;
+ struct gendisk *disk;
+ struct ublk_buf *ubuf;
+ long pinned;
+ u32 index;
+ int ret;
+
+ if (!ublk_dev_support_shmem_zc(ub))
+ return -EOPNOTSUPP;
+
+ memset(&buf_reg, 0, sizeof(buf_reg));
+ if (copy_from_user(&buf_reg, argp,
+ min_t(size_t, header->len, sizeof(buf_reg))))
+ return -EFAULT;
+
+ if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
+ return -EINVAL;
+
+ addr = buf_reg.addr;
+ size = buf_reg.len;
+ nr_pages = size >> PAGE_SHIFT;
+
+ if (!size || !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ return -ENODEV;
+
+ /* Pin pages before quiescing (may sleep) */
+ ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
+ if (!ubuf) {
+ ret = -ENOMEM;
+ goto put_disk;
+ }
+
+ ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
+ GFP_KERNEL);
+ if (!ubuf->pages) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ gup_flags = FOLL_LONGTERM;
+ if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
+ gup_flags |= FOLL_WRITE;
+
+ pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
+ if (pinned < 0) {
+ ret = pinned;
+ goto err_free_pages;
+ }
+ if (pinned != nr_pages) {
+ ret = -EFAULT;
+ goto err_unpin;
+ }
+ ubuf->nr_pages = nr_pages;
+
+ /*
+ * Drain inflight I/O and quiesce the queue so no new requests
+ * are dispatched while we modify the maple tree. Keep freeze
+ * and mutex non-nested to avoid lock dependency.
+ */
+ ublk_quiesce_and_release(disk);
+
+ mutex_lock(&ub->mutex);
+
+ ret = xa_alloc(&ub->bufs_xa, &index, ubuf, xa_limit_16b, GFP_KERNEL);
+ if (ret)
+ goto err_unlock;
+
+ ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
+ if (ret) {
+ xa_erase(&ub->bufs_xa, index);
+ goto err_unlock;
+ }
+
+ mutex_unlock(&ub->mutex);
+
+ ublk_unquiesce_and_resume(disk);
+ ublk_put_disk(disk);
+ return index;
+
+err_unlock:
+ mutex_unlock(&ub->mutex);
+ ublk_unquiesce_and_resume(disk);
+err_unpin:
+ unpin_user_pages(ubuf->pages, pinned);
+err_free_pages:
+ kvfree(ubuf->pages);
+err_free:
+ kfree(ubuf);
+put_disk:
+ ublk_put_disk(disk);
+ return ret;
+}
+
+static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
+ struct ublk_buf *ubuf)
+{
+ ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
+ unpin_user_pages(ubuf->pages, ubuf->nr_pages);
+ kvfree(ubuf->pages);
+ kfree(ubuf);
+}
+
+static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
+ struct ublksrv_ctrl_cmd *header)
+{
+ int index = (int)header->data[0];
+ struct gendisk *disk;
+ struct ublk_buf *ubuf;
+
+ if (!ublk_dev_support_shmem_zc(ub))
+ return -EOPNOTSUPP;
+
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ return -ENODEV;
+
+ /* Drain inflight I/O before modifying the maple tree */
+ ublk_quiesce_and_release(disk);
+
+ mutex_lock(&ub->mutex);
+
+ ubuf = xa_erase(&ub->bufs_xa, index);
+ if (!ubuf) {
+ mutex_unlock(&ub->mutex);
+ ublk_unquiesce_and_resume(disk);
+ ublk_put_disk(disk);
+ return -ENOENT;
+ }
+
+ __ublk_ctrl_unreg_buf(ub, ubuf);
+
+ mutex_unlock(&ub->mutex);
+
+ ublk_unquiesce_and_resume(disk);
+ ublk_put_disk(disk);
+ return 0;
+}
+
+static void ublk_buf_cleanup(struct ublk_device *ub)
+{
+ struct ublk_buf *ubuf;
+ unsigned long index;
+
+ xa_for_each(&ub->bufs_xa, index, ubuf)
+ __ublk_ctrl_unreg_buf(ub, ubuf);
+ xa_destroy(&ub->bufs_xa);
+ mtree_destroy(&ub->buf_tree);
+}
+
+
+
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
u32 cmd_op, struct ublksrv_ctrl_cmd *header)
{
@@ -5225,6 +5517,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_UPDATE_SIZE:
case UBLK_CMD_QUIESCE_DEV:
case UBLK_CMD_TRY_STOP_DEV:
+ case UBLK_CMD_REG_BUF:
+ case UBLK_CMD_UNREG_BUF:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -5350,6 +5644,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_TRY_STOP_DEV:
ret = ublk_ctrl_try_stop_dev(ub);
break;
+ case UBLK_CMD_REG_BUF:
+ ret = ublk_ctrl_reg_buf(ub, &header);
+ break;
+ case UBLK_CMD_UNREG_BUF:
+ ret = ublk_ctrl_unreg_buf(ub, &header);
+ break;
default:
ret = -EOPNOTSUPP;
break;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a88876756805..52bb9b843d73 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -57,6 +57,44 @@
_IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
#define UBLK_U_CMD_TRY_STOP_DEV \
_IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
+/*
+ * Register a shared memory buffer for zero-copy I/O.
+ * Input: ctrl_cmd.addr points to struct ublk_buf_reg (buffer VA + size)
+ * ctrl_cmd.len = sizeof(struct ublk_buf_reg)
+ * Result: >= 0 is the assigned buffer index, < 0 is error
+ *
+ * The kernel pins pages from the calling process's address space
+ * and inserts PFN ranges into a per-device maple tree. When a block
+ * request's pages match registered pages, the driver sets
+ * UBLK_IO_F_SHMEM_ZC and encodes the buffer index + offset in addr,
+ * allowing the server to access the data via its own mapping of the
+ * same shared memory — true zero copy.
+ *
+ * The memory can be backed by memfd, hugetlbfs, or any GUP-compatible
+ * shared mapping. Queue freeze is handled internally.
+ *
+ * The buffer VA and size are passed via a user buffer (not inline in
+ * ctrl_cmd) so that unprivileged devices can prepend the device path
+ * to ctrl_cmd.addr without corrupting the VA.
+ */
+#define UBLK_U_CMD_REG_BUF \
+ _IOWR('u', 0x18, struct ublksrv_ctrl_cmd)
+/*
+ * Unregister a shared memory buffer.
+ * Input: ctrl_cmd.data[0] = buffer index
+ */
+#define UBLK_U_CMD_UNREG_BUF \
+ _IOWR('u', 0x19, struct ublksrv_ctrl_cmd)
+
+/* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */
+struct ublk_shmem_buf_reg {
+ __u64 addr; /* userspace virtual address of shared memory */
+ __u32 len; /* buffer size in bytes (page-aligned, max 4GB) */
+ __u32 flags;
+};
+
+/* Pin pages without FOLL_WRITE; usable with write-sealed memfd */
+#define UBLK_SHMEM_BUF_READ_ONLY (1U << 0)
/*
* 64bits are enough now, and it should be easy to extend in case of
* running out of feature flags
@@ -370,6 +408,7 @@
/* Disable automatic partition scanning when device is started */
#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -469,6 +508,12 @@ struct ublksrv_ctrl_dev_info {
#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
/* Request has an integrity data buffer */
#define UBLK_IO_F_INTEGRITY (1UL << 18)
+/*
+ * I/O buffer is in a registered shared memory buffer. When set, the addr
+ * field in ublksrv_io_desc encodes buffer index and byte offset instead
+ * of a userspace virtual address.
+ */
+#define UBLK_IO_F_SHMEM_ZC (1U << 19)
/*
* io cmd is described by this structure, and stored in share memory, indexed
@@ -743,4 +788,31 @@ struct ublk_params {
struct ublk_param_integrity integrity;
};
+/*
+ * Shared memory zero-copy addr encoding for UBLK_IO_F_SHMEM_ZC.
+ *
+ * When UBLK_IO_F_SHMEM_ZC is set, ublksrv_io_desc.addr is encoded as:
+ * bits [0:31] = byte offset within the buffer (up to 4GB)
+ * bits [32:47] = buffer index (up to 65536)
+ * bits [48:63] = reserved (must be zero)
+ */
+#define UBLK_SHMEM_ZC_OFF_MASK 0xffffffffULL
+#define UBLK_SHMEM_ZC_IDX_OFF 32
+#define UBLK_SHMEM_ZC_IDX_MASK 0xffffULL
+
+static inline __u64 ublk_shmem_zc_addr(__u16 index, __u32 offset)
+{
+ return ((__u64)index << UBLK_SHMEM_ZC_IDX_OFF) | offset;
+}
+
+static inline __u16 ublk_shmem_zc_index(__u64 addr)
+{
+ return (addr >> UBLK_SHMEM_ZC_IDX_OFF) & UBLK_SHMEM_ZC_IDX_MASK;
+}
+
+static inline __u32 ublk_shmem_zc_offset(__u64 addr)
+{
+ return (__u32)(addr & UBLK_SHMEM_ZC_OFF_MASK);
+}
+
#endif
--
2.53.0
next prev parent reply other threads:[~2026-03-31 15:32 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` Ming Lei [this message]
2026-04-07 19:35 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47 ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47 ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Ming Lei
2026-04-07 19:50 ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk Ming Lei
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07 2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34 ` Jens Axboe
2026-04-07 19:29 ` Caleb Sander Mateos
2026-04-07 13:44 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260331153207.3635125-2-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=csander@purestorage.com \
--cc=linux-block@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox