public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: Caleb Sander Mateos <csander@purestorage.com>,
	Ming Lei <ming.lei@redhat.com>
Subject: [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk
Date: Tue, 31 Mar 2026 23:31:56 +0800	[thread overview]
Message-ID: <20260331153207.3635125-6-ming.lei@redhat.com> (raw)
In-Reply-To: <20260331153207.3635125-1-ming.lei@redhat.com>

Add infrastructure for UBLK_F_SHMEM_ZC shared memory zero-copy:

- kublk.h: struct ublk_shmem_entry and table for tracking registered
  shared memory buffers
- kublk.c: per-device unix socket listener that accepts memfd
  registrations from clients via SCM_RIGHTS fd passing. The listener
  mmaps the memfd and registers the VA range with the kernel for PFN
  matching. Also adds --shmem_zc command line option.
- kublk.c: --htlb <path> option to open a pre-allocated hugetlbfs
  file, mmap it with MAP_SHARED|MAP_POPULATE, and register it with
  the kernel via ublk_ctrl_reg_buf(). Any process that mmaps the same
  hugetlbfs file shares the same physical pages, enabling zero-copy
  without socket-based fd passing.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 tools/testing/selftests/ublk/kublk.c | 340 ++++++++++++++++++++++++++-
 tools/testing/selftests/ublk/kublk.h |  14 ++
 2 files changed, 352 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index e1c3b3c55e56..bd97e34f131b 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/fs.h>
+#include <sys/un.h>
 #include "kublk.h"
 
 #define MAX_NR_TGT_ARG 	64
@@ -1085,13 +1086,312 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
 }
 
 
+/*
+ * Shared memory registration socket listener.
+ *
+ * The parent daemon context listens on a per-device unix socket at
+ * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
+ * from clients. Clients send a memfd via SCM_RIGHTS; the server
+ * registers it with the kernel, mmaps it, and returns the assigned index.
+ */
+#define UBLK_SHMEM_SOCK_DIR	"/run/ublk"
+
+/* defined in kublk.h, shared with file_backed.c (loop target) */
+struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
+int shmem_count;
+
+static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
+{
+	snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
+}
+
+static int ublk_shmem_sock_create(int dev_id)
+{
+	struct sockaddr_un addr = { .sun_family = AF_UNIX };
+	char path[108];
+	int fd;
+
+	mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
+	ublk_shmem_sock_path(dev_id, path, sizeof(path));
+	unlink(path);
+
+	fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
+	if (fd < 0)
+		return -1;
+
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
+	if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		close(fd);
+		return -1;
+	}
+
+	listen(fd, 4);
+	ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
+	return fd;
+}
+
+static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
+{
+	char path[108];
+
+	if (sock_fd >= 0)
+		close(sock_fd);
+	ublk_shmem_sock_path(dev_id, path, sizeof(path));
+	unlink(path);
+}
+
+/* Receive a memfd from a client via SCM_RIGHTS */
+static int ublk_shmem_recv_fd(int client_fd)
+{
+	char buf[1];
+	struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
+	union {
+		char cmsg_buf[CMSG_SPACE(sizeof(int))];
+		struct cmsghdr align;
+	} u;
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = u.cmsg_buf,
+		.msg_controllen = sizeof(u.cmsg_buf),
+	};
+	struct cmsghdr *cmsg;
+
+	if (recvmsg(client_fd, &msg, 0) <= 0)
+		return -1;
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
+	    cmsg->cmsg_type != SCM_RIGHTS)
+		return -1;
+
+	return *(int *)CMSG_DATA(cmsg);
+}
+
+/* Register a shared memory buffer: store fd, mmap it, return index */
+static int ublk_shmem_register(int shmem_fd)
+{
+	off_t size;
+	void *base;
+	int idx;
+
+	if (shmem_count >= UBLK_BUF_MAX)
+		return -1;
+
+	size = lseek(shmem_fd, 0, SEEK_END);
+	if (size <= 0)
+		return -1;
+
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		    shmem_fd, 0);
+	if (base == MAP_FAILED)
+		return -1;
+
+	idx = shmem_count++;
+	shmem_table[idx].fd = shmem_fd;
+	shmem_table[idx].mmap_base = base;
+	shmem_table[idx].size = size;
+
+	ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
+		 idx, shmem_fd, (size_t)size);
+	return idx;
+}
+
+static void ublk_shmem_unregister_all(void)
+{
+	int i;
+
+	for (i = 0; i < shmem_count; i++) {
+		if (shmem_table[i].mmap_base) {
+			munmap(shmem_table[i].mmap_base,
+			       shmem_table[i].size);
+			close(shmem_table[i].fd);
+			shmem_table[i].mmap_base = NULL;
+		}
+	}
+	shmem_count = 0;
+}
+
+static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size)
+{
+	struct ublk_buf_reg buf_reg = {
+		.addr = (unsigned long)addr,
+		.len = size,
+	};
+	struct ublk_ctrl_cmd_data data = {
+		.cmd_op = UBLK_U_CMD_REG_BUF,
+		.flags = CTRL_CMD_HAS_BUF,
+		.addr = (unsigned long)&buf_reg,
+		.len = sizeof(buf_reg),
+	};
+
+	return __ublk_ctrl_cmd(dev, &data);
+}
+
+/*
+ * Handle one client connection: receive memfd, mmap it, register
+ * the VA range with kernel, send back the assigned index.
+ */
+static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
+{
+	int client_fd, memfd, idx, ret;
+	int32_t reply;
+	off_t size;
+	void *base;
+
+	client_fd = accept(sock_fd, NULL, NULL);
+	if (client_fd < 0)
+		return;
+
+	memfd = ublk_shmem_recv_fd(client_fd);
+	if (memfd < 0) {
+		reply = -1;
+		goto out;
+	}
+
+	/* mmap the memfd in server address space */
+	size = lseek(memfd, 0, SEEK_END);
+	if (size <= 0) {
+		reply = -1;
+		close(memfd);
+		goto out;
+	}
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED | MAP_POPULATE, memfd, 0);
+	if (base == MAP_FAILED) {
+		reply = -1;
+		close(memfd);
+		goto out;
+	}
+
+	/* Register server's VA range with kernel for PFN matching */
+	ret = ublk_ctrl_reg_buf(dev, base, size);
+	if (ret < 0) {
+		ublk_dbg(UBLK_DBG_DEV,
+			 "shmem_zc: kernel reg failed %d\n", ret);
+		munmap(base, size);
+		close(memfd);
+		reply = ret;
+		goto out;
+	}
+
+	/* Store in table for I/O handling */
+	idx = ublk_shmem_register(memfd);
+	if (idx >= 0) {
+		shmem_table[idx].mmap_base = base;
+		shmem_table[idx].size = size;
+	}
+	reply = idx;
+out:
+	send(client_fd, &reply, sizeof(reply), 0);
+	close(client_fd);
+}
+
+struct shmem_listener_info {
+	int dev_id;
+	int stop_efd;		/* eventfd to signal listener to stop */
+	int sock_fd;		/* listener socket fd (output) */
+	struct ublk_dev *dev;
+};
+
+/*
+ * Socket listener thread: runs in the parent daemon context alongside
+ * the I/O threads. Accepts shared memory registration requests from
+ * clients via SCM_RIGHTS. Exits when stop_efd is signaled.
+ */
+static void *ublk_shmem_listener_fn(void *data)
+{
+	struct shmem_listener_info *info = data;
+	struct pollfd pfds[2];
+
+	info->sock_fd = ublk_shmem_sock_create(info->dev_id);
+	if (info->sock_fd < 0)
+		return NULL;
+
+	pfds[0].fd = info->sock_fd;
+	pfds[0].events = POLLIN;
+	pfds[1].fd = info->stop_efd;
+	pfds[1].events = POLLIN;
+
+	while (1) {
+		int ret = poll(pfds, 2, -1);
+
+		if (ret < 0)
+			break;
+
+		/* Stop signal from parent */
+		if (pfds[1].revents & POLLIN)
+			break;
+
+		/* Client connection */
+		if (pfds[0].revents & POLLIN)
+			ublk_shmem_handle_client(info->sock_fd, info->dev);
+	}
+
+	return NULL;
+}
+
+static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
+				 struct ublk_dev *dev)
+{
+	int fd, idx, ret;
+	struct stat st;
+	void *base;
+
+	fd = open(ctx->htlb_path, O_RDWR);
+	if (fd < 0) {
+		ublk_err("htlb: can't open %s\n", ctx->htlb_path);
+		return -errno;
+	}
+
+	if (fstat(fd, &st) < 0 || st.st_size <= 0) {
+		ublk_err("htlb: invalid file size\n");
+		close(fd);
+		return -EINVAL;
+	}
+
+	base = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED | MAP_POPULATE, fd, 0);
+	if (base == MAP_FAILED) {
+		ublk_err("htlb: mmap failed\n");
+		close(fd);
+		return -ENOMEM;
+	}
+
+	ret = ublk_ctrl_reg_buf(dev, base, st.st_size);
+	if (ret < 0) {
+		ublk_err("htlb: reg_buf failed: %d\n", ret);
+		munmap(base, st.st_size);
+		close(fd);
+		return ret;
+	}
+
+	if (shmem_count >= UBLK_BUF_MAX) {
+		munmap(base, st.st_size);
+		close(fd);
+		return -ENOMEM;
+	}
+
+	idx = shmem_count++;
+	shmem_table[idx].fd = fd;
+	shmem_table[idx].mmap_base = base;
+	shmem_table[idx].size = st.st_size;
+
+	ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
+		 idx, (size_t)st.st_size);
+	return 0;
+}
+
 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 {
 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
+	struct shmem_listener_info linfo = {};
 	struct ublk_thread_info *tinfo;
 	unsigned long long extra_flags = 0;
 	cpu_set_t *affinity_buf;
 	unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
+	uint64_t stop_val = 1;
+	pthread_t listener;
 	void *thread_ret;
 	sem_t ready;
 	int ret, i;
@@ -1180,15 +1480,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		goto fail_start;
 	}
 
+	if (ctx->htlb_path) {
+		ret = ublk_shmem_htlb_setup(ctx, dev);
+		if (ret < 0) {
+			ublk_err("htlb setup failed: %d\n", ret);
+			ublk_ctrl_stop_dev(dev);
+			goto fail_start;
+		}
+	}
+
 	ublk_ctrl_get_info(dev);
 	if (ctx->fg)
 		ublk_ctrl_dump(dev);
 	else
 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
 fail_start:
-	/* wait until we are terminated */
-	for (i = 0; i < dev->nthreads; i++)
+	/*
+	 * Wait for I/O threads to exit. While waiting, a listener
+	 * thread accepts shared memory registration requests from
+	 * clients via a per-device unix socket (SCM_RIGHTS fd passing).
+	 */
+	linfo.dev_id = dinfo->dev_id;
+	linfo.dev = dev;
+	linfo.stop_efd = eventfd(0, 0);
+	if (linfo.stop_efd >= 0)
+		pthread_create(&listener, NULL,
+			       ublk_shmem_listener_fn, &linfo);
+
+	for (i = 0; i < (int)dev->nthreads; i++)
 		pthread_join(tinfo[i].thread, &thread_ret);
+
+	/* Signal listener thread to stop and wait for it */
+	if (linfo.stop_efd >= 0) {
+		write(linfo.stop_efd, &stop_val, sizeof(stop_val));
+		pthread_join(listener, NULL);
+		close(linfo.stop_efd);
+		ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
+	}
+	ublk_shmem_unregister_all();
 	free(tinfo);
  fail:
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -1618,6 +1947,7 @@ static int cmd_dev_get_features(void)
 		FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
 		FEAT_NAME(UBLK_F_BATCH_IO),
 		FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
+		FEAT_NAME(UBLK_F_SHMEM_ZC),
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1790,6 +2120,8 @@ int main(int argc, char *argv[])
 		{ "safe",		0,	NULL,  0 },
 		{ "batch",              0,      NULL, 'b'},
 		{ "no_auto_part_scan",	0,	NULL,  0 },
+		{ "shmem_zc",		0,	NULL,  0  },
+		{ "htlb",		1,	NULL,  0  },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1905,6 +2237,10 @@ int main(int argc, char *argv[])
 				ctx.safe_stop = 1;
 			if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
 				ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
+			if (!strcmp(longopts[option_idx].name, "shmem_zc"))
+				ctx.flags |= UBLK_F_SHMEM_ZC;
+			if (!strcmp(longopts[option_idx].name, "htlb"))
+				ctx.htlb_path = strdup(optarg);
 			break;
 		case '?':
 			/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 02f0c55d006b..20d0a1eab41f 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -95,6 +95,8 @@ struct dev_ctx {
 	/* for 'update_size' command */
 	unsigned long long size;
 
+	char *htlb_path;
+
 	union {
 		struct stripe_ctx 	stripe;
 		struct fault_inject_ctx fault_inject;
@@ -599,6 +601,18 @@ static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *
 	}
 }
 
+/* shared memory zero-copy support */
+#define UBLK_BUF_MAX		256
+
+struct ublk_shmem_entry {
+	int fd;
+	void *mmap_base;
+	size_t size;
+};
+
+extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
+extern int shmem_count;
+
 extern const struct ublk_tgt_ops null_tgt_ops;
 extern const struct ublk_tgt_ops loop_tgt_ops;
 extern const struct ublk_tgt_ops stripe_tgt_ops;
-- 
2.53.0


  parent reply	other threads:[~2026-03-31 15:32 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Ming Lei
2026-04-07 19:35   ` Caleb Sander Mateos
2026-04-08  2:23     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:36     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:50     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Ming Lei
2026-04-07 19:50   ` Caleb Sander Mateos
2026-04-08  2:58     ` Ming Lei
2026-03-31 15:31 ` Ming Lei [this message]
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07  2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34   ` Jens Axboe
2026-04-07 19:29   ` Caleb Sander Mateos
2026-04-08  3:03     ` Ming Lei
2026-04-07 13:44 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260331153207.3635125-6-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=csander@purestorage.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox