* [PATCH 1/8] ublk: check cmd_op first
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 6:57 ` Ming Lei
2025-06-06 21:40 ` [PATCH 2/8] ublk: handle UBLK_IO_FETCH_REQ first Caleb Sander Mateos
` (6 subsequent siblings)
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
In preparation for skipping some of the other checks for certain IO
opcodes, move the cmd_op check earlier.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c637ea010d34..e7e2163dcb3b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2175,16 +2175,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
struct task_struct *task;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
- int ret = -EINVAL;
+ int ret;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
ub_cmd->result);
+ ret = ublk_check_cmd_op(cmd_op);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
@@ -2213,15 +2218,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
*/
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
- ret = ublk_check_cmd_op(cmd_op);
- if (ret)
- goto out;
-
- ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 2/8] ublk: handle UBLK_IO_FETCH_REQ first
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 1/8] ublk: check cmd_op first Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 7:01 ` Ming Lei
2025-06-06 21:40 ` [PATCH 3/8] ublk: remove task variable from __ublk_ch_uring_cmd() Caleb Sander Mateos
` (5 subsequent siblings)
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
Check for UBLK_IO_FETCH_REQ first in __ublk_ch_uring_cmd() and return
early. This allows removing the allowances for NULL io->task and
UBLK_IO_FLAG_OWNED_BY_SRV unset that only apply to FETCH.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index e7e2163dcb3b..c4598f99be71 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2195,23 +2195,31 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (tag >= ubq->q_depth)
goto out;
io = &ubq->ios[tag];
+ /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
+ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
+ ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ if (ret)
+ goto out;
+
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
+ return -EIOCBQUEUED;
+ }
+
task = READ_ONCE(io->task);
- if (task && task != current)
+ if (task != current)
goto out;
/* there is pending io cmd, something must be wrong */
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
ret = -EBUSY;
goto out;
}
- /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
- _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@@ -2223,15 +2231,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
- case UBLK_IO_FETCH_REQ:
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
- if (ret)
- goto out;
- break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
if (ret)
goto out;
break;
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 3/8] ublk: remove task variable from __ublk_ch_uring_cmd()
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 1/8] ublk: check cmd_op first Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 2/8] ublk: handle UBLK_IO_FETCH_REQ first Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 7:02 ` Ming Lei
2025-06-06 21:40 ` [PATCH 4/8] ublk: consolidate UBLK_IO_FLAG_{ACTIVE,OWNED_BY_SRV} checks Caleb Sander Mateos
` (4 subsequent siblings)
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
The variable is computed from a simple expression and used once, so just
replace it with the expression.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c4598f99be71..76148145d8fe 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2170,11 +2170,10 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
{
struct ublk_device *ub = cmd->file->private_data;
- struct task_struct *task;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret;
@@ -2205,12 +2204,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
return -EIOCBQUEUED;
}
- task = READ_ONCE(io->task);
- if (task != current)
+ if (READ_ONCE(io->task) != current)
goto out;
/* there is pending io cmd, something must be wrong */
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
ret = -EBUSY;
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 4/8] ublk: consolidate UBLK_IO_FLAG_{ACTIVE,OWNED_BY_SRV} checks
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
` (2 preceding siblings ...)
2025-06-06 21:40 ` [PATCH 3/8] ublk: remove task variable from __ublk_ch_uring_cmd() Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 7:19 ` Ming Lei
2025-06-06 21:40 ` [PATCH 5/8] ublk: move ublk_prep_cancel() to case UBLK_IO_COMMIT_AND_FETCH_REQ Caleb Sander Mateos
` (3 subsequent siblings)
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
UBLK_IO_FLAG_ACTIVE and UBLK_IO_FLAG_OWNED_BY_SRV are mutually
exclusive. So just check that UBLK_IO_FLAG_OWNED_BY_SRV is set in
__ublk_ch_uring_cmd(); that implies UBLK_IO_FLAG_ACTIVE is unset.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 76148145d8fe..295beb6ab4a5 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2208,18 +2208,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (READ_ONCE(io->task) != current)
goto out;
/* there is pending io cmd, something must be wrong */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
ret = -EBUSY;
goto out;
}
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
-
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
*/
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 5/8] ublk: move ublk_prep_cancel() to case UBLK_IO_COMMIT_AND_FETCH_REQ
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
` (3 preceding siblings ...)
2025-06-06 21:40 ` [PATCH 4/8] ublk: consolidate UBLK_IO_FLAG_{ACTIVE,OWNED_BY_SRV} checks Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 7:21 ` Ming Lei
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
` (2 subsequent siblings)
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
UBLK_IO_COMMIT_AND_FETCH_REQ is the only one of __ublk_ch_uring_cmd()'s
switch cases that doesn't return or goto. Move the logic following the
switch into this case so it also returns. Drop the now unneeded default
case.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 295beb6ab4a5..a8030818f74a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2230,22 +2230,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
if (ret)
goto out;
- break;
+
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
+ return -EIOCBQUEUED;
case UBLK_IO_NEED_GET_DATA:
io->addr = ub_cmd->addr;
if (!ublk_get_data(ubq, io))
return -EIOCBQUEUED;
return UBLK_IO_RES_OK;
- default:
- goto out;
}
- ublk_prep_cancel(cmd, issue_flags, ubq, tag);
- return -EIOCBQUEUED;
out:
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
__func__, cmd_op, tag, ret, io->flags);
return ret;
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
` (4 preceding siblings ...)
2025-06-06 21:40 ` [PATCH 5/8] ublk: move ublk_prep_cancel() to case UBLK_IO_COMMIT_AND_FETCH_REQ Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 7:34 ` Ming Lei
2025-06-09 12:34 ` Ming Lei
2025-06-06 21:40 ` [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 8/8] ublk: remove ubq checks from ublk_{get,put}_req_ref() Caleb Sander Mateos
7 siblings, 2 replies; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
only permitted on the ublk_io's daemon task. But this restriction is
unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
look up the request from the tagset and atomically take a reference on
the request without accessing the ublk_io. ublk_unregister_io_buf()
doesn't use the q_id or tag at all.
So allow these opcodes even on tasks other than io->task.
Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
the buffer index being unregistered is not necessarily related to the
specified q_id and tag.
Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
determine whether the kernel supports off-daemon buffer registration.
Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
include/uapi/linux/ublk_cmd.h | 8 ++++++++
2 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index a8030818f74a..2084bbdd2cbb 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -68,11 +68,12 @@
| UBLK_F_ZONED \
| UBLK_F_USER_RECOVERY_FAIL_IO \
| UBLK_F_UPDATE_SIZE \
| UBLK_F_AUTO_BUF_REG \
| UBLK_F_QUIESCE \
- | UBLK_F_PER_IO_DAEMON)
+ | UBLK_F_PER_IO_DAEMON \
+ | UBLK_F_BUF_REG_OFF_DAEMON)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_USER_RECOVERY_FAIL_IO)
@@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
return 0;
}
-static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq,
- unsigned int index, unsigned int issue_flags)
-{
- if (!ublk_support_zero_copy(ubq))
- return -EINVAL;
-
- return io_buffer_unregister_bvec(cmd, index, issue_flags);
-}
-
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
struct ublk_io *io, __u64 buf_addr)
{
struct ublk_device *ub = ubq->dev;
int ret = 0;
@@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ret = ublk_check_cmd_op(cmd_op);
if (ret)
goto out;
+ /*
+ * io_buffer_unregister_bvec() doesn't access the ubq or io,
+ * so no need to validate the q_id, tag, or task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
+ return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
+ issue_flags);
+
ret = -EINVAL;
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
@@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
return -EIOCBQUEUED;
}
- if (READ_ONCE(io->task) != current)
+ if (READ_ONCE(io->task) != current) {
+ /*
+ * ublk_register_io_buf() accesses only the request, not io,
+ * so can be handled on any task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
+ return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
+ issue_flags);
+
goto out;
+ }
/* there is pending io cmd, something must be wrong */
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
ret = -EBUSY;
goto out;
@@ -2224,12 +2232,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
- case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
if (ret)
goto out;
@@ -2918,11 +2924,12 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
*/
ub->dev_info.flags &= UBLK_F_ALL;
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
UBLK_F_URING_CMD_COMP_IN_TASK |
- UBLK_F_PER_IO_DAEMON;
+ UBLK_F_PER_IO_DAEMON |
+ UBLK_F_BUF_REG_OFF_DAEMON;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 77d9d6af46da..034497ec3d2a 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -279,10 +279,18 @@
* (qid1,tag1) and (qid2,tag2), if qid1 == qid2, then the same task must
* be responsible for handling (qid1,tag1) and (qid2,tag2).
*/
#define UBLK_F_PER_IO_DAEMON (1ULL << 13)
+/*
+ * If this feature is set, UBLK_U_IO_REGISTER_IO_BUF/UBLK_U_IO_UNREGISTER_IO_BUF
+ * can be issued for an I/O on any task.
+ * If it is unset, zero-copy buffers can only be registered and unregistered by
+ * the I/O's daemon task.
+ */
+#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
#define UBLK_S_DEV_QUIESCED 2
#define UBLK_S_DEV_FAIL_IO 3
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
@ 2025-06-09 7:34 ` Ming Lei
2025-06-09 17:39 ` Caleb Sander Mateos
2025-06-09 12:34 ` Ming Lei
1 sibling, 1 reply; 26+ messages in thread
From: Ming Lei @ 2025-06-09 7:34 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> only permitted on the ublk_io's daemon task. But this restriction is
> unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> look up the request from the tagset and atomically take a reference on
> the request without accessing the ublk_io. ublk_unregister_io_buf()
> doesn't use the q_id or tag at all.
>
> So allow these opcodes even on tasks other than io->task.
>
> Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> the buffer index being unregistered is not necessarily related to the
> specified q_id and tag.
>
> Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> determine whether the kernel supports off-daemon buffer registration.
>
> Suggested-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
> drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> include/uapi/linux/ublk_cmd.h | 8 ++++++++
> 2 files changed, 30 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index a8030818f74a..2084bbdd2cbb 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -68,11 +68,12 @@
> | UBLK_F_ZONED \
> | UBLK_F_USER_RECOVERY_FAIL_IO \
> | UBLK_F_UPDATE_SIZE \
> | UBLK_F_AUTO_BUF_REG \
> | UBLK_F_QUIESCE \
> - | UBLK_F_PER_IO_DAEMON)
> + | UBLK_F_PER_IO_DAEMON \
> + | UBLK_F_BUF_REG_OFF_DAEMON)
>
> #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> | UBLK_F_USER_RECOVERY_REISSUE \
> | UBLK_F_USER_RECOVERY_FAIL_IO)
>
> @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> }
>
> return 0;
> }
>
> -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> - const struct ublk_queue *ubq,
> - unsigned int index, unsigned int issue_flags)
> -{
> - if (!ublk_support_zero_copy(ubq))
> - return -EINVAL;
> -
> - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> -}
> -
> static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> struct ublk_io *io, __u64 buf_addr)
> {
> struct ublk_device *ub = ubq->dev;
> int ret = 0;
> @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>
> ret = ublk_check_cmd_op(cmd_op);
> if (ret)
> goto out;
>
> + /*
> + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> + * so no need to validate the q_id, tag, or task
> + */
> + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> + issue_flags);
> +
Yeah, the behavior looks correct, but I'd suggest to validate the q_id
too for making code more robust.
Also you removed ublk_support_zero_copy() check for unregistering io buffer
command, which isn't expected for this patch.
> ret = -EINVAL;
> if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> goto out;
>
> ubq = ublk_get_queue(ub, ub_cmd->q_id);
> @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>
> ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> return -EIOCBQUEUED;
> }
>
> - if (READ_ONCE(io->task) != current)
> + if (READ_ONCE(io->task) != current) {
> + /*
> + * ublk_register_io_buf() accesses only the request, not io,
> + * so can be handled on any task
> + */
> + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> + issue_flags);
Maybe you can move UBLK_IO_UNREGISTER_IO_BUF handling here, which seems
more readable.
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-09 7:34 ` Ming Lei
@ 2025-06-09 17:39 ` Caleb Sander Mateos
0 siblings, 0 replies; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-09 17:39 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block
On Mon, Jun 9, 2025 at 12:34 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> > Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> > only permitted on the ublk_io's daemon task. But this restriction is
> > unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> > look up the request from the tagset and atomically take a reference on
> > the request without accessing the ublk_io. ublk_unregister_io_buf()
> > doesn't use the q_id or tag at all.
> >
> > So allow these opcodes even on tasks other than io->task.
> >
> > Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> > the buffer index being unregistered is not necessarily related to the
> > specified q_id and tag.
> >
> > Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> > determine whether the kernel supports off-daemon buffer registration.
> >
> > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> > drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> > include/uapi/linux/ublk_cmd.h | 8 ++++++++
> > 2 files changed, 30 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index a8030818f74a..2084bbdd2cbb 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -68,11 +68,12 @@
> > | UBLK_F_ZONED \
> > | UBLK_F_USER_RECOVERY_FAIL_IO \
> > | UBLK_F_UPDATE_SIZE \
> > | UBLK_F_AUTO_BUF_REG \
> > | UBLK_F_QUIESCE \
> > - | UBLK_F_PER_IO_DAEMON)
> > + | UBLK_F_PER_IO_DAEMON \
> > + | UBLK_F_BUF_REG_OFF_DAEMON)
> >
> > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> > | UBLK_F_USER_RECOVERY_REISSUE \
> > | UBLK_F_USER_RECOVERY_FAIL_IO)
> >
> > @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > }
> >
> > return 0;
> > }
> >
> > -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> > - const struct ublk_queue *ubq,
> > - unsigned int index, unsigned int issue_flags)
> > -{
> > - if (!ublk_support_zero_copy(ubq))
> > - return -EINVAL;
> > -
> > - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> > -}
> > -
> > static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> > struct ublk_io *io, __u64 buf_addr)
> > {
> > struct ublk_device *ub = ubq->dev;
> > int ret = 0;
> > @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> >
> > ret = ublk_check_cmd_op(cmd_op);
> > if (ret)
> > goto out;
> >
> > + /*
> > + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> > + * so no need to validate the q_id, tag, or task
> > + */
> > + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> > + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> > + issue_flags);
> > +
>
> Yeah, the behavior looks correct, but I'd suggest to validate the q_id
> too for making code more robust.
What value do you see in validating the q_id parameter? It's not used,
only the addr parameter is. I would rather document that the ublk
server doesn't need to provide q_id nor tag for a
UBLK_IO_UNREGISTER_IO_BUF command.
>
> Also you removed ublk_support_zero_copy() check for unregistering io buffer
> command, which isn't expected for this patch.
I can move that change into its own patch. I don't think the check
adds much value, though. There's nothing that requires the registered
buffer index passed in addr to belong to this q_id (or even this ublk
device). If you do want to provide a sanity check for the ublk server
that the ublk device supports zero-copy, I would rather just check the
flags on the ublk_device so the ublk server doesn't have to provide a
q_id.
>
> > ret = -EINVAL;
> > if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> > goto out;
> >
> > ubq = ublk_get_queue(ub, ub_cmd->q_id);
> > @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> >
> > ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> > return -EIOCBQUEUED;
> > }
> >
> > - if (READ_ONCE(io->task) != current)
> > + if (READ_ONCE(io->task) != current) {
> > + /*
> > + * ublk_register_io_buf() accesses only the request, not io,
> > + * so can be handled on any task
> > + */
> > + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> > + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> > + issue_flags);
>
> Maybe you can move UBLK_IO_UNREGISTER_IO_BUF handling here, which seems
> more readable.
I would rather allow the ublk server to omit the q_id and tag
parameters for UBLK_IO_UNREGISTER_IO_BUF since they aren't really
used, and there's no enforcement that they correspond to the
registered buffer index (addr). In which case, there isn't any
io->task to check here. It would also be nice to skip the overhead of
looking up the ubq and io for UBLK_IO_UNREGISTER_IO_BUF since they
aren't used.
In the following patch I add an optimized case for
UBLK_IO_REGISTER_IO_BUF when called on the daemon task, which is why I
duplicate the UBLK_IO_REGISTER_IO_BUF handling here in this patch. But
UBLK_IO_UNREGISTER_IO_BUF doesn't need this special case, so I don't
see the need to handle UBLK_IO_UNREGISTER_IO_BUF in 2 different
places.
Best,
Caleb
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
2025-06-09 7:34 ` Ming Lei
@ 2025-06-09 12:34 ` Ming Lei
2025-06-09 17:49 ` Caleb Sander Mateos
1 sibling, 1 reply; 26+ messages in thread
From: Ming Lei @ 2025-06-09 12:34 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> only permitted on the ublk_io's daemon task. But this restriction is
> unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> look up the request from the tagset and atomically take a reference on
> the request without accessing the ublk_io. ublk_unregister_io_buf()
> doesn't use the q_id or tag at all.
>
> So allow these opcodes even on tasks other than io->task.
>
> Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> the buffer index being unregistered is not necessarily related to the
> specified q_id and tag.
>
> Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> determine whether the kernel supports off-daemon buffer registration.
>
> Suggested-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
> drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> include/uapi/linux/ublk_cmd.h | 8 ++++++++
> 2 files changed, 30 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index a8030818f74a..2084bbdd2cbb 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -68,11 +68,12 @@
> | UBLK_F_ZONED \
> | UBLK_F_USER_RECOVERY_FAIL_IO \
> | UBLK_F_UPDATE_SIZE \
> | UBLK_F_AUTO_BUF_REG \
> | UBLK_F_QUIESCE \
> - | UBLK_F_PER_IO_DAEMON)
> + | UBLK_F_PER_IO_DAEMON \
> + | UBLK_F_BUF_REG_OFF_DAEMON)
>
> #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> | UBLK_F_USER_RECOVERY_REISSUE \
> | UBLK_F_USER_RECOVERY_FAIL_IO)
>
> @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> }
>
> return 0;
> }
>
> -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> - const struct ublk_queue *ubq,
> - unsigned int index, unsigned int issue_flags)
> -{
> - if (!ublk_support_zero_copy(ubq))
> - return -EINVAL;
> -
> - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> -}
> -
> static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> struct ublk_io *io, __u64 buf_addr)
> {
> struct ublk_device *ub = ubq->dev;
> int ret = 0;
> @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>
> ret = ublk_check_cmd_op(cmd_op);
> if (ret)
> goto out;
>
> + /*
> + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> + * so no need to validate the q_id, tag, or task
> + */
> + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> + issue_flags);
> +
> ret = -EINVAL;
> if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> goto out;
>
> ubq = ublk_get_queue(ub, ub_cmd->q_id);
> @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>
> ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> return -EIOCBQUEUED;
> }
>
> - if (READ_ONCE(io->task) != current)
> + if (READ_ONCE(io->task) != current) {
> + /*
> + * ublk_register_io_buf() accesses only the request, not io,
> + * so can be handled on any task
> + */
> + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> + issue_flags);
> +
> goto out;
> + }
>
> /* there is pending io cmd, something must be wrong */
> if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
> ret = -EBUSY;
It also skips check on UBLK_IO_FLAG_OWNED_BY_SRV for both UBLK_IO_REGISTER_IO_BUF
and UBLK_IO_UNREGISTER_IO_BUF, :-(
thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-09 12:34 ` Ming Lei
@ 2025-06-09 17:49 ` Caleb Sander Mateos
2025-06-10 1:34 ` Ming Lei
0 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-09 17:49 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block
On Mon, Jun 9, 2025 at 5:34 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> > Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> > only permitted on the ublk_io's daemon task. But this restriction is
> > unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> > look up the request from the tagset and atomically take a reference on
> > the request without accessing the ublk_io. ublk_unregister_io_buf()
> > doesn't use the q_id or tag at all.
> >
> > So allow these opcodes even on tasks other than io->task.
> >
> > Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> > the buffer index being unregistered is not necessarily related to the
> > specified q_id and tag.
> >
> > Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> > determine whether the kernel supports off-daemon buffer registration.
> >
> > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> > drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> > include/uapi/linux/ublk_cmd.h | 8 ++++++++
> > 2 files changed, 30 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index a8030818f74a..2084bbdd2cbb 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -68,11 +68,12 @@
> > | UBLK_F_ZONED \
> > | UBLK_F_USER_RECOVERY_FAIL_IO \
> > | UBLK_F_UPDATE_SIZE \
> > | UBLK_F_AUTO_BUF_REG \
> > | UBLK_F_QUIESCE \
> > - | UBLK_F_PER_IO_DAEMON)
> > + | UBLK_F_PER_IO_DAEMON \
> > + | UBLK_F_BUF_REG_OFF_DAEMON)
> >
> > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> > | UBLK_F_USER_RECOVERY_REISSUE \
> > | UBLK_F_USER_RECOVERY_FAIL_IO)
> >
> > @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > }
> >
> > return 0;
> > }
> >
> > -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> > - const struct ublk_queue *ubq,
> > - unsigned int index, unsigned int issue_flags)
> > -{
> > - if (!ublk_support_zero_copy(ubq))
> > - return -EINVAL;
> > -
> > - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> > -}
> > -
> > static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> > struct ublk_io *io, __u64 buf_addr)
> > {
> > struct ublk_device *ub = ubq->dev;
> > int ret = 0;
> > @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> >
> > ret = ublk_check_cmd_op(cmd_op);
> > if (ret)
> > goto out;
> >
> > + /*
> > + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> > + * so no need to validate the q_id, tag, or task
> > + */
> > + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> > + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> > + issue_flags);
> > +
> > ret = -EINVAL;
> > if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> > goto out;
> >
> > ubq = ublk_get_queue(ub, ub_cmd->q_id);
> > @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> >
> > ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> > return -EIOCBQUEUED;
> > }
> >
> > - if (READ_ONCE(io->task) != current)
> > + if (READ_ONCE(io->task) != current) {
> > + /*
> > + * ublk_register_io_buf() accesses only the request, not io,
> > + * so can be handled on any task
> > + */
> > + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> > + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> > + issue_flags);
> > +
> > goto out;
> > + }
> >
> > /* there is pending io cmd, something must be wrong */
> > if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
> > ret = -EBUSY;
>
> It also skips check on UBLK_IO_FLAG_OWNED_BY_SRV for both UBLK_IO_REGISTER_IO_BUF
> and UBLK_IO_UNREGISTER_IO_BUF, :-(
As we've discussed before[1], accessing io->flags on tasks other than
the io's daemon would be a race condition. So I don't see how it's
possible to keep this check for off-daemon
UBLK_IO_(UN)REGISTER_IO_BUF. What value do you see in checking for
UBLK_IO_FLAG_OWNED_BY_SRV? My understanding is that the
refcount_inc_not_zero() already ensures the ublk I/O has been
dispatched to the ublk server and either hasn't been completed or has
other registered buffers still in use, which is pretty similar to
UBLK_IO_FLAG_OWNED_BY_SRV.
For UBLK_IO_UNREGISTER_IO_BUF, I don't think checking io->flags &
UBLK_IO_FLAG_OWNED_BY_SRV is sufficient to prevent misuse, since
there's no requirement that the buffer index (addr) being unregistered
matches the q_id, tag, or even ublk device specified in the command.
[1]: https://lore.kernel.org/linux-block/CADUfDZqZ_9O7vUAYtxrrujWqPBuP05nBhCbzNuNsc9kJTmX2sA@mail.gmail.com/
Best,
Caleb
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-09 17:49 ` Caleb Sander Mateos
@ 2025-06-10 1:34 ` Ming Lei
2025-06-11 15:47 ` Caleb Sander Mateos
0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2025-06-10 1:34 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Mon, Jun 09, 2025 at 10:49:09AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 5:34 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> > > Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> > > only permitted on the ublk_io's daemon task. But this restriction is
> > > unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> > > look up the request from the tagset and atomically take a reference on
> > > the request without accessing the ublk_io. ublk_unregister_io_buf()
> > > doesn't use the q_id or tag at all.
> > >
> > > So allow these opcodes even on tasks other than io->task.
> > >
> > > Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> > > the buffer index being unregistered is not necessarily related to the
> > > specified q_id and tag.
> > >
> > > Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> > > determine whether the kernel supports off-daemon buffer registration.
> > >
> > > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > > drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> > > include/uapi/linux/ublk_cmd.h | 8 ++++++++
> > > 2 files changed, 30 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index a8030818f74a..2084bbdd2cbb 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -68,11 +68,12 @@
> > > | UBLK_F_ZONED \
> > > | UBLK_F_USER_RECOVERY_FAIL_IO \
> > > | UBLK_F_UPDATE_SIZE \
> > > | UBLK_F_AUTO_BUF_REG \
> > > | UBLK_F_QUIESCE \
> > > - | UBLK_F_PER_IO_DAEMON)
> > > + | UBLK_F_PER_IO_DAEMON \
> > > + | UBLK_F_BUF_REG_OFF_DAEMON)
> > >
> > > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> > > | UBLK_F_USER_RECOVERY_REISSUE \
> > > | UBLK_F_USER_RECOVERY_FAIL_IO)
> > >
> > > @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > }
> > >
> > > return 0;
> > > }
> > >
> > > -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> > > - const struct ublk_queue *ubq,
> > > - unsigned int index, unsigned int issue_flags)
> > > -{
> > > - if (!ublk_support_zero_copy(ubq))
> > > - return -EINVAL;
> > > -
> > > - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> > > -}
> > > -
> > > static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> > > struct ublk_io *io, __u64 buf_addr)
> > > {
> > > struct ublk_device *ub = ubq->dev;
> > > int ret = 0;
> > > @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > >
> > > ret = ublk_check_cmd_op(cmd_op);
> > > if (ret)
> > > goto out;
> > >
> > > + /*
> > > + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> > > + * so no need to validate the q_id, tag, or task
> > > + */
> > > + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> > > + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> > > + issue_flags);
> > > +
> > > ret = -EINVAL;
> > > if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> > > goto out;
> > >
> > > ubq = ublk_get_queue(ub, ub_cmd->q_id);
> > > @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > >
> > > ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> > > return -EIOCBQUEUED;
> > > }
> > >
> > > - if (READ_ONCE(io->task) != current)
> > > + if (READ_ONCE(io->task) != current) {
> > > + /*
> > > + * ublk_register_io_buf() accesses only the request, not io,
> > > + * so can be handled on any task
> > > + */
> > > + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> > > + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> > > + issue_flags);
> > > +
> > > goto out;
> > > + }
> > >
> > > /* there is pending io cmd, something must be wrong */
> > > if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
> > > ret = -EBUSY;
> >
> > It also skips check on UBLK_IO_FLAG_OWNED_BY_SRV for both UBLK_IO_REGISTER_IO_BUF
> > and UBLK_IO_UNREGISTER_IO_BUF, :-(
>
> As we've discussed before[1], accessing io->flags on tasks other than
> the io's daemon would be a race condition. So I don't see how it's
> possible to keep this check for off-daemon
> UBLK_IO_(UN)REGISTER_IO_BUF. What value do you see in checking for
> UBLK_IO_FLAG_OWNED_BY_SRV? My understanding is that the
> refcount_inc_not_zero() already ensures the ublk I/O has been
> dispatched to the ublk server and either hasn't been completed or has
> other registered buffers still in use, which is pretty similar to
> UBLK_IO_FLAG_OWNED_BY_SRV.
request can't be trusted any more for UBLK_F_BUF_REG_OFF_DAEMON because it
may be freed from elevator switch code or stopping dev code path, so we
can't rely on refcount_inc_not_zero(pdu(req)) only.
However the existing per-io-task and checking UBLK_IO_FLAG_OWNED_BY_SRV can
guarantee that the request is valid.
> For UBLK_IO_UNREGISTER_IO_BUF, I don't think checking io->flags &
> UBLK_IO_FLAG_OWNED_BY_SRV is sufficient to prevent misuse, since
> there's no requirement that the buffer index (addr) being unregistered
> matches the q_id, tag, or even ublk device specified in the command.
It should be fine to skip the check for UBLK_IO_UNREGISTER_IO_BUF because it
doesn't touch io & request.
However I think it is still correct to validate ZERO_COPY flag for
UBLK_IO_UNREGISTER_IO_BUF cause ZERO_COPY is only allowed for privileged
userspace.
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-10 1:34 ` Ming Lei
@ 2025-06-11 15:47 ` Caleb Sander Mateos
2025-06-12 2:31 ` Ming Lei
0 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-11 15:47 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block
On Mon, Jun 9, 2025 at 6:34 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Jun 09, 2025 at 10:49:09AM -0700, Caleb Sander Mateos wrote:
> > On Mon, Jun 9, 2025 at 5:34 AM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> > > > Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> > > > only permitted on the ublk_io's daemon task. But this restriction is
> > > > unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> > > > look up the request from the tagset and atomically take a reference on
> > > > the request without accessing the ublk_io. ublk_unregister_io_buf()
> > > > doesn't use the q_id or tag at all.
> > > >
> > > > So allow these opcodes even on tasks other than io->task.
> > > >
> > > > Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> > > > the buffer index being unregistered is not necessarily related to the
> > > > specified q_id and tag.
> > > >
> > > > Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> > > > determine whether the kernel supports off-daemon buffer registration.
> > > >
> > > > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > ---
> > > > drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> > > > include/uapi/linux/ublk_cmd.h | 8 ++++++++
> > > > 2 files changed, 30 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index a8030818f74a..2084bbdd2cbb 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -68,11 +68,12 @@
> > > > | UBLK_F_ZONED \
> > > > | UBLK_F_USER_RECOVERY_FAIL_IO \
> > > > | UBLK_F_UPDATE_SIZE \
> > > > | UBLK_F_AUTO_BUF_REG \
> > > > | UBLK_F_QUIESCE \
> > > > - | UBLK_F_PER_IO_DAEMON)
> > > > + | UBLK_F_PER_IO_DAEMON \
> > > > + | UBLK_F_BUF_REG_OFF_DAEMON)
> > > >
> > > > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> > > > | UBLK_F_USER_RECOVERY_REISSUE \
> > > > | UBLK_F_USER_RECOVERY_FAIL_IO)
> > > >
> > > > @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > > }
> > > >
> > > > return 0;
> > > > }
> > > >
> > > > -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> > > > - const struct ublk_queue *ubq,
> > > > - unsigned int index, unsigned int issue_flags)
> > > > -{
> > > > - if (!ublk_support_zero_copy(ubq))
> > > > - return -EINVAL;
> > > > -
> > > > - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> > > > -}
> > > > -
> > > > static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> > > > struct ublk_io *io, __u64 buf_addr)
> > > > {
> > > > struct ublk_device *ub = ubq->dev;
> > > > int ret = 0;
> > > > @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > > >
> > > > ret = ublk_check_cmd_op(cmd_op);
> > > > if (ret)
> > > > goto out;
> > > >
> > > > + /*
> > > > + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> > > > + * so no need to validate the q_id, tag, or task
> > > > + */
> > > > + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> > > > + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> > > > + issue_flags);
> > > > +
> > > > ret = -EINVAL;
> > > > if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> > > > goto out;
> > > >
> > > > ubq = ublk_get_queue(ub, ub_cmd->q_id);
> > > > @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > > >
> > > > ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> > > > return -EIOCBQUEUED;
> > > > }
> > > >
> > > > - if (READ_ONCE(io->task) != current)
> > > > + if (READ_ONCE(io->task) != current) {
> > > > + /*
> > > > + * ublk_register_io_buf() accesses only the request, not io,
> > > > + * so can be handled on any task
> > > > + */
> > > > + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> > > > + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> > > > + issue_flags);
> > > > +
> > > > goto out;
> > > > + }
> > > >
> > > > /* there is pending io cmd, something must be wrong */
> > > > if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
> > > > ret = -EBUSY;
> > >
> > > It also skips check on UBLK_IO_FLAG_OWNED_BY_SRV for both UBLK_IO_REGISTER_IO_BUF
> > > and UBLK_IO_UNREGISTER_IO_BUF, :-(
> >
> > As we've discussed before[1], accessing io->flags on tasks other than
> > the io's daemon would be a race condition. So I don't see how it's
> > possible to keep this check for off-daemon
> > UBLK_IO_(UN)REGISTER_IO_BUF. What value do you see in checking for
> > UBLK_IO_FLAG_OWNED_BY_SRV? My understanding is that the
> > refcount_inc_not_zero() already ensures the ublk I/O has been
> > dispatched to the ublk server and either hasn't been completed or has
> > other registered buffers still in use, which is pretty similar to
> > UBLK_IO_FLAG_OWNED_BY_SRV.
>
> request can't be trusted any more for UBLK_F_BUF_REG_OFF_DAEMON because it
> may be freed from elevator switch code or stopping dev code path, so we
> can't rely on refcount_inc_not_zero(pdu(req)) only.
I don't know much about how an elevator switch works, could you
explain a bit more how the request can be freed? Is this not already a
concern for user copy, where ublk_ch_read_iter() and
ublk_ch_write_iter() can be issued on any thread? Those code paths
also seem to be relying on the refcount_inc_not_zero() (plus the
blk_mq_request_started(req) and req->tag checks) in
__ublk_check_and_get_req() to prevent use-after-free.
>
> However the existing per-io-task and checking UBLK_IO_FLAG_OWNED_BY_SRV can
> guarantee that the request is valid.
>
> > For UBLK_IO_UNREGISTER_IO_BUF, I don't think checking io->flags &
> > UBLK_IO_FLAG_OWNED_BY_SRV is sufficient to prevent misuse, since
> > there's no requirement that the buffer index (addr) being unregistered
> > matches the q_id, tag, or even ublk device specified in the command.
>
> It should be fine to skip the check for UBLK_IO_UNREGISTER_IO_BUF because it
> doesn't touch io & request.
>
> However I think it is still correct to validate ZERO_COPY flag for
> UBLK_IO_UNREGISTER_IO_BUF cause ZERO_COPY is only allowed for privileged
> userspace.
Okay, I will check for UBLK_F_SUPPORT_ZERO_COPY on the ublk_device
instead to avoid the ublk_queue lookup.
Best,
Caleb
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task
2025-06-11 15:47 ` Caleb Sander Mateos
@ 2025-06-12 2:31 ` Ming Lei
0 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2025-06-12 2:31 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Wed, Jun 11, 2025 at 08:47:15AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 6:34 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Jun 09, 2025 at 10:49:09AM -0700, Caleb Sander Mateos wrote:
> > > On Mon, Jun 9, 2025 at 5:34 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Jun 06, 2025 at 03:40:09PM -0600, Caleb Sander Mateos wrote:
> > > > > Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
> > > > > only permitted on the ublk_io's daemon task. But this restriction is
> > > > > unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
> > > > > look up the request from the tagset and atomically take a reference on
> > > > > the request without accessing the ublk_io. ublk_unregister_io_buf()
> > > > > doesn't use the q_id or tag at all.
> > > > >
> > > > > So allow these opcodes even on tasks other than io->task.
> > > > >
> > > > > Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
> > > > > the buffer index being unregistered is not necessarily related to the
> > > > > specified q_id and tag.
> > > > >
> > > > > Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
> > > > > determine whether the kernel supports off-daemon buffer registration.
> > > > >
> > > > > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > > drivers/block/ublk_drv.c | 37 +++++++++++++++++++++--------------
> > > > > include/uapi/linux/ublk_cmd.h | 8 ++++++++
> > > > > 2 files changed, 30 insertions(+), 15 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index a8030818f74a..2084bbdd2cbb 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -68,11 +68,12 @@
> > > > > | UBLK_F_ZONED \
> > > > > | UBLK_F_USER_RECOVERY_FAIL_IO \
> > > > > | UBLK_F_UPDATE_SIZE \
> > > > > | UBLK_F_AUTO_BUF_REG \
> > > > > | UBLK_F_QUIESCE \
> > > > > - | UBLK_F_PER_IO_DAEMON)
> > > > > + | UBLK_F_PER_IO_DAEMON \
> > > > > + | UBLK_F_BUF_REG_OFF_DAEMON)
> > > > >
> > > > > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
> > > > > | UBLK_F_USER_RECOVERY_REISSUE \
> > > > > | UBLK_F_USER_RECOVERY_FAIL_IO)
> > > > >
> > > > > @@ -2018,20 +2019,10 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > > > }
> > > > >
> > > > > return 0;
> > > > > }
> > > > >
> > > > > -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
> > > > > - const struct ublk_queue *ubq,
> > > > > - unsigned int index, unsigned int issue_flags)
> > > > > -{
> > > > > - if (!ublk_support_zero_copy(ubq))
> > > > > - return -EINVAL;
> > > > > -
> > > > > - return io_buffer_unregister_bvec(cmd, index, issue_flags);
> > > > > -}
> > > > > -
> > > > > static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
> > > > > struct ublk_io *io, __u64 buf_addr)
> > > > > {
> > > > > struct ublk_device *ub = ubq->dev;
> > > > > int ret = 0;
> > > > > @@ -2184,10 +2175,18 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > > > >
> > > > > ret = ublk_check_cmd_op(cmd_op);
> > > > > if (ret)
> > > > > goto out;
> > > > >
> > > > > + /*
> > > > > + * io_buffer_unregister_bvec() doesn't access the ubq or io,
> > > > > + * so no need to validate the q_id, tag, or task
> > > > > + */
> > > > > + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
> > > > > + return io_buffer_unregister_bvec(cmd, ub_cmd->addr,
> > > > > + issue_flags);
> > > > > +
> > > > > ret = -EINVAL;
> > > > > if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
> > > > > goto out;
> > > > >
> > > > > ubq = ublk_get_queue(ub, ub_cmd->q_id);
> > > > > @@ -2204,12 +2203,21 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> > > > >
> > > > > ublk_prep_cancel(cmd, issue_flags, ubq, tag);
> > > > > return -EIOCBQUEUED;
> > > > > }
> > > > >
> > > > > - if (READ_ONCE(io->task) != current)
> > > > > + if (READ_ONCE(io->task) != current) {
> > > > > + /*
> > > > > + * ublk_register_io_buf() accesses only the request, not io,
> > > > > + * so can be handled on any task
> > > > > + */
> > > > > + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
> > > > > + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr,
> > > > > + issue_flags);
> > > > > +
> > > > > goto out;
> > > > > + }
> > > > >
> > > > > /* there is pending io cmd, something must be wrong */
> > > > > if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
> > > > > ret = -EBUSY;
> > > >
> > > > It also skips check on UBLK_IO_FLAG_OWNED_BY_SRV for both UBLK_IO_REGISTER_IO_BUF
> > > > and UBLK_IO_UNREGISTER_IO_BUF, :-(
> > >
> > > As we've discussed before[1], accessing io->flags on tasks other than
> > > the io's daemon would be a race condition. So I don't see how it's
> > > possible to keep this check for off-daemon
> > > UBLK_IO_(UN)REGISTER_IO_BUF. What value do you see in checking for
> > > UBLK_IO_FLAG_OWNED_BY_SRV? My understanding is that the
> > > refcount_inc_not_zero() already ensures the ublk I/O has been
> > > dispatched to the ublk server and either hasn't been completed or has
> > > other registered buffers still in use, which is pretty similar to
> > > UBLK_IO_FLAG_OWNED_BY_SRV.
> >
> > request can't be trusted any more for UBLK_F_BUF_REG_OFF_DAEMON because it
> > may be freed from elevator switch code or stopping dev code path, so we
> > can't rely on refcount_inc_not_zero(pdu(req)) only.
>
> I don't know much about how an elevator switch works, could you
> explain a bit more how the request can be freed? Is this not already a
If elevator is attached, any request is allocated from elevator, and it will
be freed when switching the elevator off, so request retrieved from
ub->tag_set.tags[tag] may become stale since elevator may be switched out
anytime.
> concern for user copy, where ublk_ch_read_iter() and
> ublk_ch_write_iter() can be issued on any thread? Those code paths
> also seem to be relying on the refcount_inc_not_zero() (plus the
> blk_mq_request_started(req) and req->tag checks) in
> __ublk_check_and_get_req() to prevent use-after-free.
Looks there is the risk.
It could be solved by grabbing queue usage counter for __ublk_check_and_get_req()
or moving the reference counter to 'ublk_io'.
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
` (5 preceding siblings ...)
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
2025-06-09 9:02 ` Ming Lei
2025-06-06 21:40 ` [PATCH 8/8] ublk: remove ubq checks from ublk_{get,put}_req_ref() Caleb Sander Mateos
7 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
ublk_register_io_buf() performs an expensive atomic refcount increment,
as well as a lot of pointer chasing to look up the struct request.
Create a separate ublk_daemon_register_io_buf() for the daemon task to
call. Initialize ublk_rq_data's reference count to a large number, count
the number of buffers registered on the daemon task nonatomically, and
atomically subtract the large number minus the number of registered
buffers in ublk_commit_and_fetch().
Also obtain the struct request directly from ublk_io's req field instead
of looking it up on the tagset.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
1 file changed, 50 insertions(+), 9 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 2084bbdd2cbb..ec9e0fd21b0e 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -81,12 +81,20 @@
#define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
+/*
+ * Initialize refcount to a large number to include any registered buffers.
+ * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
+ * any buffers registered on the io daemon task.
+ */
+#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+
struct ublk_rq_data {
refcount_t ref;
+ unsigned buffers_registered;
/* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
u16 buf_index;
void *buf_ctx_handle;
};
@@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
struct request *req)
{
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- refcount_set(&data->ref, 1);
+ refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
+ data->buffers_registered = 0;
}
}
static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
struct request *req)
@@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
} else {
__ublk_complete_rq(req);
}
}
+static inline void ublk_sub_req_ref(struct request *req)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
+
+ if (refcount_sub_and_test(sub_refs, &data->ref))
+ __ublk_complete_rq(req);
+}
+
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
@@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
static void ublk_auto_buf_reg_fallback(struct request *req)
{
const struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
- refcount_set(&data->ref, 1);
}
static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
unsigned int issue_flags)
{
@@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
return true;
}
blk_mq_end_request(req, BLK_STS_IOERR);
return false;
}
- /* one extra reference is dropped by ublk_io_release */
- refcount_set(&data->ref, 2);
+ data->buffers_registered = 1;
data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
/* store buffer index in request payload */
data->buf_index = pdu->buf.index;
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
return true;
@@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
struct request *req, struct ublk_io *io,
unsigned int issue_flags)
{
+ ublk_init_req_ref(ubq, req);
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
return ublk_auto_buf_reg(req, io, issue_flags);
- ublk_init_req_ref(ubq, req);
return true;
}
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
@@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
return 0;
}
+static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
+ const struct ublk_queue *ubq,
+ const struct ublk_io *io,
+ unsigned index, unsigned issue_flags)
+{
+ struct request *req = io->req;
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ int ret;
+
+ if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
+ return -EINVAL;
+
+ ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
+ issue_flags);
+ if (ret)
+ return ret;
+
+ data->buffers_registered++;
+ return 0;
+}
+
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
struct ublk_io *io, __u64 buf_addr)
{
struct ublk_device *ub = ubq->dev;
int ret = 0;
@@ -2131,13 +2167,17 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
io->res = ub_cmd->result;
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba;
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
+ if (unlikely(blk_should_fake_timeout(req->q)))
+ return 0;
+ if (ublk_need_req_ref(ubq))
+ ublk_sub_req_ref(req);
+ else
+ __ublk_complete_rq(req);
return 0;
}
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
{
@@ -2231,11 +2271,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
+ return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
if (ret)
goto out;
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-06 21:40 ` [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task Caleb Sander Mateos
@ 2025-06-09 9:02 ` Ming Lei
2025-06-09 17:14 ` Caleb Sander Mateos
0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2025-06-09 9:02 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> ublk_register_io_buf() performs an expensive atomic refcount increment,
> as well as a lot of pointer chasing to look up the struct request.
>
> Create a separate ublk_daemon_register_io_buf() for the daemon task to
> call. Initialize ublk_rq_data's reference count to a large number, count
> the number of buffers registered on the daemon task nonatomically, and
> atomically subtract the large number minus the number of registered
> buffers in ublk_commit_and_fetch().
>
> Also obtain the struct request directly from ublk_io's req field instead
> of looking it up on the tagset.
>
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
> drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> 1 file changed, 50 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 2084bbdd2cbb..ec9e0fd21b0e 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -81,12 +81,20 @@
> #define UBLK_PARAM_TYPE_ALL \
> (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
>
> +/*
> + * Initialize refcount to a large number to include any registered buffers.
> + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> + * any buffers registered on the io daemon task.
> + */
> +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> +
> struct ublk_rq_data {
> refcount_t ref;
> + unsigned buffers_registered;
>
> /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> u16 buf_index;
> void *buf_ctx_handle;
> };
> @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> struct request *req)
> {
> if (ublk_need_req_ref(ubq)) {
> struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
>
> - refcount_set(&data->ref, 1);
> + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> + data->buffers_registered = 0;
> }
> }
>
> static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> struct request *req)
> @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> } else {
> __ublk_complete_rq(req);
> }
> }
>
> +static inline void ublk_sub_req_ref(struct request *req)
> +{
> + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> +
> + if (refcount_sub_and_test(sub_refs, &data->ref))
> + __ublk_complete_rq(req);
> +}
> +
> static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> {
> return ubq->flags & UBLK_F_NEED_GET_DATA;
> }
>
> @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
>
> static void ublk_auto_buf_reg_fallback(struct request *req)
> {
> const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
>
> iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> - refcount_set(&data->ref, 1);
> }
>
> static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> unsigned int issue_flags)
> {
> @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> return true;
> }
> blk_mq_end_request(req, BLK_STS_IOERR);
> return false;
> }
> - /* one extra reference is dropped by ublk_io_release */
> - refcount_set(&data->ref, 2);
>
> + data->buffers_registered = 1;
> data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> /* store buffer index in request payload */
> data->buf_index = pdu->buf.index;
> io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> return true;
> @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
>
> static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> struct request *req, struct ublk_io *io,
> unsigned int issue_flags)
> {
> + ublk_init_req_ref(ubq, req);
> if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> return ublk_auto_buf_reg(req, io, issue_flags);
>
> - ublk_init_req_ref(ubq, req);
> return true;
> }
>
> static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> struct ublk_io *io)
> @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> }
>
> return 0;
> }
>
> +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> + const struct ublk_queue *ubq,
> + const struct ublk_io *io,
> + unsigned index, unsigned issue_flags)
> +{
> + struct request *req = io->req;
> + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> + int ret;
> +
> + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> + return -EINVAL;
> +
> + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> + issue_flags);
> + if (ret)
> + return ret;
> +
> + data->buffers_registered++;
This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
with data->buffers_registered++ in case of registering io buffer from
daemon context.
And in typical implementation, the unregistering io buffer should be done
in daemon context too, then I am wondering if any user-visible improvement
can be observed in this more complicated & fragile way:
- __ublk_check_and_get_req() is bypassed.
- buggy application may overflow ->buffers_registered
So can you share any data about this optimization on workload with local
registering & remote un-registering io buffer? Also is this usage
really one common case?
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-09 9:02 ` Ming Lei
@ 2025-06-09 17:14 ` Caleb Sander Mateos
2025-06-10 1:22 ` Ming Lei
0 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-09 17:14 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block
On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > as well as a lot of pointer chasing to look up the struct request.
> >
> > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > call. Initialize ublk_rq_data's reference count to a large number, count
> > the number of buffers registered on the daemon task nonatomically, and
> > atomically subtract the large number minus the number of registered
> > buffers in ublk_commit_and_fetch().
> >
> > Also obtain the struct request directly from ublk_io's req field instead
> > of looking it up on the tagset.
> >
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > 1 file changed, 50 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -81,12 +81,20 @@
> > #define UBLK_PARAM_TYPE_ALL \
> > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> >
> > +/*
> > + * Initialize refcount to a large number to include any registered buffers.
> > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > + * any buffers registered on the io daemon task.
> > + */
> > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > +
> > struct ublk_rq_data {
> > refcount_t ref;
> > + unsigned buffers_registered;
> >
> > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > u16 buf_index;
> > void *buf_ctx_handle;
> > };
> > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > struct request *req)
> > {
> > if (ublk_need_req_ref(ubq)) {
> > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> >
> > - refcount_set(&data->ref, 1);
> > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > + data->buffers_registered = 0;
> > }
> > }
> >
> > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > struct request *req)
> > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > } else {
> > __ublk_complete_rq(req);
> > }
> > }
> >
> > +static inline void ublk_sub_req_ref(struct request *req)
> > +{
> > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > +
> > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > + __ublk_complete_rq(req);
> > +}
> > +
> > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > {
> > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > }
> >
> > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> >
> > static void ublk_auto_buf_reg_fallback(struct request *req)
> > {
> > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> >
> > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > - refcount_set(&data->ref, 1);
> > }
> >
> > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > unsigned int issue_flags)
> > {
> > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > return true;
> > }
> > blk_mq_end_request(req, BLK_STS_IOERR);
> > return false;
> > }
> > - /* one extra reference is dropped by ublk_io_release */
> > - refcount_set(&data->ref, 2);
> >
> > + data->buffers_registered = 1;
> > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > /* store buffer index in request payload */
> > data->buf_index = pdu->buf.index;
> > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > return true;
> > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> >
> > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > struct request *req, struct ublk_io *io,
> > unsigned int issue_flags)
> > {
> > + ublk_init_req_ref(ubq, req);
> > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > return ublk_auto_buf_reg(req, io, issue_flags);
> >
> > - ublk_init_req_ref(ubq, req);
> > return true;
> > }
> >
> > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > struct ublk_io *io)
> > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > }
> >
> > return 0;
> > }
> >
> > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > + const struct ublk_queue *ubq,
> > + const struct ublk_io *io,
> > + unsigned index, unsigned issue_flags)
> > +{
> > + struct request *req = io->req;
> > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > + int ret;
> > +
> > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > + return -EINVAL;
> > +
> > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > + issue_flags);
> > + if (ret)
> > + return ret;
> > +
> > + data->buffers_registered++;
>
> This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> with data->buffers_registered++ in case of registering io buffer from
> daemon context.
>
> And in typical implementation, the unregistering io buffer should be done
> in daemon context too, then I am wondering if any user-visible improvement
> can be observed in this more complicated & fragile way:
Yes, generally I would expect the unregister to happen on the daemon
task too. But it's possible (with patch "ublk: allow
UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
the unregister happens on the daemon task, it's up to the io_uring
layer to actually call ublk_io_release() once all requests using the
registered buffer have completed. Assuming only the daemon task issues
io_uring requests using the buffer, I believe ublk_io_release() will
always currently be called on that task. But I'd rather not make
assumptions about the io_uring layer. It's also possible in principle
for ublk_io_release() whether it's called on the daemon task and have
a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF). But I'm
not sure it's worth the cost of an additional ubq/io lookup.
>
> - __ublk_check_and_get_req() is bypassed.
>
> - buggy application may overflow ->buffers_registered
Isn't it already possible in principle for a ublk server to overflow
ublk_rq_data's refcount_t by registering the same ublk request with
lots of buffers? But sure, if you're concerned about this overflow, I
can easily add a check for it.
>
> So can you share any data about this optimization on workload with local
> registering & remote un-registering io buffer? Also is this usage
> really one common case?
Sure, I can provide some performance measurements for this
optimization. From looking at CPU profiles in the past, the reference
counting and request lookup accounted for around 2% of the ublk server
CPU time. To be clear, in our use case, both the register and
unregister happen on the daemon task. I just haven't bothered
optimizing the reference-counting for the unregister yet because it
doesn't appear nearly as expensive.
Best,
Caleb
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-09 17:14 ` Caleb Sander Mateos
@ 2025-06-10 1:22 ` Ming Lei
2025-06-11 15:36 ` Caleb Sander Mateos
0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2025-06-10 1:22 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > as well as a lot of pointer chasing to look up the struct request.
> > >
> > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > the number of buffers registered on the daemon task nonatomically, and
> > > atomically subtract the large number minus the number of registered
> > > buffers in ublk_commit_and_fetch().
> > >
> > > Also obtain the struct request directly from ublk_io's req field instead
> > > of looking it up on the tagset.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > 1 file changed, 50 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -81,12 +81,20 @@
> > > #define UBLK_PARAM_TYPE_ALL \
> > > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > >
> > > +/*
> > > + * Initialize refcount to a large number to include any registered buffers.
> > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > + * any buffers registered on the io daemon task.
> > > + */
> > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > +
> > > struct ublk_rq_data {
> > > refcount_t ref;
> > > + unsigned buffers_registered;
> > >
> > > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > u16 buf_index;
> > > void *buf_ctx_handle;
> > > };
> > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > {
> > > if (ublk_need_req_ref(ubq)) {
> > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > - refcount_set(&data->ref, 1);
> > > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > + data->buffers_registered = 0;
> > > }
> > > }
> > >
> > > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > } else {
> > > __ublk_complete_rq(req);
> > > }
> > > }
> > >
> > > +static inline void ublk_sub_req_ref(struct request *req)
> > > +{
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > +
> > > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > > + __ublk_complete_rq(req);
> > > +}
> > > +
> > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > {
> > > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > }
> > >
> > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > >
> > > static void ublk_auto_buf_reg_fallback(struct request *req)
> > > {
> > > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > - refcount_set(&data->ref, 1);
> > > }
> > >
> > > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > return true;
> > > }
> > > blk_mq_end_request(req, BLK_STS_IOERR);
> > > return false;
> > > }
> > > - /* one extra reference is dropped by ublk_io_release */
> > > - refcount_set(&data->ref, 2);
> > >
> > > + data->buffers_registered = 1;
> > > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > /* store buffer index in request payload */
> > > data->buf_index = pdu->buf.index;
> > > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > return true;
> > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > >
> > > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > + ublk_init_req_ref(ubq, req);
> > > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > return ublk_auto_buf_reg(req, io, issue_flags);
> > >
> > > - ublk_init_req_ref(ubq, req);
> > > return true;
> > > }
> > >
> > > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > struct ublk_io *io)
> > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > }
> > >
> > > return 0;
> > > }
> > >
> > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > + const struct ublk_queue *ubq,
> > > + const struct ublk_io *io,
> > > + unsigned index, unsigned issue_flags)
> > > +{
> > > + struct request *req = io->req;
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + int ret;
> > > +
> > > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > + return -EINVAL;
> > > +
> > > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > + issue_flags);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + data->buffers_registered++;
> >
> > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > with data->buffers_registered++ in case of registering io buffer from
> > daemon context.
> >
> > And in typical implementation, the unregistering io buffer should be done
> > in daemon context too, then I am wondering if any user-visible improvement
> > can be observed in this more complicated & fragile way:
>
> Yes, generally I would expect the unregister to happen on the daemon
> task too. But it's possible (with patch "ublk: allow
> UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> the unregister happens on the daemon task, it's up to the io_uring
> layer to actually call ublk_io_release() once all requests using the
> registered buffer have completed. Assuming only the daemon task issues
> io_uring requests using the buffer, I believe ublk_io_release() will
> always currently be called on that task. But I'd rather not make
> assumptions about the io_uring layer. It's also possible in principle
> for ublk_io_release() whether it's called on the daemon task and have
> a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
Yes, my point is that optimization should focus on common case.
> But I'm not sure it's worth the cost of an additional ubq/io lookup.
>
> >
> > - __ublk_check_and_get_req() is bypassed.
> >
> > - buggy application may overflow ->buffers_registered
>
> Isn't it already possible in principle for a ublk server to overflow
> ublk_rq_data's refcount_t by registering the same ublk request with
> lots of buffers? But sure, if you're concerned about this overflow, I
> can easily add a check for it.
At least refcount_inc_not_zero() will warn if it happens.
>
> >
> > So can you share any data about this optimization on workload with local
> > registering & remote un-registering io buffer? Also is this usage
> > really one common case?
>
> Sure, I can provide some performance measurements for this
> optimization. From looking at CPU profiles in the past, the reference
> counting and request lookup accounted for around 2% of the ublk server
> CPU time. To be clear, in our use case, both the register and
> unregister happen on the daemon task. I just haven't bothered
> optimizing the reference-counting for the unregister yet because it
> doesn't appear nearly as expensive.
If you are talking about per-io-task, I guess it may not make a difference
from user viewpoint from both iops and cpu utilization since the counter is
basically per-task variable, and atomic cost shouldn't mean something for us.
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-10 1:22 ` Ming Lei
@ 2025-06-11 15:36 ` Caleb Sander Mateos
2025-06-12 4:25 ` Ming Lei
0 siblings, 1 reply; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-11 15:36 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block
On Mon, Jun 9, 2025 at 6:22 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> > On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > > as well as a lot of pointer chasing to look up the struct request.
> > > >
> > > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > > the number of buffers registered on the daemon task nonatomically, and
> > > > atomically subtract the large number minus the number of registered
> > > > buffers in ublk_commit_and_fetch().
> > > >
> > > > Also obtain the struct request directly from ublk_io's req field instead
> > > > of looking it up on the tagset.
> > > >
> > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > ---
> > > > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > > 1 file changed, 50 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -81,12 +81,20 @@
> > > > #define UBLK_PARAM_TYPE_ALL \
> > > > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > > > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > > >
> > > > +/*
> > > > + * Initialize refcount to a large number to include any registered buffers.
> > > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > > + * any buffers registered on the io daemon task.
> > > > + */
> > > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > > +
> > > > struct ublk_rq_data {
> > > > refcount_t ref;
> > > > + unsigned buffers_registered;
> > > >
> > > > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > > u16 buf_index;
> > > > void *buf_ctx_handle;
> > > > };
> > > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > > struct request *req)
> > > > {
> > > > if (ublk_need_req_ref(ubq)) {
> > > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > >
> > > > - refcount_set(&data->ref, 1);
> > > > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > > + data->buffers_registered = 0;
> > > > }
> > > > }
> > > >
> > > > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > > struct request *req)
> > > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > > } else {
> > > > __ublk_complete_rq(req);
> > > > }
> > > > }
> > > >
> > > > +static inline void ublk_sub_req_ref(struct request *req)
> > > > +{
> > > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > > +
> > > > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > > > + __ublk_complete_rq(req);
> > > > +}
> > > > +
> > > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > > {
> > > > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > > }
> > > >
> > > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > > >
> > > > static void ublk_auto_buf_reg_fallback(struct request *req)
> > > > {
> > > > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > >
> > > > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > > - refcount_set(&data->ref, 1);
> > > > }
> > > >
> > > > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > > unsigned int issue_flags)
> > > > {
> > > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > > return true;
> > > > }
> > > > blk_mq_end_request(req, BLK_STS_IOERR);
> > > > return false;
> > > > }
> > > > - /* one extra reference is dropped by ublk_io_release */
> > > > - refcount_set(&data->ref, 2);
> > > >
> > > > + data->buffers_registered = 1;
> > > > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > > /* store buffer index in request payload */
> > > > data->buf_index = pdu->buf.index;
> > > > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > > return true;
> > > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > >
> > > > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > > struct request *req, struct ublk_io *io,
> > > > unsigned int issue_flags)
> > > > {
> > > > + ublk_init_req_ref(ubq, req);
> > > > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > > return ublk_auto_buf_reg(req, io, issue_flags);
> > > >
> > > > - ublk_init_req_ref(ubq, req);
> > > > return true;
> > > > }
> > > >
> > > > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > > struct ublk_io *io)
> > > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > > }
> > > >
> > > > return 0;
> > > > }
> > > >
> > > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > > + const struct ublk_queue *ubq,
> > > > + const struct ublk_io *io,
> > > > + unsigned index, unsigned issue_flags)
> > > > +{
> > > > + struct request *req = io->req;
> > > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > + int ret;
> > > > +
> > > > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > > + return -EINVAL;
> > > > +
> > > > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > > + issue_flags);
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > + data->buffers_registered++;
> > >
> > > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > > with data->buffers_registered++ in case of registering io buffer from
> > > daemon context.
> > >
> > > And in typical implementation, the unregistering io buffer should be done
> > > in daemon context too, then I am wondering if any user-visible improvement
> > > can be observed in this more complicated & fragile way:
> >
> > Yes, generally I would expect the unregister to happen on the daemon
> > task too. But it's possible (with patch "ublk: allow
> > UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> > UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> > the unregister happens on the daemon task, it's up to the io_uring
> > layer to actually call ublk_io_release() once all requests using the
> > registered buffer have completed. Assuming only the daemon task issues
> > io_uring requests using the buffer, I believe ublk_io_release() will
> > always currently be called on that task. But I'd rather not make
> > assumptions about the io_uring layer. It's also possible in principle
> > for ublk_io_release() whether it's called on the daemon task and have
> > a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
>
> Yes, my point is that optimization should focus on common case.
Of course, I agree. I think the common case is for both register and
unregister to be issued on the daemon task. I only optimized the
register so far because it appears significantly more expensive (due
to the request lookup on the tagset and the CAS loop to increment the
refcount). I can test optimizing the unregister as well and see if
it's a net win.
>
> > But I'm not sure it's worth the cost of an additional ubq/io lookup.
> >
> > >
> > > - __ublk_check_and_get_req() is bypassed.
> > >
> > > - buggy application may overflow ->buffers_registered
> >
> > Isn't it already possible in principle for a ublk server to overflow
> > ublk_rq_data's refcount_t by registering the same ublk request with
> > lots of buffers? But sure, if you're concerned about this overflow, I
> > can easily add a check for it.
>
> At least refcount_inc_not_zero() will warn if it happens.
>
> >
> > >
> > > So can you share any data about this optimization on workload with local
> > > registering & remote un-registering io buffer? Also is this usage
> > > really one common case?
> >
> > Sure, I can provide some performance measurements for this
> > optimization. From looking at CPU profiles in the past, the reference
> > counting and request lookup accounted for around 2% of the ublk server
> > CPU time. To be clear, in our use case, both the register and
> > unregister happen on the daemon task. I just haven't bothered
> > optimizing the reference-counting for the unregister yet because it
> > doesn't appear nearly as expensive.
>
> If you are talking about per-io-task, I guess it may not make a difference
> from user viewpoint from both iops and cpu utilization since the counter is
> basically per-task variable, and atomic cost shouldn't mean something for us.
Atomic RMW operations are more expensive than non-atomic ones even
when the cache line isn't contended (at least in my experience on
x86). The effect is greater in systems with large numbers of CPUs (and
especially with multiple NUMA nodes). Let me collect some precise
performance numbers, but I recall it being a significant hotspot.
Best,
Caleb
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
2025-06-11 15:36 ` Caleb Sander Mateos
@ 2025-06-12 4:25 ` Ming Lei
0 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2025-06-12 4:25 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Uday Shankar, linux-block
On Wed, Jun 11, 2025 at 08:36:43AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 6:22 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> > > On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > > > as well as a lot of pointer chasing to look up the struct request.
> > > > >
> > > > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > > > the number of buffers registered on the daemon task nonatomically, and
> > > > > atomically subtract the large number minus the number of registered
> > > > > buffers in ublk_commit_and_fetch().
> > > > >
> > > > > Also obtain the struct request directly from ublk_io's req field instead
> > > > > of looking it up on the tagset.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > > > 1 file changed, 50 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -81,12 +81,20 @@
> > > > > #define UBLK_PARAM_TYPE_ALL \
> > > > > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > > > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > > > > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > > > >
> > > > > +/*
> > > > > + * Initialize refcount to a large number to include any registered buffers.
> > > > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > > > + * any buffers registered on the io daemon task.
> > > > > + */
> > > > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > > > +
> > > > > struct ublk_rq_data {
> > > > > refcount_t ref;
> > > > > + unsigned buffers_registered;
> > > > >
> > > > > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > > > u16 buf_index;
> > > > > void *buf_ctx_handle;
> > > > > };
> > > > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > > > struct request *req)
> > > > > {
> > > > > if (ublk_need_req_ref(ubq)) {
> > > > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > >
> > > > > - refcount_set(&data->ref, 1);
> > > > > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > > > + data->buffers_registered = 0;
> > > > > }
> > > > > }
> > > > >
> > > > > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > > > struct request *req)
> > > > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > > > } else {
> > > > > __ublk_complete_rq(req);
> > > > > }
> > > > > }
> > > > >
> > > > > +static inline void ublk_sub_req_ref(struct request *req)
> > > > > +{
> > > > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > > > +
> > > > > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > > > > + __ublk_complete_rq(req);
> > > > > +}
> > > > > +
> > > > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > > > {
> > > > > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > > > }
> > > > >
> > > > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > > > >
> > > > > static void ublk_auto_buf_reg_fallback(struct request *req)
> > > > > {
> > > > > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > > > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > > > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > >
> > > > > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > > > - refcount_set(&data->ref, 1);
> > > > > }
> > > > >
> > > > > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > > > unsigned int issue_flags)
> > > > > {
> > > > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > > > return true;
> > > > > }
> > > > > blk_mq_end_request(req, BLK_STS_IOERR);
> > > > > return false;
> > > > > }
> > > > > - /* one extra reference is dropped by ublk_io_release */
> > > > > - refcount_set(&data->ref, 2);
> > > > >
> > > > > + data->buffers_registered = 1;
> > > > > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > > > /* store buffer index in request payload */
> > > > > data->buf_index = pdu->buf.index;
> > > > > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > > > return true;
> > > > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > > >
> > > > > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > > > struct request *req, struct ublk_io *io,
> > > > > unsigned int issue_flags)
> > > > > {
> > > > > + ublk_init_req_ref(ubq, req);
> > > > > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > > > return ublk_auto_buf_reg(req, io, issue_flags);
> > > > >
> > > > > - ublk_init_req_ref(ubq, req);
> > > > > return true;
> > > > > }
> > > > >
> > > > > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > > > struct ublk_io *io)
> > > > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > > > }
> > > > >
> > > > > return 0;
> > > > > }
> > > > >
> > > > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > > > + const struct ublk_queue *ubq,
> > > > > + const struct ublk_io *io,
> > > > > + unsigned index, unsigned issue_flags)
> > > > > +{
> > > > > + struct request *req = io->req;
> > > > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > > + int ret;
> > > > > +
> > > > > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > > > + return -EINVAL;
> > > > > +
> > > > > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > > > + issue_flags);
> > > > > + if (ret)
> > > > > + return ret;
> > > > > +
> > > > > + data->buffers_registered++;
> > > >
> > > > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > > > with data->buffers_registered++ in case of registering io buffer from
> > > > daemon context.
> > > >
> > > > And in typical implementation, the unregistering io buffer should be done
> > > > in daemon context too, then I am wondering if any user-visible improvement
> > > > can be observed in this more complicated & fragile way:
> > >
> > > Yes, generally I would expect the unregister to happen on the daemon
> > > task too. But it's possible (with patch "ublk: allow
> > > UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> > > UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> > > the unregister happens on the daemon task, it's up to the io_uring
> > > layer to actually call ublk_io_release() once all requests using the
> > > registered buffer have completed. Assuming only the daemon task issues
> > > io_uring requests using the buffer, I believe ublk_io_release() will
> > > always currently be called on that task. But I'd rather not make
> > > assumptions about the io_uring layer. It's also possible in principle
> > > for ublk_io_release() whether it's called on the daemon task and have
> > > a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
> >
> > Yes, my point is that optimization should focus on common case.
>
> Of course, I agree. I think the common case is for both register and
> unregister to be issued on the daemon task. I only optimized the
> register so far because it appears significantly more expensive (due
> to the request lookup on the tagset and the CAS loop to increment the
> refcount). I can test optimizing the unregister as well and see if
> it's a net win.
Looks good!
>
> >
> > > But I'm not sure it's worth the cost of an additional ubq/io lookup.
> > >
> > > >
> > > > - __ublk_check_and_get_req() is bypassed.
> > > >
> > > > - buggy application may overflow ->buffers_registered
> > >
> > > Isn't it already possible in principle for a ublk server to overflow
> > > ublk_rq_data's refcount_t by registering the same ublk request with
> > > lots of buffers? But sure, if you're concerned about this overflow, I
> > > can easily add a check for it.
> >
> > At least refcount_inc_not_zero() will warn if it happens.
> >
> > >
> > > >
> > > > So can you share any data about this optimization on workload with local
> > > > registering & remote un-registering io buffer? Also is this usage
> > > > really one common case?
> > >
> > > Sure, I can provide some performance measurements for this
> > > optimization. From looking at CPU profiles in the past, the reference
> > > counting and request lookup accounted for around 2% of the ublk server
> > > CPU time. To be clear, in our use case, both the register and
> > > unregister happen on the daemon task. I just haven't bothered
> > > optimizing the reference-counting for the unregister yet because it
> > > doesn't appear nearly as expensive.
> >
> > If you are talking about per-io-task, I guess it may not make a difference
> > from user viewpoint from both iops and cpu utilization since the counter is
> > basically per-task variable, and atomic cost shouldn't mean something for us.
>
> Atomic RMW operations are more expensive than non-atomic ones even
> when the cache line isn't contended (at least in my experience on
> x86). The effect is greater in systems with large numbers of CPUs (and
> especially with multiple NUMA nodes). Let me collect some precise
> performance numbers, but I recall it being a significant hotspot.
Yes, as micro benchmark, RMW is much slower than plain inc.
I just compare the following two in my laptop by running the counting
until the counter reaches 2G.
1) plain inc
- (*(volatile unsigned int *)&val)++;
- 1640M/sec
2) atomic inc:
- atomic_fetch_add(&aval, 1)
- 231M/sec
So looks the optimization makes sense for millions level IOPS.
Thanks,
Ming
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 8/8] ublk: remove ubq checks from ublk_{get,put}_req_ref()
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
` (6 preceding siblings ...)
2025-06-06 21:40 ` [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task Caleb Sander Mateos
@ 2025-06-06 21:40 ` Caleb Sander Mateos
7 siblings, 0 replies; 26+ messages in thread
From: Caleb Sander Mateos @ 2025-06-06 21:40 UTC (permalink / raw)
To: Ming Lei; +Cc: Uday Shankar, linux-block, Caleb Sander Mateos
ublk_get_req_ref() and ublk_put_req_ref() currently call
ublk_need_req_ref(ubq) to check whether the ublk device features require
reference counting of its requests. However, all callers already know
that reference counting is required:
- __ublk_check_and_get_req() is only called from
ublk_check_and_get_req() if user copy is enabled, and from
ublk_register_io_buf() if zero copy is enabled
- ublk_io_release() is only called for requests registered by
ublk_register_io_buf(), which requires zero copy
- ublk_ch_read_iter() and ublk_ch_write_iter() only call
ublk_put_req_ref() if ublk_check_and_get_req() succeeded, which
requires user copy to be enabled
So drop the ublk_need_req_ref() check and the ubq argument in
ublk_get_req_ref() and ublk_put_req_ref().
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/block/ublk_drv.c | 41 +++++++++++++---------------------------
1 file changed, 13 insertions(+), 28 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index ec9e0fd21b0e..f9a6b2abfd20 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -690,33 +690,23 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
data->buffers_registered = 0;
}
}
-static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline bool ublk_get_req_ref(struct request *req)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- return refcount_inc_not_zero(&data->ref);
- }
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- return true;
+ return refcount_inc_not_zero(&data->ref);
}
-static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline void ublk_put_req_ref(struct request *req)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- if (refcount_dec_and_test(&data->ref))
- __ublk_complete_rq(req);
- } else {
+ if (refcount_dec_and_test(&data->ref))
__ublk_complete_rq(req);
- }
}
static inline void ublk_sub_req_ref(struct request *req)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
@@ -2004,13 +1994,12 @@ static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
}
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
- struct ublk_queue *ubq = rq->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, rq);
+ ublk_put_req_ref(rq);
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
const struct ublk_queue *ubq, unsigned int tag,
unsigned int index, unsigned int issue_flags)
@@ -2027,11 +2016,11 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
if (ret) {
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(req);
return ret;
}
return 0;
}
@@ -2303,11 +2292,11 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
if (!req)
return NULL;
- if (!ublk_get_req_ref(ubq, req))
+ if (!ublk_get_req_ref(req))
return NULL;
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
goto fail_put;
@@ -2317,11 +2306,11 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
if (offset > blk_rq_bytes(req))
goto fail_put;
return req;
fail_put:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(req);
return NULL;
}
static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
unsigned int issue_flags)
@@ -2431,46 +2420,42 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
goto fail;
*off = buf_off;
return req;
fail:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(req);
return ERR_PTR(-EACCES);
}
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct ublk_queue *ubq;
struct request *req;
size_t buf_off;
size_t ret;
req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(req);
return ret;
}
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct ublk_queue *ubq;
struct request *req;
size_t buf_off;
size_t ret;
req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(req);
return ret;
}
static const struct file_operations ublk_ch_fops = {
--
2.45.2
^ permalink raw reply related [flat|nested] 26+ messages in thread