From: Ming Lei <ming.lei@redhat.com>
To: Caleb Sander Mateos <csander@purestorage.com>
Cc: Uday Shankar <ushankar@purestorage.com>, linux-block@vger.kernel.org
Subject: Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
Date: Tue, 10 Jun 2025 09:22:28 +0800 [thread overview]
Message-ID: <aEeI1LoMQKgatFuk@fedora> (raw)
In-Reply-To: <CADUfDZpUwCDXGOfo4RGzqre4AC_nR2jkYwQETbD7jLPwP5cVow@mail.gmail.com>
On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > as well as a lot of pointer chasing to look up the struct request.
> > >
> > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > the number of buffers registered on the daemon task nonatomically, and
> > > atomically subtract the large number minus the number of registered
> > > buffers in ublk_commit_and_fetch().
> > >
> > > Also obtain the struct request directly from ublk_io's req field instead
> > > of looking it up on the tagset.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > 1 file changed, 50 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -81,12 +81,20 @@
> > > #define UBLK_PARAM_TYPE_ALL \
> > > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > >
> > > +/*
> > > + * Initialize refcount to a large number to include any registered buffers.
> > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > + * any buffers registered on the io daemon task.
> > > + */
> > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > +
> > > struct ublk_rq_data {
> > > refcount_t ref;
> > > + unsigned buffers_registered;
> > >
> > > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > u16 buf_index;
> > > void *buf_ctx_handle;
> > > };
> > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > {
> > > if (ublk_need_req_ref(ubq)) {
> > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > - refcount_set(&data->ref, 1);
> > > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > + data->buffers_registered = 0;
> > > }
> > > }
> > >
> > > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > } else {
> > > __ublk_complete_rq(req);
> > > }
> > > }
> > >
> > > +static inline void ublk_sub_req_ref(struct request *req)
> > > +{
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > +
> > > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > > + __ublk_complete_rq(req);
> > > +}
> > > +
> > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > {
> > > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > }
> > >
> > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > >
> > > static void ublk_auto_buf_reg_fallback(struct request *req)
> > > {
> > > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > - refcount_set(&data->ref, 1);
> > > }
> > >
> > > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > return true;
> > > }
> > > blk_mq_end_request(req, BLK_STS_IOERR);
> > > return false;
> > > }
> > > - /* one extra reference is dropped by ublk_io_release */
> > > - refcount_set(&data->ref, 2);
> > >
> > > + data->buffers_registered = 1;
> > > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > /* store buffer index in request payload */
> > > data->buf_index = pdu->buf.index;
> > > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > return true;
> > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > >
> > > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > + ublk_init_req_ref(ubq, req);
> > > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > return ublk_auto_buf_reg(req, io, issue_flags);
> > >
> > > - ublk_init_req_ref(ubq, req);
> > > return true;
> > > }
> > >
> > > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > struct ublk_io *io)
> > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > }
> > >
> > > return 0;
> > > }
> > >
> > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > + const struct ublk_queue *ubq,
> > > + const struct ublk_io *io,
> > > + unsigned index, unsigned issue_flags)
> > > +{
> > > + struct request *req = io->req;
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + int ret;
> > > +
> > > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > + return -EINVAL;
> > > +
> > > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > + issue_flags);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + data->buffers_registered++;
> >
> > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > with data->buffers_registered++ in case of registering io buffer from
> > daemon context.
> >
> > And in typical implementation, the unregistering io buffer should be done
> > in daemon context too, then I am wondering if any user-visible improvement
> > can be observed in this more complicated & fragile way:
>
> Yes, generally I would expect the unregister to happen on the daemon
> task too. But it's possible (with patch "ublk: allow
> UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> the unregister happens on the daemon task, it's up to the io_uring
> layer to actually call ublk_io_release() once all requests using the
> registered buffer have completed. Assuming only the daemon task issues
> io_uring requests using the buffer, I believe ublk_io_release() will
> always currently be called on that task. But I'd rather not make
> assumptions about the io_uring layer. It's also possible in principle
> for ublk_io_release() whether it's called on the daemon task and have
> a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
Yes, my point is that optimization should focus on common case.
> But I'm not sure it's worth the cost of an additional ubq/io lookup.
>
> >
> > - __ublk_check_and_get_req() is bypassed.
> >
> > - buggy application may overflow ->buffers_registered
>
> Isn't it already possible in principle for a ublk server to overflow
> ublk_rq_data's refcount_t by registering the same ublk request with
> lots of buffers? But sure, if you're concerned about this overflow, I
> can easily add a check for it.
At least refcount_inc_not_zero() will warn if it happens.
>
> >
> > So can you share any data about this optimization on workload with local
> > registering & remote un-registering io buffer? Also is this usage
> > really one common case?
>
> Sure, I can provide some performance measurements for this
> optimization. From looking at CPU profiles in the past, the reference
> counting and request lookup accounted for around 2% of the ublk server
> CPU time. To be clear, in our use case, both the register and
> unregister happen on the daemon task. I just haven't bothered
> optimizing the reference-counting for the unregister yet because it
> doesn't appear nearly as expensive.
If you are talking about per-io-task, I guess it may not make a difference
from user viewpoint from both iops and cpu utilization since the counter is
basically per-task variable, and atomic cost shouldn't mean something for us.
Thanks,
Ming
next prev parent reply other threads:[~2025-06-10 1:22 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 1/8] ublk: check cmd_op first Caleb Sander Mateos
2025-06-09 6:57 ` Ming Lei
2025-06-06 21:40 ` [PATCH 2/8] ublk: handle UBLK_IO_FETCH_REQ first Caleb Sander Mateos
2025-06-09 7:01 ` Ming Lei
2025-06-06 21:40 ` [PATCH 3/8] ublk: remove task variable from __ublk_ch_uring_cmd() Caleb Sander Mateos
2025-06-09 7:02 ` Ming Lei
2025-06-06 21:40 ` [PATCH 4/8] ublk: consolidate UBLK_IO_FLAG_{ACTIVE,OWNED_BY_SRV} checks Caleb Sander Mateos
2025-06-09 7:19 ` Ming Lei
2025-06-06 21:40 ` [PATCH 5/8] ublk: move ublk_prep_cancel() to case UBLK_IO_COMMIT_AND_FETCH_REQ Caleb Sander Mateos
2025-06-09 7:21 ` Ming Lei
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
2025-06-09 7:34 ` Ming Lei
2025-06-09 17:39 ` Caleb Sander Mateos
2025-06-09 12:34 ` Ming Lei
2025-06-09 17:49 ` Caleb Sander Mateos
2025-06-10 1:34 ` Ming Lei
2025-06-11 15:47 ` Caleb Sander Mateos
2025-06-12 2:31 ` Ming Lei
2025-06-06 21:40 ` [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task Caleb Sander Mateos
2025-06-09 9:02 ` Ming Lei
2025-06-09 17:14 ` Caleb Sander Mateos
2025-06-10 1:22 ` Ming Lei [this message]
2025-06-11 15:36 ` Caleb Sander Mateos
2025-06-12 4:25 ` Ming Lei
2025-06-06 21:40 ` [PATCH 8/8] ublk: remove ubq checks from ublk_{get,put}_req_ref() Caleb Sander Mateos
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aEeI1LoMQKgatFuk@fedora \
--to=ming.lei@redhat.com \
--cc=csander@purestorage.com \
--cc=linux-block@vger.kernel.org \
--cc=ushankar@purestorage.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.