From: Ming Lei <ming.lei@redhat.com>
To: Caleb Sander Mateos <csander@purestorage.com>
Cc: Uday Shankar <ushankar@purestorage.com>, linux-block@vger.kernel.org
Subject: Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task
Date: Tue, 10 Jun 2025 09:22:28 +0800 [thread overview]
Message-ID: <aEeI1LoMQKgatFuk@fedora> (raw)
In-Reply-To: <CADUfDZpUwCDXGOfo4RGzqre4AC_nR2jkYwQETbD7jLPwP5cVow@mail.gmail.com>
On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > as well as a lot of pointer chasing to look up the struct request.
> > >
> > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > the number of buffers registered on the daemon task nonatomically, and
> > > atomically subtract the large number minus the number of registered
> > > buffers in ublk_commit_and_fetch().
> > >
> > > Also obtain the struct request directly from ublk_io's req field instead
> > > of looking it up on the tagset.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > > drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > 1 file changed, 50 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -81,12 +81,20 @@
> > > #define UBLK_PARAM_TYPE_ALL \
> > > (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
> > > UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > >
> > > +/*
> > > + * Initialize refcount to a large number to include any registered buffers.
> > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > + * any buffers registered on the io daemon task.
> > > + */
> > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > +
> > > struct ublk_rq_data {
> > > refcount_t ref;
> > > + unsigned buffers_registered;
> > >
> > > /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > u16 buf_index;
> > > void *buf_ctx_handle;
> > > };
> > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > {
> > > if (ublk_need_req_ref(ubq)) {
> > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > - refcount_set(&data->ref, 1);
> > > + refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > + data->buffers_registered = 0;
> > > }
> > > }
> > >
> > > static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > struct request *req)
> > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > } else {
> > > __ublk_complete_rq(req);
> > > }
> > > }
> > >
> > > +static inline void ublk_sub_req_ref(struct request *req)
> > > +{
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > +
> > > + if (refcount_sub_and_test(sub_refs, &data->ref))
> > > + __ublk_complete_rq(req);
> > > +}
> > > +
> > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > {
> > > return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > }
> > >
> > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > >
> > > static void ublk_auto_buf_reg_fallback(struct request *req)
> > > {
> > > const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > >
> > > iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > - refcount_set(&data->ref, 1);
> > > }
> > >
> > > static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > return true;
> > > }
> > > blk_mq_end_request(req, BLK_STS_IOERR);
> > > return false;
> > > }
> > > - /* one extra reference is dropped by ublk_io_release */
> > > - refcount_set(&data->ref, 2);
> > >
> > > + data->buffers_registered = 1;
> > > data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > /* store buffer index in request payload */
> > > data->buf_index = pdu->buf.index;
> > > io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > return true;
> > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > >
> > > static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > struct request *req, struct ublk_io *io,
> > > unsigned int issue_flags)
> > > {
> > > + ublk_init_req_ref(ubq, req);
> > > if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > return ublk_auto_buf_reg(req, io, issue_flags);
> > >
> > > - ublk_init_req_ref(ubq, req);
> > > return true;
> > > }
> > >
> > > static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > struct ublk_io *io)
> > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > }
> > >
> > > return 0;
> > > }
> > >
> > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > + const struct ublk_queue *ubq,
> > > + const struct ublk_io *io,
> > > + unsigned index, unsigned issue_flags)
> > > +{
> > > + struct request *req = io->req;
> > > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > + int ret;
> > > +
> > > + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > + return -EINVAL;
> > > +
> > > + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > + issue_flags);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + data->buffers_registered++;
> >
> > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > with data->buffers_registered++ in case of registering io buffer from
> > daemon context.
> >
> > And in typical implementation, the unregistering io buffer should be done
> > in daemon context too, then I am wondering if any user-visible improvement
> > can be observed in this more complicated & fragile way:
>
> Yes, generally I would expect the unregister to happen on the daemon
> task too. But it's possible (with patch "ublk: allow
> UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> the unregister happens on the daemon task, it's up to the io_uring
> layer to actually call ublk_io_release() once all requests using the
> registered buffer have completed. Assuming only the daemon task issues
> io_uring requests using the buffer, I believe ublk_io_release() will
> always currently be called on that task. But I'd rather not make
> assumptions about the io_uring layer. It's also possible in principle
> for ublk_io_release() whether it's called on the daemon task and have
> a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
Yes, my point is that optimization should focus on common case.
> But I'm not sure it's worth the cost of an additional ubq/io lookup.
>
> >
> > - __ublk_check_and_get_req() is bypassed.
> >
> > - buggy application may overflow ->buffers_registered
>
> Isn't it already possible in principle for a ublk server to overflow
> ublk_rq_data's refcount_t by registering the same ublk request with
> lots of buffers? But sure, if you're concerned about this overflow, I
> can easily add a check for it.
At least refcount_inc_not_zero() will warn if it happens.
>
> >
> > So can you share any data about this optimization on workload with local
> > registering & remote un-registering io buffer? Also is this usage
> > really one common case?
>
> Sure, I can provide some performance measurements for this
> optimization. From looking at CPU profiles in the past, the reference
> counting and request lookup accounted for around 2% of the ublk server
> CPU time. To be clear, in our use case, both the register and
> unregister happen on the daemon task. I just haven't bothered
> optimizing the reference-counting for the unregister yet because it
> doesn't appear nearly as expensive.
If you are talking about per-io-task, I guess it may not make a difference
from user viewpoint from both iops and cpu utilization since the counter is
basically per-task variable, and atomic cost shouldn't mean something for us.
Thanks,
Ming
next prev parent reply other threads:[~2025-06-10 1:22 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-06 21:40 [PATCH 0/8] ublk: allow off-daemon zero-copy buffer registration Caleb Sander Mateos
2025-06-06 21:40 ` [PATCH 1/8] ublk: check cmd_op first Caleb Sander Mateos
2025-06-09 6:57 ` Ming Lei
2025-06-06 21:40 ` [PATCH 2/8] ublk: handle UBLK_IO_FETCH_REQ first Caleb Sander Mateos
2025-06-09 7:01 ` Ming Lei
2025-06-06 21:40 ` [PATCH 3/8] ublk: remove task variable from __ublk_ch_uring_cmd() Caleb Sander Mateos
2025-06-09 7:02 ` Ming Lei
2025-06-06 21:40 ` [PATCH 4/8] ublk: consolidate UBLK_IO_FLAG_{ACTIVE,OWNED_BY_SRV} checks Caleb Sander Mateos
2025-06-09 7:19 ` Ming Lei
2025-06-06 21:40 ` [PATCH 5/8] ublk: move ublk_prep_cancel() to case UBLK_IO_COMMIT_AND_FETCH_REQ Caleb Sander Mateos
2025-06-09 7:21 ` Ming Lei
2025-06-06 21:40 ` [PATCH 6/8] ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Caleb Sander Mateos
2025-06-09 7:34 ` Ming Lei
2025-06-09 17:39 ` Caleb Sander Mateos
2025-06-09 12:34 ` Ming Lei
2025-06-09 17:49 ` Caleb Sander Mateos
2025-06-10 1:34 ` Ming Lei
2025-06-11 15:47 ` Caleb Sander Mateos
2025-06-12 2:31 ` Ming Lei
2025-06-06 21:40 ` [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task Caleb Sander Mateos
2025-06-09 9:02 ` Ming Lei
2025-06-09 17:14 ` Caleb Sander Mateos
2025-06-10 1:22 ` Ming Lei [this message]
2025-06-11 15:36 ` Caleb Sander Mateos
2025-06-12 4:25 ` Ming Lei
2025-06-06 21:40 ` [PATCH 8/8] ublk: remove ubq checks from ublk_{get,put}_req_ref() Caleb Sander Mateos
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aEeI1LoMQKgatFuk@fedora \
--to=ming.lei@redhat.com \
--cc=csander@purestorage.com \
--cc=linux-block@vger.kernel.org \
--cc=ushankar@purestorage.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox