public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Caleb Sander Mateos <csander@purestorage.com>
Cc: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Subject: Re: [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf
Date: Wed, 8 Apr 2026 10:58:51 +0800	[thread overview]
Message-ID: <adXEa9bcCUQfPCDo@fedora> (raw)
In-Reply-To: <CADUfDZow_gmwzQk41kn=Hw4w4bhsF9ogm2ZPGNgRuz9d5aF_kQ@mail.gmail.com>

On Tue, Apr 07, 2026 at 12:50:15PM -0700, Caleb Sander Mateos wrote:
> On Tue, Mar 31, 2026 at 8:32 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
> > was stored permanently in struct ublk_buf but only needed during
> > pin_user_pages_fast() and maple tree construction. Since the maple tree
> > already stores PFN ranges via ublk_buf_range, struct page pointers can
> > be recovered via pfn_to_page() during unregistration.
> >
> > Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
> > immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
> > to iterate the maple tree for matching buf_index entries, recovering
> > struct page pointers via pfn_to_page() and unpinning in batches of 32.
> > Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
> > instead of walking the now-removed pages[] array.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++---------------
> >  1 file changed, 55 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index c2b9992503a4..2e475bdc54dd 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -296,7 +296,6 @@ struct ublk_queue {
> >
> >  /* Per-registered shared memory buffer */
> >  struct ublk_buf {
> > -       struct page **pages;
> >         unsigned int nr_pages;
> >  };
> 
> It looks like nr_pages doesn't need to be stored either, it could just
> be passed to __ublk_ctrl_reg_buf(). Then I think we could get rid of
> struct ublk_buf and the xarray entirely. We really just need a bitmap
> for allocating buffer indices.

Maybe idr_alloc().

> 
> >
> > @@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
> >   * coalescing consecutive PFNs into single range entries.
> >   * Returns 0 on success, negative error with partial insertions unwound.
> >   */
> > -/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
> > -static void ublk_buf_erase_ranges(struct ublk_device *ub,
> > -                                 struct ublk_buf *ubuf,
> > -                                 unsigned long nr_pages)
> > +/* Erase coalesced PFN ranges from the maple tree matching buf_index */
> > +static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
> >  {
> > -       unsigned long i;
> > -
> > -       for (i = 0; i < nr_pages; ) {
> > -               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
> > -               unsigned long start = i;
> > +       MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
> > +       struct ublk_buf_range *range;
> >
> > -               while (i + 1 < nr_pages &&
> > -                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
> > -                       i++;
> > -               i++;
> > -               kfree(mtree_erase(&ub->buf_tree, pfn));
> > +       mas_lock(&mas);
> > +       mas_for_each(&mas, range, ULONG_MAX) {
> > +               if (range->buf_index == buf_index) {
> > +                       mas_erase(&mas);
> > +                       kfree(range);
> > +               }
> >         }
> > +       mas_unlock(&mas);
> >  }
> >
> >  static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> > -                              struct ublk_buf *ubuf, int index,
> > +                              struct ublk_buf *ubuf,
> > +                              struct page **pages, int index,
> >                                unsigned short flags)
> >  {
> >         unsigned long nr_pages = ubuf->nr_pages;
> > @@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         int ret;
> >
> >         for (i = 0; i < nr_pages; ) {
> > -               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
> > +               unsigned long pfn = page_to_pfn(pages[i]);
> >                 unsigned long start = i;
> >                 struct ublk_buf_range *range;
> >
> >                 /* Find run of consecutive PFNs */
> >                 while (i + 1 < nr_pages &&
> > -                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
> > +                      page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
> >                         i++;
> >                 i++;    /* past the last page in this run */
> >
> > @@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         return 0;
> >
> >  unwind:
> > -       ublk_buf_erase_ranges(ub, ubuf, i);
> > +       ublk_buf_erase_ranges(ub, index);
> >         return ret;
> >  }
> >
> > @@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         void __user *argp = (void __user *)(unsigned long)header->addr;
> >         struct ublk_shmem_buf_reg buf_reg;
> >         unsigned long addr, size, nr_pages;
> > +       struct page **pages = NULL;
> >         unsigned int gup_flags;
> >         struct gendisk *disk;
> >         struct ublk_buf *ubuf;
> > @@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >                 goto put_disk;
> >         }
> >
> > -       ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
> > -                                    GFP_KERNEL);
> > -       if (!ubuf->pages) {
> > +       pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
> > +       if (!pages) {
> >                 ret = -ENOMEM;
> >                 goto err_free;
> >         }
> > @@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
> >                 gup_flags |= FOLL_WRITE;
> >
> > -       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
> > +       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
> >         if (pinned < 0) {
> >                 ret = pinned;
> >                 goto err_free_pages;
> > @@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         if (ret)
> >                 goto err_unlock;
> >
> > -       ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
> > +       ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
> >         if (ret) {
> >                 xa_erase(&ub->bufs_xa, index);
> >                 goto err_unlock;
> > @@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >
> >         mutex_unlock(&ub->mutex);
> >
> > +       kvfree(pages);
> >         ublk_unquiesce_and_resume(disk);
> >         ublk_put_disk(disk);
> >         return index;
> > @@ -5422,9 +5420,9 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         mutex_unlock(&ub->mutex);
> >         ublk_unquiesce_and_resume(disk);
> >  err_unpin:
> > -       unpin_user_pages(ubuf->pages, pinned);
> > +       unpin_user_pages(pages, pinned);
> >  err_free_pages:
> > -       kvfree(ubuf->pages);
> > +       kvfree(pages);
> >  err_free:
> >         kfree(ubuf);
> >  put_disk:
> > @@ -5433,11 +5431,36 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >  }
> >
> >  static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
> > -                                 struct ublk_buf *ubuf)
> > +                                 struct ublk_buf *ubuf, int buf_index)
> 
> ubuf is only passed to kfree() now, maybe it would make sense to move
> that to the caller so the argument can be dropped?

Yeah, looks fine.


Thanks,
Ming


  reply	other threads:[~2026-04-08  2:59 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Ming Lei
2026-04-07 19:35   ` Caleb Sander Mateos
2026-04-08  2:23     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:36     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:50     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Ming Lei
2026-04-07 19:50   ` Caleb Sander Mateos
2026-04-08  2:58     ` Ming Lei [this message]
2026-03-31 15:31 ` [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk Ming Lei
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07  2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34   ` Jens Axboe
2026-04-07 19:29   ` Caleb Sander Mateos
2026-04-08  3:03     ` Ming Lei
2026-04-07 13:44 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=adXEa9bcCUQfPCDo@fedora \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=csander@purestorage.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox