All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Caleb Sander Mateos <csander@purestorage.com>
Cc: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Subject: Re: [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf
Date: Wed, 8 Apr 2026 10:58:51 +0800	[thread overview]
Message-ID: <adXEa9bcCUQfPCDo@fedora> (raw)
In-Reply-To: <CADUfDZow_gmwzQk41kn=Hw4w4bhsF9ogm2ZPGNgRuz9d5aF_kQ@mail.gmail.com>

On Tue, Apr 07, 2026 at 12:50:15PM -0700, Caleb Sander Mateos wrote:
> On Tue, Mar 31, 2026 at 8:32 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
> > was stored permanently in struct ublk_buf but only needed during
> > pin_user_pages_fast() and maple tree construction. Since the maple tree
> > already stores PFN ranges via ublk_buf_range, struct page pointers can
> > be recovered via pfn_to_page() during unregistration.
> >
> > Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
> > immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
> > to iterate the maple tree for matching buf_index entries, recovering
> > struct page pointers via pfn_to_page() and unpinning in batches of 32.
> > Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
> > instead of walking the now-removed pages[] array.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++---------------
> >  1 file changed, 55 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index c2b9992503a4..2e475bdc54dd 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -296,7 +296,6 @@ struct ublk_queue {
> >
> >  /* Per-registered shared memory buffer */
> >  struct ublk_buf {
> > -       struct page **pages;
> >         unsigned int nr_pages;
> >  };
> 
> It looks like nr_pages doesn't need to be stored either, it could just
> be passed to __ublk_ctrl_reg_buf(). Then I think we could get rid of
> struct ublk_buf and the xarray entirely. We really just need a bitmap
> for allocating buffer indices.

Maybe idr_alloc().

> 
> >
> > @@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
> >   * coalescing consecutive PFNs into single range entries.
> >   * Returns 0 on success, negative error with partial insertions unwound.
> >   */
> > -/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
> > -static void ublk_buf_erase_ranges(struct ublk_device *ub,
> > -                                 struct ublk_buf *ubuf,
> > -                                 unsigned long nr_pages)
> > +/* Erase coalesced PFN ranges from the maple tree matching buf_index */
> > +static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
> >  {
> > -       unsigned long i;
> > -
> > -       for (i = 0; i < nr_pages; ) {
> > -               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
> > -               unsigned long start = i;
> > +       MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
> > +       struct ublk_buf_range *range;
> >
> > -               while (i + 1 < nr_pages &&
> > -                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
> > -                       i++;
> > -               i++;
> > -               kfree(mtree_erase(&ub->buf_tree, pfn));
> > +       mas_lock(&mas);
> > +       mas_for_each(&mas, range, ULONG_MAX) {
> > +               if (range->buf_index == buf_index) {
> > +                       mas_erase(&mas);
> > +                       kfree(range);
> > +               }
> >         }
> > +       mas_unlock(&mas);
> >  }
> >
> >  static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> > -                              struct ublk_buf *ubuf, int index,
> > +                              struct ublk_buf *ubuf,
> > +                              struct page **pages, int index,
> >                                unsigned short flags)
> >  {
> >         unsigned long nr_pages = ubuf->nr_pages;
> > @@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         int ret;
> >
> >         for (i = 0; i < nr_pages; ) {
> > -               unsigned long pfn = page_to_pfn(ubuf->pages[i]);
> > +               unsigned long pfn = page_to_pfn(pages[i]);
> >                 unsigned long start = i;
> >                 struct ublk_buf_range *range;
> >
> >                 /* Find run of consecutive PFNs */
> >                 while (i + 1 < nr_pages &&
> > -                      page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
> > +                      page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
> >                         i++;
> >                 i++;    /* past the last page in this run */
> >
> > @@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         return 0;
> >
> >  unwind:
> > -       ublk_buf_erase_ranges(ub, ubuf, i);
> > +       ublk_buf_erase_ranges(ub, index);
> >         return ret;
> >  }
> >
> > @@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         void __user *argp = (void __user *)(unsigned long)header->addr;
> >         struct ublk_shmem_buf_reg buf_reg;
> >         unsigned long addr, size, nr_pages;
> > +       struct page **pages = NULL;
> >         unsigned int gup_flags;
> >         struct gendisk *disk;
> >         struct ublk_buf *ubuf;
> > @@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >                 goto put_disk;
> >         }
> >
> > -       ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
> > -                                    GFP_KERNEL);
> > -       if (!ubuf->pages) {
> > +       pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
> > +       if (!pages) {
> >                 ret = -ENOMEM;
> >                 goto err_free;
> >         }
> > @@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
> >                 gup_flags |= FOLL_WRITE;
> >
> > -       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
> > +       pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
> >         if (pinned < 0) {
> >                 ret = pinned;
> >                 goto err_free_pages;
> > @@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         if (ret)
> >                 goto err_unlock;
> >
> > -       ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
> > +       ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
> >         if (ret) {
> >                 xa_erase(&ub->bufs_xa, index);
> >                 goto err_unlock;
> > @@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >
> >         mutex_unlock(&ub->mutex);
> >
> > +       kvfree(pages);
> >         ublk_unquiesce_and_resume(disk);
> >         ublk_put_disk(disk);
> >         return index;
> > @@ -5422,9 +5420,9 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >         mutex_unlock(&ub->mutex);
> >         ublk_unquiesce_and_resume(disk);
> >  err_unpin:
> > -       unpin_user_pages(ubuf->pages, pinned);
> > +       unpin_user_pages(pages, pinned);
> >  err_free_pages:
> > -       kvfree(ubuf->pages);
> > +       kvfree(pages);
> >  err_free:
> >         kfree(ubuf);
> >  put_disk:
> > @@ -5433,11 +5431,36 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
> >  }
> >
> >  static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
> > -                                 struct ublk_buf *ubuf)
> > +                                 struct ublk_buf *ubuf, int buf_index)
> 
> ubuf is only passed to kfree() now, maybe it would make sense to move
> that to the caller so the argument can be dropped?

Yeah, looks fine.


Thanks,
Ming


  reply	other threads:[~2026-04-08  2:59 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Ming Lei
2026-04-07 19:35   ` Caleb Sander Mateos
2026-04-08  2:23     ` Ming Lei
2026-04-08 15:20       ` Caleb Sander Mateos
2026-04-09 12:18         ` Ming Lei
2026-04-09 21:22           ` Caleb Sander Mateos
2026-04-10  2:28             ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:36     ` Ming Lei
2026-04-08 15:28       ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-04-08  2:50     ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Ming Lei
2026-04-07 19:50   ` Caleb Sander Mateos
2026-04-08  2:58     ` Ming Lei [this message]
2026-03-31 15:31 ` [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk Ming Lei
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07  2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34   ` Jens Axboe
2026-04-07 19:29   ` Caleb Sander Mateos
2026-04-08  3:03     ` Ming Lei
2026-04-08 12:52       ` Jens Axboe
2026-04-07 13:44 ` Jens Axboe
2026-04-14 18:56 ` Keith Busch
2026-04-15  8:38   ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=adXEa9bcCUQfPCDo@fedora \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=csander@purestorage.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.