From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: Caleb Sander Mateos <csander@purestorage.com>,
Ming Lei <ming.lei@redhat.com>
Subject: [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf
Date: Tue, 31 Mar 2026 23:31:55 +0800 [thread overview]
Message-ID: <20260331153207.3635125-5-ming.lei@redhat.com> (raw)
In-Reply-To: <20260331153207.3635125-1-ming.lei@redhat.com>
The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
was stored permanently in struct ublk_buf but only needed during
pin_user_pages_fast() and maple tree construction. Since the maple tree
already stores PFN ranges via ublk_buf_range, struct page pointers can
be recovered via pfn_to_page() during unregistration.
Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
to iterate the maple tree for matching buf_index entries, recovering
struct page pointers via pfn_to_page() and unpinning in batches of 32.
Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
instead of walking the now-removed pages[] array.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++---------------
1 file changed, 55 insertions(+), 32 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c2b9992503a4..2e475bdc54dd 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -296,7 +296,6 @@ struct ublk_queue {
/* Per-registered shared memory buffer */
struct ublk_buf {
- struct page **pages;
unsigned int nr_pages;
};
@@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
* coalescing consecutive PFNs into single range entries.
* Returns 0 on success, negative error with partial insertions unwound.
*/
-/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
-static void ublk_buf_erase_ranges(struct ublk_device *ub,
- struct ublk_buf *ubuf,
- unsigned long nr_pages)
+/* Erase coalesced PFN ranges from the maple tree matching buf_index */
+static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
{
- unsigned long i;
-
- for (i = 0; i < nr_pages; ) {
- unsigned long pfn = page_to_pfn(ubuf->pages[i]);
- unsigned long start = i;
+ MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+ struct ublk_buf_range *range;
- while (i + 1 < nr_pages &&
- page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
- i++;
- i++;
- kfree(mtree_erase(&ub->buf_tree, pfn));
+ mas_lock(&mas);
+ mas_for_each(&mas, range, ULONG_MAX) {
+ if (range->buf_index == buf_index) {
+ mas_erase(&mas);
+ kfree(range);
+ }
}
+ mas_unlock(&mas);
}
static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
- struct ublk_buf *ubuf, int index,
+ struct ublk_buf *ubuf,
+ struct page **pages, int index,
unsigned short flags)
{
unsigned long nr_pages = ubuf->nr_pages;
@@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
int ret;
for (i = 0; i < nr_pages; ) {
- unsigned long pfn = page_to_pfn(ubuf->pages[i]);
+ unsigned long pfn = page_to_pfn(pages[i]);
unsigned long start = i;
struct ublk_buf_range *range;
/* Find run of consecutive PFNs */
while (i + 1 < nr_pages &&
- page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
+ page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
i++;
i++; /* past the last page in this run */
@@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
return 0;
unwind:
- ublk_buf_erase_ranges(ub, ubuf, i);
+ ublk_buf_erase_ranges(ub, index);
return ret;
}
@@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_shmem_buf_reg buf_reg;
unsigned long addr, size, nr_pages;
+ struct page **pages = NULL;
unsigned int gup_flags;
struct gendisk *disk;
struct ublk_buf *ubuf;
@@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
goto put_disk;
}
- ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
- GFP_KERNEL);
- if (!ubuf->pages) {
+ pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
ret = -ENOMEM;
goto err_free;
}
@@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
gup_flags |= FOLL_WRITE;
- pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
+ pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
if (pinned < 0) {
ret = pinned;
goto err_free_pages;
@@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
if (ret)
goto err_unlock;
- ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
+ ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
if (ret) {
xa_erase(&ub->bufs_xa, index);
goto err_unlock;
@@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
mutex_unlock(&ub->mutex);
+ kvfree(pages);
ublk_unquiesce_and_resume(disk);
ublk_put_disk(disk);
return index;
@@ -5422,9 +5420,9 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
mutex_unlock(&ub->mutex);
ublk_unquiesce_and_resume(disk);
err_unpin:
- unpin_user_pages(ubuf->pages, pinned);
+ unpin_user_pages(pages, pinned);
err_free_pages:
- kvfree(ubuf->pages);
+ kvfree(pages);
err_free:
kfree(ubuf);
put_disk:
@@ -5433,11 +5431,36 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
}
static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
- struct ublk_buf *ubuf)
+ struct ublk_buf *ubuf, int buf_index)
{
- ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
- unpin_user_pages(ubuf->pages, ubuf->nr_pages);
- kvfree(ubuf->pages);
+ MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+ struct ublk_buf_range *range;
+ struct page *pages[32];
+
+ mas_lock(&mas);
+ mas_for_each(&mas, range, ULONG_MAX) {
+ unsigned long base, nr, off;
+
+ if (range->buf_index != buf_index)
+ continue;
+
+ base = range->base_pfn;
+ nr = mas.last - mas.index + 1;
+ mas_erase(&mas);
+
+ for (off = 0; off < nr; ) {
+ unsigned int batch = min_t(unsigned long,
+ nr - off, 32);
+ unsigned int j;
+
+ for (j = 0; j < batch; j++)
+ pages[j] = pfn_to_page(base + off + j);
+ unpin_user_pages(pages, batch);
+ off += batch;
+ }
+ kfree(range);
+ }
+ mas_unlock(&mas);
kfree(ubuf);
}
@@ -5468,7 +5491,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
return -ENOENT;
}
- __ublk_ctrl_unreg_buf(ub, ubuf);
+ __ublk_ctrl_unreg_buf(ub, ubuf, index);
mutex_unlock(&ub->mutex);
@@ -5483,7 +5506,7 @@ static void ublk_buf_cleanup(struct ublk_device *ub)
unsigned long index;
xa_for_each(&ub->bufs_xa, index, ubuf)
- __ublk_ctrl_unreg_buf(ub, ubuf);
+ __ublk_ctrl_unreg_buf(ub, ubuf, index);
xa_destroy(&ub->bufs_xa);
mtree_destroy(&ub->buf_tree);
}
--
2.53.0
next prev parent reply other threads:[~2026-03-31 15:32 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Ming Lei
2026-04-07 19:35 ` Caleb Sander Mateos
2026-04-08 2:23 ` Ming Lei
2026-04-08 15:20 ` Caleb Sander Mateos
2026-04-09 12:18 ` Ming Lei
2026-04-09 21:22 ` Caleb Sander Mateos
2026-04-10 2:28 ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47 ` Caleb Sander Mateos
2026-04-08 2:36 ` Ming Lei
2026-04-08 15:28 ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47 ` Caleb Sander Mateos
2026-04-08 2:50 ` Ming Lei
2026-03-31 15:31 ` Ming Lei [this message]
2026-04-07 19:50 ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Caleb Sander Mateos
2026-04-08 2:58 ` Ming Lei
2026-03-31 15:31 ` [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk Ming Lei
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07 2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34 ` Jens Axboe
2026-04-07 19:29 ` Caleb Sander Mateos
2026-04-08 3:03 ` Ming Lei
2026-04-08 12:52 ` Jens Axboe
2026-04-07 13:44 ` Jens Axboe
2026-04-14 18:56 ` Keith Busch
2026-04-15 8:38 ` Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260331153207.3635125-5-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=csander@purestorage.com \
--cc=linux-block@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.