public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: Caleb Sander Mateos <csander@purestorage.com>,
	Ming Lei <ming.lei@redhat.com>
Subject: [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf
Date: Tue, 31 Mar 2026 23:31:55 +0800	[thread overview]
Message-ID: <20260331153207.3635125-5-ming.lei@redhat.com> (raw)
In-Reply-To: <20260331153207.3635125-1-ming.lei@redhat.com>

The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
was stored permanently in struct ublk_buf but only needed during
pin_user_pages_fast() and maple tree construction. Since the maple tree
already stores PFN ranges via ublk_buf_range, struct page pointers can
be recovered via pfn_to_page() during unregistration.

Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
to iterate the maple tree for matching buf_index entries, recovering
struct page pointers via pfn_to_page() and unpinning in batches of 32.
Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
instead of walking the now-removed pages[] array.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 32 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c2b9992503a4..2e475bdc54dd 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -296,7 +296,6 @@ struct ublk_queue {
 
 /* Per-registered shared memory buffer */
 struct ublk_buf {
-	struct page **pages;
 	unsigned int nr_pages;
 };
 
@@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
  * coalescing consecutive PFNs into single range entries.
  * Returns 0 on success, negative error with partial insertions unwound.
  */
-/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
-static void ublk_buf_erase_ranges(struct ublk_device *ub,
-				  struct ublk_buf *ubuf,
-				  unsigned long nr_pages)
+/* Erase coalesced PFN ranges from the maple tree matching buf_index */
+static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
 {
-	unsigned long i;
-
-	for (i = 0; i < nr_pages; ) {
-		unsigned long pfn = page_to_pfn(ubuf->pages[i]);
-		unsigned long start = i;
+	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+	struct ublk_buf_range *range;
 
-		while (i + 1 < nr_pages &&
-		       page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
-			i++;
-		i++;
-		kfree(mtree_erase(&ub->buf_tree, pfn));
+	mas_lock(&mas);
+	mas_for_each(&mas, range, ULONG_MAX) {
+		if (range->buf_index == buf_index) {
+			mas_erase(&mas);
+			kfree(range);
+		}
 	}
+	mas_unlock(&mas);
 }
 
 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
-			       struct ublk_buf *ubuf, int index,
+			       struct ublk_buf *ubuf,
+			       struct page **pages, int index,
 			       unsigned short flags)
 {
 	unsigned long nr_pages = ubuf->nr_pages;
@@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
 	int ret;
 
 	for (i = 0; i < nr_pages; ) {
-		unsigned long pfn = page_to_pfn(ubuf->pages[i]);
+		unsigned long pfn = page_to_pfn(pages[i]);
 		unsigned long start = i;
 		struct ublk_buf_range *range;
 
 		/* Find run of consecutive PFNs */
 		while (i + 1 < nr_pages &&
-		       page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
+		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
 			i++;
 		i++;	/* past the last page in this run */
 
@@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
 	return 0;
 
 unwind:
-	ublk_buf_erase_ranges(ub, ubuf, i);
+	ublk_buf_erase_ranges(ub, index);
 	return ret;
 }
 
@@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublk_shmem_buf_reg buf_reg;
 	unsigned long addr, size, nr_pages;
+	struct page **pages = NULL;
 	unsigned int gup_flags;
 	struct gendisk *disk;
 	struct ublk_buf *ubuf;
@@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 		goto put_disk;
 	}
 
-	ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
-				     GFP_KERNEL);
-	if (!ubuf->pages) {
+	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
 		ret = -ENOMEM;
 		goto err_free;
 	}
@@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
 		gup_flags |= FOLL_WRITE;
 
-	pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
+	pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
 	if (pinned < 0) {
 		ret = pinned;
 		goto err_free_pages;
@@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 	if (ret)
 		goto err_unlock;
 
-	ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
+	ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
 	if (ret) {
 		xa_erase(&ub->bufs_xa, index);
 		goto err_unlock;
@@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 
 	mutex_unlock(&ub->mutex);
 
+	kvfree(pages);
 	ublk_unquiesce_and_resume(disk);
 	ublk_put_disk(disk);
 	return index;
@@ -5422,9 +5420,9 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 	mutex_unlock(&ub->mutex);
 	ublk_unquiesce_and_resume(disk);
 err_unpin:
-	unpin_user_pages(ubuf->pages, pinned);
+	unpin_user_pages(pages, pinned);
 err_free_pages:
-	kvfree(ubuf->pages);
+	kvfree(pages);
 err_free:
 	kfree(ubuf);
 put_disk:
@@ -5433,11 +5431,36 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
 }
 
 static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
-				  struct ublk_buf *ubuf)
+				  struct ublk_buf *ubuf, int buf_index)
 {
-	ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
-	unpin_user_pages(ubuf->pages, ubuf->nr_pages);
-	kvfree(ubuf->pages);
+	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+	struct ublk_buf_range *range;
+	struct page *pages[32];
+
+	mas_lock(&mas);
+	mas_for_each(&mas, range, ULONG_MAX) {
+		unsigned long base, nr, off;
+
+		if (range->buf_index != buf_index)
+			continue;
+
+		base = range->base_pfn;
+		nr = mas.last - mas.index + 1;
+		mas_erase(&mas);
+
+		for (off = 0; off < nr; ) {
+			unsigned int batch = min_t(unsigned long,
+						   nr - off, 32);
+			unsigned int j;
+
+			for (j = 0; j < batch; j++)
+				pages[j] = pfn_to_page(base + off + j);
+			unpin_user_pages(pages, batch);
+			off += batch;
+		}
+		kfree(range);
+	}
+	mas_unlock(&mas);
 	kfree(ubuf);
 }
 
@@ -5468,7 +5491,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
 		return -ENOENT;
 	}
 
-	__ublk_ctrl_unreg_buf(ub, ubuf);
+	__ublk_ctrl_unreg_buf(ub, ubuf, index);
 
 	mutex_unlock(&ub->mutex);
 
@@ -5483,7 +5506,7 @@ static void ublk_buf_cleanup(struct ublk_device *ub)
 	unsigned long index;
 
 	xa_for_each(&ub->bufs_xa, index, ubuf)
-		__ublk_ctrl_unreg_buf(ub, ubuf);
+		__ublk_ctrl_unreg_buf(ub, ubuf, index);
 	xa_destroy(&ub->bufs_xa);
 	mtree_destroy(&ub->buf_tree);
 }
-- 
2.53.0


  parent reply	other threads:[~2026-03-31 15:32 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-31 15:31 [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-03-31 15:31 ` [PATCH v2 01/10] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands Ming Lei
2026-04-07 19:35   ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 02/10] ublk: add PFN-based buffer matching in I/O path Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 03/10] ublk: enable UBLK_F_SHMEM_ZC feature flag Ming Lei
2026-04-07 19:47   ` Caleb Sander Mateos
2026-03-31 15:31 ` Ming Lei [this message]
2026-04-07 19:50   ` [PATCH v2 04/10] ublk: eliminate permanent pages[] array from struct ublk_buf Caleb Sander Mateos
2026-03-31 15:31 ` [PATCH v2 05/10] selftests/ublk: add shared memory zero-copy support in kublk Ming Lei
2026-03-31 15:31 ` [PATCH v2 06/10] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target Ming Lei
2026-03-31 15:31 ` [PATCH v2 07/10] selftests/ublk: add shared memory zero-copy test Ming Lei
2026-03-31 15:31 ` [PATCH v2 08/10] selftests/ublk: add hugetlbfs shmem_zc test for loop target Ming Lei
2026-03-31 15:32 ` [PATCH v2 09/10] selftests/ublk: add filesystem fio verify test for shmem_zc Ming Lei
2026-03-31 15:32 ` [PATCH v2 10/10] selftests/ublk: add read-only buffer registration test Ming Lei
2026-04-07  2:38 ` [PATCH v2 00/10] ublk: add shared memory zero-copy support Ming Lei
2026-04-07 13:34   ` Jens Axboe
2026-04-07 19:29   ` Caleb Sander Mateos
2026-04-07 13:44 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260331153207.3635125-5-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=csander@purestorage.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox