linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Matthew Wilcox <willy@infradead.org>
Cc: linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	Christoph Hellwig <hch@lst.de>,
	Luis Chamberlain <mcgrof@kernel.org>,
	Pankaj Raghav <p.raghav@samsung.com>
Subject: [PATCH 1/7] brd: use XArray instead of radix-tree to index backing pages
Date: Wed, 14 Jun 2023 13:46:31 +0200	[thread overview]
Message-ID: <20230614114637.89759-2-hare@suse.de> (raw)
In-Reply-To: <20230614114637.89759-1-hare@suse.de>

From: Pankaj Raghav <p.raghav@samsung.com>

XArray was introduced to hold large array of pointers with a simple API.
XArray API also provides array semantics which simplifies the way we store
and access the backing pages, and the code becomes significantly easier
to understand.

No performance difference was noticed between the two implementation
using fio with direct=1 [1].

[1] Performance in KIOPS:

          |  radix-tree |    XArray  |   Diff
          |             |            |
write     |    315      |     313    |   -0.6%
randwrite |    286      |     290    |   +1.3%
read      |    330      |     335    |   +1.5%
randread  |    309      |     312    |   +0.9%

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
 drivers/block/brd.c | 93 ++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 69 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bcad9b926b0c..2f71376afc71 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -19,7 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
@@ -28,7 +28,7 @@
 #include <linux/uaccess.h>
 
 /*
- * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * Each block ramdisk device has a xarray brd_pages of pages that stores
  * the pages containing the block device's contents. A brd page's ->index is
  * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  * with, the kernel's pagecache or buffer cache (which sit above our block
@@ -40,11 +40,9 @@ struct brd_device {
 	struct list_head	brd_list;
 
 	/*
-	 * Backing store of pages and lock to protect it. This is the contents
-	 * of the block device.
+	 * Backing store of pages. This is the contents of the block device.
 	 */
-	spinlock_t		brd_lock;
-	struct radix_tree_root	brd_pages;
+	struct xarray	        brd_pages;
 	u64			brd_nr_pages;
 };
 
@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 	pgoff_t idx;
 	struct page *page;
 
-	/*
-	 * The page lifetime is protected by the fact that we have opened the
-	 * device node -- brd pages will never be deleted under us, so we
-	 * don't need any further locking or refcounting.
-	 *
-	 * This is strictly true for the radix-tree nodes as well (ie. we
-	 * don't actually need the rcu_read_lock()), however that is not a
-	 * documented feature of the radix-tree API so it is better to be
-	 * safe here (we don't have total exclusion from radix tree updates
-	 * here, only deletes).
-	 */
-	rcu_read_lock();
 	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
-	page = radix_tree_lookup(&brd->brd_pages, idx);
-	rcu_read_unlock();
+	page = xa_load(&brd->brd_pages, idx);
 
 	BUG_ON(page && page->index != idx);
 
@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
 {
 	pgoff_t idx;
-	struct page *page;
+	struct page *page, *cur;
 	int ret = 0;
 
 	page = brd_lookup_page(brd, sector);
@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
 	if (!page)
 		return -ENOMEM;
 
-	if (radix_tree_maybe_preload(gfp)) {
-		__free_page(page);
-		return -ENOMEM;
-	}
+	xa_lock(&brd->brd_pages);
 
-	spin_lock(&brd->brd_lock);
 	idx = sector >> PAGE_SECTORS_SHIFT;
 	page->index = idx;
-	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+
+	cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
+
+	if (unlikely(cur)) {
 		__free_page(page);
-		page = radix_tree_lookup(&brd->brd_pages, idx);
-		if (!page)
-			ret = -ENOMEM;
-		else if (page->index != idx)
+		ret = xa_err(cur);
+		if (!ret && (cur->index != idx))
 			ret = -EIO;
 	} else {
 		brd->brd_nr_pages++;
 	}
-	spin_unlock(&brd->brd_lock);
 
-	radix_tree_preload_end();
+	xa_unlock(&brd->brd_pages);
+
 	return ret;
 }
 
 /*
- * Free all backing store pages and radix tree. This must only be called when
+ * Free all backing store pages and xarray. This must only be called when
  * there are no other users of the device.
  */
-#define FREE_BATCH 16
 static void brd_free_pages(struct brd_device *brd)
 {
-	unsigned long pos = 0;
-	struct page *pages[FREE_BATCH];
-	int nr_pages;
-
-	do {
-		int i;
-
-		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
-				(void **)pages, pos, FREE_BATCH);
-
-		for (i = 0; i < nr_pages; i++) {
-			void *ret;
-
-			BUG_ON(pages[i]->index < pos);
-			pos = pages[i]->index;
-			ret = radix_tree_delete(&brd->brd_pages, pos);
-			BUG_ON(!ret || ret != pages[i]);
-			__free_page(pages[i]);
-		}
-
-		pos++;
+	struct page *page;
+	pgoff_t idx;
 
-		/*
-		 * It takes 3.4 seconds to remove 80GiB ramdisk.
-		 * So, we need cond_resched to avoid stalling the CPU.
-		 */
-		cond_resched();
+	xa_for_each(&brd->brd_pages, idx, page) {
+		__free_page(page);
+		cond_resched_rcu();
+	}
 
-		/*
-		 * This assumes radix_tree_gang_lookup always returns as
-		 * many pages as possible. If the radix-tree code changes,
-		 * so will this have to.
-		 */
-	} while (nr_pages == FREE_BATCH);
+	xa_destroy(&brd->brd_pages);
 }
 
 /*
@@ -372,8 +328,7 @@ static int brd_alloc(int i)
 	brd->brd_number		= i;
 	list_add_tail(&brd->brd_list, &brd_devices);
 
-	spin_lock_init(&brd->brd_lock);
-	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+	xa_init(&brd->brd_pages);
 
 	snprintf(buf, DISK_NAME_LEN, "ram%d", i);
 	if (!IS_ERR_OR_NULL(brd_debugfs_dir))
-- 
2.35.3


  reply	other threads:[~2023-06-14 11:47 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-14 11:46 [PATCH 0/7] RFC: high-order folio support for I/O Hannes Reinecke
2023-06-14 11:46 ` Hannes Reinecke [this message]
2023-06-14 12:45   ` [PATCH 1/7] brd: use XArray instead of radix-tree to index backing pages Matthew Wilcox
2023-06-14 12:50     ` Pankaj Raghav
2023-06-14 13:03       ` Hannes Reinecke
2023-06-14 11:46 ` [PATCH 2/7] brd: convert to folios Hannes Reinecke
2023-06-14 13:45   ` Matthew Wilcox
2023-06-14 13:50     ` Hannes Reinecke
2023-06-14 11:46 ` [PATCH 3/7] brd: abstract page_size conventions Hannes Reinecke
2023-06-14 11:46 ` [PATCH 4/7] brd: make sector size configurable Hannes Reinecke
2023-06-14 12:55   ` Matthew Wilcox
2023-06-14 13:02     ` Hannes Reinecke
2023-06-15  2:17   ` Dave Chinner
2023-06-15  5:55     ` Christoph Hellwig
2023-06-15  6:33       ` Hannes Reinecke
2023-06-15  6:23     ` Hannes Reinecke
2023-06-14 11:46 ` [PATCH 5/7] brd: make logical " Hannes Reinecke
2023-06-14 11:46 ` [PATCH 6/7] mm/filemap: allocate folios with mapping blocksize Hannes Reinecke
     [not found]   ` <CGME20230619080901eucas1p224e67aa31866d2ad8d259b2209c2db67@eucas1p2.samsung.com>
2023-06-19  8:08     ` Pankaj Raghav
2023-06-19  8:42       ` Hannes Reinecke
2023-06-19 22:57         ` Dave Chinner
2023-06-20  0:00           ` Matthew Wilcox
2023-06-20  5:57           ` Hannes Reinecke
2023-06-14 11:46 ` [PATCH 7/7] mm/readahead: align readahead down to " Hannes Reinecke
2023-06-14 13:17 ` [PATCH 0/7] RFC: high-order folio support for I/O Hannes Reinecke
2023-06-14 13:53   ` Matthew Wilcox
2023-06-14 15:06     ` Hannes Reinecke
2023-06-14 15:35       ` Hannes Reinecke
2023-06-14 17:46         ` Matthew Wilcox
2023-06-14 23:53       ` Dave Chinner
2023-06-15  6:21         ` Hannes Reinecke
2023-06-15  8:51           ` Dave Chinner
2023-06-16 16:06             ` Kent Overstreet
2023-06-15  3:44       ` Dave Chinner
2023-06-14 13:48 ` [PATCH 1/2] highmem: Add memcpy_to_folio() Matthew Wilcox (Oracle)
2023-06-14 18:38   ` kernel test robot
2023-06-14 19:30   ` kernel test robot
2023-06-15  5:58   ` Christoph Hellwig
2023-06-15 12:16     ` Matthew Wilcox
2023-06-14 13:48 ` [PATCH 2/2] highmem: Add memcpy_from_folio() Matthew Wilcox (Oracle)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230614114637.89759-2-hare@suse.de \
    --to=hare@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=mcgrof@kernel.org \
    --cc=p.raghav@samsung.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).