[PATCH 0/2] Allocate eb in 16k an 64k slabs, cache folio address

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] Allocate eb in 16k an 64k slabs, cache folio address
@ 2025-08-28 21:53 David Sterba
  2025-08-28 21:53 ` [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers David Sterba
  2025-08-28 21:53 ` [PATCH 2/2] btrfs: use cached extent buffer folio address everywhere David Sterba
  0 siblings, 2 replies; 11+ messages in thread
From: David Sterba @ 2025-08-28 21:53 UTC (permalink / raw)
  To: linux-btrfs; +Cc: David Sterba

Optimiztion of eb folio access.

NOTE: does not work with both patches applied, btrfs/002 hangs and I
      don't see why. I'm sending this out in case somebody can spot it.

The extent buffer folio array is not used effectively on 16K node size,
so there are two slabs allocated for the default size and 64k for
everything else. With some pointer magic we can also cache the folio
addresses and this allows object code reduction. Numbers are in the
patches.

David Sterba (2):
  btrfs: add 16K and 64K slabs for extent buffers
  btrfs: use cached extent buffer folio address everywhere

 fs/btrfs/accessors.c | 12 +++---
 fs/btrfs/accessors.h |  5 +--
 fs/btrfs/ctree.c     |  2 +-
 fs/btrfs/disk-io.c   |  6 +--
 fs/btrfs/extent_io.c | 96 ++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.h | 20 ++++++---
 6 files changed, 92 insertions(+), 49 deletions(-)

-- 
2.51.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 21:53 [PATCH 0/2] Allocate eb in 16k an 64k slabs, cache folio address David Sterba
@ 2025-08-28 21:53 ` David Sterba
  2025-08-28 22:26   ` Qu Wenruo
  2025-08-28 21:53 ` [PATCH 2/2] btrfs: use cached extent buffer folio address everywhere David Sterba
  1 sibling, 1 reply; 11+ messages in thread
From: David Sterba @ 2025-08-28 21:53 UTC (permalink / raw)
  To: linux-btrfs; +Cc: David Sterba

This is preparatory work to use cache folio address in the extent buffer
to avoid reading folio_address() all the time (this is hidden in the
accessors).

However this is not as straightforward as just adding the array, the
extent buffers are of variable size depending on the node size and
adding 16*8=128 bytes would bloat the whole structure too much.

We already waste 3/4 of the folios array on x86_64 and default node size
16K so we need to make the arrays variable size. This is allowed only
for one such array and it must be at the end of the structure. And we
need two.

With one indirection this is possible. For N folios in a node the
variable sized array will be 2N:

	faddr     ----+
	folios[]      |
	[0]           |
	[1]           |
	...           |
	[N]       <---+
	[N+1]
	...

There are 5 node sizes supported in total, but not all of them are used.
Create slabs only for 2 sizes where 16K is for the default size on
x86_64 and 64K. The one that contains the node size will be used,
possibly with some slack space.

In case of 16K node size we'll gain a few more objects per slab:

Before this change:

  sizeof (struct extent_buffer) = 240
  objects in 8K slab = 34

After, the base size of extent buffer is 120 bytes:

For 16K:

  size = 120 + 2 * 4 * 8 = 184
  objects in 8K slab = 44

For 64K:

  size = 120 + 2 * 16 * 8 = 376
  objects in 8K slab = 21

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 71 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h | 20 +++++++++----
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ca7174fa024056..4c906e5ea8ac70 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -35,7 +35,21 @@
 #include "super.h"
 #include "transaction.h"
 
-static struct kmem_cache *extent_buffer_cache;
+static struct kmem_cache *extent_buffer_cache_16k;
+static struct kmem_cache *extent_buffer_cache_64k;
+
+static void free_eb(struct extent_buffer *eb)
+{
+	if (!eb)
+		return;
+
+	if (test_bit(EXTENT_BUFFER_16K, &eb->bflags))
+		kmem_cache_free(extent_buffer_cache_16k, eb);
+	else if (test_bit(EXTENT_BUFFER_64K, &eb->bflags))
+		kmem_cache_free(extent_buffer_cache_64k, eb);
+	else
+		DEBUG_WARN("eb size mismatch");
+}
 
 #ifdef CONFIG_BTRFS_DEBUG
 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
@@ -81,7 +95,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
 		       btrfs_header_owner(eb));
 		list_del(&eb->leak_list);
 		WARN_ON_ONCE(1);
-		kmem_cache_free(extent_buffer_cache, eb);
+		free_eb(eb);
 	}
 	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
 }
@@ -221,11 +235,24 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
 
 int __init extent_buffer_init_cachep(void)
 {
-	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
-						sizeof(struct extent_buffer), 0, 0,
-						NULL);
-	if (!extent_buffer_cache)
+	size_t size;
+
+	size  = sizeof(struct extent_buffer);
+	size += (SZ_16K >> PAGE_SHIFT) * sizeof(struct folio *);
+	size += (SZ_16K >> PAGE_SHIFT) * sizeof(void *);
+	extent_buffer_cache_16k = kmem_cache_create("btrfs_extent_buffer_16k",
+						    size, 0, 0, NULL);
+
+	size  = sizeof(struct extent_buffer);
+	size += (SZ_64K >> PAGE_SHIFT) * sizeof(struct folio *);
+	size += (SZ_64K >> PAGE_SHIFT) * sizeof(void *);
+	extent_buffer_cache_64k = kmem_cache_create("btrfs_extent_buffer_64k",
+						    size, 0, 0, NULL);
+
+	if (!extent_buffer_cache_16k || !extent_buffer_cache_64k) {
+		extent_buffer_free_cachep();
 		return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -237,7 +264,8 @@ void __cold extent_buffer_free_cachep(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
-	kmem_cache_destroy(extent_buffer_cache);
+	kmem_cache_destroy(extent_buffer_cache_16k);
+	kmem_cache_destroy(extent_buffer_cache_64k);
 }
 
 static void process_one_folio(struct btrfs_fs_info *fs_info,
@@ -692,8 +720,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
 	if (ret < 0)
 		return ret;
 
-	for (int i = 0; i < num_pages; i++)
+	for (int i = 0; i < num_pages; i++) {
 		eb->folios[i] = page_folio(page_array[i]);
+		eb->faddr[i] = folio_address(eb->folios[i]);
+	}
 	eb->folio_size = PAGE_SIZE;
 	eb->folio_shift = PAGE_SHIFT;
 	return 0;
@@ -2192,7 +2222,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 
 	prepare_eb_write(eb);
 
-	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
+	bbio = btrfs_bio_alloc(num_extent_folios(eb),
 			       REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
 			       eb->fs_info, end_bbio_meta_write, eb);
 	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
@@ -2929,7 +2959,7 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
 {
 	ASSERT(!extent_buffer_under_io(eb));
 
-	for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+	for (int i = 0; i < num_extent_folios(eb); i++) {
 		struct folio *folio = eb->folios[i];
 
 		if (!folio)
@@ -2946,7 +2976,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
 	btrfs_release_extent_buffer_folios(eb);
 	btrfs_leak_debug_del_eb(eb);
-	kmem_cache_free(extent_buffer_cache, eb);
+	free_eb(eb);
 }
 
 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -2954,7 +2984,16 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
 {
 	struct extent_buffer *eb = NULL;
 
-	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
+	ASSERT(fs_info->nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE);
+	if (fs_info->nodesize <= SZ_16K) {
+		eb = kmem_cache_zalloc(extent_buffer_cache_16k, GFP_NOFS | __GFP_NOFAIL);
+		__set_bit(EXTENT_BUFFER_16K, &eb->bflags);
+		eb->faddr = (void **)(eb->folios + (SZ_16K >> PAGE_SHIFT));
+	} else {
+		eb = kmem_cache_zalloc(extent_buffer_cache_64k, GFP_NOFS | __GFP_NOFAIL);
+		__set_bit(EXTENT_BUFFER_64K, &eb->bflags);
+		eb->faddr = (void **)(eb->folios + (SZ_64K >> PAGE_SHIFT));
+	}
 	eb->start = start;
 	eb->len = fs_info->nodesize;
 	eb->fs_info = fs_info;
@@ -2965,8 +3004,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
 	spin_lock_init(&eb->refs_lock);
 	refcount_set(&eb->refs, 1);
 
-	ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
-
 	return eb;
 }
 
@@ -3550,7 +3587,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 	struct extent_buffer *eb =
 			container_of(head, struct extent_buffer, rcu_head);
 
-	kmem_cache_free(extent_buffer_cache, eb);
+	free_eb(eb);
 }
 
 static int release_extent_buffer(struct extent_buffer *eb)
@@ -3586,7 +3623,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 		btrfs_release_extent_buffer_folios(eb);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
-			kmem_cache_free(extent_buffer_cache, eb);
+			free_eb(eb);
 			return 1;
 		}
 #endif
@@ -3837,7 +3874,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
 	check_buffer_tree_ref(eb);
 	refcount_inc(&eb->refs);
 
-	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
+	bbio = btrfs_bio_alloc(num_extent_folios(eb),
 			       REQ_OP_READ | REQ_META, eb->fs_info,
 			       end_bbio_meta_read, eb);
 	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 61130786b9a3ad..3903913924f02c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -48,6 +48,9 @@ enum {
 	EXTENT_BUFFER_ZONED_ZEROOUT,
 	/* Indicate that extent buffer pages a being read */
 	EXTENT_BUFFER_READING,
+	/* Size of slab derived from fs_info->nodesize. */
+	EXTENT_BUFFER_16K,
+	EXTENT_BUFFER_64K,
 };
 
 /* these are flags for __process_pages_contig */
@@ -107,16 +110,21 @@ struct extent_buffer {
 
 	struct rw_semaphore lock;
 
-	/*
-	 * Pointers to all the folios of the extent buffer.
-	 *
-	 * For now the folio is always order 0 (aka, a single page).
-	 */
-	struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 	pid_t lock_owner;
 #endif
+	/*
+	 * Pointers to all folios of the extent buffer.
+	 *
+	 * For now the folio is always order 0 (a single page).
+	 *
+	 * Extent buffer size is determined at runtime, and allocated from the
+	 * right slab depending on "nodesize <= 16K". Cached folio address array
+	 * is stored after folios, @faddr is set up at allocation time.
+	 */
+	void **faddr;
+	struct folio *folios[];
 };
 
 struct btrfs_eb_write_context {
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 21:53 ` [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers David Sterba
@ 2025-08-28 22:26   ` Qu Wenruo
  2025-08-28 23:17     ` Qu Wenruo
  2025-08-28 23:30     ` David Sterba
  0 siblings, 2 replies; 11+ messages in thread
From: Qu Wenruo @ 2025-08-28 22:26 UTC (permalink / raw)
  To: David Sterba, linux-btrfs



在 2025/8/29 07:23, David Sterba 写道:
> This is preparatory work to use cache folio address in the extent buffer
> to avoid reading folio_address() all the time (this is hidden in the
> accessors).
> 
> However this is not as straightforward as just adding the array, the
> extent buffers are of variable size depending on the node size and
> adding 16*8=128 bytes would bloat the whole structure too much.
> 
> We already waste 3/4 of the folios array on x86_64 and default node size
> 16K so we need to make the arrays variable size. This is allowed only
> for one such array and it must be at the end of the structure. And we
> need two.
> 
> With one indirection this is possible. For N folios in a node the
> variable sized array will be 2N:

I'm OK with the variable folio pointer array, but why faddrs?

It looks like a little overkilled just to get rid of folio_address().


And to be honest, if we really want to reduce the complexity of 
folio_address(), we should go direct to large folios, which is much 
easier and less risky.
> 
> 	faddr     ----+
> 	folios[]      |
> 	[0]           |
> 	[1]           |
> 	...           |
> 	[N]       <---+
> 	[N+1]
> 	...
> 
> There are 5 node sizes supported in total, but not all of them are used.
> Create slabs only for 2 sizes where 16K is for the default size on
> x86_64 and 64K. The one that contains the node size will be used,
> possibly with some slack space.

I'm not a fan of two magic numbers.

Can we just make folios[] variable size first without bothering the 
cached folio address?
Especially you're mentioning the series do not pass btrfs/002.

Thanks,
Qu

> 
> In case of 16K node size we'll gain a few more objects per slab:
> 
> Before this change:
> 
>    sizeof (struct extent_buffer) = 240
>    objects in 8K slab = 34
> 
> After, the base size of extent buffer is 120 bytes:
> 
> For 16K:
> 
>    size = 120 + 2 * 4 * 8 = 184
>    objects in 8K slab = 44
> 
> For 64K:
> 
>    size = 120 + 2 * 16 * 8 = 376
>    objects in 8K slab = 21
> 
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
>   fs/btrfs/extent_io.c | 71 +++++++++++++++++++++++++++++++++-----------
>   fs/btrfs/extent_io.h | 20 +++++++++----
>   2 files changed, 68 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index ca7174fa024056..4c906e5ea8ac70 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -35,7 +35,21 @@
>   #include "super.h"
>   #include "transaction.h"
>   
> -static struct kmem_cache *extent_buffer_cache;
> +static struct kmem_cache *extent_buffer_cache_16k;
> +static struct kmem_cache *extent_buffer_cache_64k;
> +
> +static void free_eb(struct extent_buffer *eb)
> +{
> +	if (!eb)
> +		return;
> +
> +	if (test_bit(EXTENT_BUFFER_16K, &eb->bflags))
> +		kmem_cache_free(extent_buffer_cache_16k, eb);
> +	else if (test_bit(EXTENT_BUFFER_64K, &eb->bflags))
> +		kmem_cache_free(extent_buffer_cache_64k, eb);
> +	else
> +		DEBUG_WARN("eb size mismatch");
> +}
>   
>   #ifdef CONFIG_BTRFS_DEBUG
>   static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
> @@ -81,7 +95,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
>   		       btrfs_header_owner(eb));
>   		list_del(&eb->leak_list);
>   		WARN_ON_ONCE(1);
> -		kmem_cache_free(extent_buffer_cache, eb);
> +		free_eb(eb);
>   	}
>   	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
>   }
> @@ -221,11 +235,24 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
>   
>   int __init extent_buffer_init_cachep(void)
>   {
> -	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
> -						sizeof(struct extent_buffer), 0, 0,
> -						NULL);
> -	if (!extent_buffer_cache)
> +	size_t size;
> +
> +	size  = sizeof(struct extent_buffer);
> +	size += (SZ_16K >> PAGE_SHIFT) * sizeof(struct folio *);
> +	size += (SZ_16K >> PAGE_SHIFT) * sizeof(void *);
> +	extent_buffer_cache_16k = kmem_cache_create("btrfs_extent_buffer_16k",
> +						    size, 0, 0, NULL);
> +
> +	size  = sizeof(struct extent_buffer);
> +	size += (SZ_64K >> PAGE_SHIFT) * sizeof(struct folio *);
> +	size += (SZ_64K >> PAGE_SHIFT) * sizeof(void *);
> +	extent_buffer_cache_64k = kmem_cache_create("btrfs_extent_buffer_64k",
> +						    size, 0, 0, NULL);
> +
> +	if (!extent_buffer_cache_16k || !extent_buffer_cache_64k) {
> +		extent_buffer_free_cachep();
>   		return -ENOMEM;
> +	}
>   
>   	return 0;
>   }
> @@ -237,7 +264,8 @@ void __cold extent_buffer_free_cachep(void)
>   	 * destroy caches.
>   	 */
>   	rcu_barrier();
> -	kmem_cache_destroy(extent_buffer_cache);
> +	kmem_cache_destroy(extent_buffer_cache_16k);
> +	kmem_cache_destroy(extent_buffer_cache_64k);
>   }
>   
>   static void process_one_folio(struct btrfs_fs_info *fs_info,
> @@ -692,8 +720,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
>   	if (ret < 0)
>   		return ret;
>   
> -	for (int i = 0; i < num_pages; i++)
> +	for (int i = 0; i < num_pages; i++) {
>   		eb->folios[i] = page_folio(page_array[i]);
> +		eb->faddr[i] = folio_address(eb->folios[i]);
> +	}
>   	eb->folio_size = PAGE_SIZE;
>   	eb->folio_shift = PAGE_SHIFT;
>   	return 0;
> @@ -2192,7 +2222,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
>   
>   	prepare_eb_write(eb);
>   
> -	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
> +	bbio = btrfs_bio_alloc(num_extent_folios(eb),
>   			       REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
>   			       eb->fs_info, end_bbio_meta_write, eb);
>   	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
> @@ -2929,7 +2959,7 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
>   {
>   	ASSERT(!extent_buffer_under_io(eb));
>   
> -	for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
> +	for (int i = 0; i < num_extent_folios(eb); i++) {
>   		struct folio *folio = eb->folios[i];
>   
>   		if (!folio)
> @@ -2946,7 +2976,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
>   {
>   	btrfs_release_extent_buffer_folios(eb);
>   	btrfs_leak_debug_del_eb(eb);
> -	kmem_cache_free(extent_buffer_cache, eb);
> +	free_eb(eb);
>   }
>   
>   static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> @@ -2954,7 +2984,16 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
>   {
>   	struct extent_buffer *eb = NULL;
>   
> -	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
> +	ASSERT(fs_info->nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE);
> +	if (fs_info->nodesize <= SZ_16K) {
> +		eb = kmem_cache_zalloc(extent_buffer_cache_16k, GFP_NOFS | __GFP_NOFAIL);
> +		__set_bit(EXTENT_BUFFER_16K, &eb->bflags);
> +		eb->faddr = (void **)(eb->folios + (SZ_16K >> PAGE_SHIFT));
> +	} else {
> +		eb = kmem_cache_zalloc(extent_buffer_cache_64k, GFP_NOFS | __GFP_NOFAIL);
> +		__set_bit(EXTENT_BUFFER_64K, &eb->bflags);
> +		eb->faddr = (void **)(eb->folios + (SZ_64K >> PAGE_SHIFT));
> +	}
>   	eb->start = start;
>   	eb->len = fs_info->nodesize;
>   	eb->fs_info = fs_info;
> @@ -2965,8 +3004,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
>   	spin_lock_init(&eb->refs_lock);
>   	refcount_set(&eb->refs, 1);
>   
> -	ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
> -
>   	return eb;
>   }
>   
> @@ -3550,7 +3587,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>   	struct extent_buffer *eb =
>   			container_of(head, struct extent_buffer, rcu_head);
>   
> -	kmem_cache_free(extent_buffer_cache, eb);
> +	free_eb(eb);
>   }
>   
>   static int release_extent_buffer(struct extent_buffer *eb)
> @@ -3586,7 +3623,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>   		btrfs_release_extent_buffer_folios(eb);
>   #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>   		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
> -			kmem_cache_free(extent_buffer_cache, eb);
> +			free_eb(eb);
>   			return 1;
>   		}
>   #endif
> @@ -3837,7 +3874,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
>   	check_buffer_tree_ref(eb);
>   	refcount_inc(&eb->refs);
>   
> -	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
> +	bbio = btrfs_bio_alloc(num_extent_folios(eb),
>   			       REQ_OP_READ | REQ_META, eb->fs_info,
>   			       end_bbio_meta_read, eb);
>   	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 61130786b9a3ad..3903913924f02c 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -48,6 +48,9 @@ enum {
>   	EXTENT_BUFFER_ZONED_ZEROOUT,
>   	/* Indicate that extent buffer pages a being read */
>   	EXTENT_BUFFER_READING,
> +	/* Size of slab derived from fs_info->nodesize. */
> +	EXTENT_BUFFER_16K,
> +	EXTENT_BUFFER_64K,
>   };
>   
>   /* these are flags for __process_pages_contig */
> @@ -107,16 +110,21 @@ struct extent_buffer {
>   
>   	struct rw_semaphore lock;
>   
> -	/*
> -	 * Pointers to all the folios of the extent buffer.
> -	 *
> -	 * For now the folio is always order 0 (aka, a single page).
> -	 */
> -	struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
>   #ifdef CONFIG_BTRFS_DEBUG
>   	struct list_head leak_list;
>   	pid_t lock_owner;
>   #endif
> +	/*
> +	 * Pointers to all folios of the extent buffer.
> +	 *
> +	 * For now the folio is always order 0 (a single page).
> +	 *
> +	 * Extent buffer size is determined at runtime, and allocated from the
> +	 * right slab depending on "nodesize <= 16K". Cached folio address array
> +	 * is stored after folios, @faddr is set up at allocation time.
> +	 */
> +	void **faddr;
> +	struct folio *folios[];
>   };
>   
>   struct btrfs_eb_write_context {


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 22:26   ` Qu Wenruo
@ 2025-08-28 23:17     ` Qu Wenruo
  2025-08-28 23:30     ` David Sterba
  1 sibling, 0 replies; 11+ messages in thread
From: Qu Wenruo @ 2025-08-28 23:17 UTC (permalink / raw)
  To: David Sterba, linux-btrfs



在 2025/8/29 07:56, Qu Wenruo 写道:
> 
> 
> 在 2025/8/29 07:23, David Sterba 写道:
>> This is preparatory work to use cache folio address in the extent buffer
>> to avoid reading folio_address() all the time (this is hidden in the
>> accessors).
>>
>> However this is not as straightforward as just adding the array, the
>> extent buffers are of variable size depending on the node size and
>> adding 16*8=128 bytes would bloat the whole structure too much.
>>
>> We already waste 3/4 of the folios array on x86_64 and default node size
>> 16K so we need to make the arrays variable size. This is allowed only
>> for one such array and it must be at the end of the structure. And we
>> need two.
>>
>> With one indirection this is possible. For N folios in a node the
>> variable sized array will be 2N:
> 
> I'm OK with the variable folio pointer array, but why faddrs?
> 
> It looks like a little overkilled just to get rid of folio_address().
> 
> 
> And to be honest, if we really want to reduce the complexity of 
> folio_address(), we should go direct to large folios, which is much 
> easier and less risky.
>>
>>     faddr     ----+
>>     folios[]      |
>>     [0]           |
>>     [1]           |
>>     ...           |
>>     [N]       <---+
>>     [N+1]
>>     ...
>>
>> There are 5 node sizes supported in total, but not all of them are used.
>> Create slabs only for 2 sizes where 16K is for the default size on
>> x86_64 and 64K. The one that contains the node size will be used,
>> possibly with some slack space.
> 
> I'm not a fan of two magic numbers.
> 
> Can we just make folios[] variable size first without bothering the 
> cached folio address?
> Especially you're mentioning the series do not pass btrfs/002.
> 
> Thanks,
> Qu
> 
>>
>> In case of 16K node size we'll gain a few more objects per slab:
>>
>> Before this change:
>>
>>    sizeof (struct extent_buffer) = 240
>>    objects in 8K slab = 34
>>
>> After, the base size of extent buffer is 120 bytes:
>>
>> For 16K:
>>
>>    size = 120 + 2 * 4 * 8 = 184
>>    objects in 8K slab = 44
>>
>> For 64K:
>>
>>    size = 120 + 2 * 16 * 8 = 376
>>    objects in 8K slab = 21
>>
>> Signed-off-by: David Sterba <dsterba@suse.com>
>> ---
>>   fs/btrfs/extent_io.c | 71 +++++++++++++++++++++++++++++++++-----------
>>   fs/btrfs/extent_io.h | 20 +++++++++----
>>   2 files changed, 68 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index ca7174fa024056..4c906e5ea8ac70 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -35,7 +35,21 @@
>>   #include "super.h"
>>   #include "transaction.h"
>> -static struct kmem_cache *extent_buffer_cache;
>> +static struct kmem_cache *extent_buffer_cache_16k;
>> +static struct kmem_cache *extent_buffer_cache_64k;
>> +
>> +static void free_eb(struct extent_buffer *eb)
>> +{
>> +    if (!eb)
>> +        return;
>> +
>> +    if (test_bit(EXTENT_BUFFER_16K, &eb->bflags))
>> +        kmem_cache_free(extent_buffer_cache_16k, eb);
>> +    else if (test_bit(EXTENT_BUFFER_64K, &eb->bflags))
>> +        kmem_cache_free(extent_buffer_cache_64k, eb);
>> +    else
>> +        DEBUG_WARN("eb size mismatch");
>> +}
>>   #ifdef CONFIG_BTRFS_DEBUG
>>   static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
>> @@ -81,7 +95,7 @@ void btrfs_extent_buffer_leak_debug_check(struct 
>> btrfs_fs_info *fs_info)
>>                  btrfs_header_owner(eb));
>>           list_del(&eb->leak_list);
>>           WARN_ON_ONCE(1);
>> -        kmem_cache_free(extent_buffer_cache, eb);
>> +        free_eb(eb);
>>       }
>>       spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
>>   }
>> @@ -221,11 +235,24 @@ static void submit_write_bio(struct 
>> btrfs_bio_ctrl *bio_ctrl, int ret)
>>   int __init extent_buffer_init_cachep(void)
>>   {
>> -    extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
>> -                        sizeof(struct extent_buffer), 0, 0,
>> -                        NULL);
>> -    if (!extent_buffer_cache)
>> +    size_t size;
>> +
>> +    size  = sizeof(struct extent_buffer);
>> +    size += (SZ_16K >> PAGE_SHIFT) * sizeof(struct folio *);
>> +    size += (SZ_16K >> PAGE_SHIFT) * sizeof(void *);

And this won't work for 64K page sized systems at all.

In fact there is no space to save on 64K page sized system, the folios[] 
array is always allocated with one single slot.

Please just use a folios pointer instead.

Thanks,
Qu

>> +    extent_buffer_cache_16k = 
>> kmem_cache_create("btrfs_extent_buffer_16k",
>> +                            size, 0, 0, NULL);
>> +
>> +    size  = sizeof(struct extent_buffer);
>> +    size += (SZ_64K >> PAGE_SHIFT) * sizeof(struct folio *);
>> +    size += (SZ_64K >> PAGE_SHIFT) * sizeof(void *);
>> +    extent_buffer_cache_64k = 
>> kmem_cache_create("btrfs_extent_buffer_64k",
>> +                            size, 0, 0, NULL);
>> +
>> +    if (!extent_buffer_cache_16k || !extent_buffer_cache_64k) {
>> +        extent_buffer_free_cachep();
>>           return -ENOMEM;
>> +    }
>>       return 0;
>>   }
>> @@ -237,7 +264,8 @@ void __cold extent_buffer_free_cachep(void)
>>        * destroy caches.
>>        */
>>       rcu_barrier();
>> -    kmem_cache_destroy(extent_buffer_cache);
>> +    kmem_cache_destroy(extent_buffer_cache_16k);
>> +    kmem_cache_destroy(extent_buffer_cache_64k);
>>   }
>>   static void process_one_folio(struct btrfs_fs_info *fs_info,
>> @@ -692,8 +720,10 @@ static int alloc_eb_folio_array(struct 
>> extent_buffer *eb, bool nofail)
>>       if (ret < 0)
>>           return ret;
>> -    for (int i = 0; i < num_pages; i++)
>> +    for (int i = 0; i < num_pages; i++) {
>>           eb->folios[i] = page_folio(page_array[i]);
>> +        eb->faddr[i] = folio_address(eb->folios[i]);
>> +    }
>>       eb->folio_size = PAGE_SIZE;
>>       eb->folio_shift = PAGE_SHIFT;
>>       return 0;
>> @@ -2192,7 +2222,7 @@ static noinline_for_stack void 
>> write_one_eb(struct extent_buffer *eb,
>>       prepare_eb_write(eb);
>> -    bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
>> +    bbio = btrfs_bio_alloc(num_extent_folios(eb),
>>                      REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
>>                      eb->fs_info, end_bbio_meta_write, eb);
>>       bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
>> @@ -2929,7 +2959,7 @@ static void 
>> btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
>>   {
>>       ASSERT(!extent_buffer_under_io(eb));
>> -    for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
>> +    for (int i = 0; i < num_extent_folios(eb); i++) {
>>           struct folio *folio = eb->folios[i];
>>           if (!folio)
>> @@ -2946,7 +2976,7 @@ static inline void 
>> btrfs_release_extent_buffer(struct extent_buffer *eb)
>>   {
>>       btrfs_release_extent_buffer_folios(eb);
>>       btrfs_leak_debug_del_eb(eb);
>> -    kmem_cache_free(extent_buffer_cache, eb);
>> +    free_eb(eb);
>>   }
>>   static struct extent_buffer *__alloc_extent_buffer(struct 
>> btrfs_fs_info *fs_info,
>> @@ -2954,7 +2984,16 @@ static struct extent_buffer 
>> *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
>>   {
>>       struct extent_buffer *eb = NULL;
>> -    eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
>> +    ASSERT(fs_info->nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE);
>> +    if (fs_info->nodesize <= SZ_16K) {
>> +        eb = kmem_cache_zalloc(extent_buffer_cache_16k, GFP_NOFS | 
>> __GFP_NOFAIL);
>> +        __set_bit(EXTENT_BUFFER_16K, &eb->bflags);
>> +        eb->faddr = (void **)(eb->folios + (SZ_16K >> PAGE_SHIFT));
>> +    } else {
>> +        eb = kmem_cache_zalloc(extent_buffer_cache_64k, GFP_NOFS | 
>> __GFP_NOFAIL);
>> +        __set_bit(EXTENT_BUFFER_64K, &eb->bflags);
>> +        eb->faddr = (void **)(eb->folios + (SZ_64K >> PAGE_SHIFT));
>> +    }
>>       eb->start = start;
>>       eb->len = fs_info->nodesize;
>>       eb->fs_info = fs_info;
>> @@ -2965,8 +3004,6 @@ static struct extent_buffer 
>> *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
>>       spin_lock_init(&eb->refs_lock);
>>       refcount_set(&eb->refs, 1);
>> -    ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
>> -
>>       return eb;
>>   }
>> @@ -3550,7 +3587,7 @@ static inline void 
>> btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>>       struct extent_buffer *eb =
>>               container_of(head, struct extent_buffer, rcu_head);
>> -    kmem_cache_free(extent_buffer_cache, eb);
>> +    free_eb(eb);
>>   }
>>   static int release_extent_buffer(struct extent_buffer *eb)
>> @@ -3586,7 +3623,7 @@ static int release_extent_buffer(struct 
>> extent_buffer *eb)
>>           btrfs_release_extent_buffer_folios(eb);
>>   #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>>           if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
>> -            kmem_cache_free(extent_buffer_cache, eb);
>> +            free_eb(eb);
>>               return 1;
>>           }
>>   #endif
>> @@ -3837,7 +3874,7 @@ int read_extent_buffer_pages_nowait(struct 
>> extent_buffer *eb, int mirror_num,
>>       check_buffer_tree_ref(eb);
>>       refcount_inc(&eb->refs);
>> -    bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
>> +    bbio = btrfs_bio_alloc(num_extent_folios(eb),
>>                      REQ_OP_READ | REQ_META, eb->fs_info,
>>                      end_bbio_meta_read, eb);
>>       bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
>> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
>> index 61130786b9a3ad..3903913924f02c 100644
>> --- a/fs/btrfs/extent_io.h
>> +++ b/fs/btrfs/extent_io.h
>> @@ -48,6 +48,9 @@ enum {
>>       EXTENT_BUFFER_ZONED_ZEROOUT,
>>       /* Indicate that extent buffer pages a being read */
>>       EXTENT_BUFFER_READING,
>> +    /* Size of slab derived from fs_info->nodesize. */
>> +    EXTENT_BUFFER_16K,
>> +    EXTENT_BUFFER_64K,
>>   };
>>   /* these are flags for __process_pages_contig */
>> @@ -107,16 +110,21 @@ struct extent_buffer {
>>       struct rw_semaphore lock;
>> -    /*
>> -     * Pointers to all the folios of the extent buffer.
>> -     *
>> -     * For now the folio is always order 0 (aka, a single page).
>> -     */
>> -    struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
>>   #ifdef CONFIG_BTRFS_DEBUG
>>       struct list_head leak_list;
>>       pid_t lock_owner;
>>   #endif
>> +    /*
>> +     * Pointers to all folios of the extent buffer.
>> +     *
>> +     * For now the folio is always order 0 (a single page).
>> +     *
>> +     * Extent buffer size is determined at runtime, and allocated 
>> from the
>> +     * right slab depending on "nodesize <= 16K". Cached folio 
>> address array
>> +     * is stored after folios, @faddr is set up at allocation time.
>> +     */
>> +    void **faddr;
>> +    struct folio *folios[];
>>   };
>>   struct btrfs_eb_write_context {
> 
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 22:26   ` Qu Wenruo
  2025-08-28 23:17     ` Qu Wenruo
@ 2025-08-28 23:30     ` David Sterba
  2025-08-28 23:34       ` Qu Wenruo
  1 sibling, 1 reply; 11+ messages in thread
From: David Sterba @ 2025-08-28 23:30 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: David Sterba, linux-btrfs

On Fri, Aug 29, 2025 at 07:56:31AM +0930, Qu Wenruo wrote:
> 在 2025/8/29 07:23, David Sterba 写道:
> > This is preparatory work to use cache folio address in the extent buffer
> > to avoid reading folio_address() all the time (this is hidden in the
> > accessors).
> > 
> > However this is not as straightforward as just adding the array, the
> > extent buffers are of variable size depending on the node size and
> > adding 16*8=128 bytes would bloat the whole structure too much.
> > 
> > We already waste 3/4 of the folios array on x86_64 and default node size
> > 16K so we need to make the arrays variable size. This is allowed only
> > for one such array and it must be at the end of the structure. And we
> > need two.
> > 
> > With one indirection this is possible. For N folios in a node the
> > variable sized array will be 2N:
> 
> I'm OK with the variable folio pointer array, but why faddrs?

It's explained in the 2nd patch, it saves the lookups and translation of
page/folio -> address

> It looks like a little overkilled just to get rid of folio_address().

The folio_address does not show up in profiles becaues it's completely
inlined but it's in each and every accessor.

> And to be honest, if we really want to reduce the complexity of 
> folio_address(), we should go direct to large folios, which is much 
> easier and less risky.

In terms of code perhaps it is but it's still a major step compared to
the individual pages or folios so I'd rather make a few more steps in
between.

> > 	faddr     ----+
> > 	folios[]      |
> > 	[0]           |
> > 	[1]           |
> > 	...           |
> > 	[N]       <---+
> > 	[N+1]
> > 	...
> > 
> > There are 5 node sizes supported in total, but not all of them are used.
> > Create slabs only for 2 sizes where 16K is for the default size on
> > x86_64 and 64K. The one that contains the node size will be used,
> > possibly with some slack space.
> 
> I'm not a fan of two magic numbers.

It's not magic rather than optimizing for the most common case which is
default of 16k and a fallback. It does not make sense to create a slab
for each supported node size while only 16k would be in use.

> Can we just make folios[] variable size first without bothering the 
> cached folio address?

Yes the first patch can be taken separately.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 23:30     ` David Sterba
@ 2025-08-28 23:34       ` Qu Wenruo
  2025-08-28 23:45         ` David Sterba
  0 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2025-08-28 23:34 UTC (permalink / raw)
  To: dsterba, Qu Wenruo; +Cc: David Sterba, linux-btrfs



在 2025/8/29 09:00, David Sterba 写道:
> On Fri, Aug 29, 2025 at 07:56:31AM +0930, Qu Wenruo wrote:
>> 在 2025/8/29 07:23, David Sterba 写道:
>>> This is preparatory work to use cache folio address in the extent buffer
>>> to avoid reading folio_address() all the time (this is hidden in the
>>> accessors).
>>>
>>> However this is not as straightforward as just adding the array, the
>>> extent buffers are of variable size depending on the node size and
>>> adding 16*8=128 bytes would bloat the whole structure too much.
>>>
>>> We already waste 3/4 of the folios array on x86_64 and default node size
>>> 16K so we need to make the arrays variable size. This is allowed only
>>> for one such array and it must be at the end of the structure. And we
>>> need two.
>>>
>>> With one indirection this is possible. For N folios in a node the
>>> variable sized array will be 2N:
>>
>> I'm OK with the variable folio pointer array, but why faddrs?
> 
> It's explained in the 2nd patch, it saves the lookups and translation of
> page/folio -> address
> 
>> It looks like a little overkilled just to get rid of folio_address().
> 
> The folio_address does not show up in profiles becaues it's completely
> inlined but it's in each and every accessor.
> 
>> And to be honest, if we really want to reduce the complexity of
>> folio_address(), we should go direct to large folios, which is much
>> easier and less risky.
> 
> In terms of code perhaps it is but it's still a major step compared to
> the individual pages or folios so I'd rather make a few more steps in
> between.
> 
>>> 	faddr     ----+
>>> 	folios[]      |
>>> 	[0]           |
>>> 	[1]           |
>>> 	...           |
>>> 	[N]       <---+
>>> 	[N+1]
>>> 	...
>>>
>>> There are 5 node sizes supported in total, but not all of them are used.
>>> Create slabs only for 2 sizes where 16K is for the default size on
>>> x86_64 and 64K. The one that contains the node size will be used,
>>> possibly with some slack space.
>>
>> I'm not a fan of two magic numbers.
> 
> It's not magic rather than optimizing for the most common case which is
> default of 16k and a fallback. It does not make sense to create a slab
> for each supported node size while only 16k would be in use.

It is. And all the two magic numbers are all based on 4K page sized systems.

This doesn't look good for anything with larger page sizes.

E.g. on 64K page size systmes, the 16K makes no sense and will not 
allocate any space for the folios[] array.

That's why I hate this two magic numbers based solution.

> 
>> Can we just make folios[] variable size first without bothering the
>> cached folio address?
> 
> Yes the first patch can be taken separately.

Not really until it can properly handle all page sizes.

Thanks,
Qu


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 23:34       ` Qu Wenruo
@ 2025-08-28 23:45         ` David Sterba
  2025-08-28 23:50           ` Qu Wenruo
  0 siblings, 1 reply; 11+ messages in thread
From: David Sterba @ 2025-08-28 23:45 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, David Sterba, linux-btrfs

On Fri, Aug 29, 2025 at 09:04:01AM +0930, Qu Wenruo wrote:
> >>> There are 5 node sizes supported in total, but not all of them are used.
> >>> Create slabs only for 2 sizes where 16K is for the default size on
> >>> x86_64 and 64K. The one that contains the node size will be used,
> >>> possibly with some slack space.
> >>
> >> I'm not a fan of two magic numbers.
> > 
> > It's not magic rather than optimizing for the most common case which is
> > default of 16k and a fallback. It does not make sense to create a slab
> > for each supported node size while only 16k would be in use.
> 
> It is. And all the two magic numbers are all based on 4K page sized systems.

No, it's based on default nodesize in mkfs.btrfs 16k.

> This doesn't look good for anything with larger page sizes.

Which is either 16k on macs and mayb ARMs or 64K on ARMs and Powerpcs.

> E.g. on 64K page size systmes, the 16K makes no sense and will not 
> allocate any space for the folios[] array.
> 
> That's why I hate this two magic numbers based solution.

On 16k page systems the array will have one entry, similar to what we
have now with 64k page systems because of how INLINE_EXTENT_BUFFER_PAGES
is calculated as BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 23:45         ` David Sterba
@ 2025-08-28 23:50           ` Qu Wenruo
  2025-08-29  0:50             ` David Sterba
  0 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2025-08-28 23:50 UTC (permalink / raw)
  To: dsterba, Qu Wenruo; +Cc: David Sterba, linux-btrfs



在 2025/8/29 09:15, David Sterba 写道:
> On Fri, Aug 29, 2025 at 09:04:01AM +0930, Qu Wenruo wrote:
>>>>> There are 5 node sizes supported in total, but not all of them are used.
>>>>> Create slabs only for 2 sizes where 16K is for the default size on
>>>>> x86_64 and 64K. The one that contains the node size will be used,
>>>>> possibly with some slack space.
>>>>
>>>> I'm not a fan of two magic numbers.
>>>
>>> It's not magic rather than optimizing for the most common case which is
>>> default of 16k and a fallback. It does not make sense to create a slab
>>> for each supported node size while only 16k would be in use.
>>
>> It is. And all the two magic numbers are all based on 4K page sized systems.
> 
> No, it's based on default nodesize in mkfs.btrfs 16k.
> 
>> This doesn't look good for anything with larger page sizes.
> 
> Which is either 16k on macs and mayb ARMs or 64K on ARMs and Powerpcs.
> 
>> E.g. on 64K page size systmes, the 16K makes no sense and will not
>> allocate any space for the folios[] array.
>>
>> That's why I hate this two magic numbers based solution.
> 
> On 16k page systems the array will have one entry, similar to what we
> have now with 64k page systems because of how INLINE_EXTENT_BUFFER_PAGES
> is calculated as BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE
> 

It still doesn't work well for 64K page sized systems. As I mentioned, 
the 16K cachep will have no space for any folio entry, and only 64K one 
makes sense.

If you really want to use kmem_cache infrastructure with variable sized 
array, please move the extent buffer cache to a per-fs basis, so we 
don't need all those weird "optimization", just a single eb cache no 
matter the page size/node size.

And also, please fix the size calculation to use round_up() to handle 
large page size more correctly.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-28 23:50           ` Qu Wenruo
@ 2025-08-29  0:50             ` David Sterba
  2025-08-29  1:07               ` Qu Wenruo
  0 siblings, 1 reply; 11+ messages in thread
From: David Sterba @ 2025-08-29  0:50 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, David Sterba, linux-btrfs

On Fri, Aug 29, 2025 at 09:20:13AM +0930, Qu Wenruo wrote:
> > On 16k page systems the array will have one entry, similar to what we
> > have now with 64k page systems because of how INLINE_EXTENT_BUFFER_PAGES
> > is calculated as BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE
> 
> It still doesn't work well for 64K page sized systems. As I mentioned, 
> the 16K cachep will have no space for any folio entry, and only 64K one 
> makes sense.
> 
> If you really want to use kmem_cache infrastructure with variable sized 
> array, please move the extent buffer cache to a per-fs basis, so we 
> don't need all those weird "optimization", just a single eb cache no 
> matter the page size/node size.

This shifts away from the idea I wanted to implement, per-fs kmem cache
does not make sense to me due to the internal overhead of a kmem cache.
I hoped that 2 sizes would capture the most common page sizes combined
with node size with a little overhead of the indirection, I did not try
to do most efficient representation of the pages and ebs. I'll think if
there's an intersection what we both could find reasonable to implement
or will drop the patches.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers
  2025-08-29  0:50             ` David Sterba
@ 2025-08-29  1:07               ` Qu Wenruo
  0 siblings, 0 replies; 11+ messages in thread
From: Qu Wenruo @ 2025-08-29  1:07 UTC (permalink / raw)
  To: dsterba; +Cc: Qu Wenruo, David Sterba, linux-btrfs

在 2025/8/29 10:20, David Sterba 写道:
> On Fri, Aug 29, 2025 at 09:20:13AM +0930, Qu Wenruo wrote:
>>> On 16k page systems the array will have one entry, similar to what we
>>> have now with 64k page systems because of how INLINE_EXTENT_BUFFER_PAGES
>>> is calculated as BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE
>>
>> It still doesn't work well for 64K page sized systems. As I mentioned,
>> the 16K cachep will have no space for any folio entry, and only 64K one
>> makes sense.
>>
>> If you really want to use kmem_cache infrastructure with variable sized
>> array, please move the extent buffer cache to a per-fs basis, so we
>> don't need all those weird "optimization", just a single eb cache no
>> matter the page size/node size.
> 
> This shifts away from the idea I wanted to implement, per-fs kmem cache
> does not make sense to me due to the internal overhead of a kmem cache.

Personally speaking, I do not think the current per-module kmem cache is 
that reasonable.

The per-module one makes eb leakage detection much harder, it can only 
be done at module unload time, not fs unmount time.

Yes, the current per-module one makes it much easier to share the eb 
pool across all fses, but it's not a perfect logical fit when node size 
is a per-fs attribute.

And to be honest, I prefer a logically easier-to-understand structure 
other than something that focuses on "optimization" (which sometimes are 
even immature), and easier-to-understand one always wins for long term 
maintainability (and more flex).

> I hoped that 2 sizes would capture the most common page sizes combined
> with node size with a little overhead of the indirection, I did not try
> to do most efficient representation of the pages and ebs. I'll think if
> there's an intersection what we both could find reasonable to implement
> or will drop the patches.

Or go with a kcallocated pointer. It will get the space saving (at least 
for x86_64) but still uses the per-module pool, but that also means 
extra error handling and reduce the benefit of kmem cache (we still need 
to extra memory allocation).

I can not find a perfect solution, but with the recent bs > ps support, 
I really start doubting about a lot of per-module designs.

I mean if the eb cache is already per-fs, we will never go with the 
compromise of 16K/64K solution in the first place.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/2] btrfs: use cached extent buffer folio address everywhere
  2025-08-28 21:53 [PATCH 0/2] Allocate eb in 16k an 64k slabs, cache folio address David Sterba
  2025-08-28 21:53 ` [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers David Sterba
@ 2025-08-28 21:53 ` David Sterba
  1 sibling, 0 replies; 11+ messages in thread
From: David Sterba @ 2025-08-28 21:53 UTC (permalink / raw)
  To: linux-btrfs; +Cc: David Sterba

Now that the eb folio address is cached, use it instead of calling
folio_address(). This avoids reading data from the vmemmap_base array
and using local pointers in eb that are possibly in the same or a near
cacheline.

This saves noticeable amount of .ko size (x86_64 release config):

     text	   data	    bss	    dec	    hex	filename
  1649205	 136841	  15592	1801638	 1b7da6	pre/btrfs.ko
  1639267	 136841	  15592	1791700	 1b56d4	post/btrfs.ko

  DELTA: -9938

Several core functions also have nice stack savings (selection);

  btrfs_get_16                                     -8 (16 -> 8)
  btrfs_get_32                                     -8 (24 -> 16)
  btrfs_get_64                                     -8 (24 -> 16)

Some functions have been inlined completely: btrfs_header_generation,
btrfs_header_bytenr, btrfs_header_owner, btrfs_header_level.

The use of folio_address() is hidden in all the accessors. The
translation of folio to its address involves looking up vmemmap_base,
bit shifts and dereference of page_offset_base. An example:

  3e:   4a 8b 44 d7 78          mov    0x78(%rdi,%r10,8),%rax
  43:   48 2b 05 00 00 00 00    sub    0x0(%rip),%rax        # 4a <btrfs_get_8+0x4a>
                        46: R_X86_64_PC32       vmemmap_base-0x4
  4a:   49 21 c9                and    %rcx,%r9
  4d:   48 83 c2 01             add    $0x1,%rdx
  51:   48 c1 f8 06             sar    $0x6,%rax
  55:   8b 4f 08                mov    0x8(%rdi),%ecx
  58:   48 c1 e0 0c             shl    $0xc,%rax
  5c:   48 03 05 00 00 00 00    add    0x0(%rip),%rax        # 63 <btrfs_get_8+0x63>
                        5f: R_X86_64_PC32       page_offset_base-0x4

While in the new code it's:

  44:   48 8b 77 70             mov    0x70(%rdi),%rsi
  48:   8b 4f 08                mov    0x8(%rdi),%ecx
  4b:   48 83 c2 01             add    $0x1,%rdx
  4f:   4a 03 04 d6             add    (%rsi,%r10,8),%rax

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c | 12 ++++++------
 fs/btrfs/accessors.h |  5 ++---
 fs/btrfs/ctree.c     |  2 +-
 fs/btrfs/disk-io.c   |  6 +++---
 fs/btrfs/extent_io.c | 25 ++++++++++++-------------
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 1248aa2535d306..f1b47b9b5d9d80 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -56,7 +56,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oif = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	char *kaddr = folio_address(eb->folios[idx]) + oif;		\
+	char *kaddr = eb->faddr[idx] + oif;				\
 	const int part = eb->folio_size - oif;				\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
@@ -70,11 +70,11 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 									\
 	if (sizeof(u##bits) == 2) {					\
 		lebytes[0] = *kaddr;					\
-		kaddr = folio_address(eb->folios[idx + 1]);		\
+		kaddr = eb->faddr[idx + 1];				\
 		lebytes[1] = *kaddr;					\
 	} else {							\
 		memcpy_split_src(lebytes, kaddr,			\
-				 folio_address(eb->folios[idx + 1]),	\
+				 eb->faddr[idx + 1],			\
 				 part, sizeof(u##bits));		\
 	}								\
 	return get_unaligned_le##bits(lebytes);				\
@@ -86,7 +86,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oif = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	char *kaddr = folio_address(eb->folios[idx]) + oif;		\
+	char *kaddr = eb->faddr[idx] + oif;				\
 	const int part = eb->folio_size - oif;				\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
@@ -102,11 +102,11 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 	put_unaligned_le##bits(val, lebytes);				\
 	if (sizeof(u##bits) == 2) {					\
 		*kaddr = lebytes[0];					\
-		kaddr = folio_address(eb->folios[idx + 1]);		\
+		kaddr = eb->faddr[idx + 1];				\
 		*kaddr = lebytes[1];					\
 	} else {							\
 		memcpy(kaddr, lebytes, part);				\
-		kaddr = folio_address(eb->folios[idx + 1]);		\
+		kaddr = eb->faddr[idx + 1];				\
 		memcpy(kaddr, lebytes + part, sizeof(u##bits) - part);	\
 	}								\
 }
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 99b3ced12805bb..bf0e82e87214dc 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -75,14 +75,13 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
 {									\
-	const type *p = folio_address(eb->folios[0]) +			\
-			offset_in_page(eb->start);			\
+	const type *p = eb->faddr[0] + offset_in_page(eb->start);	\
 	return get_unaligned_le##bits(&p->member);			\
 }									\
 static inline void btrfs_set_##name(const struct extent_buffer *eb,	\
 				    u##bits val)			\
 {									\
-	type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
+	type *p = eb->faddr[0] + offset_in_page(eb->start);		\
 	put_unaligned_le##bits(val, &p->member);			\
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6f9465d4ce54ce..1b095570747756 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -779,7 +779,7 @@ int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
 
 		if (oil + key_size <= unit_size) {
 			const unsigned long idx = get_eb_folio_index(eb, offset);
-			char *kaddr = folio_address(eb->folios[idx]);
+			char *kaddr = eb->faddr[idx];
 
 			oil = get_eb_offset_in_folio(eb, offset);
 			tmp = (struct btrfs_disk_key *)(kaddr + oil);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7b06bbc4089878..a12544c3fe62b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -88,7 +88,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 		first_page_part = fs_info->nodesize;
 		num_pages = 1;
 	} else {
-		kaddr = folio_address(buf->folios[0]);
+		kaddr = buf->faddr[0];
 		first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
 		num_pages = num_extent_pages(buf);
 	}
@@ -103,7 +103,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 	 * crypto_shash_update() already.
 	 */
 	for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
-		kaddr = folio_address(buf->folios[i]);
+		kaddr = buf->faddr[i];
 		crypto_shash_update(shash, kaddr, PAGE_SIZE);
 	}
 	memset(result, 0, BTRFS_CSUM_SIZE);
@@ -393,7 +393,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
 	}
 
 	csum_tree_block(eb, result);
-	header_csum = folio_address(eb->folios[0]) +
+	header_csum = eb->faddr[0] +
 		get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
 
 	if (memcmp(result, header_csum, csum_size) != 0) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c906e5ea8ac70..c14ba0009c7a93 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3508,7 +3508,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	/* All pages are physically contiguous, can skip cross page handling. */
 	if (page_contig)
-		eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
+		eb->addr = eb->faddr[0] + offset_in_page(eb->start);
 again:
 	xa_lock_irq(&fs_info->buffer_tree);
 	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
@@ -3968,7 +3968,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 		char *kaddr;
 
 		cur = min(len, unit_size - offset);
-		kaddr = folio_address(eb->folios[i]);
+		kaddr = eb->faddr[i];
 		memcpy(dst, kaddr + offset, cur);
 
 		dst += cur;
@@ -4004,7 +4004,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 		char *kaddr;
 
 		cur = min(len, unit_size - offset);
-		kaddr = folio_address(eb->folios[i]);
+		kaddr = eb->faddr[i];
 		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
 			ret = -EFAULT;
 			break;
@@ -4040,7 +4040,7 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 
 	while (len > 0) {
 		cur = min(len, unit_size - offset);
-		kaddr = folio_address(eb->folios[i]);
+		kaddr = eb->faddr[i];
 		ret = memcmp(ptr, kaddr + offset, cur);
 		if (ret)
 			break;
@@ -4119,7 +4119,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
 			assert_eb_folio_uptodate(eb, i);
 
 		cur = min(len, unit_size - offset);
-		kaddr = folio_address(eb->folios[i]);
+		kaddr = eb->faddr[i];
 		if (use_memmove)
 			memmove(kaddr + offset, src, cur);
 		else
@@ -4155,7 +4155,7 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c,
 		unsigned int cur_len = min(start + len - cur, unit_size - offset);
 
 		assert_eb_folio_uptodate(eb, index);
-		memset(folio_address(eb->folios[index]) + offset, c, cur_len);
+		memset(eb->faddr[index] + offset, c, cur_len);
 
 		cur += cur_len;
 	}
@@ -4181,7 +4181,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
 		unsigned long index = get_eb_folio_index(src, cur);
 		unsigned long offset = get_eb_offset_in_folio(src, cur);
 		unsigned long cur_len = min(src->len, unit_size - offset);
-		void *addr = folio_address(src->folios[index]) + offset;
+		void *addr = src->faddr[index] + offset;
 
 		write_extent_buffer(dst, addr, cur, cur_len);
 
@@ -4214,7 +4214,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 
 		cur = min(len, (unsigned long)(unit_size - offset));
 
-		kaddr = folio_address(dst->folios[i]);
+		kaddr = dst->faddr[i];
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
 
 		src_offset += cur;
@@ -4272,7 +4272,7 @@ bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, nr, &i, &offset);
 	assert_eb_folio_uptodate(eb, i);
-	kaddr = folio_address(eb->folios[i]);
+	kaddr = eb->faddr[i];
 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
 }
 
@@ -4282,7 +4282,7 @@ static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long
 
 	if (check_eb_range(eb, bytenr, 1))
 		return NULL;
-	return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
+	return eb->faddr[index] + get_eb_offset_in_folio(eb, bytenr);
 }
 
 /*
@@ -4390,7 +4390,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
 		unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
 		unsigned long cur_len = min(src_offset + len - cur_src,
 					    unit_size - folio_off);
-		void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
+		void *src_addr = dst->faddr[folio_index] + folio_off;
 		const bool use_memmove = areas_overlap(src_offset + cur_off,
 						       dst_offset + cur_off, cur_len);
 
@@ -4437,8 +4437,7 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
 		cur = min_t(unsigned long, len, src_off_in_folio + 1);
 		cur = min(cur, dst_off_in_folio + 1);
 
-		src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
-					 cur + 1;
+		src_addr = dst->faddr[src_i] + src_off_in_folio - cur + 1;
 		use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
 					    cur);
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-08-29  1:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-28 21:53 [PATCH 0/2] Allocate eb in 16k an 64k slabs, cache folio address David Sterba
2025-08-28 21:53 ` [PATCH 1/2] btrfs: add 16K and 64K slabs for extent buffers David Sterba
2025-08-28 22:26   ` Qu Wenruo
2025-08-28 23:17     ` Qu Wenruo
2025-08-28 23:30     ` David Sterba
2025-08-28 23:34       ` Qu Wenruo
2025-08-28 23:45         ` David Sterba
2025-08-28 23:50           ` Qu Wenruo
2025-08-29  0:50             ` David Sterba
2025-08-29  1:07               ` Qu Wenruo
2025-08-28 21:53 ` [PATCH 2/2] btrfs: use cached extent buffer folio address everywhere David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.