* [PATCH 2/5] btrfs: add struct btrfs_eb_prealloc
2026-06-23 22:35 [PATCH 0/5] allocate extent_buffer GFP_NOFAIL with unlocked retry Boris Burkov
2026-06-23 22:35 ` [PATCH 1/5] btrfs: factor init_extent_buffer from __alloc_extent_buffer Boris Burkov
@ 2026-06-23 22:35 ` Boris Burkov
2026-06-23 22:35 ` [PATCH 3/5] btrfs: enable unlocked NOFAIL retry for eb allocations Boris Burkov
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Boris Burkov @ 2026-06-23 22:35 UTC (permalink / raw)
To: linux-btrfs, kernel-team
In further preparation for supporting NOFAIL allocations with retries
outside the critical section, add a struct to carry the extent_buffer
and btrfs_folio_state we need to allocate.
Refactor the allocation pathways to use the new struct but with no
functional change. Wire empty prealloc structs in from callers.
Signed-off-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/ctree.c | 21 +++--
fs/btrfs/disk-io.c | 6 +-
fs/btrfs/disk-io.h | 2 +
fs/btrfs/extent-tree.c | 6 +-
fs/btrfs/extent_io.c | 179 ++++++++++++++++++++++++++++-------------
fs/btrfs/extent_io.h | 19 +++++
fs/btrfs/tree-log.c | 3 +-
7 files changed, 168 insertions(+), 68 deletions(-)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 49fb6b816aa9..261ef4ec7d1b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1460,6 +1460,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
+ struct btrfs_eb_prealloc *pa,
struct extent_buffer **eb_ret, int slot,
const struct btrfs_key *key)
{
@@ -1546,7 +1547,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
- tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
+ tmp = btrfs_find_create_tree_block(fs_info, pa, blocknr,
+ check.owner_root, check.level);
if (IS_ERR(tmp)) {
ret = PTR_ERR(tmp);
tmp = NULL;
@@ -2004,6 +2006,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u8 lowest_level = 0;
int min_write_lock_level;
int prev_cmp;
+ struct btrfs_eb_prealloc pa = { 0 };
if (!root)
return -EINVAL;
@@ -2187,7 +2190,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
goto done;
}
- ret2 = read_block_for_search(root, p, &b, slot, key);
+ ret2 = read_block_for_search(root, p, &pa, &b, slot, key);
if (ret2 == -EAGAIN && !p->nowait) {
trace_btrfs_search_slot_restart(root, level, "read_block");
goto again;
@@ -2234,6 +2237,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = ret2;
}
+ btrfs_free_eb_prealloc(&pa);
+
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
@@ -2259,6 +2264,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
+ struct btrfs_eb_prealloc pa = { 0 };
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -2316,7 +2322,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
goto done;
}
- ret2 = read_block_for_search(root, p, &b, slot, key);
+ ret2 = read_block_for_search(root, p, &pa, &b, slot, key);
if (ret2 == -EAGAIN && !p->nowait)
goto again;
if (ret2) {
@@ -2339,6 +2345,8 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
if (ret < 0)
btrfs_release_path(p);
+ btrfs_free_eb_prealloc(&pa);
+
return ret;
}
@@ -4780,6 +4788,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *next;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ struct btrfs_eb_prealloc pa = { 0 };
bool need_commit_sem = false;
u32 nritems;
int ret;
@@ -4880,7 +4889,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
}
next = c;
- ret = read_block_for_search(root, path, &next, slot, &key);
+ ret = read_block_for_search(root, path, &pa, &next, slot, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
@@ -4923,7 +4932,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
if (!level)
break;
- ret = read_block_for_search(root, path, &next, 0, &key);
+ ret = read_block_for_search(root, path, &pa, &next, 0, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
@@ -4956,6 +4965,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
ret = ret2;
}
+ btrfs_free_eb_prealloc(&pa);
+
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0a7d80da9c94..60307fe6a685 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -590,12 +590,13 @@ static const struct address_space_operations btree_aops = {
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa,
u64 bytenr, u64 owner_root,
int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
- return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
+ return alloc_extent_buffer(fs_info, pa, bytenr, owner_root, level);
}
/*
@@ -608,12 +609,13 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check)
{
+ struct btrfs_eb_prealloc pa = { 0 };
struct extent_buffer *buf = NULL;
int ret;
ASSERT(check);
- buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
+ buf = btrfs_find_create_tree_block(fs_info, &pa, bytenr, check->owner_root,
check->level);
if (IS_ERR(buf))
return buf;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9185f8f02eeb..290508894f7c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -15,6 +15,7 @@
struct block_device;
struct super_block;
struct extent_buffer;
+struct btrfs_eb_prealloc;
struct btrfs_device;
struct btrfs_fs_devices;
struct btrfs_fs_info;
@@ -48,6 +49,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa,
u64 bytenr, u64 owner_root,
int level);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 624d76e0ca01..9f7e50f53f92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5260,10 +5260,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_eb_prealloc pa = { 0 };
struct extent_buffer *buf;
u64 lockdep_owner = owner;
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
+ buf = btrfs_find_create_tree_block(fs_info, &pa, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
@@ -5917,6 +5918,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_eb_prealloc pa = { 0 };
u64 bytenr;
u64 generation;
u64 owner_root = 0;
@@ -5939,7 +5941,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
+ next = btrfs_find_create_tree_block(fs_info, &pa, bytenr, btrfs_root_id(root),
level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fa4cc8bcd1af..4d6bd7535a65 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3374,7 +3374,7 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
- struct btrfs_folio_state *prealloc,
+ struct btrfs_eb_prealloc *pa,
struct extent_buffer **found_eb_ret)
{
@@ -3396,6 +3396,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
if (!ret)
goto finish;
+ /* ret == -EEXIST: a folio already lives at this index. */
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
if (IS_ERR(existing_folio))
@@ -3404,7 +3405,27 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
+ /*
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
+ *
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
if (folio_size(existing_folio) != eb->folio_size) {
+ DEBUG_WARN("folio order mismatch between new eb and filemap");
folio_unlock(existing_folio);
folio_put(existing_folio);
return -EAGAIN;
@@ -3435,8 +3456,10 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
eb->folio_size = folio_size(eb->folios[i]);
eb->folio_shift = folio_shift(eb->folios[i]);
/* Should not fail, as we have preallocated the memory. */
- ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], pa->bfs);
ASSERT(!ret);
+ /* The subpage state, if any, is now attached to the folio or freed. */
+ pa->bfs = NULL;
/*
* To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the
@@ -3451,17 +3474,94 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
return 0;
}
+/*
+ * Allocate the extent_buffer, its folios, and btrfs_folio_state, if needed.
+ *
+ * Return 0 on success and a negative errno otherwise. On failure, pa->eb/bfs
+ * will be NULL.
+ */
+int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa)
+{
+ int ret;
+
+ ASSERT(!pa->eb, "unexpected non-null eb: %p", pa->eb);
+ ASSERT(!pa->bfs, "unexpected non-null bfs: %p", pa->bfs);
+
+ pa->eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS | __GFP_NOFAIL);
+ /* alloc_eb_folio_array() needs len; init_extent_buffer() sets it again later. */
+ pa->eb->len = fs_info->nodesize;
+
+ /*
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
+ *
+ * The memory will be freed by attach_extent_buffer_page() or freed
+ * manually if we exit earlier.
+ */
+ if (btrfs_meta_is_subpage(fs_info)) {
+ pa->bfs = btrfs_alloc_folio_state(fs_info, PAGE_SIZE,
+ BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(pa->bfs)) {
+ ret = PTR_ERR(pa->bfs);
+ pa->bfs = NULL;
+ goto free_eb;
+ }
+ }
+
+ /*
+ * Allocate all pages first. These will be attached to btree_inode->i_mapping
+ * below (added to LRU, served by btree_migrate_folio), so request
+ * __GFP_MOVABLE so the page allocator places them in MOVABLE pageblocks.
+ */
+ ret = alloc_eb_folio_array(pa->eb, GFP_NOFS | __GFP_NOFAIL | __GFP_MOVABLE);
+ if (ret < 0)
+ goto free_bfs;
+
+ return 0;
+
+free_bfs:
+ btrfs_free_folio_state(pa->bfs);
+ pa->bfs = NULL;
+free_eb:
+ kmem_cache_free(extent_buffer_cache, pa->eb);
+ pa->eb = NULL;
+ return ret;
+}
+
+/*
+ * Used to cleanup a btrfs_eb_prealloc which had its contents allocated but
+ * folios not yet attached and eb/bfs consumed, and refs still 0.
+ *
+ * Safe to call on a fully used btrfs_eb_prealloc as the internal structs will
+ * be null once they are owned by the context using them.
+ */
+void btrfs_free_eb_prealloc(struct btrfs_eb_prealloc *pa)
+{
+ if (!pa->eb)
+ return;
+
+ for (int i = 0; i < num_extent_pages(pa->eb); i++) {
+ if (pa->eb->folios[i])
+ folio_put(pa->eb->folios[i]);
+ }
+ btrfs_free_folio_state(pa->bfs);
+ kmem_cache_free(extent_buffer_cache, pa->eb);
+ pa->eb = NULL;
+ pa->bfs = NULL;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa,
u64 start, u64 owner_root, int level)
{
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct btrfs_folio_state *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
bool uptodate = true;
- int ret;
+ int ret = 0;
if (check_eb_alignment(fs_info, start))
return ERR_PTR(-EINVAL);
@@ -3481,7 +3581,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS | __GFP_NOFAIL);
+ if (!pa->eb) {
+ ret = btrfs_init_eb_prealloc(fs_info, pa);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ eb = pa->eb;
+ pa->eb = NULL;
init_extent_buffer(fs_info, eb, start);
/*
@@ -3493,66 +3599,18 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- /*
- * Preallocate folio private for subpage case, so that we won't
- * allocate memory with i_private_lock nor page lock hold.
- *
- * The memory will be freed by attach_extent_buffer_page() or freed
- * manually if we exit earlier.
- */
- if (btrfs_meta_is_subpage(fs_info)) {
- prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
- if (IS_ERR(prealloc)) {
- ret = PTR_ERR(prealloc);
- goto out;
- }
- }
-
-reallocate:
- /*
- * Allocate all pages first. These will be attached to btree_inode->i_mapping
- * below (added to LRU, served by btree_migrate_folio), so request
- * __GFP_MOVABLE so the page allocator places them in MOVABLE pageblocks.
- */
- ret = alloc_eb_folio_array(eb, GFP_NOFS | __GFP_NOFAIL | __GFP_MOVABLE);
- if (ret < 0) {
- btrfs_free_folio_state(prealloc);
- goto out;
- }
-
/* Attach all pages to the filemap. */
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, pa, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
}
-
- /*
- * TODO: Special handling for a corner case where the order of
- * folios mismatch between the new eb and filemap.
- *
- * This happens when:
- *
- * - the new eb is using higher order folio
- *
- * - the filemap is still using 0-order folios for the range
- * This can happen at the previous eb allocation, and we don't
- * have higher order folio for the call.
- *
- * - the existing eb has already been freed
- *
- * In this case, we have to free the existing folios first, and
- * re-allocate using the same order.
- * Thankfully this is not going to happen yet, as we're still
- * using 0-order folios.
- */
- if (unlikely(ret == -EAGAIN)) {
- DEBUG_WARN("folio order mismatch between new eb and filemap");
- goto reallocate;
- }
+ /* -EAGAIN: folio order mismatch, unreachable with 0-order folios. */
+ if (ret < 0)
+ goto out;
attached++;
/*
@@ -3629,6 +3687,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
out:
WARN_ON(!refcount_dec_and_test(&eb->refs));
+ /* attach hands off pa->bfs; free it if we bailed first. */
+ btrfs_free_folio_state(pa->bfs);
+ pa->bfs = NULL;
+
/*
* Any attached folios need to be detached before we unlock them. This
* is because when we're inserting our new folios into the mapping, and
@@ -4660,6 +4722,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
.level = level,
.transid = gen
};
+ struct btrfs_eb_prealloc pa = { 0 };
struct extent_buffer *eb;
int ret;
@@ -4668,7 +4731,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
check.has_first_key = true;
}
- eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ eb = btrfs_find_create_tree_block(fs_info, &pa, bytenr, owner_root, level);
if (IS_ERR(eb))
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9896e15ddc40..26d941504ae1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -119,6 +119,21 @@ struct extent_buffer {
#endif
};
+/*
+ * Wrapper struct for managing preallocating an extent_buffer, its folios and a
+ * btrfs_folio_state if needed.
+ *
+ * Only used to mediate allocation, do not refer to the eb directly if not
+ * returned from a successful eb allocating API.
+ *
+ * The eb folios and bfs should generally not be fully attached, except briefly
+ * before they are NULLed in the struct after successful attachment.
+ */
+struct btrfs_eb_prealloc {
+ struct extent_buffer *eb;
+ struct btrfs_folio_state *bfs;
+};
+
struct btrfs_eb_write_context {
struct writeback_control *wbc;
struct extent_buffer *eb;
@@ -266,7 +281,11 @@ int set_folio_extent_mapped(struct folio *folio);
void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa,
u64 start, u64 owner_root, int level);
+int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_prealloc *pa);
+void btrfs_free_eb_prealloc(struct btrfs_eb_prealloc *pa);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 875e4ddc68ea..fd5bd593389c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2986,6 +2986,7 @@ static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
{
struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = wc->log->fs_info;
+ struct btrfs_eb_prealloc pa = { 0 };
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
@@ -3010,7 +3011,7 @@ static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
check.has_first_key = true;
btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
- next = btrfs_find_create_tree_block(fs_info, bytenr,
+ next = btrfs_find_create_tree_block(fs_info, &pa, bytenr,
btrfs_header_owner(cur),
*level - 1);
if (IS_ERR(next)) {
--
2.54.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/5] btrfs: enable unlocked NOFAIL retry for eb allocations
2026-06-23 22:35 [PATCH 0/5] allocate extent_buffer GFP_NOFAIL with unlocked retry Boris Burkov
2026-06-23 22:35 ` [PATCH 1/5] btrfs: factor init_extent_buffer from __alloc_extent_buffer Boris Burkov
2026-06-23 22:35 ` [PATCH 2/5] btrfs: add struct btrfs_eb_prealloc Boris Burkov
@ 2026-06-23 22:35 ` Boris Burkov
2026-06-23 22:35 ` [PATCH 4/5] btrfs: probe with GFP_NOWAIT for tree block readahead Boris Burkov
2026-06-23 22:35 ` [PATCH 5/5] btrfs: use GFP_NOWAIT when inhibiting eb writeback Boris Burkov
4 siblings, 0 replies; 6+ messages in thread
From: Boris Burkov @ 2026-06-23 22:35 UTC (permalink / raw)
To: linux-btrfs, kernel-team
Now that we have the btrfs_eb_prealloc struct to carry the allocation
and the "needs prealloc" signal, wire that up between the various
search_slot style callers down into alloc_extent_buffer.
If the prealloc struct indicates that it supports a nowait try, then
alloc_extent_buffer tries to allocate NOWAIT. If that succeeds, great.
Otherwise, we return EAGAIN and signal via the struct that preallocation
is required. The caller then does the allocation and tries again with
the eb, bfs, and folios wired through in the prealloc struct.
If unlock-and-allocate retries are not supported then we just use the
normal gfp flags like before.
Note that there are still two GFP_NOFS allocations, as far as I know,
that happen under the lock and cannot be preallocated:
- the __xa_cmpxchg to insert the eb into the eb xarray
- the xarray allocations for filemap_add_folio to add the folios to
the btree_inode mapping.
The former we could wire up with xa_reserve if we signaled the "prealloc
start" back up to the retry point. However, since there is no concept of
reservation in the filemap xarray, it seemed relatively unhelpful to
bother. These allocations are relatively small cached slab allocations,
so hopefully we can move the needle on reclaim stalls without reserving
them.
Signed-off-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/ctree.c | 21 ++++++++++++++++++---
fs/btrfs/extent_io.c | 27 +++++++++++++++++++++------
fs/btrfs/extent_io.h | 6 +++++-
fs/btrfs/subpage.c | 7 ++++---
fs/btrfs/subpage.h | 3 ++-
5 files changed, 50 insertions(+), 14 deletions(-)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 261ef4ec7d1b..8fe330d81b8f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2006,7 +2006,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u8 lowest_level = 0;
int min_write_lock_level;
int prev_cmp;
- struct btrfs_eb_prealloc pa = { 0 };
+ struct btrfs_eb_prealloc pa = { .supports_nowait = true };
if (!root)
return -EINVAL;
@@ -2061,6 +2061,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
again:
+ if (pa.needs_prealloc) {
+ ret = btrfs_init_eb_prealloc(fs_info, &pa, false);
+ if (ret)
+ goto done;
+ }
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
if (IS_ERR(b)) {
@@ -2264,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- struct btrfs_eb_prealloc pa = { 0 };
+ struct btrfs_eb_prealloc pa = { .supports_nowait = true };
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -2276,6 +2281,11 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
}
again:
+ if (pa.needs_prealloc) {
+ ret = btrfs_init_eb_prealloc(fs_info, &pa, false);
+ if (ret)
+ goto done;
+ }
b = btrfs_get_old_root(root, time_seq);
if (unlikely(!b)) {
ret = -EIO;
@@ -4788,7 +4798,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *next;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
- struct btrfs_eb_prealloc pa = { 0 };
+ struct btrfs_eb_prealloc pa = { .supports_nowait = true };
bool need_commit_sem = false;
u32 nritems;
int ret;
@@ -4807,6 +4817,11 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
+ if (pa.needs_prealloc) {
+ ret = btrfs_init_eb_prealloc(fs_info, &pa, false);
+ if (ret)
+ goto done;
+ }
level = 1;
next = NULL;
btrfs_release_path(path);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4d6bd7535a65..94fa9a6c3978 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3477,18 +3477,28 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
/*
* Allocate the extent_buffer, its folios, and btrfs_folio_state, if needed.
*
+ * @pa: The holder struct to do the allocation in.
+ * @nowait: Whether to do a speculative GFP_NOWAIT allocation while holding locks.
+ *
* Return 0 on success and a negative errno otherwise. On failure, pa->eb/bfs
- * will be NULL.
+ * will be NULL. If @nowait=true, then on ENOMEM, mark @pa->needs_prealloc and
+ * return -EAGAIN to signal the caller to unlock and retry.
*/
int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
- struct btrfs_eb_prealloc *pa)
+ struct btrfs_eb_prealloc *pa, bool nowait)
{
+ gfp_t gfp = nowait ? GFP_NOWAIT : GFP_NOFS | __GFP_NOFAIL;
int ret;
ASSERT(!pa->eb, "unexpected non-null eb: %p", pa->eb);
ASSERT(!pa->bfs, "unexpected non-null bfs: %p", pa->bfs);
+ pa->needs_prealloc = false;
- pa->eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS | __GFP_NOFAIL);
+ pa->eb = kmem_cache_zalloc(extent_buffer_cache, gfp);
+ if (!pa->eb) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* alloc_eb_folio_array() needs len; init_extent_buffer() sets it again later. */
pa->eb->len = fs_info->nodesize;
@@ -3501,7 +3511,7 @@ int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
*/
if (btrfs_meta_is_subpage(fs_info)) {
pa->bfs = btrfs_alloc_folio_state(fs_info, PAGE_SIZE,
- BTRFS_SUBPAGE_METADATA);
+ BTRFS_SUBPAGE_METADATA, gfp);
if (IS_ERR(pa->bfs)) {
ret = PTR_ERR(pa->bfs);
pa->bfs = NULL;
@@ -3514,7 +3524,7 @@ int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
* below (added to LRU, served by btree_migrate_folio), so request
* __GFP_MOVABLE so the page allocator places them in MOVABLE pageblocks.
*/
- ret = alloc_eb_folio_array(pa->eb, GFP_NOFS | __GFP_NOFAIL | __GFP_MOVABLE);
+ ret = alloc_eb_folio_array(pa->eb, gfp | __GFP_MOVABLE);
if (ret < 0)
goto free_bfs;
@@ -3526,6 +3536,11 @@ int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
free_eb:
kmem_cache_free(extent_buffer_cache, pa->eb);
pa->eb = NULL;
+out:
+ if (nowait && ret == -ENOMEM) {
+ pa->needs_prealloc = true;
+ ret = -EAGAIN;
+ }
return ret;
}
@@ -3582,7 +3597,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
if (!pa->eb) {
- ret = btrfs_init_eb_prealloc(fs_info, pa);
+ ret = btrfs_init_eb_prealloc(fs_info, pa, pa->supports_nowait);
if (ret)
return ERR_PTR(ret);
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 26d941504ae1..a8bbd2e0aff0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -132,6 +132,10 @@ struct extent_buffer {
struct btrfs_eb_prealloc {
struct extent_buffer *eb;
struct btrfs_folio_state *bfs;
+ /* eb alloc may use GFP_NOWAIT; caller can drop locks and retry. */
+ bool supports_nowait;
+ /* GFP_NOWAIT eb alloc failed; preallocate again and retry. */
+ bool needs_prealloc;
};
struct btrfs_eb_write_context {
@@ -284,7 +288,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct btrfs_eb_prealloc *pa,
u64 start, u64 owner_root, int level);
int btrfs_init_eb_prealloc(struct btrfs_fs_info *fs_info,
- struct btrfs_eb_prealloc *pa);
+ struct btrfs_eb_prealloc *pa, bool nowait);
void btrfs_free_eb_prealloc(struct btrfs_eb_prealloc *pa);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 56060acac2e9..13323d94a56e 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -59,7 +59,7 @@ int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type);
+ bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type, GFP_NOFS);
if (IS_ERR(bfs))
return PTR_ERR(bfs);
@@ -86,7 +86,8 @@ void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio
}
struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_folio_type type)
+ size_t fsize, enum btrfs_folio_type type,
+ gfp_t gfp)
{
struct btrfs_folio_state *ret;
unsigned int real_size;
@@ -96,7 +97,7 @@ struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs
real_size = struct_size(ret, bitmaps,
BITS_TO_LONGS(btrfs_bitmap_nr_max *
(fsize >> fs_info->sectorsize_bits)));
- ret = kzalloc(real_size, GFP_NOFS);
+ ret = kzalloc(real_size, gfp);
if (!ret)
return ERR_PTR(-ENOMEM);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index c6d7394e6418..dc0e9c33210e 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -102,7 +102,8 @@ void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio
/* Allocate additional data where page represents more than one sector */
struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_folio_type type);
+ size_t fsize, enum btrfs_folio_type type,
+ gfp_t gfp);
static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs)
{
kfree(bfs);
--
2.54.0
^ permalink raw reply related [flat|nested] 6+ messages in thread