From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH PoC 2/6] btrfs: add delayed ordered extent support
Date: Mon, 9 Mar 2026 09:32:51 +1030 [thread overview]
Message-ID: <a51572ec3227b356f5808bcfcfdfd0ede065932e.1773009120.git.wqu@suse.com> (raw)
In-Reply-To: <cover.1773009120.git.wqu@suse.com>
A delayed ordered extent acts as a place holder ordered extent, thus its
disk_bytenr/disk_num_bytes will all be zero.
A real OE inside the delayed OE will be added to the child list of the
delayed OE.
When the parent delayed OE finished, child OEs will be finished and have
their valid file extents inserted.
And if some range has no child OE, e.g. beyond i_size, the range without
child OE will be manually released.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/inode.c | 66 +++++++++++++++++++++++++++++++++++++
fs/btrfs/ordered-data.c | 73 ++++++++++++++++++++++++++++++++++++++---
fs/btrfs/ordered-data.h | 14 ++++++++
3 files changed, 149 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0551b8e755ed..4876c136f819 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3162,6 +3162,69 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
update_inode_bytes, oe->qgroup_rsv);
}
+static int finish_delayed_ordered(struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_inode *inode = oe->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_extent *child;
+ struct btrfs_ordered_extent *tmp;
+ const u32 nr_bits = oe->num_bytes >> fs_info->sectorsize_bits;
+ bool io_error = test_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+ u64 cur = oe->file_offset;
+ int ret = 0;
+ int saved_ret = 0;
+
+ /* Finish each child OE. */
+ list_for_each_entry_safe(child, tmp, &oe->child_list, child_list) {
+ list_del_init(&child->child_list);
+ refcount_inc(&child->refs);
+
+ /* The range should have been marked in the bitmap. */
+ ASSERT(bitmap_test_range_all_set(oe->child_bitmap,
+ (child->file_offset - oe->file_offset) >> fs_info->sectorsize_bits,
+ child->num_bytes >> fs_info->sectorsize_bits));
+
+ if (io_error)
+ set_bit(BTRFS_ORDERED_IOERR, &child->flags);
+
+ ret = btrfs_finish_one_ordered(child);
+ if (ret && !saved_ret)
+ saved_ret = ret;
+ }
+
+ /* For ranges that doesn't have a child OE, manually clean them up. */
+ while (cur < oe->file_offset + oe->num_bytes) {
+ const u32 cur_bit = (cur - oe->file_offset) >> fs_info->sectorsize_bits;
+ u32 first_zero;
+ u32 next_set;
+ u64 range_start;
+ u64 range_end;
+ u32 range_len;
+
+ first_zero = find_next_zero_bit(oe->child_bitmap, nr_bits, cur_bit);
+ if (first_zero >= nr_bits)
+ break;
+ next_set = find_next_bit(oe->child_bitmap, nr_bits, first_zero);
+ ASSERT(next_set > first_zero);
+
+ range_start = oe->file_offset + (first_zero << fs_info->sectorsize_bits);
+ range_len = (next_set - first_zero) << fs_info->sectorsize_bits;
+ range_end = range_start + range_len - 1;
+
+ /*
+ * The range has no real OE created, thus its reserved data/meta space
+ * needs to be manually released by EXTENT_DO_ACCOUNTING.
+ */
+ btrfs_clear_extent_bit(&inode->io_tree, range_start, range_end,
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, NULL);
+ cur = range_end + 1;
+ }
+ btrfs_remove_ordered_extent(oe);
+ btrfs_put_ordered_extent(oe);
+ btrfs_put_ordered_extent(oe);
+ return saved_ret;
+}
+
/*
* As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
@@ -3184,6 +3247,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
bool clear_reserved_extent = true;
unsigned int clear_bits = EXTENT_DEFRAG;
+ if (test_bit(BTRFS_ORDERED_DELAYED, &ordered_extent->flags))
+ return finish_delayed_ordered(ordered_extent);
+
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bc88b904d024..f28f8779ad85 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -169,6 +169,17 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
if (test_bit(BTRFS_ORDERED_ENCODED, &flags))
ASSERT(test_bit(BTRFS_ORDERED_COMPRESSED, &flags));
+ /*
+ * DELAYED can only be set with REGULAR, no DIRECT/ENCODED, and should
+ * not exceed BTRFS_MAX_COMPRESSED size.
+ */
+ if (test_bit(BTRFS_ORDERED_DELAYED, &flags)) {
+ ASSERT(test_bit(BTRFS_ORDERED_REGULAR, &flags));
+ ASSERT(!test_bit(BTRFS_ORDERED_DIRECT, &flags));
+ ASSERT(!test_bit(BTRFS_ORDERED_ENCODED, &flags));
+ ASSERT(num_bytes <= BTRFS_MAX_COMPRESSED);
+ }
+
/*
* For a NOCOW write we can free the qgroup reserve right now. For a COW
* one we transfer the reserved space from the inode's iotree into the
@@ -215,6 +226,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
INIT_LIST_HEAD(&entry->bioc_list);
+ INIT_LIST_HEAD(&entry->child_list);
init_completion(&entry->completion);
/*
@@ -235,12 +247,43 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
return entry;
}
+static void add_child_oe(struct btrfs_ordered_extent *parent,
+ struct btrfs_ordered_extent *child)
+{
+ struct btrfs_inode *inode = parent->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 start_bit = (child->file_offset - parent->file_offset) >>
+ fs_info->sectorsize_bits;
+ const u32 nr_bits = child->num_bytes >> fs_info->sectorsize_bits;
+
+ lockdep_assert_held(&inode->ordered_tree_lock);
+ /* Basic flags check for parent and child. */
+ ASSERT(test_bit(BTRFS_ORDERED_DELAYED, &parent->flags));
+ ASSERT(!test_bit(BTRFS_ORDERED_DELAYED, &child->flags));
+
+ /* Child should not belong to any parent yet. */
+ ASSERT(list_empty(&child->child_list));
+
+ /* Child should be fully inside parent's range. */
+ ASSERT(child->file_offset >= parent->file_offset);
+ ASSERT(child->file_offset + child->num_bytes <=
+ parent->file_offset + parent->num_bytes);
+
+ /* There should be no existing child in the range. */
+ ASSERT(bitmap_test_range_all_zero(parent->child_bitmap, start_bit, nr_bits));
+
+ list_add_tail(&parent->child_list, &child->child_list);
+
+ bitmap_set(parent->child_bitmap, start_bit, nr_bits);
+}
+
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct btrfs_inode *inode = entry->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool is_child = false;
trace_btrfs_ordered_extent_add(inode, entry);
@@ -253,17 +296,25 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
spin_lock(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
- if (unlikely(node)) {
+ if (node) {
struct btrfs_ordered_extent *exist =
rb_entry(node, struct btrfs_ordered_extent, rb_node);
- btrfs_panic(fs_info, -EEXIST,
+ if (test_bit(BTRFS_ORDERED_DELAYED, &exist->flags)) {
+ add_child_oe(exist, entry);
+ is_child = true;
+ } else {
+ btrfs_panic(fs_info, -EEXIST,
"existing oe file_offset=%llu num_bytes=%llu flags=0x%lx new oe file_offset=%llu num_bytes=%llu flags=0x%lx",
- exist->file_offset, exist->num_bytes, exist->flags,
- entry->file_offset, entry->num_bytes, entry->flags);
+ exist->file_offset, exist->num_bytes, exist->flags,
+ entry->file_offset, entry->num_bytes, entry->flags);
+ }
}
spin_unlock(&inode->ordered_tree_lock);
+ /* Child OE shouldn't be added to per-root oe list. */
+ if (is_child)
+ return;
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&root->ordered_extents);
@@ -336,6 +387,20 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
return entry;
}
+struct btrfs_ordered_extent *btrfs_alloc_delayed_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset, u32 length)
+{
+ struct btrfs_ordered_extent *entry;
+
+ entry = alloc_ordered_extent(inode, file_offset, length, length, 0, 0, 0,
+ (1UL << BTRFS_ORDERED_REGULAR) |
+ (1UL << BTRFS_ORDERED_DELAYED),
+ BTRFS_COMPRESS_NONE);
+ if (!IS_ERR(entry))
+ insert_ordered_extent(entry);
+ return entry;
+}
+
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 2664ea455229..8a1800f109e8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -13,6 +13,7 @@
#include <linux/rbtree.h>
#include <linux/wait.h>
#include "async-thread.h"
+#include "compression.h"
struct inode;
struct page;
@@ -87,6 +88,12 @@ enum {
*/
BTRFS_ORDERED_DIRECT,
+ /*
+ * Extra bit for delayed OE, can only be set for REGULAR.
+ * Can not be set with COMPRESSED/ENCODED/DIRECT.
+ */
+ BTRFS_ORDERED_DELAYED,
+
BTRFS_ORDERED_NR_FLAGS,
};
static_assert(BTRFS_ORDERED_NR_FLAGS <= BITS_PER_LONG);
@@ -155,6 +162,11 @@ struct btrfs_ordered_extent {
/* a per root list of all the pending ordered extents */
struct list_head root_extent_list;
+ /* Child ordered extent list for delayed OE. */
+ struct list_head child_list;
+
+ unsigned long child_bitmap[BITS_TO_LONGS(BTRFS_MAX_COMPRESSED / BTRFS_MIN_BLOCKSIZE)];
+
struct btrfs_work work;
struct completion completion;
@@ -192,6 +204,8 @@ struct btrfs_file_extent {
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
const struct btrfs_file_extent *file_extent, unsigned long flags);
+struct btrfs_ordered_extent *btrfs_alloc_delayed_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset, u32 length);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
--
2.53.0
next prev parent reply other threads:[~2026-03-08 23:03 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-08 23:02 [PATCH PoC 0/6] btrfs: delay compression to bbio submission time Qu Wenruo
2026-03-08 23:02 ` [PATCH PoC 1/6] btrfs: add skeleton for delayed btrfs bio Qu Wenruo
2026-03-08 23:02 ` Qu Wenruo [this message]
2026-03-08 23:02 ` [PATCH PoC 3/6] btrfs: introduce the skeleton of delayed bbio endio function Qu Wenruo
2026-03-08 23:02 ` [PATCH PoC 4/6] btrfs: introduce compression for delayed bbio Qu Wenruo
2026-03-08 23:02 ` [PATCH PoC 5/6] btrfs: implement uncompressed fallback " Qu Wenruo
2026-03-08 23:02 ` [PATCH PoC 6/6] btrfs: enable experimental delayed compression support Qu Wenruo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a51572ec3227b356f5808bcfcfdfd0ede065932e.1773009120.git.wqu@suse.com \
--to=wqu@suse.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox