From: Paul Richards <paul.richards@gmail.com>
To: linux-btrfs@vger.kernel.org
Cc: dsterba@suse.com, Paul Richards <paul.richards@gmail.com>
Subject: [PATCH 2/3] btrfs: support for FALLOC_FL_COLLAPSE_RANGE in btrfs_fallocate()
Date: Sat, 18 Apr 2026 15:38:07 +0100 [thread overview]
Message-ID: <20260418143808.199603-3-paul.richards@gmail.com> (raw)
In-Reply-To: <20260418143808.199603-1-paul.richards@gmail.com>
Assisted-by: Amazon Q Developer:auto/unknown
Signed-off-by: Paul Richards <paul.richards@gmail.com>
---
fs/btrfs/file.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 300 insertions(+)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0b5cc3cec675..99d24bef5f88 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,13 @@
#include "super.h"
#include "print-tree.h"
+/*
+ * When we shift extents as part of fallocate insert or collapse we commit
+ * and cycle the transaction every BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL
+ * extents to avoid accumulating too many changes in one transaction.
+ */
+#define BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL (32)
+
/*
* Unlock folio after btrfs_file_write() is done with it.
*/
@@ -2648,6 +2655,296 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
return ret;
}
+/*
+ * Update the extent back-reference in the extent tree when a
+ * BTRFS_EXTENT_DATA_KEY item is shifted to a new logical file offset.
+ * Drops the back-reference at old_file_offset and adds one at new_file_offset.
+ * Holes (disk_bytenr == 0) and inline extents have no back-references and
+ * must not be passed to this function.
+ */
+static int btrfs_shift_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 ino,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 old_file_offset, u64 new_file_offset)
+{
+ struct btrfs_ref ref = {
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ int ret;
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ btrfs_init_data_ref(&ref, ino, old_file_offset, 0, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ btrfs_init_data_ref(&ref, ino, new_file_offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
+static int btrfs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 end = offset + len;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ struct extent_state *cached_state = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 ino = btrfs_ino(BTRFS_I(inode));
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL))
+ return -EOPNOTSUPP;
+
+ /* offset and len must be sector-aligned */
+ if (!IS_ALIGNED(offset | len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* collapse range must not reach or pass EOF - use ftruncate instead */
+ if (end >= inode->i_size)
+ return -EINVAL;
+
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: ino=%llu offset=%lld len=%lld i_size=%lld",
+ btrfs_ino(BTRFS_I(inode)), offset, len, inode->i_size);
+
+ /* wait for any ordered extents in [offset, i_size) to complete */
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset,
+ inode->i_size - offset);
+ if (ret)
+ return ret;
+
+ /*
+ * Flush dirty pages and invalidate the page cache for [offset, i_size)
+ * before any extent manipulation, following the ext4/xfs pattern.
+ * We hold BTRFS_ILOCK_MMAP so no new dirty pages can appear during
+ * the operation. The page cache must be empty before we shift extent
+ * keys so that stale pages at the old offsets cannot be read back
+ * after the collapse.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: nrpages before upfront invalidate=%lu (pages before offset=%llu not invalidated)",
+ inode->i_mapping->nrpages, offset >> PAGE_SHIFT);
+ truncate_pagecache_range(inode, offset, LLONG_MAX);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: nrpages after upfront invalidate=%lu (expected %llu)",
+ inode->i_mapping->nrpages, offset >> PAGE_SHIFT);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Lock the range [offset, end) and invalidate the page cache within
+ * it. btrfs_punch_hole_lock_range() calls truncate_pagecache_range()
+ * internally in a retry loop.
+ */
+ btrfs_punch_hole_lock_range(inode, offset, end - 1, &cached_state);
+
+ /*
+ * Remove all extents in [offset, end). Passing NULL for extent_info
+ * means we are punching a hole. btrfs_replace_file_extents() splits
+ * any extent straddling the boundaries, drops extent refs, and
+ * returns a transaction handle for us to reuse.
+ */
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ offset, end - 1, NULL, &trans);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: btrfs_replace_file_extents ret=%d nrpages=%lu",
+ ret, inode->i_mapping->nrpages);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Shift all BTRFS_EXTENT_DATA_KEY items with key.offset >= end
+ * leftward by len bytes.
+ *
+ * We iterate forward (lowest offset first) which is safe for a
+ * left-shift because the new key is always less than the old one,
+ * so we never collide with a key we haven't visited yet.
+ */
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = end;
+
+ int nr_shifted = 0;
+ while (1) {
+ struct btrfs_file_extent_item *fi;
+ u64 disk_bytenr;
+ u64 num_bytes;
+ u64 extent_offset;
+ int extent_type;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out_trans;
+
+ /* If no exact match, slot points at the next item - that's fine */
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_trans;
+ if (ret > 0) {
+ /* No more items - we are done shifting */
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ /* Stop if we've moved past this inode's extent data items */
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 0;
+ break;
+ }
+
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: shifting key offset %llu -> %llu",
+ key.offset, key.offset - len);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset -= len;
+ btrfs_set_item_key_safe(trans, path, &new_key);
+
+ /*
+ * Update the back-reference in the extent tree to reflect the
+ * new logical file offset. Holes (disk_bytenr == 0) and inline
+ * extents have no back-references to update.
+ */
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE && disk_bytenr > 0) {
+ ret = btrfs_shift_extent_backref(trans, root, ino,
+ disk_bytenr, num_bytes,
+ key.offset - extent_offset,
+ new_key.offset - extent_offset);
+ if (unlikely(ret))
+ goto out_trans;
+ }
+
+ /* Advance to the next item on the next iteration */
+ key.offset = new_key.offset + 1;
+ nr_shifted++;
+
+ /*
+ * Cycle the transaction every N items to avoid holding a
+ * single transaction open across a large number of extent
+ * items. btrfs_set_item_key_safe() modifies leaves in-place
+ * so we won't hit -ENOSPC; we use a simple counter instead.
+ * Update the inode at each cycle point so it is consistent
+ * on disk if a crash occurs mid-loop.
+ */
+ if (nr_shifted % BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL == 0) {
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: cycling transaction, nr_shifted=%d", nr_shifted);
+ inode_inc_iversion(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_release_path(path);
+ goto out_trans;
+ }
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_release_path(path);
+ goto out_unlock;
+ }
+ }
+
+ btrfs_release_path(path);
+ }
+
+ if (ret)
+ goto out_trans;
+
+ /* Update i_size and the on-disk inode */
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: updating i_size %lld -> %lld nrpages=%lu",
+ inode->i_size, inode->i_size - len, inode->i_mapping->nrpages);
+ /*
+ * Drop the extent map cache for [offset, i_size) so that subsequent
+ * reads re-load the correct mappings from the btree. The key-shift
+ * loop updated the btree but the in-memory extent map cache still
+ * has entries at the old logical offsets.
+ */
+ btrfs_drop_extent_map_range(BTRFS_I(inode), offset, (u64)-1, false);
+ inode_inc_iversion(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ i_size_write(inode, inode->i_size - len);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+
+out_trans:
+ if (trans) {
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
+ }
+out_unlock:
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, offset, end - 1,
+ &cached_state);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: post-unlock ret=%d i_size=%lld, invalidating page cache from %lld",
+ ret, inode->i_size, offset);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) && !ret) {
+ /*
+ * These are expected to be no-ops: ordered extents were drained
+ * at the start of this function and BTRFS_ILOCK_MMAP has been
+ * held throughout, so no new writes could have been submitted.
+ * The page cache was emptied from offset onwards upfront and
+ * nrpages in that region stayed 0 throughout the operation.
+ * Pages before offset are unaffected and may still be cached.
+ */
+ int wait_ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset,
+ inode->i_size);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: post-shift wait_ordered ret=%d",
+ wait_ret);
+ ASSERT(wait_ret == 0);
+ ASSERT(filemap_range_has_page(inode->i_mapping, offset,
+ LLONG_MAX) == false);
+ truncate_pagecache_range(inode, offset, LLONG_MAX);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: truncate_pagecache_range done");
+ ASSERT(filemap_range_has_page(inode->i_mapping, offset,
+ LLONG_MAX) == false);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
@@ -3296,6 +3593,9 @@ static long btrfs_fallocate(struct file *file, int mode,
case FALLOC_FL_PUNCH_HOLE:
ret = btrfs_punch_hole(file, offset, len);
break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ ret = btrfs_collapse_range(inode, offset, len);
+ break;
default:
ret = -EOPNOTSUPP;
}
--
2.53.0
next prev parent reply other threads:[~2026-04-18 14:38 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-18 14:38 [RFC PATCH 0/3] btrfs: implement FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE Paul Richards
2026-04-18 14:38 ` [PATCH 1/3] btrfs: refactor btrfs_fallocate() ahead of supporting more modes Paul Richards
2026-04-19 0:57 ` Qu Wenruo
2026-04-18 14:38 ` Paul Richards [this message]
2026-04-19 1:29 ` [PATCH 2/3] btrfs: support for FALLOC_FL_COLLAPSE_RANGE in btrfs_fallocate() Qu Wenruo
2026-04-18 14:38 ` [PATCH 3/3] btrfs: support for FALLOC_FL_INSERT_RANGE " Paul Richards
2026-04-19 4:44 ` Qu Wenruo
2026-04-19 0:25 ` [RFC PATCH 0/3] btrfs: implement FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE Qu Wenruo
2026-04-19 5:08 ` Qu Wenruo
2026-04-19 18:40 ` Paul Richards
2026-04-19 22:30 ` Qu Wenruo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260418143808.199603-3-paul.richards@gmail.com \
--to=paul.richards@gmail.com \
--cc=dsterba@suse.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox