From: Paul Richards <paul.richards@gmail.com>
To: linux-btrfs@vger.kernel.org
Cc: dsterba@suse.com, Paul Richards <paul.richards@gmail.com>
Subject: [PATCH 2/3] btrfs: support for FALLOC_FL_COLLAPSE_RANGE in btrfs_fallocate()
Date: Sat, 18 Apr 2026 15:38:07 +0100 [thread overview]
Message-ID: <20260418143808.199603-3-paul.richards@gmail.com> (raw)
In-Reply-To: <20260418143808.199603-1-paul.richards@gmail.com>
Assisted-by: Amazon Q Developer:auto/unknown
Signed-off-by: Paul Richards <paul.richards@gmail.com>
---
fs/btrfs/file.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 300 insertions(+)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0b5cc3cec675..99d24bef5f88 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,13 @@
#include "super.h"
#include "print-tree.h"
+/*
+ * When we shift extents as part of fallocate insert or collapse we commit
+ * and cycle the transaction every BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL
+ * extents to avoid accumulating too many changes in one transaction.
+ */
+#define BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL (32)
+
/*
* Unlock folio after btrfs_file_write() is done with it.
*/
@@ -2648,6 +2655,296 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
return ret;
}
+/*
+ * Update the extent back-reference in the extent tree when a
+ * BTRFS_EXTENT_DATA_KEY item is shifted to a new logical file offset.
+ * Drops the back-reference at old_file_offset and adds one at new_file_offset.
+ * Holes (disk_bytenr == 0) and inline extents have no back-references and
+ * must not be passed to this function.
+ */
+static int btrfs_shift_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 ino,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 old_file_offset, u64 new_file_offset)
+{
+ struct btrfs_ref ref = {
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ int ret;
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ btrfs_init_data_ref(&ref, ino, old_file_offset, 0, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ btrfs_init_data_ref(&ref, ino, new_file_offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
+static int btrfs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 end = offset + len;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ struct extent_state *cached_state = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 ino = btrfs_ino(BTRFS_I(inode));
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL))
+ return -EOPNOTSUPP;
+
+ /* offset and len must be sector-aligned */
+ if (!IS_ALIGNED(offset | len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* collapse range must not reach or pass EOF - use ftruncate instead */
+ if (end >= inode->i_size)
+ return -EINVAL;
+
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: ino=%llu offset=%lld len=%lld i_size=%lld",
+ btrfs_ino(BTRFS_I(inode)), offset, len, inode->i_size);
+
+ /* wait for any ordered extents in [offset, i_size) to complete */
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset,
+ inode->i_size - offset);
+ if (ret)
+ return ret;
+
+ /*
+ * Flush dirty pages and invalidate the page cache for [offset, i_size)
+ * before any extent manipulation, following the ext4/xfs pattern.
+ * We hold BTRFS_ILOCK_MMAP so no new dirty pages can appear during
+ * the operation. The page cache must be empty before we shift extent
+ * keys so that stale pages at the old offsets cannot be read back
+ * after the collapse.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: nrpages before upfront invalidate=%lu (pages before offset=%llu not invalidated)",
+ inode->i_mapping->nrpages, offset >> PAGE_SHIFT);
+ truncate_pagecache_range(inode, offset, LLONG_MAX);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: nrpages after upfront invalidate=%lu (expected %llu)",
+ inode->i_mapping->nrpages, offset >> PAGE_SHIFT);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Lock the range [offset, end) and invalidate the page cache within
+ * it. btrfs_punch_hole_lock_range() calls truncate_pagecache_range()
+ * internally in a retry loop.
+ */
+ btrfs_punch_hole_lock_range(inode, offset, end - 1, &cached_state);
+
+ /*
+ * Remove all extents in [offset, end). Passing NULL for extent_info
+ * means we are punching a hole. btrfs_replace_file_extents() splits
+ * any extent straddling the boundaries, drops extent refs, and
+ * returns a transaction handle for us to reuse.
+ */
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ offset, end - 1, NULL, &trans);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: btrfs_replace_file_extents ret=%d nrpages=%lu",
+ ret, inode->i_mapping->nrpages);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Shift all BTRFS_EXTENT_DATA_KEY items with key.offset >= end
+ * leftward by len bytes.
+ *
+ * We iterate forward (lowest offset first) which is safe for a
+ * left-shift because the new key is always less than the old one,
+ * so we never collide with a key we haven't visited yet.
+ */
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = end;
+
+ int nr_shifted = 0;
+ while (1) {
+ struct btrfs_file_extent_item *fi;
+ u64 disk_bytenr;
+ u64 num_bytes;
+ u64 extent_offset;
+ int extent_type;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out_trans;
+
+ /* If no exact match, slot points at the next item - that's fine */
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_trans;
+ if (ret > 0) {
+ /* No more items - we are done shifting */
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ /* Stop if we've moved past this inode's extent data items */
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 0;
+ break;
+ }
+
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: shifting key offset %llu -> %llu",
+ key.offset, key.offset - len);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset -= len;
+ btrfs_set_item_key_safe(trans, path, &new_key);
+
+ /*
+ * Update the back-reference in the extent tree to reflect the
+ * new logical file offset. Holes (disk_bytenr == 0) and inline
+ * extents have no back-references to update.
+ */
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE && disk_bytenr > 0) {
+ ret = btrfs_shift_extent_backref(trans, root, ino,
+ disk_bytenr, num_bytes,
+ key.offset - extent_offset,
+ new_key.offset - extent_offset);
+ if (unlikely(ret))
+ goto out_trans;
+ }
+
+ /* Advance to the next item on the next iteration */
+ key.offset = new_key.offset + 1;
+ nr_shifted++;
+
+ /*
+ * Cycle the transaction every N items to avoid holding a
+ * single transaction open across a large number of extent
+ * items. btrfs_set_item_key_safe() modifies leaves in-place
+ * so we won't hit -ENOSPC; we use a simple counter instead.
+ * Update the inode at each cycle point so it is consistent
+ * on disk if a crash occurs mid-loop.
+ */
+ if (nr_shifted % BTRFS_INSERT_COLLAPSE_TRANSACTION_CYCLE_INTERVAL == 0) {
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: cycling transaction, nr_shifted=%d", nr_shifted);
+ inode_inc_iversion(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_release_path(path);
+ goto out_trans;
+ }
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_release_path(path);
+ goto out_unlock;
+ }
+ }
+
+ btrfs_release_path(path);
+ }
+
+ if (ret)
+ goto out_trans;
+
+ /* Update i_size and the on-disk inode */
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: updating i_size %lld -> %lld nrpages=%lu",
+ inode->i_size, inode->i_size - len, inode->i_mapping->nrpages);
+ /*
+ * Drop the extent map cache for [offset, i_size) so that subsequent
+ * reads re-load the correct mappings from the btree. The key-shift
+ * loop updated the btree but the in-memory extent map cache still
+ * has entries at the old logical offsets.
+ */
+ btrfs_drop_extent_map_range(BTRFS_I(inode), offset, (u64)-1, false);
+ inode_inc_iversion(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ i_size_write(inode, inode->i_size - len);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+
+out_trans:
+ if (trans) {
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
+ }
+out_unlock:
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, offset, end - 1,
+ &cached_state);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: post-unlock ret=%d i_size=%lld, invalidating page cache from %lld",
+ ret, inode->i_size, offset);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) && !ret) {
+ /*
+ * These are expected to be no-ops: ordered extents were drained
+ * at the start of this function and BTRFS_ILOCK_MMAP has been
+ * held throughout, so no new writes could have been submitted.
+ * The page cache was emptied from offset onwards upfront and
+ * nrpages in that region stayed 0 throughout the operation.
+ * Pages before offset are unaffected and may still be cached.
+ */
+ int wait_ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset,
+ inode->i_size);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: post-shift wait_ordered ret=%d",
+ wait_ret);
+ ASSERT(wait_ret == 0);
+ ASSERT(filemap_range_has_page(inode->i_mapping, offset,
+ LLONG_MAX) == false);
+ truncate_pagecache_range(inode, offset, LLONG_MAX);
+ btrfs_info(fs_info,
+ "btrfs_collapse_range: truncate_pagecache_range done");
+ ASSERT(filemap_range_has_page(inode->i_mapping, offset,
+ LLONG_MAX) == false);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
@@ -3296,6 +3593,9 @@ static long btrfs_fallocate(struct file *file, int mode,
case FALLOC_FL_PUNCH_HOLE:
ret = btrfs_punch_hole(file, offset, len);
break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ ret = btrfs_collapse_range(inode, offset, len);
+ break;
default:
ret = -EOPNOTSUPP;
}
--
2.53.0
next prev parent reply other threads:[~2026-04-18 14:38 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-18 14:38 [RFC PATCH 0/3] btrfs: implement FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE Paul Richards
2026-04-18 14:38 ` [PATCH 1/3] btrfs: refactor btrfs_fallocate() ahead of supporting more modes Paul Richards
2026-04-19 0:57 ` Qu Wenruo
2026-04-18 14:38 ` Paul Richards [this message]
2026-04-19 1:29 ` [PATCH 2/3] btrfs: support for FALLOC_FL_COLLAPSE_RANGE in btrfs_fallocate() Qu Wenruo
2026-04-18 14:38 ` [PATCH 3/3] btrfs: support for FALLOC_FL_INSERT_RANGE " Paul Richards
2026-04-19 4:44 ` Qu Wenruo
2026-04-19 0:25 ` [RFC PATCH 0/3] btrfs: implement FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE Qu Wenruo
2026-04-19 5:08 ` Qu Wenruo
2026-04-19 18:40 ` Paul Richards
2026-04-19 22:30 ` Qu Wenruo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260418143808.199603-3-paul.richards@gmail.com \
--to=paul.richards@gmail.com \
--cc=dsterba@suse.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.