[PATCH v2] btrfs: add BTRFS_IOC_GET

public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
@ 2026-04-08 17:46 Mark Harmstone
  2026-04-08 17:51 ` Mark Harmstone
  2026-04-09 11:08 ` Qu Wenruo
  0 siblings, 2 replies; 6+ messages in thread
From: Mark Harmstone @ 2026-04-08 17:46 UTC (permalink / raw)
  To: linux-btrfs, wqu, boris; +Cc: Mark Harmstone

Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
query the on-disk csums for a file.

This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
the kernel, which details the offset and length we're interested in, and
a buffer for the kernel to write its results into. The kernel writes a
struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
csums if available.

If the extent is an uncompressed, non-nodatasum extent, the kernel sets
the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
csums. If it is sparse, preallocated, or beyond the EOF, it sets the
type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
the precomputed hash of the zero sector. Otherwise, it sets the type to
BTRFS_GET_CSUMS_NO_CSUMS.

We do store the csums of compressed extents, but we deliberately don't
return them here: they're hashed over the compressed data, not the
uncompressed data that's returned to userspace.

The main use case for this is for speeding up mkfs.btrfs --rootdir. For
the case when the source FS is btrfs and using the same csum algorithm,
we can avoid having to recalculate the csums - in my synthetic
benchmarks (16GB file on a spinning-rust drive), this resulted in a ~11%
speed-up (218s to 196s).

When using the --reflink option added in btrfs-progs v6.16.1, we can forgo
reading the data entirely, resulting a ~2200% speed-up on the same test
(128s to 6s).

    # mkdir rootdir
    # dd if=/dev/urandom of=rootdir/file bs=4096 count=4194304

    (without ioctl)
    # echo 3 > /proc/sys/vm/drop_caches
    # time mkfs.btrfs --rootdir rootdir testimg
    ...
    real    3m37.965s
    user    0m5.496s
    sys     0m6.125s

    # echo 3 > /proc/sys/vm/drop_caches
    # time mkfs.btrfs --rootdir rootdir --reflink testimg
    ...
    real    2m8.342s
    user    0m5.472s
    sys     0m1.667s

    (with ioctl)
    # echo 3 > /proc/sys/vm/drop_caches
    # time mkfs.btrfs --rootdir rootdir testimg
    ...
    real    3m15.865s
    user    0m4.258s
    sys     0m6.261s

    # echo 3 > /proc/sys/vm/drop_caches
    # time mkfs.btrfs --rootdir rootdir --reflink testimg
    ...
    real    0m5.847s
    user    0m2.899s
    sys     0m0.097s

Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
 fs/btrfs/ioctl.c           | 330 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  21 +++
 2 files changed, 351 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2e447f5005c16..5cdda33eeaf05a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "file.h"
+#include "file-item.h"
 #include "scrub.h"
 #include "super.h"
 
@@ -5139,6 +5140,333 @@ static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg
 }
 #endif
 
+#define GET_CSUMS_BUF_MAX	(16 * 1024 * 1024)
+
+static int copy_csums_to_user(struct btrfs_fs_info *fs_info, u64 disk_bytenr,
+			      u64 len, u8 __user *buf)
+{
+	struct btrfs_root *csum_root;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+	const u32 csum_size = fs_info->csum_size;
+	int ret;
+
+	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+
+	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+				      disk_bytenr + len - 1, &list, false);
+	if (ret < 0)
+		return ret;
+
+	/* Clear the output buffer to handle potential gaps in csum coverage. */
+	if (clear_user(buf, (len >> fs_info->sectorsize_bits) * csum_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = 0;
+	while (!list_empty(&list)) {
+		u64 offset;
+		size_t copy_size;
+
+		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+
+		offset = ((sums->logical - disk_bytenr) >> fs_info->sectorsize_bits) * csum_size;
+		copy_size = (sums->len >> fs_info->sectorsize_bits) * csum_size;
+
+		if (copy_to_user(buf + offset, sums->sums, copy_size)) {
+			kfree(sums);
+			ret = -EFAULT;
+			goto out;
+		}
+
+		kfree(sums);
+	}
+
+out:
+	while (!list_empty(&list)) {
+		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return ret;
+}
+
+static int btrfs_ioctl_get_csums(struct file *file, void __user *argp)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_inode *bi = BTRFS_I(inode);
+	struct btrfs_fs_info *fs_info = bi->root->fs_info;
+	struct btrfs_root *root = bi->root;
+	struct btrfs_ioctl_get_csums_args args;
+	BTRFS_PATH_AUTO_FREE(path);
+	const u64 ino = btrfs_ino(bi);
+	const u32 sectorsize = fs_info->sectorsize;
+	const u32 csum_size = fs_info->csum_size;
+	u8 __user *ubuf;
+	u64 buf_limit;
+	u64 buf_used = 0;
+	u64 cur_offset;
+	u64 end_offset;
+	u64 prev_extent_end;
+	struct btrfs_key key;
+	int ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	if (!IS_ALIGNED(args.offset, sectorsize) ||
+	    !IS_ALIGNED(args.length, sectorsize))
+		return -EINVAL;
+	if (args.length == 0)
+		return -EINVAL;
+	if (args.offset + args.length < args.offset)
+		return -EOVERFLOW;
+	if (args.buf_size < sizeof(struct btrfs_ioctl_get_csums_entry))
+		return -EINVAL;
+
+	buf_limit = min_t(u64, args.buf_size, GET_CSUMS_BUF_MAX);
+	ubuf = (u8 __user *)(argp + offsetof(struct btrfs_ioctl_get_csums_args, buf));
+	cur_offset = args.offset;
+	end_offset = args.offset + args.length;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_wait_ordered_range(bi, cur_offset, args.length);
+	if (ret)
+		return ret;
+
+	btrfs_inode_lock(bi, BTRFS_ILOCK_SHARED);
+
+	ret = btrfs_wait_ordered_range(bi, cur_offset, args.length);
+	if (ret)
+		goto out_unlock;
+
+	/* NODATASUM early exit. */
+	if (bi->flags & BTRFS_INODE_NODATASUM) {
+		struct btrfs_ioctl_get_csums_entry entry = {
+			.offset = cur_offset,
+			.length = end_offset - cur_offset,
+			.type = BTRFS_GET_CSUMS_NO_CSUMS,
+		};
+
+		if (copy_to_user(ubuf, &entry, sizeof(entry))) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+
+		buf_used = sizeof(entry);
+		cur_offset = end_offset;
+		goto done;
+	}
+
+	prev_extent_end = cur_offset;
+
+	while (cur_offset < end_offset) {
+		struct btrfs_file_extent_item *ei;
+		struct extent_buffer *leaf;
+		struct btrfs_ioctl_get_csums_entry entry;
+		u64 extent_end;
+		u64 disk_bytenr = 0;
+		u64 extent_offset = 0;
+		u64 range_start, range_len;
+		u64 entry_csum_size;
+		u64 key_offset;
+		int extent_type;
+		u8 compression;
+
+		/* Search for the extent at or before cur_offset. */
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = cur_offset;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out_unlock;
+
+		if (ret > 0 && path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0] - 1);
+			if (key.objectid == ino &&
+			    key.type == BTRFS_EXTENT_DATA_KEY) {
+				path->slots[0]--;
+				if (btrfs_file_extent_end(path) <= cur_offset)
+					path->slots[0]++;
+			}
+		}
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_unlock;
+			if (ret > 0) {
+				ret = 0;
+				btrfs_release_path(path);
+				break;
+			}
+		}
+
+		leaf = path->nodes[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		extent_end = btrfs_file_extent_end(path);
+		key_offset = key.offset;
+
+		/* Read extent fields before releasing the path. */
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, ei);
+		compression = btrfs_file_extent_compression(leaf, ei);
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+			if (disk_bytenr && compression == BTRFS_COMPRESS_NONE)
+				extent_offset = btrfs_file_extent_offset(leaf, ei);
+		}
+
+		btrfs_release_path(path);
+
+		/* Implicit hole (NO_HOLES feature). */
+		if (prev_extent_end < key_offset) {
+			u64 hole_end = min(key_offset, end_offset);
+			u64 hole_len = hole_end - prev_extent_end;
+
+			if (prev_extent_end >= cur_offset) {
+				memset(&entry, 0, sizeof(entry));
+				entry.offset = prev_extent_end;
+				entry.length = hole_len;
+				entry.type = BTRFS_GET_CSUMS_ZEROED;
+
+				if (buf_used + sizeof(entry) > buf_limit)
+					goto done;
+				if (copy_to_user(ubuf + buf_used, &entry,
+						 sizeof(entry))) {
+					ret = -EFAULT;
+					goto out_unlock;
+				}
+				buf_used += sizeof(entry);
+				cur_offset = hole_end;
+			}
+
+			if (key_offset >= end_offset) {
+				cur_offset = end_offset;
+				break;
+			}
+		}
+
+		/* Clamp to our query range. */
+		range_start = max(cur_offset, key_offset);
+		range_len = min(extent_end, end_offset) - range_start;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.offset = range_start;
+		entry.length = range_len;
+
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			entry.type = BTRFS_GET_CSUMS_NO_CSUMS;
+			entry_csum_size = 0;
+		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			entry.type = BTRFS_GET_CSUMS_ZEROED;
+			entry_csum_size = 0;
+		} else {
+			/* BTRFS_FILE_EXTENT_REG */
+			if (disk_bytenr == 0) {
+				/* Explicit hole. */
+				entry.type = BTRFS_GET_CSUMS_ZEROED;
+				entry_csum_size = 0;
+			} else if (compression != BTRFS_COMPRESS_NONE) {
+				entry.type = BTRFS_GET_CSUMS_NO_CSUMS;
+				entry_csum_size = 0;
+			} else {
+				entry.type = BTRFS_GET_CSUMS_HAS_CSUMS;
+				entry_csum_size = (range_len >> fs_info->sectorsize_bits) * csum_size;
+			}
+		}
+
+		/* Check if this entry (+ csum data) fits in the buffer. */
+		if (buf_used + sizeof(entry) + entry_csum_size > buf_limit) {
+			if (buf_used == 0) {
+				ret = -EOVERFLOW;
+				goto out_unlock;
+			}
+			goto done;
+		}
+
+		if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		buf_used += sizeof(entry);
+
+		if (entry.type == BTRFS_GET_CSUMS_HAS_CSUMS) {
+			ret = copy_csums_to_user(fs_info,
+				disk_bytenr + extent_offset + (range_start - key_offset),
+				range_len, ubuf + buf_used);
+			if (ret)
+				goto out_unlock;
+			buf_used += entry_csum_size;
+		}
+
+		cur_offset = range_start + range_len;
+		prev_extent_end = extent_end;
+
+		if (fatal_signal_pending(current)) {
+			if (buf_used == 0) {
+				ret = -EINTR;
+				goto out_unlock;
+			}
+			goto done;
+		}
+
+		cond_resched();
+	}
+
+	/* Handle trailing implicit hole. */
+	if (cur_offset < end_offset) {
+		struct btrfs_ioctl_get_csums_entry entry = {
+			.offset = prev_extent_end,
+			.length = end_offset - prev_extent_end,
+			.type = BTRFS_GET_CSUMS_ZEROED,
+		};
+
+		if (buf_used + sizeof(entry) <= buf_limit) {
+			if (copy_to_user(ubuf + buf_used, &entry,
+					 sizeof(entry))) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+			buf_used += sizeof(entry);
+			cur_offset = end_offset;
+		}
+	}
+
+done:
+	args.offset = cur_offset;
+	args.length = (cur_offset < end_offset) ? end_offset - cur_offset : 0;
+	args.buf_size = buf_used;
+
+	if (copy_to_user(argp, &args, sizeof(args)))
+		ret = -EFAULT;
+
+out_unlock:
+	btrfs_inode_unlock(bi, BTRFS_ILOCK_SHARED);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 #endif
 	case BTRFS_IOC_SUBVOL_SYNC_WAIT:
 		return btrfs_ioctl_subvol_sync(fs_info, argp);
+	case BTRFS_IOC_GET_CSUMS:
+		return btrfs_ioctl_get_csums(file, argp);
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	case BTRFS_IOC_SHUTDOWN:
 		return btrfs_ioctl_shutdown(fs_info, arg);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 9165154a274d94..d079e8b67fd740 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -1100,6 +1100,25 @@ enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
 };
 
+/* Types for struct btrfs_ioctl_get_csums_entry::type */
+#define BTRFS_GET_CSUMS_HAS_CSUMS	0
+#define BTRFS_GET_CSUMS_ZEROED		1
+#define BTRFS_GET_CSUMS_NO_CSUMS	2
+
+struct btrfs_ioctl_get_csums_entry {
+	__u64 offset;		/* file offset of this range */
+	__u64 length;		/* length in bytes */
+	__u32 type;		/* BTRFS_GET_CSUMS_* type */
+	__u32 reserved;		/* padding, must be 0 */
+};
+
+struct btrfs_ioctl_get_csums_args {
+	__u64 offset;		/* in/out: file offset */
+	__u64 length;		/* in/out: range length */
+	__u64 buf_size;		/* in/out: buffer capacity / bytes written */
+	__u8 buf[];		/* out: entries + csum data */
+};
+
 /* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */
 #define BTRFS_SHUTDOWN_FLAGS_DEFAULT			0x0
 #define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH			0x1
@@ -1226,6 +1245,8 @@ enum btrfs_err_code {
 				     struct btrfs_ioctl_encoded_io_args)
 #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
 					struct btrfs_ioctl_subvol_wait)
+#define BTRFS_IOC_GET_CSUMS _IOWR(BTRFS_IOCTL_MAGIC, 66, \
+				  struct btrfs_ioctl_get_csums_args)
 
 /* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */
 #define BTRFS_IOC_SHUTDOWN	_IOR('X', 125, __u32)
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
  2026-04-08 17:46 [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl Mark Harmstone
@ 2026-04-08 17:51 ` Mark Harmstone
  2026-04-09 11:08 ` Qu Wenruo
  1 sibling, 0 replies; 6+ messages in thread
From: Mark Harmstone @ 2026-04-08 17:51 UTC (permalink / raw)
  To: linux-btrfs, wqu, boris

The only change here is renaming SPARSE to ZEROED, to make it clearer 
what the meaning is.

Qu suggested that we could make the output more structured, i.e. not 
"__u8 buf[];", but we can return multiple csum entries in one call. So 
for instance we could return [ZEROED, HAS_CSUMS, NO_CSUMS], for a file 
with a sparse extent at the start, an uncompressed extent, and then a 
compressed extent.

The progs PR https://github.com/kdave/btrfs-progs/pull/1096 needs to be 
updated to rename SPARSE to ZEROED... but it also conflicts with Qu's PR 
https://github.com/kdave/btrfs-progs/pull/1103, so one or the other has 
to be rebased anyway.

On 08/04/2026 6.46 pm, Mark Harmstone wrote:
> Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
> query the on-disk csums for a file.
> 
> This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
> the kernel, which details the offset and length we're interested in, and
> a buffer for the kernel to write its results into. The kernel writes a
> struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
> csums if available.
> 
> If the extent is an uncompressed, non-nodatasum extent, the kernel sets
> the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
> csums. If it is sparse, preallocated, or beyond the EOF, it sets the
> type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
> the precomputed hash of the zero sector. Otherwise, it sets the type to
> BTRFS_GET_CSUMS_NO_CSUMS.
> 
> We do store the csums of compressed extents, but we deliberately don't
> return them here: they're hashed over the compressed data, not the
> uncompressed data that's returned to userspace.
> 
> The main use case for this is for speeding up mkfs.btrfs --rootdir. For
> the case when the source FS is btrfs and using the same csum algorithm,
> we can avoid having to recalculate the csums - in my synthetic
> benchmarks (16GB file on a spinning-rust drive), this resulted in a ~11%
> speed-up (218s to 196s).
> 
> When using the --reflink option added in btrfs-progs v6.16.1, we can forgo
> reading the data entirely, resulting a ~2200% speed-up on the same test
> (128s to 6s).
> 
>      # mkdir rootdir
>      # dd if=/dev/urandom of=rootdir/file bs=4096 count=4194304
> 
>      (without ioctl)
>      # echo 3 > /proc/sys/vm/drop_caches
>      # time mkfs.btrfs --rootdir rootdir testimg
>      ...
>      real    3m37.965s
>      user    0m5.496s
>      sys     0m6.125s
> 
>      # echo 3 > /proc/sys/vm/drop_caches
>      # time mkfs.btrfs --rootdir rootdir --reflink testimg
>      ...
>      real    2m8.342s
>      user    0m5.472s
>      sys     0m1.667s
> 
>      (with ioctl)
>      # echo 3 > /proc/sys/vm/drop_caches
>      # time mkfs.btrfs --rootdir rootdir testimg
>      ...
>      real    3m15.865s
>      user    0m4.258s
>      sys     0m6.261s
> 
>      # echo 3 > /proc/sys/vm/drop_caches
>      # time mkfs.btrfs --rootdir rootdir --reflink testimg
>      ...
>      real    0m5.847s
>      user    0m2.899s
>      sys     0m0.097s
> 
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
>   fs/btrfs/ioctl.c           | 330 +++++++++++++++++++++++++++++++++++++
>   include/uapi/linux/btrfs.h |  21 +++
>   2 files changed, 351 insertions(+)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index b2e447f5005c16..5cdda33eeaf05a 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -56,6 +56,7 @@
>   #include "uuid-tree.h"
>   #include "ioctl.h"
>   #include "file.h"
> +#include "file-item.h"
>   #include "scrub.h"
>   #include "super.h"
>   
> @@ -5139,6 +5140,333 @@ static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg
>   }
>   #endif
>   
> +#define GET_CSUMS_BUF_MAX	(16 * 1024 * 1024)
> +
> +static int copy_csums_to_user(struct btrfs_fs_info *fs_info, u64 disk_bytenr,
> +			      u64 len, u8 __user *buf)
> +{
> +	struct btrfs_root *csum_root;
> +	struct btrfs_ordered_sum *sums;
> +	LIST_HEAD(list);
> +	const u32 csum_size = fs_info->csum_size;
> +	int ret;
> +
> +	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
> +
> +	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
> +				      disk_bytenr + len - 1, &list, false);
> +	if (ret < 0)
> +		return ret;
> +
> +	/* Clear the output buffer to handle potential gaps in csum coverage. */
> +	if (clear_user(buf, (len >> fs_info->sectorsize_bits) * csum_size)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = 0;
> +	while (!list_empty(&list)) {
> +		u64 offset;
> +		size_t copy_size;
> +
> +		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
> +		list_del(&sums->list);
> +
> +		offset = ((sums->logical - disk_bytenr) >> fs_info->sectorsize_bits) * csum_size;
> +		copy_size = (sums->len >> fs_info->sectorsize_bits) * csum_size;
> +
> +		if (copy_to_user(buf + offset, sums->sums, copy_size)) {
> +			kfree(sums);
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		kfree(sums);
> +	}
> +
> +out:
> +	while (!list_empty(&list)) {
> +		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
> +		list_del(&sums->list);
> +		kfree(sums);
> +	}
> +	return ret;
> +}
> +
> +static int btrfs_ioctl_get_csums(struct file *file, void __user *argp)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct btrfs_inode *bi = BTRFS_I(inode);
> +	struct btrfs_fs_info *fs_info = bi->root->fs_info;
> +	struct btrfs_root *root = bi->root;
> +	struct btrfs_ioctl_get_csums_args args;
> +	BTRFS_PATH_AUTO_FREE(path);
> +	const u64 ino = btrfs_ino(bi);
> +	const u32 sectorsize = fs_info->sectorsize;
> +	const u32 csum_size = fs_info->csum_size;
> +	u8 __user *ubuf;
> +	u64 buf_limit;
> +	u64 buf_used = 0;
> +	u64 cur_offset;
> +	u64 end_offset;
> +	u64 prev_extent_end;
> +	struct btrfs_key key;
> +	int ret;
> +
> +	if (!(file->f_mode & FMODE_READ))
> +		return -EBADF;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&args, argp, sizeof(args)))
> +		return -EFAULT;
> +
> +	if (!IS_ALIGNED(args.offset, sectorsize) ||
> +	    !IS_ALIGNED(args.length, sectorsize))
> +		return -EINVAL;
> +	if (args.length == 0)
> +		return -EINVAL;
> +	if (args.offset + args.length < args.offset)
> +		return -EOVERFLOW;
> +	if (args.buf_size < sizeof(struct btrfs_ioctl_get_csums_entry))
> +		return -EINVAL;
> +
> +	buf_limit = min_t(u64, args.buf_size, GET_CSUMS_BUF_MAX);
> +	ubuf = (u8 __user *)(argp + offsetof(struct btrfs_ioctl_get_csums_args, buf));
> +	cur_offset = args.offset;
> +	end_offset = args.offset + args.length;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_wait_ordered_range(bi, cur_offset, args.length);
> +	if (ret)
> +		return ret;
> +
> +	btrfs_inode_lock(bi, BTRFS_ILOCK_SHARED);
> +
> +	ret = btrfs_wait_ordered_range(bi, cur_offset, args.length);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/* NODATASUM early exit. */
> +	if (bi->flags & BTRFS_INODE_NODATASUM) {
> +		struct btrfs_ioctl_get_csums_entry entry = {
> +			.offset = cur_offset,
> +			.length = end_offset - cur_offset,
> +			.type = BTRFS_GET_CSUMS_NO_CSUMS,
> +		};
> +
> +		if (copy_to_user(ubuf, &entry, sizeof(entry))) {
> +			ret = -EFAULT;
> +			goto out_unlock;
> +		}
> +
> +		buf_used = sizeof(entry);
> +		cur_offset = end_offset;
> +		goto done;
> +	}
> +
> +	prev_extent_end = cur_offset;
> +
> +	while (cur_offset < end_offset) {
> +		struct btrfs_file_extent_item *ei;
> +		struct extent_buffer *leaf;
> +		struct btrfs_ioctl_get_csums_entry entry;
> +		u64 extent_end;
> +		u64 disk_bytenr = 0;
> +		u64 extent_offset = 0;
> +		u64 range_start, range_len;
> +		u64 entry_csum_size;
> +		u64 key_offset;
> +		int extent_type;
> +		u8 compression;
> +
> +		/* Search for the extent at or before cur_offset. */
> +		key.objectid = ino;
> +		key.type = BTRFS_EXTENT_DATA_KEY;
> +		key.offset = cur_offset;
> +
> +		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
> +		if (ret < 0)
> +			goto out_unlock;
> +
> +		if (ret > 0 && path->slots[0] > 0) {
> +			btrfs_item_key_to_cpu(path->nodes[0], &key,
> +					      path->slots[0] - 1);
> +			if (key.objectid == ino &&
> +			    key.type == BTRFS_EXTENT_DATA_KEY) {
> +				path->slots[0]--;
> +				if (btrfs_file_extent_end(path) <= cur_offset)
> +					path->slots[0]++;
> +			}
> +		}
> +
> +		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
> +			ret = btrfs_next_leaf(root, path);
> +			if (ret < 0)
> +				goto out_unlock;
> +			if (ret > 0) {
> +				ret = 0;
> +				btrfs_release_path(path);
> +				break;
> +			}
> +		}
> +
> +		leaf = path->nodes[0];
> +
> +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
> +			btrfs_release_path(path);
> +			break;
> +		}
> +
> +		extent_end = btrfs_file_extent_end(path);
> +		key_offset = key.offset;
> +
> +		/* Read extent fields before releasing the path. */
> +		ei = btrfs_item_ptr(leaf, path->slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(leaf, ei);
> +		compression = btrfs_file_extent_compression(leaf, ei);
> +
> +		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
> +			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
> +			if (disk_bytenr && compression == BTRFS_COMPRESS_NONE)
> +				extent_offset = btrfs_file_extent_offset(leaf, ei);
> +		}
> +
> +		btrfs_release_path(path);
> +
> +		/* Implicit hole (NO_HOLES feature). */
> +		if (prev_extent_end < key_offset) {
> +			u64 hole_end = min(key_offset, end_offset);
> +			u64 hole_len = hole_end - prev_extent_end;
> +
> +			if (prev_extent_end >= cur_offset) {
> +				memset(&entry, 0, sizeof(entry));
> +				entry.offset = prev_extent_end;
> +				entry.length = hole_len;
> +				entry.type = BTRFS_GET_CSUMS_ZEROED;
> +
> +				if (buf_used + sizeof(entry) > buf_limit)
> +					goto done;
> +				if (copy_to_user(ubuf + buf_used, &entry,
> +						 sizeof(entry))) {
> +					ret = -EFAULT;
> +					goto out_unlock;
> +				}
> +				buf_used += sizeof(entry);
> +				cur_offset = hole_end;
> +			}
> +
> +			if (key_offset >= end_offset) {
> +				cur_offset = end_offset;
> +				break;
> +			}
> +		}
> +
> +		/* Clamp to our query range. */
> +		range_start = max(cur_offset, key_offset);
> +		range_len = min(extent_end, end_offset) - range_start;
> +
> +		memset(&entry, 0, sizeof(entry));
> +		entry.offset = range_start;
> +		entry.length = range_len;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
> +			entry.type = BTRFS_GET_CSUMS_NO_CSUMS;
> +			entry_csum_size = 0;
> +		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> +			entry.type = BTRFS_GET_CSUMS_ZEROED;
> +			entry_csum_size = 0;
> +		} else {
> +			/* BTRFS_FILE_EXTENT_REG */
> +			if (disk_bytenr == 0) {
> +				/* Explicit hole. */
> +				entry.type = BTRFS_GET_CSUMS_ZEROED;
> +				entry_csum_size = 0;
> +			} else if (compression != BTRFS_COMPRESS_NONE) {
> +				entry.type = BTRFS_GET_CSUMS_NO_CSUMS;
> +				entry_csum_size = 0;
> +			} else {
> +				entry.type = BTRFS_GET_CSUMS_HAS_CSUMS;
> +				entry_csum_size = (range_len >> fs_info->sectorsize_bits) * csum_size;
> +			}
> +		}
> +
> +		/* Check if this entry (+ csum data) fits in the buffer. */
> +		if (buf_used + sizeof(entry) + entry_csum_size > buf_limit) {
> +			if (buf_used == 0) {
> +				ret = -EOVERFLOW;
> +				goto out_unlock;
> +			}
> +			goto done;
> +		}
> +
> +		if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
> +			ret = -EFAULT;
> +			goto out_unlock;
> +		}
> +		buf_used += sizeof(entry);
> +
> +		if (entry.type == BTRFS_GET_CSUMS_HAS_CSUMS) {
> +			ret = copy_csums_to_user(fs_info,
> +				disk_bytenr + extent_offset + (range_start - key_offset),
> +				range_len, ubuf + buf_used);
> +			if (ret)
> +				goto out_unlock;
> +			buf_used += entry_csum_size;
> +		}
> +
> +		cur_offset = range_start + range_len;
> +		prev_extent_end = extent_end;
> +
> +		if (fatal_signal_pending(current)) {
> +			if (buf_used == 0) {
> +				ret = -EINTR;
> +				goto out_unlock;
> +			}
> +			goto done;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	/* Handle trailing implicit hole. */
> +	if (cur_offset < end_offset) {
> +		struct btrfs_ioctl_get_csums_entry entry = {
> +			.offset = prev_extent_end,
> +			.length = end_offset - prev_extent_end,
> +			.type = BTRFS_GET_CSUMS_ZEROED,
> +		};
> +
> +		if (buf_used + sizeof(entry) <= buf_limit) {
> +			if (copy_to_user(ubuf + buf_used, &entry,
> +					 sizeof(entry))) {
> +				ret = -EFAULT;
> +				goto out_unlock;
> +			}
> +			buf_used += sizeof(entry);
> +			cur_offset = end_offset;
> +		}
> +	}
> +
> +done:
> +	args.offset = cur_offset;
> +	args.length = (cur_offset < end_offset) ? end_offset - cur_offset : 0;
> +	args.buf_size = buf_used;
> +
> +	if (copy_to_user(argp, &args, sizeof(args)))
> +		ret = -EFAULT;
> +
> +out_unlock:
> +	btrfs_inode_unlock(bi, BTRFS_ILOCK_SHARED);
> +	return ret;
> +}
> +
>   long btrfs_ioctl(struct file *file, unsigned int
>   		cmd, unsigned long arg)
>   {
> @@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>   #endif
>   	case BTRFS_IOC_SUBVOL_SYNC_WAIT:
>   		return btrfs_ioctl_subvol_sync(fs_info, argp);
> +	case BTRFS_IOC_GET_CSUMS:
> +		return btrfs_ioctl_get_csums(file, argp);
>   #ifdef CONFIG_BTRFS_EXPERIMENTAL
>   	case BTRFS_IOC_SHUTDOWN:
>   		return btrfs_ioctl_shutdown(fs_info, arg);
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 9165154a274d94..d079e8b67fd740 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -1100,6 +1100,25 @@ enum btrfs_err_code {
>   	BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
>   };
>   
> +/* Types for struct btrfs_ioctl_get_csums_entry::type */
> +#define BTRFS_GET_CSUMS_HAS_CSUMS	0
> +#define BTRFS_GET_CSUMS_ZEROED		1
> +#define BTRFS_GET_CSUMS_NO_CSUMS	2
> +
> +struct btrfs_ioctl_get_csums_entry {
> +	__u64 offset;		/* file offset of this range */
> +	__u64 length;		/* length in bytes */
> +	__u32 type;		/* BTRFS_GET_CSUMS_* type */
> +	__u32 reserved;		/* padding, must be 0 */
> +};
> +
> +struct btrfs_ioctl_get_csums_args {
> +	__u64 offset;		/* in/out: file offset */
> +	__u64 length;		/* in/out: range length */
> +	__u64 buf_size;		/* in/out: buffer capacity / bytes written */
> +	__u8 buf[];		/* out: entries + csum data */
> +};
> +
>   /* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */
>   #define BTRFS_SHUTDOWN_FLAGS_DEFAULT			0x0
>   #define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH			0x1
> @@ -1226,6 +1245,8 @@ enum btrfs_err_code {
>   				     struct btrfs_ioctl_encoded_io_args)
>   #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
>   					struct btrfs_ioctl_subvol_wait)
> +#define BTRFS_IOC_GET_CSUMS _IOWR(BTRFS_IOCTL_MAGIC, 66, \
> +				  struct btrfs_ioctl_get_csums_args)
>   
>   /* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */
>   #define BTRFS_IOC_SHUTDOWN	_IOR('X', 125, __u32)


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
  2026-04-08 17:46 [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl Mark Harmstone
  2026-04-08 17:51 ` Mark Harmstone
@ 2026-04-09 11:08 ` Qu Wenruo
  2026-04-13 13:14   ` Mark Harmstone
  1 sibling, 1 reply; 6+ messages in thread
From: Qu Wenruo @ 2026-04-09 11:08 UTC (permalink / raw)
  To: Mark Harmstone, linux-btrfs, boris

在 2026/4/9 03:16, Mark Harmstone 写道:
> Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
> query the on-disk csums for a file.

After some more discussion, now I understand why you want an 
unprivileged ioctl instead of splitting the workload into fiemap + csum 
tree search ioctl.

You want to do extra permission checks, which is impossible for the csum 
tree search ioctl.

And if we allow unprivileged csum tree search, it will expose all the 
data checksum to an attacker.
The csum itself is not enough to re-construct the plaintext even for the 
weakest CRC32C.

But it is still enough info to know other aspects of some data, e.g. if 
some blocks are all zero, or some two blocks are (possibly) the same etc.

Not sure if you want to include some short words on this design decision 
though.

> 
> This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
> the kernel, which details the offset and length we're interested in, and
> a buffer for the kernel to write its results into. The kernel writes a
> struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
> csums if available.
> 
> If the extent is an uncompressed, non-nodatasum extent, the kernel sets
> the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
> csums. If it is sparse, preallocated, or beyond the EOF, it sets the
> type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
> the precomputed hash of the zero sector.

Well, for mkfs it's going to skip the range as a hole, which is even 
faster than using any precalculated csum.

Although keeping the ZEROED flag may be useful for future users, I would 
not mind to keep this flag.

> Otherwise, it sets the type to
> BTRFS_GET_CSUMS_NO_CSUMS.
> 
> We do store the csums of compressed extents, but we deliberately don't
> return them here: they're hashed over the compressed data, not the
> uncompressed data that's returned to userspace.

Consdiering we're already treating prealloc/hole with a dedicated ZEROED 
flag, just to keep things consistent, it may be better to provide a 
ENCODED flag, to indicate the range is either compressed or encrypted 
for the incoming encyrption feature.

We still don't provide the csum, but just let the user space to know why.

>   
> +#define GET_CSUMS_BUF_MAX	(16 * 1024 * 1024)

SZ_16M.

[...]
>   long btrfs_ioctl(struct file *file, unsigned int
>   		cmd, unsigned long arg)
>   {
> @@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>   #endif
>   	case BTRFS_IOC_SUBVOL_SYNC_WAIT:
>   		return btrfs_ioctl_subvol_sync(fs_info, argp);
> +	case BTRFS_IOC_GET_CSUMS:
> +		return btrfs_ioctl_get_csums(file, argp);
>   #ifdef CONFIG_BTRFS_EXPERIMENTAL
>   	case BTRFS_IOC_SHUTDOWN:
>   		return btrfs_ioctl_shutdown(fs_info, arg);
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 9165154a274d94..d079e8b67fd740 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -1100,6 +1100,25 @@ enum btrfs_err_code {
>   	BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
>   };
>   
> +/* Types for struct btrfs_ioctl_get_csums_entry::type */
> +#define BTRFS_GET_CSUMS_HAS_CSUMS	0
> +#define BTRFS_GET_CSUMS_ZEROED		1
> +#define BTRFS_GET_CSUMS_NO_CSUMS	2
> +
> +struct btrfs_ioctl_get_csums_entry {
> +	__u64 offset;		/* file offset of this range */
> +	__u64 length;		/* length in bytes */
> +	__u32 type;		/* BTRFS_GET_CSUMS_* type */
> +	__u32 reserved;		/* padding, must be 0 */
> +};
> +
> +struct btrfs_ioctl_get_csums_args {
> +	__u64 offset;		/* in/out: file offset */
> +	__u64 length;		/* in/out: range length */
> +	__u64 buf_size;		/* in/out: buffer capacity / bytes written */
> +	__u8 buf[];		/* out: entries + csum data */

Maybe you want to push more explanation on the output buffer format.

The resulted buffer would be something like the following example:

Input:

  inode has [0, 4K) hole, [4K, 12K) data, isize 12K.

  args.offset = 0
  args.length = 1M
  args.buf_size = 1M

Output:

  args.offset = 0
  args.length = 1M
  args.buf_size = buf_size_out
  buf:

  | [0, 4K) ZEROED | [4K, 12K) HAS_CSUM | CSUM | [12K, 1M) ZEROED |
  |<------------------------ buf_size_out ----------------------->|

As it takes me some time to understand the output buffer format from the 
code, which is different from my initial impression.

Another thing is, it may be better to add a flag/version member to 
btrfs_ioctl_get_csums_args.

If we need to add extra flags to entry->type, or utilize the reserved 
entry padding for something, or even introduce some new behavior to the 
output buffer format, we must have a way to tell the end users.

Otherwise looks good to me.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
  2026-04-09 11:08 ` Qu Wenruo
@ 2026-04-13 13:14   ` Mark Harmstone
  2026-04-13 14:12     ` Daniel Vacek
  0 siblings, 1 reply; 6+ messages in thread
From: Mark Harmstone @ 2026-04-13 13:14 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs, boris; +Cc: neelx

On 09/04/2026 12.08 pm, Qu Wenruo wrote:
> 
> 
> 在 2026/4/9 03:16, Mark Harmstone 写道:
>> Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
>> query the on-disk csums for a file.
> 
> After some more discussion, now I understand why you want an 
> unprivileged ioctl instead of splitting the workload into fiemap + csum 
> tree search ioctl.
> 
> You want to do extra permission checks, which is impossible for the csum 
> tree search ioctl.
> 
> And if we allow unprivileged csum tree search, it will expose all the 
> data checksum to an attacker.
> The csum itself is not enough to re-construct the plaintext even for the 
> weakest CRC32C.
> 
> But it is still enough info to know other aspects of some data, e.g. if 
> some blocks are all zero, or some two blocks are (possibly) the same etc.
> 
> Not sure if you want to include some short words on this design decision 
> though.

That's correct. I'll make sure the description is clearer for v3.

The reason arbitrary csums can't be returned to unprivileged users is 
principally because it is a good indication that a known block is 
somewhere on the system. An attack vector might be an unprivileged user 
scanning the whole filesystem to see if a known vulnerable ELF file is 
installed.

>> This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
>> the kernel, which details the offset and length we're interested in, and
>> a buffer for the kernel to write its results into. The kernel writes a
>> struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
>> csums if available.
>>
>> If the extent is an uncompressed, non-nodatasum extent, the kernel sets
>> the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
>> csums. If it is sparse, preallocated, or beyond the EOF, it sets the
>> type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
>> the precomputed hash of the zero sector.
> 
> Well, for mkfs it's going to skip the range as a hole, which is even 
> faster than using any precalculated csum.
> 
> Although keeping the ZEROED flag may be useful for future users, I would 
> not mind to keep this flag.

I wrote this before you added your hole-scanning code to mkfs, but I'm 
keeping this because it's cheap and it might be useful... and you have 
to handle the case where userspace asks for the csum of a hole regardless.

One non-mkfs use that springs to mind is that XORing all the csums of a 
file together should give you a pretty fast ad-hoc checksum.

>> Otherwise, it sets the type to
>> BTRFS_GET_CSUMS_NO_CSUMS.
>>
>> We do store the csums of compressed extents, but we deliberately don't
>> return them here: they're hashed over the compressed data, not the
>> uncompressed data that's returned to userspace.
> 
> Consdiering we're already treating prealloc/hole with a dedicated ZEROED 
> flag, just to keep things consistent, it may be better to provide a 
> ENCODED flag, to indicate the range is either compressed or encrypted 
> for the incoming encyrption feature.
> 
> We still don't provide the csum, but just let the user space to know why.

This is free, so there's no reason not to.

At any rate, it ought to be erroring out if the encryption field is set 
on the file extent. From the looks of things Daniel's current encryption 
work is setting this.

There might one day be a use case for returning the csums of an 
encrypted extent, but as with compressed extents this shouldn't be done 
by default. At least there would be a one-to-one mapping between file 
blocks and encrypted sectors, so this could be done by extending this 
interface.

>> +#define GET_CSUMS_BUF_MAX    (16 * 1024 * 1024)
> 
> SZ_16M.
> 
> [...]
>>   long btrfs_ioctl(struct file *file, unsigned int
>>           cmd, unsigned long arg)
>>   {
>> @@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>>   #endif
>>       case BTRFS_IOC_SUBVOL_SYNC_WAIT:
>>           return btrfs_ioctl_subvol_sync(fs_info, argp);
>> +    case BTRFS_IOC_GET_CSUMS:
>> +        return btrfs_ioctl_get_csums(file, argp);
>>   #ifdef CONFIG_BTRFS_EXPERIMENTAL
>>       case BTRFS_IOC_SHUTDOWN:
>>           return btrfs_ioctl_shutdown(fs_info, arg);
>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>> index 9165154a274d94..d079e8b67fd740 100644
>> --- a/include/uapi/linux/btrfs.h
>> +++ b/include/uapi/linux/btrfs.h
>> @@ -1100,6 +1100,25 @@ enum btrfs_err_code {
>>       BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
>>   };
>> +/* Types for struct btrfs_ioctl_get_csums_entry::type */
>> +#define BTRFS_GET_CSUMS_HAS_CSUMS    0
>> +#define BTRFS_GET_CSUMS_ZEROED        1
>> +#define BTRFS_GET_CSUMS_NO_CSUMS    2
>> +
>> +struct btrfs_ioctl_get_csums_entry {
>> +    __u64 offset;        /* file offset of this range */
>> +    __u64 length;        /* length in bytes */
>> +    __u32 type;        /* BTRFS_GET_CSUMS_* type */
>> +    __u32 reserved;        /* padding, must be 0 */
>> +};
>> +
>> +struct btrfs_ioctl_get_csums_args {
>> +    __u64 offset;        /* in/out: file offset */
>> +    __u64 length;        /* in/out: range length */
>> +    __u64 buf_size;        /* in/out: buffer capacity / bytes written */
>> +    __u8 buf[];        /* out: entries + csum data */
> 
> Maybe you want to push more explanation on the output buffer format.
> 
> The resulted buffer would be something like the following example:
> 
> Input:
> 
>   inode has [0, 4K) hole, [4K, 12K) data, isize 12K.
> 
>   args.offset = 0
>   args.length = 1M
>   args.buf_size = 1M
> 
> Output:
> 
>   args.offset = 0
>   args.length = 1M
>   args.buf_size = buf_size_out
>   buf:
> 
>   | [0, 4K) ZEROED | [4K, 12K) HAS_CSUM | CSUM | [12K, 1M) ZEROED |
>   |<------------------------ buf_size_out ----------------------->|
> 
> As it takes me some time to understand the output buffer format from the 
> code, which is different from my initial impression.

Yes, that's right.

> Another thing is, it may be better to add a flag/version member to 
> btrfs_ioctl_get_csums_args.
> 
> If we need to add extra flags to entry->type, or utilize the reserved 
> entry padding for something, or even introduce some new behavior to the 
> output buffer format, we must have a way to tell the end users.

No objections here, that sounds like a good idea. Thanks Qu. One obvious 
future flag would be to return the csums of encrypted extents rather 
than ignoring them, if there was a use case for this.

> Otherwise looks good to me.
> 
> Thanks,
> Qu


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
  2026-04-13 13:14   ` Mark Harmstone
@ 2026-04-13 14:12     ` Daniel Vacek
  2026-04-13 14:31       ` Mark Harmstone
  0 siblings, 1 reply; 6+ messages in thread
From: Daniel Vacek @ 2026-04-13 14:12 UTC (permalink / raw)
  To: Mark Harmstone; +Cc: Qu Wenruo, linux-btrfs, boris

On Mon, 13 Apr 2026 at 15:14, Mark Harmstone <mark@harmstone.com> wrote:
> On 09/04/2026 12.08 pm, Qu Wenruo wrote:
> > 在 2026/4/9 03:16, Mark Harmstone 写道:
> >> Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
> >> query the on-disk csums for a file.
> >
> > After some more discussion, now I understand why you want an
> > unprivileged ioctl instead of splitting the workload into fiemap + csum
> > tree search ioctl.
> >
> > You want to do extra permission checks, which is impossible for the csum
> > tree search ioctl.
> >
> > And if we allow unprivileged csum tree search, it will expose all the
> > data checksum to an attacker.
> > The csum itself is not enough to re-construct the plaintext even for the
> > weakest CRC32C.
> >
> > But it is still enough info to know other aspects of some data, e.g. if
> > some blocks are all zero, or some two blocks are (possibly) the same etc.
> >
> > Not sure if you want to include some short words on this design decision
> > though.
>
> That's correct. I'll make sure the description is clearer for v3.
>
> The reason arbitrary csums can't be returned to unprivileged users is
> principally because it is a good indication that a known block is
> somewhere on the system. An attack vector might be an unprivileged user
> scanning the whole filesystem to see if a known vulnerable ELF file is
> installed.
>
> >> This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
> >> the kernel, which details the offset and length we're interested in, and
> >> a buffer for the kernel to write its results into. The kernel writes a
> >> struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
> >> csums if available.
> >>
> >> If the extent is an uncompressed, non-nodatasum extent, the kernel sets
> >> the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
> >> csums. If it is sparse, preallocated, or beyond the EOF, it sets the
> >> type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
> >> the precomputed hash of the zero sector.
> >
> > Well, for mkfs it's going to skip the range as a hole, which is even
> > faster than using any precalculated csum.
> >
> > Although keeping the ZEROED flag may be useful for future users, I would
> > not mind to keep this flag.
>
> I wrote this before you added your hole-scanning code to mkfs, but I'm
> keeping this because it's cheap and it might be useful... and you have
> to handle the case where userspace asks for the csum of a hole regardless.
>
> One non-mkfs use that springs to mind is that XORing all the csums of a
> file together should give you a pretty fast ad-hoc checksum.
>
> >> Otherwise, it sets the type to
> >> BTRFS_GET_CSUMS_NO_CSUMS.
> >>
> >> We do store the csums of compressed extents, but we deliberately don't
> >> return them here: they're hashed over the compressed data, not the
> >> uncompressed data that's returned to userspace.
> >
> > Consdiering we're already treating prealloc/hole with a dedicated ZEROED
> > flag, just to keep things consistent, it may be better to provide a
> > ENCODED flag, to indicate the range is either compressed or encrypted
> > for the incoming encyrption feature.
> >
> > We still don't provide the csum, but just let the user space to know why.
>
> This is free, so there's no reason not to.
>
> At any rate, it ought to be erroring out if the encryption field is set
> on the file extent. From the looks of things Daniel's current encryption
> work is setting this.
>
> There might one day be a use case for returning the csums of an
> encrypted extent, but as with compressed extents this shouldn't be done
> by default. At least there would be a one-to-one mapping between file
> blocks and encrypted sectors, so this could be done by extending this
> interface.

I'd say there is no reason not to return the csums. Even for
compressed or encrypted extents, you can flag them ENCODED and still
return the csum (given the permission check). Userspace is free to
ignore it.

Note that you can read/write the raw data with the
BTRFS_IOC_ENCODED_{READ|WRITE} ioctl.

Other than that it looks like a very nice speed-up.

--nX

> >> +#define GET_CSUMS_BUF_MAX    (16 * 1024 * 1024)
> >
> > SZ_16M.
> >
> > [...]
> >>   long btrfs_ioctl(struct file *file, unsigned int
> >>           cmd, unsigned long arg)
> >>   {
> >> @@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
> >>   #endif
> >>       case BTRFS_IOC_SUBVOL_SYNC_WAIT:
> >>           return btrfs_ioctl_subvol_sync(fs_info, argp);
> >> +    case BTRFS_IOC_GET_CSUMS:
> >> +        return btrfs_ioctl_get_csums(file, argp);
> >>   #ifdef CONFIG_BTRFS_EXPERIMENTAL
> >>       case BTRFS_IOC_SHUTDOWN:
> >>           return btrfs_ioctl_shutdown(fs_info, arg);
> >> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> >> index 9165154a274d94..d079e8b67fd740 100644
> >> --- a/include/uapi/linux/btrfs.h
> >> +++ b/include/uapi/linux/btrfs.h
> >> @@ -1100,6 +1100,25 @@ enum btrfs_err_code {
> >>       BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
> >>   };
> >> +/* Types for struct btrfs_ioctl_get_csums_entry::type */
> >> +#define BTRFS_GET_CSUMS_HAS_CSUMS    0
> >> +#define BTRFS_GET_CSUMS_ZEROED        1
> >> +#define BTRFS_GET_CSUMS_NO_CSUMS    2
> >> +
> >> +struct btrfs_ioctl_get_csums_entry {
> >> +    __u64 offset;        /* file offset of this range */
> >> +    __u64 length;        /* length in bytes */
> >> +    __u32 type;        /* BTRFS_GET_CSUMS_* type */
> >> +    __u32 reserved;        /* padding, must be 0 */
> >> +};
> >> +
> >> +struct btrfs_ioctl_get_csums_args {
> >> +    __u64 offset;        /* in/out: file offset */
> >> +    __u64 length;        /* in/out: range length */
> >> +    __u64 buf_size;        /* in/out: buffer capacity / bytes written */
> >> +    __u8 buf[];        /* out: entries + csum data */
> >
> > Maybe you want to push more explanation on the output buffer format.
> >
> > The resulted buffer would be something like the following example:
> >
> > Input:
> >
> >   inode has [0, 4K) hole, [4K, 12K) data, isize 12K.
> >
> >   args.offset = 0
> >   args.length = 1M
> >   args.buf_size = 1M
> >
> > Output:
> >
> >   args.offset = 0
> >   args.length = 1M
> >   args.buf_size = buf_size_out
> >   buf:
> >
> >   | [0, 4K) ZEROED | [4K, 12K) HAS_CSUM | CSUM | [12K, 1M) ZEROED |
> >   |<------------------------ buf_size_out ----------------------->|
> >
> > As it takes me some time to understand the output buffer format from the
> > code, which is different from my initial impression.
>
> Yes, that's right.
>
> > Another thing is, it may be better to add a flag/version member to
> > btrfs_ioctl_get_csums_args.
> >
> > If we need to add extra flags to entry->type, or utilize the reserved
> > entry padding for something, or even introduce some new behavior to the
> > output buffer format, we must have a way to tell the end users.
>
> No objections here, that sounds like a good idea. Thanks Qu. One obvious
> future flag would be to return the csums of encrypted extents rather
> than ignoring them, if there was a use case for this.
>
> > Otherwise looks good to me.
> >
> > Thanks,
> > Qu
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl
  2026-04-13 14:12     ` Daniel Vacek
@ 2026-04-13 14:31       ` Mark Harmstone
  0 siblings, 0 replies; 6+ messages in thread
From: Mark Harmstone @ 2026-04-13 14:31 UTC (permalink / raw)
  To: Daniel Vacek; +Cc: Qu Wenruo, linux-btrfs, boris

On 13/04/2026 3.12 pm, Daniel Vacek wrote:
> On Mon, 13 Apr 2026 at 15:14, Mark Harmstone <mark@harmstone.com> wrote:
>> On 09/04/2026 12.08 pm, Qu Wenruo wrote:
>>> 在 2026/4/9 03:16, Mark Harmstone 写道:
>>>> Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to
>>>> query the on-disk csums for a file.
>>>
>>> After some more discussion, now I understand why you want an
>>> unprivileged ioctl instead of splitting the workload into fiemap + csum
>>> tree search ioctl.
>>>
>>> You want to do extra permission checks, which is impossible for the csum
>>> tree search ioctl.
>>>
>>> And if we allow unprivileged csum tree search, it will expose all the
>>> data checksum to an attacker.
>>> The csum itself is not enough to re-construct the plaintext even for the
>>> weakest CRC32C.
>>>
>>> But it is still enough info to know other aspects of some data, e.g. if
>>> some blocks are all zero, or some two blocks are (possibly) the same etc.
>>>
>>> Not sure if you want to include some short words on this design decision
>>> though.
>>
>> That's correct. I'll make sure the description is clearer for v3.
>>
>> The reason arbitrary csums can't be returned to unprivileged users is
>> principally because it is a good indication that a known block is
>> somewhere on the system. An attack vector might be an unprivileged user
>> scanning the whole filesystem to see if a known vulnerable ELF file is
>> installed.
>>
>>>> This is done by userspace passing a struct btrfs_ioctl_get_csums_args to
>>>> the kernel, which details the offset and length we're interested in, and
>>>> a buffer for the kernel to write its results into. The kernel writes a
>>>> struct btrfs_ioctl_get_csums_entry into the buffer, followed by the
>>>> csums if available.
>>>>
>>>> If the extent is an uncompressed, non-nodatasum extent, the kernel sets
>>>> the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the
>>>> csums. If it is sparse, preallocated, or beyond the EOF, it sets the
>>>> type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use
>>>> the precomputed hash of the zero sector.
>>>
>>> Well, for mkfs it's going to skip the range as a hole, which is even
>>> faster than using any precalculated csum.
>>>
>>> Although keeping the ZEROED flag may be useful for future users, I would
>>> not mind to keep this flag.
>>
>> I wrote this before you added your hole-scanning code to mkfs, but I'm
>> keeping this because it's cheap and it might be useful... and you have
>> to handle the case where userspace asks for the csum of a hole regardless.
>>
>> One non-mkfs use that springs to mind is that XORing all the csums of a
>> file together should give you a pretty fast ad-hoc checksum.
>>
>>>> Otherwise, it sets the type to
>>>> BTRFS_GET_CSUMS_NO_CSUMS.
>>>>
>>>> We do store the csums of compressed extents, but we deliberately don't
>>>> return them here: they're hashed over the compressed data, not the
>>>> uncompressed data that's returned to userspace.
>>>
>>> Consdiering we're already treating prealloc/hole with a dedicated ZEROED
>>> flag, just to keep things consistent, it may be better to provide a
>>> ENCODED flag, to indicate the range is either compressed or encrypted
>>> for the incoming encyrption feature.
>>>
>>> We still don't provide the csum, but just let the user space to know why.
>>
>> This is free, so there's no reason not to.
>>
>> At any rate, it ought to be erroring out if the encryption field is set
>> on the file extent. From the looks of things Daniel's current encryption
>> work is setting this.
>>
>> There might one day be a use case for returning the csums of an
>> encrypted extent, but as with compressed extents this shouldn't be done
>> by default. At least there would be a one-to-one mapping between file
>> blocks and encrypted sectors, so this could be done by extending this
>> interface.
> 
> I'd say there is no reason not to return the csums. Even for
> compressed or encrypted extents, you can flag them ENCODED and still
> return the csum (given the permission check). Userspace is free to
> ignore it.
> 
> Note that you can read/write the raw data with the
> BTRFS_IOC_ENCODED_{READ|WRITE} ioctl.

Thanks Daniel. My concern there is that this probably won't be what 
userspace wants, so you'll be doing a copy for no reason. I'll think 
I'll just keep it for a later flag, so we can add this later if and when 
we need it.

One thing I'm working on at the moment is "encoded reflinks", so you can 
mark uncompressed data as being part of a compressed extent or vice 
versa. If we extended this to encryption, this would be a use case for 
this ioctl: you would be able to do mkfs.btrfs --rootdir --reflink on an 
encrypted directory without needing to have loaded the key.

> Other than that it looks like a very nice speed-up.
> 
> --nX
> 
>>>> +#define GET_CSUMS_BUF_MAX    (16 * 1024 * 1024)
>>>
>>> SZ_16M.
>>>
>>> [...]
>>>>    long btrfs_ioctl(struct file *file, unsigned int
>>>>            cmd, unsigned long arg)
>>>>    {
>>>> @@ -5294,6 +5622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>>>>    #endif
>>>>        case BTRFS_IOC_SUBVOL_SYNC_WAIT:
>>>>            return btrfs_ioctl_subvol_sync(fs_info, argp);
>>>> +    case BTRFS_IOC_GET_CSUMS:
>>>> +        return btrfs_ioctl_get_csums(file, argp);
>>>>    #ifdef CONFIG_BTRFS_EXPERIMENTAL
>>>>        case BTRFS_IOC_SHUTDOWN:
>>>>            return btrfs_ioctl_shutdown(fs_info, arg);
>>>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>>>> index 9165154a274d94..d079e8b67fd740 100644
>>>> --- a/include/uapi/linux/btrfs.h
>>>> +++ b/include/uapi/linux/btrfs.h
>>>> @@ -1100,6 +1100,25 @@ enum btrfs_err_code {
>>>>        BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
>>>>    };
>>>> +/* Types for struct btrfs_ioctl_get_csums_entry::type */
>>>> +#define BTRFS_GET_CSUMS_HAS_CSUMS    0
>>>> +#define BTRFS_GET_CSUMS_ZEROED        1
>>>> +#define BTRFS_GET_CSUMS_NO_CSUMS    2
>>>> +
>>>> +struct btrfs_ioctl_get_csums_entry {
>>>> +    __u64 offset;        /* file offset of this range */
>>>> +    __u64 length;        /* length in bytes */
>>>> +    __u32 type;        /* BTRFS_GET_CSUMS_* type */
>>>> +    __u32 reserved;        /* padding, must be 0 */
>>>> +};
>>>> +
>>>> +struct btrfs_ioctl_get_csums_args {
>>>> +    __u64 offset;        /* in/out: file offset */
>>>> +    __u64 length;        /* in/out: range length */
>>>> +    __u64 buf_size;        /* in/out: buffer capacity / bytes written */
>>>> +    __u8 buf[];        /* out: entries + csum data */
>>>
>>> Maybe you want to push more explanation on the output buffer format.
>>>
>>> The resulted buffer would be something like the following example:
>>>
>>> Input:
>>>
>>>    inode has [0, 4K) hole, [4K, 12K) data, isize 12K.
>>>
>>>    args.offset = 0
>>>    args.length = 1M
>>>    args.buf_size = 1M
>>>
>>> Output:
>>>
>>>    args.offset = 0
>>>    args.length = 1M
>>>    args.buf_size = buf_size_out
>>>    buf:
>>>
>>>    | [0, 4K) ZEROED | [4K, 12K) HAS_CSUM | CSUM | [12K, 1M) ZEROED |
>>>    |<------------------------ buf_size_out ----------------------->|
>>>
>>> As it takes me some time to understand the output buffer format from the
>>> code, which is different from my initial impression.
>>
>> Yes, that's right.
>>
>>> Another thing is, it may be better to add a flag/version member to
>>> btrfs_ioctl_get_csums_args.
>>>
>>> If we need to add extra flags to entry->type, or utilize the reserved
>>> entry padding for something, or even introduce some new behavior to the
>>> output buffer format, we must have a way to tell the end users.
>>
>> No objections here, that sounds like a good idea. Thanks Qu. One obvious
>> future flag would be to return the csums of encrypted extents rather
>> than ignoring them, if there was a use case for this.
>>
>>> Otherwise looks good to me.
>>>
>>> Thanks,
>>> Qu
>>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-04-13 14:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-08 17:46 [PATCH v2] btrfs: add BTRFS_IOC_GET_CSUMS ioctl Mark Harmstone
2026-04-08 17:51 ` Mark Harmstone
2026-04-09 11:08 ` Qu Wenruo
2026-04-13 13:14   ` Mark Harmstone
2026-04-13 14:12     ` Daniel Vacek
2026-04-13 14:31       ` Mark Harmstone

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox