* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms [not found] ` <176169810371.1424854.3010195280915622081.stgit@frogsfrogsfrogs> @ 2026-01-21 19:34 ` Joanne Koong 2026-01-21 22:45 ` Darrick J. Wong 2026-02-05 19:22 ` Chris Mason 1 sibling, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-21 19:34 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Implement functions to enable upcalling of iomap_begin and iomap_end to > userspace fuse servers. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > fs/fuse/fuse_i.h | 22 ++ > fs/fuse/iomap_i.h | 36 ++++ > include/uapi/linux/fuse.h | 90 +++++++++ > fs/fuse/Kconfig | 32 +++ > fs/fuse/Makefile | 1 > fs/fuse/file_iomap.c | 434 +++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/inode.c | 8 + > 7 files changed, 621 insertions(+), 2 deletions(-) > create mode 100644 fs/fuse/iomap_i.h > create mode 100644 fs/fuse/file_iomap.c > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 7c7d255d817f1e..45be59df7ae592 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -929,6 +929,9 @@ struct fuse_conn { > /* Is synchronous FUSE_INIT allowed? */ > unsigned int sync_init:1; > > + /* Enable fs/iomap for file operations */ > + unsigned int iomap:1; > + > /* Use io_uring for communication */ > unsigned int io_uring; > > @@ -1053,12 +1056,17 @@ static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) > return sb->s_fs_info; > } > > +static inline const struct fuse_mount *get_fuse_mount_super_c(const struct super_block *sb) > +{ > + return sb->s_fs_info; > +} I'm not seeing this getting used anywhere - did you mean to remove this? > + > static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) > { > return get_fuse_mount_super(sb)->fc; > } > > -static inline struct fuse_mount *get_fuse_mount(struct inode *inode) > +static inline struct fuse_mount *get_fuse_mount(const struct inode *inode) > { > return get_fuse_mount_super(inode->i_sb); > } > @@ -1683,4 +1691,16 @@ extern void fuse_sysctl_unregister(void); > #define fuse_sysctl_unregister() do { } while (0) > #endif /* CONFIG_SYSCTL */ > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP) > +bool fuse_iomap_enabled(void); > + > +static inline bool fuse_has_iomap(const struct inode *inode) > +{ > + return get_fuse_conn(inode)->iomap; > +} > +#else > +# define fuse_iomap_enabled(...) (false) > +# define fuse_has_iomap(...) (false) > +#endif > + > #endif /* _FS_FUSE_I_H */ > diff --git a/fs/fuse/iomap_i.h b/fs/fuse/iomap_i.h > new file mode 100644 > index 00000000000000..d773f728579d1d > --- /dev/null > +++ b/fs/fuse/iomap_i.h > @@ -0,0 +1,36 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2025 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <djwong@kernel.org> > + */ > +#ifndef _FS_FUSE_IOMAP_I_H > +#define _FS_FUSE_IOMAP_I_H > + > +#if IS_ENABLED(CONFIG_FUSE_IOMAP) > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > +# define ASSERT(condition) do { \ > + int __cond = !!(condition); \ > + WARN(!__cond, "Assertion failed: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > +} while (0) > +# define BAD_DATA(condition) ({ \ > + int __cond = !!(condition); \ > + WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > +}) > +#else > +# define ASSERT(condition) > +# define BAD_DATA(condition) ({ \ > + int __cond = !!(condition); \ > + unlikely(__cond); \ > +}) > +#endif /* CONFIG_FUSE_IOMAP_DEBUG */ > + > +enum fuse_iomap_iodir { > + READ_MAPPING, > + WRITE_MAPPING, > +}; > + > +#define EFSCORRUPTED EUCLEAN > + > +#endif /* CONFIG_FUSE_IOMAP */ > + > +#endif /* _FS_FUSE_IOMAP_I_H */ > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index 18713cfaf09171..7d709cf12b41a7 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -240,6 +240,9 @@ > * - add FUSE_COPY_FILE_RANGE_64 > * - add struct fuse_copy_file_range_out > * - add FUSE_NOTIFY_PRUNE > + * > + * 7.99 Should this be changed to something like 7.46 now that this patch is submitted for merging into the tree? > + * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > */ > > #ifndef _LINUX_FUSE_H > @@ -275,7 +278,7 @@ > #define FUSE_KERNEL_VERSION 7 > > /** Minor version number of this interface */ > -#define FUSE_KERNEL_MINOR_VERSION 45 > +#define FUSE_KERNEL_MINOR_VERSION 99 Same question here > > /** The node ID of the root inode */ > #define FUSE_ROOT_ID 1 > @@ -448,6 +451,7 @@ struct fuse_file_lock { > * FUSE_OVER_IO_URING: Indicate that client supports io-uring > * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. > * init_out.request_timeout contains the timeout (in secs) > + * FUSE_IOMAP: Client supports iomap for regular file operations. > */ > #define FUSE_ASYNC_READ (1 << 0) > #define FUSE_POSIX_LOCKS (1 << 1) > @@ -495,6 +499,7 @@ struct fuse_file_lock { > #define FUSE_ALLOW_IDMAP (1ULL << 40) > #define FUSE_OVER_IO_URING (1ULL << 41) > #define FUSE_REQUEST_TIMEOUT (1ULL << 42) > +#define FUSE_IOMAP (1ULL << 43) > > /** > * CUSE INIT request/reply flags > @@ -664,6 +669,9 @@ enum fuse_opcode { > FUSE_STATX = 52, > FUSE_COPY_FILE_RANGE_64 = 53, > > + FUSE_IOMAP_BEGIN = 4094, > + FUSE_IOMAP_END = 4095, > + > /* CUSE specific operations */ > CUSE_INIT = 4096, > > @@ -1314,4 +1322,84 @@ struct fuse_uring_cmd_req { > uint8_t padding[6]; > }; > > +/* mapping types; see corresponding IOMAP_TYPE_ */ > +#define FUSE_IOMAP_TYPE_HOLE (0) > +#define FUSE_IOMAP_TYPE_DELALLOC (1) > +#define FUSE_IOMAP_TYPE_MAPPED (2) > +#define FUSE_IOMAP_TYPE_UNWRITTEN (3) > +#define FUSE_IOMAP_TYPE_INLINE (4) > + > +/* fuse-specific mapping type indicating that writes use the read mapping */ > +#define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255) > + > +#define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */ > + > +/* mapping flags passed back from iomap_begin; see corresponding IOMAP_F_ */ > +#define FUSE_IOMAP_F_NEW (1U << 0) > +#define FUSE_IOMAP_F_DIRTY (1U << 1) > +#define FUSE_IOMAP_F_SHARED (1U << 2) > +#define FUSE_IOMAP_F_MERGED (1U << 3) > +#define FUSE_IOMAP_F_BOUNDARY (1U << 4) > +#define FUSE_IOMAP_F_ANON_WRITE (1U << 5) > +#define FUSE_IOMAP_F_ATOMIC_BIO (1U << 6) Do you think it makes sense to have the fuse iomap constants mirror the in-kernel iomap ones? Maybe I'm mistaken but it seems like the fuse iomap capabilities won't diverge too much from fs/iomap ones? I like that if they're mirrored, then it makes it simpler instead of needing to convert back and forth. > + > +/* fuse-specific mapping flag asking for ->iomap_end call */ > +#define FUSE_IOMAP_F_WANT_IOMAP_END (1U << 7) > + > +/* mapping flags passed to iomap_end */ > +#define FUSE_IOMAP_F_SIZE_CHANGED (1U << 8) > +#define FUSE_IOMAP_F_STALE (1U << 9) > + > +/* operation flags from iomap; see corresponding IOMAP_* */ > +#define FUSE_IOMAP_OP_WRITE (1U << 0) > +#define FUSE_IOMAP_OP_ZERO (1U << 1) > +#define FUSE_IOMAP_OP_REPORT (1U << 2) > +#define FUSE_IOMAP_OP_FAULT (1U << 3) > +#define FUSE_IOMAP_OP_DIRECT (1U << 4) > +#define FUSE_IOMAP_OP_NOWAIT (1U << 5) > +#define FUSE_IOMAP_OP_OVERWRITE_ONLY (1U << 6) > +#define FUSE_IOMAP_OP_UNSHARE (1U << 7) > +#define FUSE_IOMAP_OP_DAX (1U << 8) > +#define FUSE_IOMAP_OP_ATOMIC (1U << 9) > +#define FUSE_IOMAP_OP_DONTCACHE (1U << 10) > + > +#define FUSE_IOMAP_NULL_ADDR (-1ULL) /* addr is not valid */ > + > +struct fuse_iomap_io { > + uint64_t offset; /* file offset of mapping, bytes */ > + uint64_t length; /* length of mapping, bytes */ > + uint64_t addr; /* disk offset of mapping, bytes */ > + uint16_t type; /* FUSE_IOMAP_TYPE_* */ > + uint16_t flags; /* FUSE_IOMAP_F_* */ > + uint32_t dev; /* device cookie */ Do you think it's a good idea to add a reserved field here in case we end up needing it in the future? > +}; > + > +struct fuse_iomap_begin_in { > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > + uint32_t reserved; /* zero */ > + uint64_t attr_ino; /* matches fuse_attr:ino */ > + uint64_t pos; /* file position, in bytes */ > + uint64_t count; /* operation length, in bytes */ > +}; > + > +struct fuse_iomap_begin_out { > + /* read file data from here */ > + struct fuse_iomap_io read; > + > + /* write file data to here, if applicable */ > + struct fuse_iomap_io write; Same question here > +}; > + > +struct fuse_iomap_end_in { > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > + uint32_t reserved; /* zero */ > + uint64_t attr_ino; /* matches fuse_attr:ino */ > + uint64_t pos; /* file position, in bytes */ > + uint64_t count; /* operation length, in bytes */ > + int64_t written; /* bytes processed */ On the fs/iomap side, I see that written is passed through by iomap_iter() to ->iomap_end through 'ssize_t advanced' but it's not clear to me why advanced needs to be signed. I think it used to also represent the error status, but it looks like now that's represented through iter->status and 'advanced' strictly reflects the number of bytes written. As such, do you think it makes sense to change 'advanced' to loff_t and have written be uint64_t instead? > + > + /* mapping that the kernel acted upon */ > + struct fuse_iomap_io map; > +}; > + > #endif /* _LINUX_FUSE_H */ > diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig > index 290d1c09e0b924..934d48076a010c 100644 > --- a/fs/fuse/Kconfig > +++ b/fs/fuse/Kconfig > @@ -69,6 +69,38 @@ config FUSE_PASSTHROUGH > config FUSE_BACKING > bool > > +config FUSE_IOMAP > + bool "FUSE file IO over iomap" > + default y > + depends on FUSE_FS > + depends on BLOCK > + select FS_IOMAP > + help > + Enable fuse servers to operate the regular file I/O path through > + the fs-iomap library in the kernel. This enables higher performance > + userspace filesystems by keeping the performance critical parts in > + the kernel while delegating the difficult metadata parsing parts to > + an easily-contained userspace program. > + > + This feature is considered EXPERIMENTAL. Use with caution! > + > + If unsure, say N. > + > +config FUSE_IOMAP_BY_DEFAULT > + bool "FUSE file I/O over iomap by default" > + default n > + depends on FUSE_IOMAP > + help > + Enable sending FUSE file I/O over iomap by default. I'm not really sure what the general linux preference is for adding new configs, but assuming it errs towards less configs than more, imo it seems easy enough to just set the enable_iomap module param to true manually instead of needing this config for it, especially since the param only needs to be set once. > + > +config FUSE_IOMAP_DEBUG > + bool "Debug FUSE file IO over iomap" > + default y > + depends on FUSE_IOMAP > + help > + Enable debugging assertions for the fuse iomap code paths and logging > + of bad iomap file mapping data being sent to the kernel. I'm wondering if it makes sense to make this a general FUSE_DEBUG config so we can reuse this more generally > + > config FUSE_IO_URING > bool "FUSE communication over io-uring" > default y > diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile > index 46041228e5be2c..27be39317701d6 100644 > --- a/fs/fuse/Makefile > +++ b/fs/fuse/Makefile > @@ -18,5 +18,6 @@ fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o > fuse-$(CONFIG_FUSE_BACKING) += backing.o > fuse-$(CONFIG_SYSCTL) += sysctl.o > fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o > +fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o > > virtiofs-y := virtio_fs.o > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > new file mode 100644 > index 00000000000000..d564d60d0f1779 > --- /dev/null > +++ b/fs/fuse/file_iomap.c > @@ -0,0 +1,434 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2025 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <djwong@kernel.org> > + */ > +#include <linux/iomap.h> > +#include "fuse_i.h" > +#include "fuse_trace.h" > +#include "iomap_i.h" > + > +static bool __read_mostly enable_iomap = > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > + true; > +#else > + false; > +#endif > +module_param(enable_iomap, bool, 0644); > +MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > + > +bool fuse_iomap_enabled(void) > +{ > + /* Don't let anyone touch iomap until the end of the patchset. */ > + return false; > + > + /* > + * There are fears that a fuse+iomap server could somehow DoS the > + * system by doing things like going out to lunch during a writeback > + * related iomap request. Only allow iomap access if the fuse server > + * has rawio capabilities since those processes can mess things up > + * quite well even without our help. > + */ > + return enable_iomap && has_capability_noaudit(current, CAP_SYS_RAWIO); > +} > + > +/* Convert IOMAP_* mapping types to FUSE_IOMAP_TYPE_* */ > +#define XMAP(word) \ > + case IOMAP_##word: \ > + return FUSE_IOMAP_TYPE_##word > +static inline uint16_t fuse_iomap_type_to_server(uint16_t iomap_type) > +{ > + switch (iomap_type) { > + XMAP(HOLE); > + XMAP(DELALLOC); > + XMAP(MAPPED); > + XMAP(UNWRITTEN); > + XMAP(INLINE); > + default: > + ASSERT(0); > + } > + return 0; > +} > +#undef XMAP > + > +/* Convert FUSE_IOMAP_TYPE_* to IOMAP_* mapping types */ > +#define XMAP(word) \ > + case FUSE_IOMAP_TYPE_##word: \ > + return IOMAP_##word > +static inline uint16_t fuse_iomap_type_from_server(uint16_t fuse_type) > +{ > + switch (fuse_type) { > + XMAP(HOLE); > + XMAP(DELALLOC); > + XMAP(MAPPED); > + XMAP(UNWRITTEN); > + XMAP(INLINE); > + default: > + ASSERT(0); > + } > + return 0; > +} > +#undef XMAP > + > +/* Validate FUSE_IOMAP_TYPE_* */ > +static inline bool fuse_iomap_check_type(uint16_t fuse_type) > +{ > + switch (fuse_type) { > + case FUSE_IOMAP_TYPE_HOLE: > + case FUSE_IOMAP_TYPE_DELALLOC: > + case FUSE_IOMAP_TYPE_MAPPED: > + case FUSE_IOMAP_TYPE_UNWRITTEN: > + case FUSE_IOMAP_TYPE_INLINE: > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > + return true; > + } > + > + return false; > +} > + > +#define FUSE_IOMAP_F_ALL (FUSE_IOMAP_F_NEW | \ > + FUSE_IOMAP_F_DIRTY | \ > + FUSE_IOMAP_F_SHARED | \ > + FUSE_IOMAP_F_MERGED | \ > + FUSE_IOMAP_F_BOUNDARY | \ > + FUSE_IOMAP_F_ANON_WRITE | \ > + FUSE_IOMAP_F_ATOMIC_BIO | \ > + FUSE_IOMAP_F_WANT_IOMAP_END) > + > +static inline bool fuse_iomap_check_flags(uint16_t flags) > +{ > + return (flags & ~FUSE_IOMAP_F_ALL) == 0; > +} > + > +/* Convert IOMAP_F_* mapping state flags to FUSE_IOMAP_F_* */ > +#define XMAP(word) \ > + if (iomap_f_flags & IOMAP_F_##word) \ > + ret |= FUSE_IOMAP_F_##word > +#define YMAP(iword, oword) \ > + if (iomap_f_flags & IOMAP_F_##iword) \ > + ret |= FUSE_IOMAP_F_##oword > +static inline uint16_t fuse_iomap_flags_to_server(uint16_t iomap_f_flags) > +{ > + uint16_t ret = 0; > + > + XMAP(NEW); > + XMAP(DIRTY); > + XMAP(SHARED); > + XMAP(MERGED); > + XMAP(BOUNDARY); > + XMAP(ANON_WRITE); > + XMAP(ATOMIC_BIO); > + YMAP(PRIVATE, WANT_IOMAP_END); > + > + XMAP(SIZE_CHANGED); > + XMAP(STALE); > + > + return ret; > +} > +#undef YMAP > +#undef XMAP > + > +/* Convert FUSE_IOMAP_F_* to IOMAP_F_* mapping state flags */ > +#define XMAP(word) \ > + if (fuse_f_flags & FUSE_IOMAP_F_##word) \ > + ret |= IOMAP_F_##word > +#define YMAP(iword, oword) \ > + if (fuse_f_flags & FUSE_IOMAP_F_##iword) \ > + ret |= IOMAP_F_##oword > +static inline uint16_t fuse_iomap_flags_from_server(uint16_t fuse_f_flags) > +{ > + uint16_t ret = 0; > + > + XMAP(NEW); > + XMAP(DIRTY); > + XMAP(SHARED); > + XMAP(MERGED); > + XMAP(BOUNDARY); > + XMAP(ANON_WRITE); > + XMAP(ATOMIC_BIO); > + YMAP(WANT_IOMAP_END, PRIVATE); > + > + return ret; > +} > +#undef YMAP > +#undef XMAP > + > +/* Convert IOMAP_* operation flags to FUSE_IOMAP_OP_* */ > +#define XMAP(word) \ > + if (iomap_op_flags & IOMAP_##word) \ > + ret |= FUSE_IOMAP_OP_##word > +static inline uint32_t fuse_iomap_op_to_server(unsigned iomap_op_flags) > +{ > + uint32_t ret = 0; > + > + XMAP(WRITE); > + XMAP(ZERO); > + XMAP(REPORT); > + XMAP(FAULT); > + XMAP(DIRECT); > + XMAP(NOWAIT); > + XMAP(OVERWRITE_ONLY); > + XMAP(UNSHARE); > + XMAP(DAX); > + XMAP(ATOMIC); > + XMAP(DONTCACHE); > + > + return ret; > +} > +#undef XMAP > + > +/* Validate an iomap mapping. */ > +static inline bool fuse_iomap_check_mapping(const struct inode *inode, > + const struct fuse_iomap_io *map, > + enum fuse_iomap_iodir iodir) > +{ > + const unsigned int blocksize = i_blocksize(inode); > + uint64_t end; > + > + /* Type and flags must be known */ > + if (BAD_DATA(!fuse_iomap_check_type(map->type))) > + return false; > + if (BAD_DATA(!fuse_iomap_check_flags(map->flags))) > + return false; > + > + /* No zero-length mappings */ > + if (BAD_DATA(map->length == 0)) > + return false; > + > + /* File range must be aligned to blocksize */ > + if (BAD_DATA(!IS_ALIGNED(map->offset, blocksize))) > + return false; > + if (BAD_DATA(!IS_ALIGNED(map->length, blocksize))) > + return false; > + > + /* No overflows in the file range */ > + if (BAD_DATA(check_add_overflow(map->offset, map->length, &end))) > + return false; > + > + /* File range cannot start past maxbytes */ > + if (BAD_DATA(map->offset >= inode->i_sb->s_maxbytes)) > + return false; > + > + switch (map->type) { > + case FUSE_IOMAP_TYPE_MAPPED: > + case FUSE_IOMAP_TYPE_UNWRITTEN: > + /* Mappings backed by space must have a device/addr */ > + if (BAD_DATA(map->dev == FUSE_IOMAP_DEV_NULL)) > + return false; > + if (BAD_DATA(map->addr == FUSE_IOMAP_NULL_ADDR)) > + return false; > + break; > + case FUSE_IOMAP_TYPE_DELALLOC: > + case FUSE_IOMAP_TYPE_HOLE: > + case FUSE_IOMAP_TYPE_INLINE: > + /* Mappings not backed by space cannot have a device addr. */ > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > + return false; > + if (BAD_DATA(map->addr != FUSE_IOMAP_NULL_ADDR)) > + return false; > + break; > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > + /* "Pure overwrite" only allowed for write mapping */ > + if (BAD_DATA(iodir != WRITE_MAPPING)) > + return false; > + break; > + default: > + /* should have been caught already */ > + ASSERT(0); > + return false; > + } > + > + /* XXX: we don't support devices yet */ > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > + return false; > + > + /* No overflows in the device range, if supplied */ > + if (map->addr != FUSE_IOMAP_NULL_ADDR && > + BAD_DATA(check_add_overflow(map->addr, map->length, &end))) > + return false; > + > + return true; > +} > + > +/* Convert a mapping from the server into something the kernel can use */ > +static inline void fuse_iomap_from_server(struct inode *inode, Maybe worth adding a const in front of struct inode? > + struct iomap *iomap, > + const struct fuse_iomap_io *fmap) > +{ > + iomap->addr = fmap->addr; > + iomap->offset = fmap->offset; > + iomap->length = fmap->length; > + iomap->type = fuse_iomap_type_from_server(fmap->type); > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > + iomap->bdev = inode->i_sb->s_bdev; /* XXX */ > +} > + > +/* Convert a mapping from the kernel into something the server can use */ > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > + const struct iomap *iomap) > +{ > + fmap->addr = FUSE_IOMAP_NULL_ADDR; /* XXX */ > + fmap->offset = iomap->offset; > + fmap->length = iomap->length; > + fmap->type = fuse_iomap_type_to_server(iomap->type); > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ AFAICT, this only gets used for sending the FUSE_IOMAP_END request. Is passing the iomap->addr to fmap->addr and inode->i_sb->s_bdev to fmap->dev not useful to the server here? Also, did you mean to leave in the /* XXX */ comments? > +} > + > +/* Check the incoming _begin mappings to make sure they're not nonsense. */ > +static inline int > +fuse_iomap_begin_validate(const struct inode *inode, > + unsigned opflags, loff_t pos, > + const struct fuse_iomap_begin_out *outarg) > +{ > + /* Make sure the mappings aren't garbage */ > + if (!fuse_iomap_check_mapping(inode, &outarg->read, READ_MAPPING)) > + return -EFSCORRUPTED; > + > + if (!fuse_iomap_check_mapping(inode, &outarg->write, WRITE_MAPPING)) > + return -EFSCORRUPTED; > + > + /* > + * Must have returned a mapping for at least the first byte in the > + * range. The main mapping check already validated that the length > + * is nonzero and there is no overflow in computing end. > + */ > + if (BAD_DATA(outarg->read.offset > pos)) > + return -EFSCORRUPTED; > + if (BAD_DATA(outarg->write.offset > pos)) > + return -EFSCORRUPTED; > + > + if (BAD_DATA(outarg->read.offset + outarg->read.length <= pos)) > + return -EFSCORRUPTED; > + if (BAD_DATA(outarg->write.offset + outarg->write.length <= pos)) > + return -EFSCORRUPTED; > + > + return 0; > +} > + > +static inline bool fuse_is_iomap_file_write(unsigned int opflags) > +{ > + return opflags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_UNSHARE); > +} > + > +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > + unsigned opflags, struct iomap *iomap, > + struct iomap *srcmap) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_iomap_begin_in inarg = { > + .attr_ino = fi->orig_ino, > + .opflags = fuse_iomap_op_to_server(opflags), > + .pos = pos, > + .count = count, > + }; > + struct fuse_iomap_begin_out outarg = { }; > + struct fuse_mount *fm = get_fuse_mount(inode); > + FUSE_ARGS(args); > + int err; > + > + args.opcode = FUSE_IOMAP_BEGIN; > + args.nodeid = get_node_id(inode); > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + args.out_numargs = 1; > + args.out_args[0].size = sizeof(outarg); > + args.out_args[0].value = &outarg; > + err = fuse_simple_request(fm, &args); > + if (err) > + return err; > + > + err = fuse_iomap_begin_validate(inode, opflags, pos, &outarg); > + if (err) > + return err; > + > + if (fuse_is_iomap_file_write(opflags) && > + outarg.write.type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) { > + /* > + * For an out of place write, we must supply the write mapping > + * via @iomap, and the read mapping via @srcmap. > + */ > + fuse_iomap_from_server(inode, iomap, &outarg.write); > + fuse_iomap_from_server(inode, srcmap, &outarg.read); > + } else { > + /* > + * For everything else (reads, reporting, and pure overwrites), > + * we can return the sole mapping through @iomap and leave > + * @srcmap unchanged from its default (HOLE). > + */ > + fuse_iomap_from_server(inode, iomap, &outarg.read); > + } > + > + return 0; > +} > + > +/* Decide if we send FUSE_IOMAP_END to the fuse server */ > +static bool fuse_should_send_iomap_end(const struct iomap *iomap, > + unsigned int opflags, loff_t count, > + ssize_t written) > +{ > + /* fuse server demanded an iomap_end call. */ > + if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END) > + return true; > + > + /* Reads and reporting should never affect the filesystem metadata */ > + if (!fuse_is_iomap_file_write(opflags)) > + return false; > + > + /* Appending writes get an iomap_end call */ > + if (iomap->flags & IOMAP_F_SIZE_CHANGED) > + return true; > + > + /* Short writes get an iomap_end call to clean up delalloc */ > + return written < count; > +} > + > +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > + ssize_t written, unsigned opflags, > + struct iomap *iomap) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_mount *fm = get_fuse_mount(inode); > + int err = 0; > + > + if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > + struct fuse_iomap_end_in inarg = { > + .opflags = fuse_iomap_op_to_server(opflags), > + .attr_ino = fi->orig_ino, > + .pos = pos, > + .count = count, > + .written = written, > + }; > + FUSE_ARGS(args); > + > + fuse_iomap_to_server(&inarg.map, iomap); > + > + args.opcode = FUSE_IOMAP_END; > + args.nodeid = get_node_id(inode); Just curious about this - does it make sense to set args.force here for this opcode? It seems like it serves the same sort of purpose a flush request (which sets args.force) does? > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + err = fuse_simple_request(fm, &args); > + switch (err) { > + case -ENOSYS: > + /* > + * libfuse returns ENOSYS for servers that don't > + * implement iomap_end > + */ > + err = 0; > + break; > + case 0: > + break; Is this case 0 needed separately from the default case? Thanks, Joanne > + default: > + break; > + } > + } > + > + return err; > +} > + > +const struct iomap_ops fuse_iomap_ops = { > + .iomap_begin = fuse_iomap_begin, > + .iomap_end = fuse_iomap_end, > +}; > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 0cac7164afa298..1eea8dc6e723c6 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -1457,6 +1457,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > > if (flags & FUSE_REQUEST_TIMEOUT) > timeout = arg->request_timeout; > + > + if ((flags & FUSE_IOMAP) && fuse_iomap_enabled()) { > + fc->iomap = 1; > + pr_warn( > + "EXPERIMENTAL iomap feature enabled. Use at your own risk!"); > + } > } else { > ra_pages = fc->max_read / PAGE_SIZE; > fc->no_lock = 1; > @@ -1525,6 +1531,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) > */ > if (fuse_uring_enabled()) > flags |= FUSE_OVER_IO_URING; > + if (fuse_iomap_enabled()) > + flags |= FUSE_IOMAP; > > ia->in.flags = flags; > ia->in.flags2 = flags >> 32; > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms 2026-01-21 19:34 ` [PATCH 01/31] fuse: implement the basic iomap mechanisms Joanne Koong @ 2026-01-21 22:45 ` Darrick J. Wong 2026-01-22 0:06 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-21 22:45 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 11:34:24AM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > Implement functions to enable upcalling of iomap_begin and iomap_end to > > userspace fuse servers. > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > fs/fuse/fuse_i.h | 22 ++ > > fs/fuse/iomap_i.h | 36 ++++ > > include/uapi/linux/fuse.h | 90 +++++++++ > > fs/fuse/Kconfig | 32 +++ > > fs/fuse/Makefile | 1 > > fs/fuse/file_iomap.c | 434 +++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/inode.c | 8 + > > 7 files changed, 621 insertions(+), 2 deletions(-) > > create mode 100644 fs/fuse/iomap_i.h > > create mode 100644 fs/fuse/file_iomap.c > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index 7c7d255d817f1e..45be59df7ae592 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -929,6 +929,9 @@ struct fuse_conn { > > /* Is synchronous FUSE_INIT allowed? */ > > unsigned int sync_init:1; > > > > + /* Enable fs/iomap for file operations */ > > + unsigned int iomap:1; > > + > > /* Use io_uring for communication */ > > unsigned int io_uring; > > > > @@ -1053,12 +1056,17 @@ static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) > > return sb->s_fs_info; > > } > > > > +static inline const struct fuse_mount *get_fuse_mount_super_c(const struct super_block *sb) > > +{ > > + return sb->s_fs_info; > > +} > > I'm not seeing this getting used anywhere - did you mean to remove this? Yeah. > > + > > static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) > > { > > return get_fuse_mount_super(sb)->fc; > > } > > > > -static inline struct fuse_mount *get_fuse_mount(struct inode *inode) > > +static inline struct fuse_mount *get_fuse_mount(const struct inode *inode) > > { > > return get_fuse_mount_super(inode->i_sb); > > } > > @@ -1683,4 +1691,16 @@ extern void fuse_sysctl_unregister(void); > > #define fuse_sysctl_unregister() do { } while (0) > > #endif /* CONFIG_SYSCTL */ > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP) > > +bool fuse_iomap_enabled(void); > > + > > +static inline bool fuse_has_iomap(const struct inode *inode) > > +{ > > + return get_fuse_conn(inode)->iomap; > > +} > > +#else > > +# define fuse_iomap_enabled(...) (false) > > +# define fuse_has_iomap(...) (false) > > +#endif > > + > > #endif /* _FS_FUSE_I_H */ > > diff --git a/fs/fuse/iomap_i.h b/fs/fuse/iomap_i.h > > new file mode 100644 > > index 00000000000000..d773f728579d1d > > --- /dev/null > > +++ b/fs/fuse/iomap_i.h > > @@ -0,0 +1,36 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Copyright (C) 2025 Oracle. All Rights Reserved. > > + * Author: Darrick J. Wong <djwong@kernel.org> > > + */ > > +#ifndef _FS_FUSE_IOMAP_I_H > > +#define _FS_FUSE_IOMAP_I_H > > + > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP) > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > +# define ASSERT(condition) do { \ > > + int __cond = !!(condition); \ > > + WARN(!__cond, "Assertion failed: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > > +} while (0) > > +# define BAD_DATA(condition) ({ \ > > + int __cond = !!(condition); \ > > + WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > > +}) > > +#else > > +# define ASSERT(condition) > > +# define BAD_DATA(condition) ({ \ > > + int __cond = !!(condition); \ > > + unlikely(__cond); \ > > +}) > > +#endif /* CONFIG_FUSE_IOMAP_DEBUG */ > > + > > +enum fuse_iomap_iodir { > > + READ_MAPPING, > > + WRITE_MAPPING, > > +}; > > + > > +#define EFSCORRUPTED EUCLEAN > > + > > +#endif /* CONFIG_FUSE_IOMAP */ > > + > > +#endif /* _FS_FUSE_IOMAP_I_H */ > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index 18713cfaf09171..7d709cf12b41a7 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -240,6 +240,9 @@ > > * - add FUSE_COPY_FILE_RANGE_64 > > * - add struct fuse_copy_file_range_out > > * - add FUSE_NOTIFY_PRUNE > > + * > > + * 7.99 > > Should this be changed to something like 7.46 now that this patch is > submitted for merging into the tree? When review of this patchset nears completion I'll change the 99s to 46 or whatever the fuse/libfuse minor version happens to be at that point. Nobody's touched this series since 29 October (during 6.19 development) and I've been busy with xfs_healer so I'm not submitting this for 7.0 either. > > + * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > > */ > > > > #ifndef _LINUX_FUSE_H > > @@ -275,7 +278,7 @@ > > #define FUSE_KERNEL_VERSION 7 > > > > /** Minor version number of this interface */ > > -#define FUSE_KERNEL_MINOR_VERSION 45 > > +#define FUSE_KERNEL_MINOR_VERSION 99 > > Same question here > > > > > /** The node ID of the root inode */ > > #define FUSE_ROOT_ID 1 > > @@ -448,6 +451,7 @@ struct fuse_file_lock { > > * FUSE_OVER_IO_URING: Indicate that client supports io-uring > > * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. > > * init_out.request_timeout contains the timeout (in secs) > > + * FUSE_IOMAP: Client supports iomap for regular file operations. > > */ > > #define FUSE_ASYNC_READ (1 << 0) > > #define FUSE_POSIX_LOCKS (1 << 1) > > @@ -495,6 +499,7 @@ struct fuse_file_lock { > > #define FUSE_ALLOW_IDMAP (1ULL << 40) > > #define FUSE_OVER_IO_URING (1ULL << 41) > > #define FUSE_REQUEST_TIMEOUT (1ULL << 42) > > +#define FUSE_IOMAP (1ULL << 43) > > > > /** > > * CUSE INIT request/reply flags > > @@ -664,6 +669,9 @@ enum fuse_opcode { > > FUSE_STATX = 52, > > FUSE_COPY_FILE_RANGE_64 = 53, > > > > + FUSE_IOMAP_BEGIN = 4094, > > + FUSE_IOMAP_END = 4095, > > + > > /* CUSE specific operations */ > > CUSE_INIT = 4096, > > > > @@ -1314,4 +1322,84 @@ struct fuse_uring_cmd_req { > > uint8_t padding[6]; > > }; > > > > +/* mapping types; see corresponding IOMAP_TYPE_ */ > > +#define FUSE_IOMAP_TYPE_HOLE (0) > > +#define FUSE_IOMAP_TYPE_DELALLOC (1) > > +#define FUSE_IOMAP_TYPE_MAPPED (2) > > +#define FUSE_IOMAP_TYPE_UNWRITTEN (3) > > +#define FUSE_IOMAP_TYPE_INLINE (4) > > + > > +/* fuse-specific mapping type indicating that writes use the read mapping */ > > +#define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255) > > + > > +#define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */ > > + > > +/* mapping flags passed back from iomap_begin; see corresponding IOMAP_F_ */ > > +#define FUSE_IOMAP_F_NEW (1U << 0) > > +#define FUSE_IOMAP_F_DIRTY (1U << 1) > > +#define FUSE_IOMAP_F_SHARED (1U << 2) > > +#define FUSE_IOMAP_F_MERGED (1U << 3) > > +#define FUSE_IOMAP_F_BOUNDARY (1U << 4) > > +#define FUSE_IOMAP_F_ANON_WRITE (1U << 5) > > +#define FUSE_IOMAP_F_ATOMIC_BIO (1U << 6) > > Do you think it makes sense to have the fuse iomap constants mirror > the in-kernel iomap ones? Maybe I'm mistaken but it seems like the > fuse iomap capabilities won't diverge too much from fs/iomap ones? I > like that if they're mirrored, then it makes it simpler instead of > needing to convert back and forth. "Mirrored"? As in, having the define use a symbol: #define FUSE_IOMAP_F_NEW IOMAP_F_NEW instead of defining it to be a specific numerical constant like it is here? <confused> This might not be answering your question, but as an old iomap maintainer I want the kernel iomap api and the fuse iomap uabi to be as decoupled as they can be; and trust the compiler to notice that the flag and enum constants are the same and not do anything too stupid with the translation. > > +/* fuse-specific mapping flag asking for ->iomap_end call */ > > +#define FUSE_IOMAP_F_WANT_IOMAP_END (1U << 7) > > + > > +/* mapping flags passed to iomap_end */ > > +#define FUSE_IOMAP_F_SIZE_CHANGED (1U << 8) > > +#define FUSE_IOMAP_F_STALE (1U << 9) > > + > > +/* operation flags from iomap; see corresponding IOMAP_* */ > > +#define FUSE_IOMAP_OP_WRITE (1U << 0) > > +#define FUSE_IOMAP_OP_ZERO (1U << 1) > > +#define FUSE_IOMAP_OP_REPORT (1U << 2) > > +#define FUSE_IOMAP_OP_FAULT (1U << 3) > > +#define FUSE_IOMAP_OP_DIRECT (1U << 4) > > +#define FUSE_IOMAP_OP_NOWAIT (1U << 5) > > +#define FUSE_IOMAP_OP_OVERWRITE_ONLY (1U << 6) > > +#define FUSE_IOMAP_OP_UNSHARE (1U << 7) > > +#define FUSE_IOMAP_OP_DAX (1U << 8) > > +#define FUSE_IOMAP_OP_ATOMIC (1U << 9) > > +#define FUSE_IOMAP_OP_DONTCACHE (1U << 10) > > + > > +#define FUSE_IOMAP_NULL_ADDR (-1ULL) /* addr is not valid */ > > + > > +struct fuse_iomap_io { > > + uint64_t offset; /* file offset of mapping, bytes */ > > + uint64_t length; /* length of mapping, bytes */ > > + uint64_t addr; /* disk offset of mapping, bytes */ > > + uint16_t type; /* FUSE_IOMAP_TYPE_* */ > > + uint16_t flags; /* FUSE_IOMAP_F_* */ > > + uint32_t dev; /* device cookie */ > > Do you think it's a good idea to add a reserved field here in case we > end up needing it in the future? I'm open to the idea of pre-padding the structs, though that's extra copy overhead until they get used for something. Does that fuse-iouring-zerocopy patchset that you're working on enable the kernel to avoid copying fuse command data around? I haven't read it in sufficient (or any) detail to know the answer to that question. Second: how easy is it to send a variable sized fuse command to userspace? It looks like some commands like FUSE_WRITE do things like: if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); args->in_args[0].value = &ia->write.in; args->in_args[1].size = count; Which means that future expansion can (in theory) bump the minor version and send larer commands. It also looks like the kernel can support receiving variable-sized responses, like FUSE_READ does: args->out_argvar = true; args->out_numargs = 1; args->out_args[0].size = count; I think this means that if we ever needed to expand the _out struct to allow the fuse server to send back a more lengthy response, we could potentially do that without needing a minor protocol version bump. > > +}; > > + > > +struct fuse_iomap_begin_in { > > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > > + uint32_t reserved; /* zero */ > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > + uint64_t pos; /* file position, in bytes */ > > + uint64_t count; /* operation length, in bytes */ > > +}; > > + > > +struct fuse_iomap_begin_out { > > + /* read file data from here */ > > + struct fuse_iomap_io read; > > + > > + /* write file data to here, if applicable */ > > + struct fuse_iomap_io write; > > Same question here How much padding do you want? fuse_iomap_io is conveniently half a cacheline right now... > > +}; > > + > > +struct fuse_iomap_end_in { > > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > > + uint32_t reserved; /* zero */ > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > + uint64_t pos; /* file position, in bytes */ > > + uint64_t count; /* operation length, in bytes */ > > + int64_t written; /* bytes processed */ > > On the fs/iomap side, I see that written is passed through by > iomap_iter() to ->iomap_end through 'ssize_t advanced' but it's not > clear to me why advanced needs to be signed. I think it used to also > represent the error status, but it looks like now that's represented > through iter->status and 'advanced' strictly reflects the number of > bytes written. As such, do you think it makes sense to change > 'advanced' to loff_t and have written be uint64_t instead? Not quite -- back in the bad old days, iomap_iter::processed was a s64 value that the iteration loop had to set to one of: * a positive number for positive progress * zero to stop the iteration * a negative errno to fail out Nowadays we just move iomap_iter::pos forward via iomap_iter_advance or set status to a negative number to end the iteration. So yes, I think @advanced should be widened to 64-bits since iomap operations can jump more than 2GB per iter step. Practically speaking I think this hasn't yet been a problem because the only operations that can do that (fiemap, seek, swap) also don't have any client filesystems that implement iomap_end; or they do but never send mappings large enough to cause problems. iomap iters can't go backwards so @advanced could be u64 as well. Also the name of the ->iomap_end parameter could be changed to "advanced" because iomap_end could in theory be called for any operation, not just writes. That's a throwback to the days when the iomap code was just part of xfs. It also is an unsigned quantity. > > + > > + /* mapping that the kernel acted upon */ > > + struct fuse_iomap_io map; > > +}; > > + > > #endif /* _LINUX_FUSE_H */ > > diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig > > index 290d1c09e0b924..934d48076a010c 100644 > > --- a/fs/fuse/Kconfig > > +++ b/fs/fuse/Kconfig > > @@ -69,6 +69,38 @@ config FUSE_PASSTHROUGH > > config FUSE_BACKING > > bool > > > > +config FUSE_IOMAP > > + bool "FUSE file IO over iomap" > > + default y > > + depends on FUSE_FS > > + depends on BLOCK > > + select FS_IOMAP > > + help > > + Enable fuse servers to operate the regular file I/O path through > > + the fs-iomap library in the kernel. This enables higher performance > > + userspace filesystems by keeping the performance critical parts in > > + the kernel while delegating the difficult metadata parsing parts to > > + an easily-contained userspace program. > > + > > + This feature is considered EXPERIMENTAL. Use with caution! > > + > > + If unsure, say N. > > + > > +config FUSE_IOMAP_BY_DEFAULT > > + bool "FUSE file I/O over iomap by default" > > + default n > > + depends on FUSE_IOMAP > > + help > > + Enable sending FUSE file I/O over iomap by default. > > I'm not really sure what the general linux preference is for adding > new configs, but assuming it errs towards less configs than more, imo > it seems easy enough to just set the enable_iomap module param to true > manually instead of needing this config for it, especially since the > param only needs to be set once. /me doesn't know what the norm is in fuse-land -- for xfs I've preferred to have a kconfig option for experimental code so that distros can turn off experimental stuff they don't want to support. OTOH they can also patch it out or affix the module param to 0. Also I'm not sure if the kernel tinyfication project is still active, for a while they were advocating strongly for more kconfig options so that people building embedded kernels could turn off big chunks of functionality they'd never need. > > + > > +config FUSE_IOMAP_DEBUG > > + bool "Debug FUSE file IO over iomap" > > + default y > > + depends on FUSE_IOMAP > > + help > > + Enable debugging assertions for the fuse iomap code paths and logging > > + of bad iomap file mapping data being sent to the kernel. > > I'm wondering if it makes sense to make this a general FUSE_DEBUG > config so we can reuse this more generally In general yes but I highly recommend that everyone look at the static labels and auto-ftracing stuff enabled by the next few debug patches before anyone commits to spreading that enhanced observability / brain disease to the rest of fuse. ;) > > + > > config FUSE_IO_URING > > bool "FUSE communication over io-uring" > > default y > > diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile > > index 46041228e5be2c..27be39317701d6 100644 > > --- a/fs/fuse/Makefile > > +++ b/fs/fuse/Makefile > > @@ -18,5 +18,6 @@ fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o > > fuse-$(CONFIG_FUSE_BACKING) += backing.o > > fuse-$(CONFIG_SYSCTL) += sysctl.o > > fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o > > +fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o > > > > virtiofs-y := virtio_fs.o > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > new file mode 100644 > > index 00000000000000..d564d60d0f1779 > > --- /dev/null > > +++ b/fs/fuse/file_iomap.c > > @@ -0,0 +1,434 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Copyright (C) 2025 Oracle. All Rights Reserved. > > + * Author: Darrick J. Wong <djwong@kernel.org> > > + */ > > +#include <linux/iomap.h> > > +#include "fuse_i.h" > > +#include "fuse_trace.h" > > +#include "iomap_i.h" > > + > > +static bool __read_mostly enable_iomap = > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > > + true; > > +#else > > + false; > > +#endif > > +module_param(enable_iomap, bool, 0644); > > +MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > + > > +bool fuse_iomap_enabled(void) > > +{ > > + /* Don't let anyone touch iomap until the end of the patchset. */ > > + return false; > > + > > + /* > > + * There are fears that a fuse+iomap server could somehow DoS the > > + * system by doing things like going out to lunch during a writeback > > + * related iomap request. Only allow iomap access if the fuse server > > + * has rawio capabilities since those processes can mess things up > > + * quite well even without our help. > > + */ > > + return enable_iomap && has_capability_noaudit(current, CAP_SYS_RAWIO); > > +} > > + > > +/* Convert IOMAP_* mapping types to FUSE_IOMAP_TYPE_* */ > > +#define XMAP(word) \ > > + case IOMAP_##word: \ > > + return FUSE_IOMAP_TYPE_##word > > +static inline uint16_t fuse_iomap_type_to_server(uint16_t iomap_type) > > +{ > > + switch (iomap_type) { > > + XMAP(HOLE); > > + XMAP(DELALLOC); > > + XMAP(MAPPED); > > + XMAP(UNWRITTEN); > > + XMAP(INLINE); > > + default: > > + ASSERT(0); > > + } > > + return 0; > > +} > > +#undef XMAP > > + > > +/* Convert FUSE_IOMAP_TYPE_* to IOMAP_* mapping types */ > > +#define XMAP(word) \ > > + case FUSE_IOMAP_TYPE_##word: \ > > + return IOMAP_##word > > +static inline uint16_t fuse_iomap_type_from_server(uint16_t fuse_type) > > +{ > > + switch (fuse_type) { > > + XMAP(HOLE); > > + XMAP(DELALLOC); > > + XMAP(MAPPED); > > + XMAP(UNWRITTEN); > > + XMAP(INLINE); > > + default: > > + ASSERT(0); > > + } > > + return 0; > > +} > > +#undef XMAP > > + > > +/* Validate FUSE_IOMAP_TYPE_* */ > > +static inline bool fuse_iomap_check_type(uint16_t fuse_type) > > +{ > > + switch (fuse_type) { > > + case FUSE_IOMAP_TYPE_HOLE: > > + case FUSE_IOMAP_TYPE_DELALLOC: > > + case FUSE_IOMAP_TYPE_MAPPED: > > + case FUSE_IOMAP_TYPE_UNWRITTEN: > > + case FUSE_IOMAP_TYPE_INLINE: > > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > > + return true; > > + } > > + > > + return false; > > +} > > + > > +#define FUSE_IOMAP_F_ALL (FUSE_IOMAP_F_NEW | \ > > + FUSE_IOMAP_F_DIRTY | \ > > + FUSE_IOMAP_F_SHARED | \ > > + FUSE_IOMAP_F_MERGED | \ > > + FUSE_IOMAP_F_BOUNDARY | \ > > + FUSE_IOMAP_F_ANON_WRITE | \ > > + FUSE_IOMAP_F_ATOMIC_BIO | \ > > + FUSE_IOMAP_F_WANT_IOMAP_END) > > + > > +static inline bool fuse_iomap_check_flags(uint16_t flags) > > +{ > > + return (flags & ~FUSE_IOMAP_F_ALL) == 0; > > +} > > + > > +/* Convert IOMAP_F_* mapping state flags to FUSE_IOMAP_F_* */ > > +#define XMAP(word) \ > > + if (iomap_f_flags & IOMAP_F_##word) \ > > + ret |= FUSE_IOMAP_F_##word > > +#define YMAP(iword, oword) \ > > + if (iomap_f_flags & IOMAP_F_##iword) \ > > + ret |= FUSE_IOMAP_F_##oword > > +static inline uint16_t fuse_iomap_flags_to_server(uint16_t iomap_f_flags) > > +{ > > + uint16_t ret = 0; > > + > > + XMAP(NEW); > > + XMAP(DIRTY); > > + XMAP(SHARED); > > + XMAP(MERGED); > > + XMAP(BOUNDARY); > > + XMAP(ANON_WRITE); > > + XMAP(ATOMIC_BIO); > > + YMAP(PRIVATE, WANT_IOMAP_END); > > + > > + XMAP(SIZE_CHANGED); > > + XMAP(STALE); > > + > > + return ret; > > +} > > +#undef YMAP > > +#undef XMAP > > + > > +/* Convert FUSE_IOMAP_F_* to IOMAP_F_* mapping state flags */ > > +#define XMAP(word) \ > > + if (fuse_f_flags & FUSE_IOMAP_F_##word) \ > > + ret |= IOMAP_F_##word > > +#define YMAP(iword, oword) \ > > + if (fuse_f_flags & FUSE_IOMAP_F_##iword) \ > > + ret |= IOMAP_F_##oword > > +static inline uint16_t fuse_iomap_flags_from_server(uint16_t fuse_f_flags) > > +{ > > + uint16_t ret = 0; > > + > > + XMAP(NEW); > > + XMAP(DIRTY); > > + XMAP(SHARED); > > + XMAP(MERGED); > > + XMAP(BOUNDARY); > > + XMAP(ANON_WRITE); > > + XMAP(ATOMIC_BIO); > > + YMAP(WANT_IOMAP_END, PRIVATE); > > + > > + return ret; > > +} > > +#undef YMAP > > +#undef XMAP > > + > > +/* Convert IOMAP_* operation flags to FUSE_IOMAP_OP_* */ > > +#define XMAP(word) \ > > + if (iomap_op_flags & IOMAP_##word) \ > > + ret |= FUSE_IOMAP_OP_##word > > +static inline uint32_t fuse_iomap_op_to_server(unsigned iomap_op_flags) > > +{ > > + uint32_t ret = 0; > > + > > + XMAP(WRITE); > > + XMAP(ZERO); > > + XMAP(REPORT); > > + XMAP(FAULT); > > + XMAP(DIRECT); > > + XMAP(NOWAIT); > > + XMAP(OVERWRITE_ONLY); > > + XMAP(UNSHARE); > > + XMAP(DAX); > > + XMAP(ATOMIC); > > + XMAP(DONTCACHE); > > + > > + return ret; > > +} > > +#undef XMAP > > + > > +/* Validate an iomap mapping. */ > > +static inline bool fuse_iomap_check_mapping(const struct inode *inode, > > + const struct fuse_iomap_io *map, > > + enum fuse_iomap_iodir iodir) > > +{ > > + const unsigned int blocksize = i_blocksize(inode); > > + uint64_t end; > > + > > + /* Type and flags must be known */ > > + if (BAD_DATA(!fuse_iomap_check_type(map->type))) > > + return false; > > + if (BAD_DATA(!fuse_iomap_check_flags(map->flags))) > > + return false; > > + > > + /* No zero-length mappings */ > > + if (BAD_DATA(map->length == 0)) > > + return false; > > + > > + /* File range must be aligned to blocksize */ > > + if (BAD_DATA(!IS_ALIGNED(map->offset, blocksize))) > > + return false; > > + if (BAD_DATA(!IS_ALIGNED(map->length, blocksize))) > > + return false; > > + > > + /* No overflows in the file range */ > > + if (BAD_DATA(check_add_overflow(map->offset, map->length, &end))) > > + return false; > > + > > + /* File range cannot start past maxbytes */ > > + if (BAD_DATA(map->offset >= inode->i_sb->s_maxbytes)) > > + return false; > > + > > + switch (map->type) { > > + case FUSE_IOMAP_TYPE_MAPPED: > > + case FUSE_IOMAP_TYPE_UNWRITTEN: > > + /* Mappings backed by space must have a device/addr */ > > + if (BAD_DATA(map->dev == FUSE_IOMAP_DEV_NULL)) > > + return false; > > + if (BAD_DATA(map->addr == FUSE_IOMAP_NULL_ADDR)) > > + return false; > > + break; > > + case FUSE_IOMAP_TYPE_DELALLOC: > > + case FUSE_IOMAP_TYPE_HOLE: > > + case FUSE_IOMAP_TYPE_INLINE: > > + /* Mappings not backed by space cannot have a device addr. */ > > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > > + return false; > > + if (BAD_DATA(map->addr != FUSE_IOMAP_NULL_ADDR)) > > + return false; > > + break; > > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > > + /* "Pure overwrite" only allowed for write mapping */ > > + if (BAD_DATA(iodir != WRITE_MAPPING)) > > + return false; > > + break; > > + default: > > + /* should have been caught already */ > > + ASSERT(0); > > + return false; > > + } > > + > > + /* XXX: we don't support devices yet */ > > > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > > + return false; > > + > > + /* No overflows in the device range, if supplied */ > > + if (map->addr != FUSE_IOMAP_NULL_ADDR && > > + BAD_DATA(check_add_overflow(map->addr, map->length, &end))) > > + return false; > > + > > + return true; > > +} > > + > > +/* Convert a mapping from the server into something the kernel can use */ > > +static inline void fuse_iomap_from_server(struct inode *inode, > > Maybe worth adding a const in front of struct inode? It can go away in a patch or two when we wire up bdev support. Though considering that fuse_iomap_enabled returns false all the way to the end of the patchset I guess I could just set bdev to null and skip passing in the inode at all. > > + struct iomap *iomap, > > + const struct fuse_iomap_io *fmap) > > +{ > > + iomap->addr = fmap->addr; > > + iomap->offset = fmap->offset; > > + iomap->length = fmap->length; > > + iomap->type = fuse_iomap_type_from_server(fmap->type); > > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > > + iomap->bdev = inode->i_sb->s_bdev; /* XXX */ > > +} > > + > > +/* Convert a mapping from the kernel into something the server can use */ > > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > > + const struct iomap *iomap) > > +{ > > + fmap->addr = FUSE_IOMAP_NULL_ADDR; /* XXX */ > > + fmap->offset = iomap->offset; > > + fmap->length = iomap->length; > > + fmap->type = fuse_iomap_type_to_server(iomap->type); > > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ > > AFAICT, this only gets used for sending the FUSE_IOMAP_END request. Is > passing the iomap->addr to fmap->addr and inode->i_sb->s_bdev to > fmap->dev not useful to the server here? So far the only fields I've needed in fuse4fs are the offset/count/written fields as provided by iomap_iter, and the flags field from the mapping. The addr field isn't necessary for fuse4fs because the fuse server would know if the mapping had changed. OTOH it's probably harmless to send it along. Hrm. I probably need a way to look up the backing_id from the iomap bdev. Looking further ahead at the ioend patch, I just realized that iomap ioends can tell you the new address of a write-append operation but they don't tell you which device. I guess you can read that from the ioend->io_bio.bi_bdev. > Also, did you mean to leave in the /* XXX */ comments? Yes, because they're a reminder to come back and check if I /ever/ needed them. > > +} > > + > > +/* Check the incoming _begin mappings to make sure they're not nonsense. */ > > +static inline int > > +fuse_iomap_begin_validate(const struct inode *inode, > > + unsigned opflags, loff_t pos, > > + const struct fuse_iomap_begin_out *outarg) > > +{ > > + /* Make sure the mappings aren't garbage */ > > + if (!fuse_iomap_check_mapping(inode, &outarg->read, READ_MAPPING)) > > + return -EFSCORRUPTED; > > + > > + if (!fuse_iomap_check_mapping(inode, &outarg->write, WRITE_MAPPING)) > > + return -EFSCORRUPTED; > > + > > + /* > > + * Must have returned a mapping for at least the first byte in the > > + * range. The main mapping check already validated that the length > > + * is nonzero and there is no overflow in computing end. > > + */ > > + if (BAD_DATA(outarg->read.offset > pos)) > > + return -EFSCORRUPTED; > > + if (BAD_DATA(outarg->write.offset > pos)) > > + return -EFSCORRUPTED; > > + > > + if (BAD_DATA(outarg->read.offset + outarg->read.length <= pos)) > > + return -EFSCORRUPTED; > > + if (BAD_DATA(outarg->write.offset + outarg->write.length <= pos)) > > + return -EFSCORRUPTED; > > + > > + return 0; > > +} > > + > > +static inline bool fuse_is_iomap_file_write(unsigned int opflags) > > +{ > > + return opflags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_UNSHARE); > > +} > > + > > +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > > + unsigned opflags, struct iomap *iomap, > > + struct iomap *srcmap) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct fuse_iomap_begin_in inarg = { > > + .attr_ino = fi->orig_ino, > > + .opflags = fuse_iomap_op_to_server(opflags), > > + .pos = pos, > > + .count = count, > > + }; > > + struct fuse_iomap_begin_out outarg = { }; > > + struct fuse_mount *fm = get_fuse_mount(inode); > > + FUSE_ARGS(args); > > + int err; > > + > > + args.opcode = FUSE_IOMAP_BEGIN; > > + args.nodeid = get_node_id(inode); > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + args.out_numargs = 1; > > + args.out_args[0].size = sizeof(outarg); > > + args.out_args[0].value = &outarg; > > + err = fuse_simple_request(fm, &args); > > + if (err) > > + return err; > > + > > + err = fuse_iomap_begin_validate(inode, opflags, pos, &outarg); > > + if (err) > > + return err; > > + > > + if (fuse_is_iomap_file_write(opflags) && > > + outarg.write.type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) { > > + /* > > + * For an out of place write, we must supply the write mapping > > + * via @iomap, and the read mapping via @srcmap. > > + */ > > + fuse_iomap_from_server(inode, iomap, &outarg.write); > > + fuse_iomap_from_server(inode, srcmap, &outarg.read); > > + } else { > > + /* > > + * For everything else (reads, reporting, and pure overwrites), > > + * we can return the sole mapping through @iomap and leave > > + * @srcmap unchanged from its default (HOLE). > > + */ > > + fuse_iomap_from_server(inode, iomap, &outarg.read); > > + } > > + > > + return 0; > > +} > > + > > +/* Decide if we send FUSE_IOMAP_END to the fuse server */ > > +static bool fuse_should_send_iomap_end(const struct iomap *iomap, > > + unsigned int opflags, loff_t count, > > + ssize_t written) > > +{ > > + /* fuse server demanded an iomap_end call. */ > > + if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END) > > + return true; > > + > > + /* Reads and reporting should never affect the filesystem metadata */ > > + if (!fuse_is_iomap_file_write(opflags)) > > + return false; > > + > > + /* Appending writes get an iomap_end call */ > > + if (iomap->flags & IOMAP_F_SIZE_CHANGED) > > + return true; > > + > > + /* Short writes get an iomap_end call to clean up delalloc */ > > + return written < count; > > +} > > + > > +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > + ssize_t written, unsigned opflags, > > + struct iomap *iomap) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct fuse_mount *fm = get_fuse_mount(inode); > > + int err = 0; > > + > > + if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > > + struct fuse_iomap_end_in inarg = { > > + .opflags = fuse_iomap_op_to_server(opflags), > > + .attr_ino = fi->orig_ino, > > + .pos = pos, > > + .count = count, > > + .written = written, > > + }; > > + FUSE_ARGS(args); > > + > > + fuse_iomap_to_server(&inarg.map, iomap); > > + > > + args.opcode = FUSE_IOMAP_END; > > + args.nodeid = get_node_id(inode); > > Just curious about this - does it make sense to set args.force here > for this opcode? It seems like it serves the same sort of purpose a > flush request (which sets args.force) does? What does args.force do? There's no documentation of what behaviors these fields are supposed to trigger. > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + err = fuse_simple_request(fm, &args); > > + switch (err) { > > + case -ENOSYS: > > + /* > > + * libfuse returns ENOSYS for servers that don't > > + * implement iomap_end > > + */ > > + err = 0; > > + break; > > + case 0: > > + break; > > Is this case 0 needed separately from the default case? Nah, that's just me absorbing functional brogrammerisms. ;) --D > Thanks, > Joanne > > > + default: > > + break; > > + } > > + } > > + > > + return err; > > +} > > + > > +const struct iomap_ops fuse_iomap_ops = { > > + .iomap_begin = fuse_iomap_begin, > > + .iomap_end = fuse_iomap_end, > > +}; > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > index 0cac7164afa298..1eea8dc6e723c6 100644 > > --- a/fs/fuse/inode.c > > +++ b/fs/fuse/inode.c > > @@ -1457,6 +1457,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > > > > if (flags & FUSE_REQUEST_TIMEOUT) > > timeout = arg->request_timeout; > > + > > + if ((flags & FUSE_IOMAP) && fuse_iomap_enabled()) { > > + fc->iomap = 1; > > + pr_warn( > > + "EXPERIMENTAL iomap feature enabled. Use at your own risk!"); > > + } > > } else { > > ra_pages = fc->max_read / PAGE_SIZE; > > fc->no_lock = 1; > > @@ -1525,6 +1531,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) > > */ > > if (fuse_uring_enabled()) > > flags |= FUSE_OVER_IO_URING; > > + if (fuse_iomap_enabled()) > > + flags |= FUSE_IOMAP; > > > > ia->in.flags = flags; > > ia->in.flags2 = flags >> 32; > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms 2026-01-21 22:45 ` Darrick J. Wong @ 2026-01-22 0:06 ` Joanne Koong 2026-01-22 0:34 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-22 0:06 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 2:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 21, 2026 at 11:34:24AM -0800, Joanne Koong wrote: > > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > Implement functions to enable upcalling of iomap_begin and iomap_end to > > > userspace fuse servers. > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > --- > > > fs/fuse/fuse_i.h | 22 ++ > > > fs/fuse/iomap_i.h | 36 ++++ > > > include/uapi/linux/fuse.h | 90 +++++++++ > > > fs/fuse/Kconfig | 32 +++ > > > fs/fuse/Makefile | 1 > > > fs/fuse/file_iomap.c | 434 +++++++++++++++++++++++++++++++++++++++++++++ > > > fs/fuse/inode.c | 8 + > > > 7 files changed, 621 insertions(+), 2 deletions(-) > > > create mode 100644 fs/fuse/iomap_i.h > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > > index 7c7d255d817f1e..45be59df7ae592 100644 > > > --- a/fs/fuse/fuse_i.h > > > +++ b/fs/fuse/fuse_i.h > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > > index 18713cfaf09171..7d709cf12b41a7 100644 > > > --- a/include/uapi/linux/fuse.h > > > +++ b/include/uapi/linux/fuse.h > > > @@ -240,6 +240,9 @@ > > > * - add FUSE_COPY_FILE_RANGE_64 > > > * - add struct fuse_copy_file_range_out > > > * - add FUSE_NOTIFY_PRUNE > > > + * > > > + * 7.99 > > > > Should this be changed to something like 7.46 now that this patch is > > submitted for merging into the tree? > > When review of this patchset nears completion I'll change the 99s to > 46 or whatever the fuse/libfuse minor version happens to be at that > point. Sounds good. > > Nobody's touched this series since 29 October (during 6.19 development) > and I've been busy with xfs_healer so I'm not submitting this for 7.0 > either. > > > > + * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > > > */ > > > > > > +/* fuse-specific mapping type indicating that writes use the read mapping */ > > > +#define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255) > > > + > > > +#define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */ > > > + > > > +/* mapping flags passed back from iomap_begin; see corresponding IOMAP_F_ */ > > > +#define FUSE_IOMAP_F_NEW (1U << 0) > > > +#define FUSE_IOMAP_F_DIRTY (1U << 1) > > > +#define FUSE_IOMAP_F_SHARED (1U << 2) > > > +#define FUSE_IOMAP_F_MERGED (1U << 3) > > > +#define FUSE_IOMAP_F_BOUNDARY (1U << 4) > > > +#define FUSE_IOMAP_F_ANON_WRITE (1U << 5) > > > +#define FUSE_IOMAP_F_ATOMIC_BIO (1U << 6) > > > > Do you think it makes sense to have the fuse iomap constants mirror > > the in-kernel iomap ones? Maybe I'm mistaken but it seems like the > > fuse iomap capabilities won't diverge too much from fs/iomap ones? I > > like that if they're mirrored, then it makes it simpler instead of > > needing to convert back and forth. > > "Mirrored"? As in, having the define use a symbol: > > #define FUSE_IOMAP_F_NEW IOMAP_F_NEW > > instead of defining it to be a specific numerical constant like it is > here? I was thinking keeping it like it is with defining it to a specific numerical constant, but having the number correspond to the number iomap.h uses and having static asserts to ensure they match, and then being able to just pass struct fuse_iomap_io's flags directly to iomap->flags and vice versa. But I guess the iomap constants could change at any time since it's not a uapi. > > <confused> > > This might not be answering your question, but as an old iomap > maintainer I want the kernel iomap api and the fuse iomap uabi to > be as decoupled as they can be; and trust the compiler to notice that > the flag and enum constants are the same and not do anything too stupid > with the translation. Gotcha, that makes sense. > > > > +/* fuse-specific mapping flag asking for ->iomap_end call */ > > > +#define FUSE_IOMAP_F_WANT_IOMAP_END (1U << 7) > > > + > > > +/* mapping flags passed to iomap_end */ > > > +#define FUSE_IOMAP_F_SIZE_CHANGED (1U << 8) > > > +#define FUSE_IOMAP_F_STALE (1U << 9) > > > + > > > +/* operation flags from iomap; see corresponding IOMAP_* */ > > > +#define FUSE_IOMAP_OP_WRITE (1U << 0) > > > +#define FUSE_IOMAP_OP_ZERO (1U << 1) > > > +#define FUSE_IOMAP_OP_REPORT (1U << 2) > > > +#define FUSE_IOMAP_OP_FAULT (1U << 3) > > > +#define FUSE_IOMAP_OP_DIRECT (1U << 4) > > > +#define FUSE_IOMAP_OP_NOWAIT (1U << 5) > > > +#define FUSE_IOMAP_OP_OVERWRITE_ONLY (1U << 6) > > > +#define FUSE_IOMAP_OP_UNSHARE (1U << 7) > > > +#define FUSE_IOMAP_OP_DAX (1U << 8) > > > +#define FUSE_IOMAP_OP_ATOMIC (1U << 9) > > > +#define FUSE_IOMAP_OP_DONTCACHE (1U << 10) > > > + > > > +#define FUSE_IOMAP_NULL_ADDR (-1ULL) /* addr is not valid */ > > > + > > > +struct fuse_iomap_io { > > > + uint64_t offset; /* file offset of mapping, bytes */ > > > + uint64_t length; /* length of mapping, bytes */ > > > + uint64_t addr; /* disk offset of mapping, bytes */ > > > + uint16_t type; /* FUSE_IOMAP_TYPE_* */ > > > + uint16_t flags; /* FUSE_IOMAP_F_* */ > > > + uint32_t dev; /* device cookie */ > > > > Do you think it's a good idea to add a reserved field here in case we > > end up needing it in the future? > > I'm open to the idea of pre-padding the structs, though that's extra > copy overhead until they get used for something. Bernd would know better than me on this, but iirc, fuse generally tries to prepad structs to avoid having to deal with backwards compatibility issues if future fields get added. > > Does that fuse-iouring-zerocopy patchset that you're working on enable > the kernel to avoid copying fuse command data around? I haven't read it > in sufficient (or any) detail to know the answer to that question. No, only the payload bypasses the copy. All the header stuff would have to get copied out to the ring. > > Second: how easy is it to send a variable sized fuse command to > userspace? It looks like some commands like FUSE_WRITE do things like: > > if (ff->fm->fc->minor < 9) > args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; > else > args->in_args[0].size = sizeof(ia->write.in); > args->in_args[0].value = &ia->write.in; > args->in_args[1].size = count; > > Which means that future expansion can (in theory) bump the minor version > and send larer commands. > > It also looks like the kernel can support receiving variable-sized > responses, like FUSE_READ does: > > args->out_argvar = true; > args->out_numargs = 1; > args->out_args[0].size = count; > > I think this means that if we ever needed to expand the _out struct to > allow the fuse server to send back a more lengthy response, we could > potentially do that without needing a minor protocol version bump. I'm not sure, Bernd or Miklos would know more, but my general impression has been that we try to avoid doing the FUSE_COMPAT_ stuff if we can. > > > > +}; > > > + > > > +struct fuse_iomap_begin_in { > > > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > > > + uint32_t reserved; /* zero */ > > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > > + uint64_t pos; /* file position, in bytes */ > > > + uint64_t count; /* operation length, in bytes */ > > > +}; > > > + > > > +struct fuse_iomap_begin_out { > > > + /* read file data from here */ > > > + struct fuse_iomap_io read; > > > + > > > + /* write file data to here, if applicable */ > > > + struct fuse_iomap_io write; > > > > Same question here > > How much padding do you want? fuse_iomap_io is conveniently half a > cacheline right now... > > > > +}; > > > + > > > +struct fuse_iomap_end_in { > > > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > > > + uint32_t reserved; /* zero */ > > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > > + uint64_t pos; /* file position, in bytes */ > > > + uint64_t count; /* operation length, in bytes */ > > > + int64_t written; /* bytes processed */ > > > > On the fs/iomap side, I see that written is passed through by > > iomap_iter() to ->iomap_end through 'ssize_t advanced' but it's not > > clear to me why advanced needs to be signed. I think it used to also > > represent the error status, but it looks like now that's represented > > through iter->status and 'advanced' strictly reflects the number of > > bytes written. As such, do you think it makes sense to change > > 'advanced' to loff_t and have written be uint64_t instead? > > Not quite -- back in the bad old days, iomap_iter::processed was a s64 > value that the iteration loop had to set to one of: > > * a positive number for positive progress > * zero to stop the iteration > * a negative errno to fail out > > Nowadays we just move iomap_iter::pos forward via iomap_iter_advance or > set status to a negative number to end the iteration. > > So yes, I think @advanced should be widened to 64-bits since iomap > operations can jump more than 2GB per iter step. Practically speaking I > think this hasn't yet been a problem because the only operations that > can do that (fiemap, seek, swap) also don't have any client filesystems > that implement iomap_end; or they do but never send mappings large > enough to cause problems. > > iomap iters can't go backwards so @advanced could be u64 as well. > > Also the name of the ->iomap_end parameter could be changed to > "advanced" because iomap_end could in theory be called for any > operation, not just writes. That's a throwback to the days when the > iomap code was just part of xfs. It also is an unsigned quantity. That makes sense, thanks for the context. > > > > + > > > + /* mapping that the kernel acted upon */ > > > + struct fuse_iomap_io map; > > > +}; > > > + > > > #endif /* _LINUX_FUSE_H */ > > > diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig > > > index 290d1c09e0b924..934d48076a010c 100644 > > > --- a/fs/fuse/Kconfig > > > +++ b/fs/fuse/Kconfig > > > @@ -69,6 +69,38 @@ config FUSE_PASSTHROUGH > > > config FUSE_BACKING > > > bool > > > > > > +config FUSE_IOMAP > > > + bool "FUSE file IO over iomap" > > > + default y > > > + depends on FUSE_FS > > > + depends on BLOCK > > > + select FS_IOMAP > > > + help > > > + Enable fuse servers to operate the regular file I/O path through > > > + the fs-iomap library in the kernel. This enables higher performance > > > + userspace filesystems by keeping the performance critical parts in > > > + the kernel while delegating the difficult metadata parsing parts to > > > + an easily-contained userspace program. > > > + > > > + This feature is considered EXPERIMENTAL. Use with caution! > > > + > > > + If unsure, say N. > > > + > > > +config FUSE_IOMAP_BY_DEFAULT > > > + bool "FUSE file I/O over iomap by default" > > > + default n > > > + depends on FUSE_IOMAP > > > + help > > > + Enable sending FUSE file I/O over iomap by default. > > > > I'm not really sure what the general linux preference is for adding > > new configs, but assuming it errs towards less configs than more, imo > > it seems easy enough to just set the enable_iomap module param to true > > manually instead of needing this config for it, especially since the > > param only needs to be set once. > > /me doesn't know what the norm is in fuse-land -- for xfs I've preferred > to have a kconfig option for experimental code so that distros can turn > off experimental stuff they don't want to support. > > OTOH they can also patch it out or affix the module param to 0. > > Also I'm not sure if the kernel tinyfication project is still active, > for a while they were advocating strongly for more kconfig options so > that people building embedded kernels could turn off big chunks of > functionality they'd never need. > > > > + > > > +config FUSE_IOMAP_DEBUG > > > + bool "Debug FUSE file IO over iomap" > > > + default y > > > + depends on FUSE_IOMAP > > > + help > > > + Enable debugging assertions for the fuse iomap code paths and logging > > > + of bad iomap file mapping data being sent to the kernel. > > > > I'm wondering if it makes sense to make this a general FUSE_DEBUG > > config so we can reuse this more generally > > In general yes but I highly recommend that everyone look at the static > labels and auto-ftracing stuff enabled by the next few debug patches > before anyone commits to spreading that enhanced observability / brain > disease to the rest of fuse. ;) > > > > + > > > config FUSE_IO_URING > > > bool "FUSE communication over io-uring" > > > default y > > > diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile > > > index 46041228e5be2c..27be39317701d6 100644 > > > --- a/fs/fuse/Makefile > > > +++ b/fs/fuse/Makefile > > > @@ -18,5 +18,6 @@ fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o > > > fuse-$(CONFIG_FUSE_BACKING) += backing.o > > > fuse-$(CONFIG_SYSCTL) += sysctl.o > > > fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o > > > +fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o > > > > > > virtiofs-y := virtio_fs.o > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > new file mode 100644 > > > index 00000000000000..d564d60d0f1779 > > > --- /dev/null > > > +++ b/fs/fuse/file_iomap.c > > > @@ -0,0 +1,434 @@ > > > +// SPDX-License-Identifier: GPL-2.0 > > > +/* > > > + * Copyright (C) 2025 Oracle. All Rights Reserved. > > > + * Author: Darrick J. Wong <djwong@kernel.org> > > > + */ > > > +#include <linux/iomap.h> > > > +#include "fuse_i.h" > > > +#include "fuse_trace.h" > > > +#include "iomap_i.h" > > > + > > > +static bool __read_mostly enable_iomap = > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > > > + true; > > > +#else > > > + false; > > > +#endif > > > +module_param(enable_iomap, bool, 0644); > > > +MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > > + > > > +bool fuse_iomap_enabled(void) > > > +{ > > > + /* Don't let anyone touch iomap until the end of the patchset. */ > > > + return false; > > > + > > > + /* > > > + * There are fears that a fuse+iomap server could somehow DoS the > > > + * system by doing things like going out to lunch during a writeback > > > + * related iomap request. Only allow iomap access if the fuse server > > > + * has rawio capabilities since those processes can mess things up > > > + * quite well even without our help. > > > + */ > > > + return enable_iomap && has_capability_noaudit(current, CAP_SYS_RAWIO); > > > +} > > > + > > > +/* Convert IOMAP_* mapping types to FUSE_IOMAP_TYPE_* */ > > > +#define XMAP(word) \ > > > + case IOMAP_##word: \ > > > + return FUSE_IOMAP_TYPE_##word > > > +static inline uint16_t fuse_iomap_type_to_server(uint16_t iomap_type) > > > +{ > > > + switch (iomap_type) { > > > + XMAP(HOLE); > > > + XMAP(DELALLOC); > > > + XMAP(MAPPED); > > > + XMAP(UNWRITTEN); > > > + XMAP(INLINE); > > > + default: > > > + ASSERT(0); > > > + } > > > + return 0; > > > +} > > > +#undef XMAP > > > + > > > +/* Convert FUSE_IOMAP_TYPE_* to IOMAP_* mapping types */ > > > +#define XMAP(word) \ > > > + case FUSE_IOMAP_TYPE_##word: \ > > > + return IOMAP_##word > > > +static inline uint16_t fuse_iomap_type_from_server(uint16_t fuse_type) > > > +{ > > > + switch (fuse_type) { > > > + XMAP(HOLE); > > > + XMAP(DELALLOC); > > > + XMAP(MAPPED); > > > + XMAP(UNWRITTEN); > > > + XMAP(INLINE); > > > + default: > > > + ASSERT(0); > > > + } > > > + return 0; > > > +} > > > +#undef XMAP > > > + > > > +/* Validate FUSE_IOMAP_TYPE_* */ > > > +static inline bool fuse_iomap_check_type(uint16_t fuse_type) > > > +{ > > > + switch (fuse_type) { > > > + case FUSE_IOMAP_TYPE_HOLE: > > > + case FUSE_IOMAP_TYPE_DELALLOC: > > > + case FUSE_IOMAP_TYPE_MAPPED: > > > + case FUSE_IOMAP_TYPE_UNWRITTEN: > > > + case FUSE_IOMAP_TYPE_INLINE: > > > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > > > + return true; > > > + } > > > + > > > + return false; > > > +} > > > + > > > +#define FUSE_IOMAP_F_ALL (FUSE_IOMAP_F_NEW | \ > > > + FUSE_IOMAP_F_DIRTY | \ > > > + FUSE_IOMAP_F_SHARED | \ > > > + FUSE_IOMAP_F_MERGED | \ > > > + FUSE_IOMAP_F_BOUNDARY | \ > > > + FUSE_IOMAP_F_ANON_WRITE | \ > > > + FUSE_IOMAP_F_ATOMIC_BIO | \ > > > + FUSE_IOMAP_F_WANT_IOMAP_END) > > > + > > > +static inline bool fuse_iomap_check_flags(uint16_t flags) > > > +{ > > > + return (flags & ~FUSE_IOMAP_F_ALL) == 0; > > > +} > > > + > > > +/* Convert IOMAP_F_* mapping state flags to FUSE_IOMAP_F_* */ > > > +#define XMAP(word) \ > > > + if (iomap_f_flags & IOMAP_F_##word) \ > > > + ret |= FUSE_IOMAP_F_##word > > > +#define YMAP(iword, oword) \ > > > + if (iomap_f_flags & IOMAP_F_##iword) \ > > > + ret |= FUSE_IOMAP_F_##oword > > > +static inline uint16_t fuse_iomap_flags_to_server(uint16_t iomap_f_flags) > > > +{ > > > + uint16_t ret = 0; > > > + > > > + XMAP(NEW); > > > + XMAP(DIRTY); > > > + XMAP(SHARED); > > > + XMAP(MERGED); > > > + XMAP(BOUNDARY); > > > + XMAP(ANON_WRITE); > > > + XMAP(ATOMIC_BIO); > > > + YMAP(PRIVATE, WANT_IOMAP_END); > > > + > > > + XMAP(SIZE_CHANGED); > > > + XMAP(STALE); > > > + > > > + return ret; > > > +} > > > +#undef YMAP > > > +#undef XMAP > > > + > > > +/* Convert FUSE_IOMAP_F_* to IOMAP_F_* mapping state flags */ > > > +#define XMAP(word) \ > > > + if (fuse_f_flags & FUSE_IOMAP_F_##word) \ > > > + ret |= IOMAP_F_##word > > > +#define YMAP(iword, oword) \ > > > + if (fuse_f_flags & FUSE_IOMAP_F_##iword) \ > > > + ret |= IOMAP_F_##oword > > > +static inline uint16_t fuse_iomap_flags_from_server(uint16_t fuse_f_flags) > > > +{ > > > + uint16_t ret = 0; > > > + > > > + XMAP(NEW); > > > + XMAP(DIRTY); > > > + XMAP(SHARED); > > > + XMAP(MERGED); > > > + XMAP(BOUNDARY); > > > + XMAP(ANON_WRITE); > > > + XMAP(ATOMIC_BIO); > > > + YMAP(WANT_IOMAP_END, PRIVATE); > > > + > > > + return ret; > > > +} > > > +#undef YMAP > > > +#undef XMAP > > > + > > > +/* Convert IOMAP_* operation flags to FUSE_IOMAP_OP_* */ > > > +#define XMAP(word) \ > > > + if (iomap_op_flags & IOMAP_##word) \ > > > + ret |= FUSE_IOMAP_OP_##word > > > +static inline uint32_t fuse_iomap_op_to_server(unsigned iomap_op_flags) > > > +{ > > > + uint32_t ret = 0; > > > + > > > + XMAP(WRITE); > > > + XMAP(ZERO); > > > + XMAP(REPORT); > > > + XMAP(FAULT); > > > + XMAP(DIRECT); > > > + XMAP(NOWAIT); > > > + XMAP(OVERWRITE_ONLY); > > > + XMAP(UNSHARE); > > > + XMAP(DAX); > > > + XMAP(ATOMIC); > > > + XMAP(DONTCACHE); > > > + > > > + return ret; > > > +} > > > +#undef XMAP > > > + > > > +/* Validate an iomap mapping. */ > > > +static inline bool fuse_iomap_check_mapping(const struct inode *inode, > > > + const struct fuse_iomap_io *map, > > > + enum fuse_iomap_iodir iodir) > > > +{ > > > + const unsigned int blocksize = i_blocksize(inode); > > > + uint64_t end; > > > + > > > + /* Type and flags must be known */ > > > + if (BAD_DATA(!fuse_iomap_check_type(map->type))) > > > + return false; > > > + if (BAD_DATA(!fuse_iomap_check_flags(map->flags))) > > > + return false; > > > + > > > + /* No zero-length mappings */ > > > + if (BAD_DATA(map->length == 0)) > > > + return false; > > > + > > > + /* File range must be aligned to blocksize */ > > > + if (BAD_DATA(!IS_ALIGNED(map->offset, blocksize))) > > > + return false; > > > + if (BAD_DATA(!IS_ALIGNED(map->length, blocksize))) > > > + return false; > > > + > > > + /* No overflows in the file range */ > > > + if (BAD_DATA(check_add_overflow(map->offset, map->length, &end))) > > > + return false; > > > + > > > + /* File range cannot start past maxbytes */ > > > + if (BAD_DATA(map->offset >= inode->i_sb->s_maxbytes)) > > > + return false; > > > + > > > + switch (map->type) { > > > + case FUSE_IOMAP_TYPE_MAPPED: > > > + case FUSE_IOMAP_TYPE_UNWRITTEN: > > > + /* Mappings backed by space must have a device/addr */ > > > + if (BAD_DATA(map->dev == FUSE_IOMAP_DEV_NULL)) > > > + return false; > > > + if (BAD_DATA(map->addr == FUSE_IOMAP_NULL_ADDR)) > > > + return false; > > > + break; > > > + case FUSE_IOMAP_TYPE_DELALLOC: > > > + case FUSE_IOMAP_TYPE_HOLE: > > > + case FUSE_IOMAP_TYPE_INLINE: > > > + /* Mappings not backed by space cannot have a device addr. */ > > > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > > > + return false; > > > + if (BAD_DATA(map->addr != FUSE_IOMAP_NULL_ADDR)) > > > + return false; > > > + break; > > > + case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > > > + /* "Pure overwrite" only allowed for write mapping */ > > > + if (BAD_DATA(iodir != WRITE_MAPPING)) > > > + return false; > > > + break; > > > + default: > > > + /* should have been caught already */ > > > + ASSERT(0); > > > + return false; > > > + } > > > + > > > + /* XXX: we don't support devices yet */ > > > > > + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL)) > > > + return false; > > > + > > > + /* No overflows in the device range, if supplied */ > > > + if (map->addr != FUSE_IOMAP_NULL_ADDR && > > > + BAD_DATA(check_add_overflow(map->addr, map->length, &end))) > > > + return false; > > > + > > > + return true; > > > +} > > > + > > > +/* Convert a mapping from the server into something the kernel can use */ > > > +static inline void fuse_iomap_from_server(struct inode *inode, > > > > Maybe worth adding a const in front of struct inode? > > It can go away in a patch or two when we wire up bdev support. > > Though considering that fuse_iomap_enabled returns false all the way to > the end of the patchset I guess I could just set bdev to null and skip > passing in the inode at all. > > > > + struct iomap *iomap, > > > + const struct fuse_iomap_io *fmap) > > > +{ > > > + iomap->addr = fmap->addr; > > > + iomap->offset = fmap->offset; > > > + iomap->length = fmap->length; > > > + iomap->type = fuse_iomap_type_from_server(fmap->type); > > > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > > > + iomap->bdev = inode->i_sb->s_bdev; /* XXX */ > > > +} > > > + > > > +/* Convert a mapping from the kernel into something the server can use */ > > > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > > > + const struct iomap *iomap) > > > +{ > > > + fmap->addr = FUSE_IOMAP_NULL_ADDR; /* XXX */ > > > + fmap->offset = iomap->offset; > > > + fmap->length = iomap->length; > > > + fmap->type = fuse_iomap_type_to_server(iomap->type); > > > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > > > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ > > > > AFAICT, this only gets used for sending the FUSE_IOMAP_END request. Is > > passing the iomap->addr to fmap->addr and inode->i_sb->s_bdev to > > fmap->dev not useful to the server here? > > So far the only fields I've needed in fuse4fs are the > offset/count/written fields as provided by iomap_iter, and the flags > field from the mapping. The addr field isn't necessary for fuse4fs > because the fuse server would know if the mapping had changed. OTOH > it's probably harmless to send it along. > > Hrm. I probably need a way to look up the backing_id from the iomap > bdev. > > Looking further ahead at the ioend patch, I just realized that iomap > ioends can tell you the new address of a write-append operation but they > don't tell you which device. I guess you can read that from the > ioend->io_bio.bi_bdev. > > > Also, did you mean to leave in the /* XXX */ comments? > > Yes, because they're a reminder to come back and check if I /ever/ > needed them. Makes sense, seems like you're planning to remove them when the patch is ready to merge, if I understand correctly. > > > > +} > > > + > > > +/* Check the incoming _begin mappings to make sure they're not nonsense. */ > > > +static inline int > > > +fuse_iomap_begin_validate(const struct inode *inode, > > > + unsigned opflags, loff_t pos, > > > + const struct fuse_iomap_begin_out *outarg) > > > +{ > > > + /* Make sure the mappings aren't garbage */ > > > + if (!fuse_iomap_check_mapping(inode, &outarg->read, READ_MAPPING)) > > > + return -EFSCORRUPTED; > > > + > > > + if (!fuse_iomap_check_mapping(inode, &outarg->write, WRITE_MAPPING)) > > > + return -EFSCORRUPTED; > > > + > > > + /* > > > + * Must have returned a mapping for at least the first byte in the > > > + * range. The main mapping check already validated that the length > > > + * is nonzero and there is no overflow in computing end. > > > + */ > > > + if (BAD_DATA(outarg->read.offset > pos)) > > > + return -EFSCORRUPTED; > > > + if (BAD_DATA(outarg->write.offset > pos)) > > > + return -EFSCORRUPTED; > > > + > > > + if (BAD_DATA(outarg->read.offset + outarg->read.length <= pos)) > > > + return -EFSCORRUPTED; > > > + if (BAD_DATA(outarg->write.offset + outarg->write.length <= pos)) > > > + return -EFSCORRUPTED; > > > + > > > + return 0; > > > +} > > > + > > > +static inline bool fuse_is_iomap_file_write(unsigned int opflags) > > > +{ > > > + return opflags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_UNSHARE); > > > +} > > > + > > > +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > > > + unsigned opflags, struct iomap *iomap, > > > + struct iomap *srcmap) > > > +{ > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > + struct fuse_iomap_begin_in inarg = { > > > + .attr_ino = fi->orig_ino, > > > + .opflags = fuse_iomap_op_to_server(opflags), > > > + .pos = pos, > > > + .count = count, > > > + }; > > > + struct fuse_iomap_begin_out outarg = { }; > > > + struct fuse_mount *fm = get_fuse_mount(inode); > > > + FUSE_ARGS(args); > > > + int err; > > > + > > > + args.opcode = FUSE_IOMAP_BEGIN; > > > + args.nodeid = get_node_id(inode); > > > + args.in_numargs = 1; > > > + args.in_args[0].size = sizeof(inarg); > > > + args.in_args[0].value = &inarg; > > > + args.out_numargs = 1; > > > + args.out_args[0].size = sizeof(outarg); > > > + args.out_args[0].value = &outarg; > > > + err = fuse_simple_request(fm, &args); > > > + if (err) > > > + return err; > > > + > > > + err = fuse_iomap_begin_validate(inode, opflags, pos, &outarg); > > > + if (err) > > > + return err; > > > + > > > + if (fuse_is_iomap_file_write(opflags) && > > > + outarg.write.type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) { > > > + /* > > > + * For an out of place write, we must supply the write mapping > > > + * via @iomap, and the read mapping via @srcmap. > > > + */ > > > + fuse_iomap_from_server(inode, iomap, &outarg.write); > > > + fuse_iomap_from_server(inode, srcmap, &outarg.read); > > > + } else { > > > + /* > > > + * For everything else (reads, reporting, and pure overwrites), > > > + * we can return the sole mapping through @iomap and leave > > > + * @srcmap unchanged from its default (HOLE). > > > + */ > > > + fuse_iomap_from_server(inode, iomap, &outarg.read); > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +/* Decide if we send FUSE_IOMAP_END to the fuse server */ > > > +static bool fuse_should_send_iomap_end(const struct iomap *iomap, > > > + unsigned int opflags, loff_t count, > > > + ssize_t written) > > > +{ > > > + /* fuse server demanded an iomap_end call. */ > > > + if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END) > > > + return true; > > > + > > > + /* Reads and reporting should never affect the filesystem metadata */ > > > + if (!fuse_is_iomap_file_write(opflags)) > > > + return false; > > > + > > > + /* Appending writes get an iomap_end call */ > > > + if (iomap->flags & IOMAP_F_SIZE_CHANGED) > > > + return true; > > > + > > > + /* Short writes get an iomap_end call to clean up delalloc */ > > > + return written < count; > > > +} > > > + > > > +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > > + ssize_t written, unsigned opflags, > > > + struct iomap *iomap) > > > +{ > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > + struct fuse_mount *fm = get_fuse_mount(inode); > > > + int err = 0; > > > + > > > + if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > > > + struct fuse_iomap_end_in inarg = { > > > + .opflags = fuse_iomap_op_to_server(opflags), > > > + .attr_ino = fi->orig_ino, > > > + .pos = pos, > > > + .count = count, > > > + .written = written, > > > + }; > > > + FUSE_ARGS(args); > > > + > > > + fuse_iomap_to_server(&inarg.map, iomap); > > > + > > > + args.opcode = FUSE_IOMAP_END; > > > + args.nodeid = get_node_id(inode); > > > > Just curious about this - does it make sense to set args.force here > > for this opcode? It seems like it serves the same sort of purpose a > > flush request (which sets args.force) does? > > What does args.force do? There's no documentation of what behaviors > these fields are supposed to trigger. The args.force forces the request to be sent even if it gets interrupted by a signal. It'll also bypass the fuse_block_alloc() check when sending the request, but I don't think that's too relevant to this case. Thanks, Joanne > > > > + args.in_numargs = 1; > > > + args.in_args[0].size = sizeof(inarg); > > > + args.in_args[0].value = &inarg; > > > + err = fuse_simple_request(fm, &args); > > > + switch (err) { > > > + case -ENOSYS: > > > + /* > > > + * libfuse returns ENOSYS for servers that don't > > > + * implement iomap_end > > > + */ > > > + err = 0; > > > + break; > > > + case 0: > > > + break; > > > > Is this case 0 needed separately from the default case? > > Nah, that's just me absorbing functional brogrammerisms. ;) > > --D > > > Thanks, > > Joanne > > > > > + default: > > > + break; > > > + } > > > + } > > > + > > > + return err; > > > +} > > > + > > > +const struct iomap_ops fuse_iomap_ops = { > > > + .iomap_begin = fuse_iomap_begin, > > > + .iomap_end = fuse_iomap_end, > > > +}; > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > > index 0cac7164afa298..1eea8dc6e723c6 100644 > > > --- a/fs/fuse/inode.c > > > +++ b/fs/fuse/inode.c > > > @@ -1457,6 +1457,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > > > > > > if (flags & FUSE_REQUEST_TIMEOUT) > > > timeout = arg->request_timeout; > > > + > > > + if ((flags & FUSE_IOMAP) && fuse_iomap_enabled()) { > > > + fc->iomap = 1; > > > + pr_warn( > > > + "EXPERIMENTAL iomap feature enabled. Use at your own risk!"); > > > + } > > > } else { > > > ra_pages = fc->max_read / PAGE_SIZE; > > > fc->no_lock = 1; > > > @@ -1525,6 +1531,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) > > > */ > > > if (fuse_uring_enabled()) > > > flags |= FUSE_OVER_IO_URING; > > > + if (fuse_iomap_enabled()) > > > + flags |= FUSE_IOMAP; > > > > > > ia->in.flags = flags; > > > ia->in.flags2 = flags >> 32; > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms 2026-01-22 0:06 ` Joanne Koong @ 2026-01-22 0:34 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-22 0:34 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 04:06:39PM -0800, Joanne Koong wrote: > On Wed, Jan 21, 2026 at 2:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 21, 2026 at 11:34:24AM -0800, Joanne Koong wrote: > > > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > Implement functions to enable upcalling of iomap_begin and iomap_end to > > > > userspace fuse servers. > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > --- > > > > fs/fuse/fuse_i.h | 22 ++ > > > > fs/fuse/iomap_i.h | 36 ++++ > > > > include/uapi/linux/fuse.h | 90 +++++++++ > > > > fs/fuse/Kconfig | 32 +++ > > > > fs/fuse/Makefile | 1 > > > > fs/fuse/file_iomap.c | 434 +++++++++++++++++++++++++++++++++++++++++++++ > > > > fs/fuse/inode.c | 8 + > > > > 7 files changed, 621 insertions(+), 2 deletions(-) > > > > create mode 100644 fs/fuse/iomap_i.h > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > > > index 7c7d255d817f1e..45be59df7ae592 100644 > > > > --- a/fs/fuse/fuse_i.h > > > > +++ b/fs/fuse/fuse_i.h > > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > > > index 18713cfaf09171..7d709cf12b41a7 100644 > > > > --- a/include/uapi/linux/fuse.h > > > > +++ b/include/uapi/linux/fuse.h > > > > @@ -240,6 +240,9 @@ > > > > * - add FUSE_COPY_FILE_RANGE_64 > > > > * - add struct fuse_copy_file_range_out > > > > * - add FUSE_NOTIFY_PRUNE > > > > + * > > > > + * 7.99 > > > > > > Should this be changed to something like 7.46 now that this patch is > > > submitted for merging into the tree? > > > > When review of this patchset nears completion I'll change the 99s to > > 46 or whatever the fuse/libfuse minor version happens to be at that > > point. > > Sounds good. I'll add another XXX comment here to increase the likelihood it doesn't get missed. > > > > Nobody's touched this series since 29 October (during 6.19 development) > > and I've been busy with xfs_healer so I'm not submitting this for 7.0 > > either. > > > > > > + * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > > > > */ > > > > > > > > +/* fuse-specific mapping type indicating that writes use the read mapping */ > > > > +#define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255) > > > > + > > > > +#define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */ > > > > + > > > > +/* mapping flags passed back from iomap_begin; see corresponding IOMAP_F_ */ > > > > +#define FUSE_IOMAP_F_NEW (1U << 0) > > > > +#define FUSE_IOMAP_F_DIRTY (1U << 1) > > > > +#define FUSE_IOMAP_F_SHARED (1U << 2) > > > > +#define FUSE_IOMAP_F_MERGED (1U << 3) > > > > +#define FUSE_IOMAP_F_BOUNDARY (1U << 4) > > > > +#define FUSE_IOMAP_F_ANON_WRITE (1U << 5) > > > > +#define FUSE_IOMAP_F_ATOMIC_BIO (1U << 6) > > > > > > Do you think it makes sense to have the fuse iomap constants mirror > > > the in-kernel iomap ones? Maybe I'm mistaken but it seems like the > > > fuse iomap capabilities won't diverge too much from fs/iomap ones? I > > > like that if they're mirrored, then it makes it simpler instead of > > > needing to convert back and forth. > > > > "Mirrored"? As in, having the define use a symbol: > > > > #define FUSE_IOMAP_F_NEW IOMAP_F_NEW > > > > instead of defining it to be a specific numerical constant like it is > > here? > > I was thinking keeping it like it is with defining it to a specific > numerical constant, but having the number correspond to the number > iomap.h uses and having static asserts to ensure they match, and then > being able to just pass struct fuse_iomap_io's flags directly to > iomap->flags and vice versa. But I guess the iomap constants could > change at any time since it's not a uapi. Yep. iomap's api stability is only guaranteed until the mtime changes on include/linux/iomap.h. I actually /did/ do the static assert thing earlier in the lifetime of this patchset, but then I godbolted what the conversion functions were actually doing and observed that gcc and clang are smart enough to collapse all the C code into the appropriate masking if you compile with -O2. <snip> > > > > +struct fuse_iomap_io { > > > > + uint64_t offset; /* file offset of mapping, bytes */ > > > > + uint64_t length; /* length of mapping, bytes */ > > > > + uint64_t addr; /* disk offset of mapping, bytes */ > > > > + uint16_t type; /* FUSE_IOMAP_TYPE_* */ > > > > + uint16_t flags; /* FUSE_IOMAP_F_* */ > > > > + uint32_t dev; /* device cookie */ > > > > > > Do you think it's a good idea to add a reserved field here in case we > > > end up needing it in the future? > > > > I'm open to the idea of pre-padding the structs, though that's extra > > copy overhead until they get used for something. > > Bernd would know better than me on this, but iirc, fuse generally > tries to prepad structs to avoid having to deal with backwards > compatibility issues if future fields get added. <nod> for xfs I've generally added one u64 unless two would round us up to a cacheline... or just defined the struct size to be something insane like 512 bytes. > > > > Does that fuse-iouring-zerocopy patchset that you're working on enable > > the kernel to avoid copying fuse command data around? I haven't read it > > in sufficient (or any) detail to know the answer to that question. > > No, only the payload bypasses the copy. All the header stuff would > have to get copied out to the ring. D'oh! :/ > > > > Second: how easy is it to send a variable sized fuse command to > > userspace? It looks like some commands like FUSE_WRITE do things like: > > > > if (ff->fm->fc->minor < 9) > > args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; > > else > > args->in_args[0].size = sizeof(ia->write.in); > > args->in_args[0].value = &ia->write.in; > > args->in_args[1].size = count; > > > > Which means that future expansion can (in theory) bump the minor version > > and send larer commands. > > > > It also looks like the kernel can support receiving variable-sized > > responses, like FUSE_READ does: > > > > args->out_argvar = true; > > args->out_numargs = 1; > > args->out_args[0].size = count; > > > > I think this means that if we ever needed to expand the _out struct to > > allow the fuse server to send back a more lengthy response, we could > > potentially do that without needing a minor protocol version bump. > > I'm not sure, Bernd or Miklos would know more, but my general > impression has been that we try to avoid doing the FUSE_COMPAT_ stuff > if we can. <nod> revving the minor protocol version will take time to propagate. <snip> > > > > +}; > > > > + > > > > +struct fuse_iomap_end_in { > > > > + uint32_t opflags; /* FUSE_IOMAP_OP_* */ > > > > + uint32_t reserved; /* zero */ > > > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > > > + uint64_t pos; /* file position, in bytes */ > > > > + uint64_t count; /* operation length, in bytes */ > > > > + int64_t written; /* bytes processed */ > > > > > > On the fs/iomap side, I see that written is passed through by > > > iomap_iter() to ->iomap_end through 'ssize_t advanced' but it's not > > > clear to me why advanced needs to be signed. I think it used to also > > > represent the error status, but it looks like now that's represented > > > through iter->status and 'advanced' strictly reflects the number of > > > bytes written. As such, do you think it makes sense to change > > > 'advanced' to loff_t and have written be uint64_t instead? > > > > Not quite -- back in the bad old days, iomap_iter::processed was a s64 > > value that the iteration loop had to set to one of: > > > > * a positive number for positive progress > > * zero to stop the iteration > > * a negative errno to fail out > > > > Nowadays we just move iomap_iter::pos forward via iomap_iter_advance or > > set status to a negative number to end the iteration. Slight inaccuracy: one sets iter->status to a negative number to fail out of the iteration. To end early, they should call iomap_iter without calling iomap_iter_advance. > > So yes, I think @advanced should be widened to 64-bits since iomap > > operations can jump more than 2GB per iter step. Practically speaking I > > think this hasn't yet been a problem because the only operations that > > can do that (fiemap, seek, swap) also don't have any client filesystems > > that implement iomap_end; or they do but never send mappings large > > enough to cause problems. > > > > iomap iters can't go backwards so @advanced could be u64 as well. > > > > Also the name of the ->iomap_end parameter could be changed to > > "advanced" because iomap_end could in theory be called for any > > operation, not just writes. That's a throwback to the days when the > > iomap code was just part of xfs. It also is an unsigned quantity. > > That makes sense, thanks for the context. <nod> <snip> > > > > +/* Convert a mapping from the server into something the kernel can use */ > > > > +static inline void fuse_iomap_from_server(struct inode *inode, > > > > > > Maybe worth adding a const in front of struct inode? > > > > It can go away in a patch or two when we wire up bdev support. > > > > Though considering that fuse_iomap_enabled returns false all the way to > > the end of the patchset I guess I could just set bdev to null and skip > > passing in the inode at all. Done. > > > > + struct iomap *iomap, > > > > + const struct fuse_iomap_io *fmap) > > > > +{ > > > > + iomap->addr = fmap->addr; > > > > + iomap->offset = fmap->offset; > > > > + iomap->length = fmap->length; > > > > + iomap->type = fuse_iomap_type_from_server(fmap->type); > > > > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > > > > + iomap->bdev = inode->i_sb->s_bdev; /* XXX */ > > > > +} > > > > + > > > > +/* Convert a mapping from the kernel into something the server can use */ > > > > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > > > > + const struct iomap *iomap) > > > > +{ > > > > + fmap->addr = FUSE_IOMAP_NULL_ADDR; /* XXX */ > > > > + fmap->offset = iomap->offset; > > > > + fmap->length = iomap->length; > > > > + fmap->type = fuse_iomap_type_to_server(iomap->type); > > > > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > > > > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ > > > > > > AFAICT, this only gets used for sending the FUSE_IOMAP_END request. Is > > > passing the iomap->addr to fmap->addr and inode->i_sb->s_bdev to > > > fmap->dev not useful to the server here? > > > > So far the only fields I've needed in fuse4fs are the > > offset/count/written fields as provided by iomap_iter, and the flags > > field from the mapping. The addr field isn't necessary for fuse4fs > > because the fuse server would know if the mapping had changed. OTOH > > it's probably harmless to send it along. > > > > Hrm. I probably need a way to look up the backing_id from the iomap > > bdev. > > > > Looking further ahead at the ioend patch, I just realized that iomap > > ioends can tell you the new address of a write-append operation but they > > don't tell you which device. I guess you can read that from the > > ioend->io_bio.bi_bdev. > > > > > Also, did you mean to leave in the /* XXX */ comments? > > > > Yes, because they're a reminder to come back and check if I /ever/ > > needed them. > > Makes sense, seems like you're planning to remove them when the patch > is ready to merge, if I understand correctly. Yeah. I also fixed this fuse_iomap_to_server to set fmap->dev. <snip> > > > > +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > > > + ssize_t written, unsigned opflags, > > > > + struct iomap *iomap) > > > > +{ > > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > + struct fuse_mount *fm = get_fuse_mount(inode); > > > > + int err = 0; > > > > + > > > > + if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > > > > + struct fuse_iomap_end_in inarg = { > > > > + .opflags = fuse_iomap_op_to_server(opflags), > > > > + .attr_ino = fi->orig_ino, > > > > + .pos = pos, > > > > + .count = count, > > > > + .written = written, > > > > + }; > > > > + FUSE_ARGS(args); > > > > + > > > > + fuse_iomap_to_server(&inarg.map, iomap); > > > > + > > > > + args.opcode = FUSE_IOMAP_END; > > > > + args.nodeid = get_node_id(inode); > > > > > > Just curious about this - does it make sense to set args.force here > > > for this opcode? It seems like it serves the same sort of purpose a > > > flush request (which sets args.force) does? > > > > What does args.force do? There's no documentation of what behaviors > > these fields are supposed to trigger. > > The args.force forces the request to be sent even if it gets > interrupted by a signal. It'll also bypass the fuse_block_alloc() > check when sending the request, but I don't think that's too relevant > to this case. Hrm. For iomap_begin I think it's ok if a signal kills the IO operation. For iomap_end ... I guess we really should force the command out to the server in case it needs to clean up, even if the user is hammering on kill -9. For iomap_ioend the same probably applies, but it's called from workqueue context so there's not going to be a fatal signal. But maybe we should do that, just in case someone develops motivation to make directio completions run in the caller's context or something. --D ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms [not found] ` <176169810371.1424854.3010195280915622081.stgit@frogsfrogsfrogs> 2026-01-21 19:34 ` [PATCH 01/31] fuse: implement the basic iomap mechanisms Joanne Koong @ 2026-02-05 19:22 ` Chris Mason 2026-02-05 23:31 ` Darrick J. Wong 1 sibling, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:22 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Implement functions to enable upcalling of iomap_begin and iomap_end to > userspace fuse servers. > Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: (I don't fully understand why it calls out commit b3f5bd5243, looks like it's probably just the last place that changes nearby code) commit 99dffae5966f9647a430fa4c9198e2d2dacae172 Author: Darrick J. Wong <djwong@kernel.org> fuse: implement the basic iomap mechanisms Implement functions to enable upcalling of iomap_begin and iomap_end to userspace fuse servers. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > new file mode 100644 > index 0000000000000..d1c2c2e1e68b8 > --- /dev/null > +++ b/fs/fuse/fuse_iomap.c [ ... ] > @@ -0,0 +1,430 @@ [ ... ] > +/* Convert a mapping from the server into something the kernel can use */ > +static inline void fuse_iomap_from_server(struct iomap *iomap, > + const struct fuse_iomap_io *fmap) > +{ > + iomap->addr = fmap->addr; > + iomap->offset = fmap->offset; > + iomap->length = fmap->length; > + iomap->type = fuse_iomap_type_from_server(fmap->type); > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > + iomap->bdev = NULL; /* XXX */ > +} > + > +/* Convert a mapping from the kernel into something the server can use */ > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > + const struct iomap *iomap) > +{ > + fmap->addr = fmap->addr; ^^^^ Does fmap->addr get the wrong value here? Looking at fuse_iomap_from_server() just above, it correctly does iomap->addr = fmap->addr. This function appears to have the reverse bug, assigning fmap->addr to itself instead of fmap->addr = iomap->addr. The other fields (offset, length, type, flags) all correctly copy from iomap. This would cause fuse_iomap_end() to send uninitialized or garbage data in the addr field when calling fuse_iomap_to_server(&inarg.map, iomap). This bug persists unfixed through commit b3f5bd524315 later in the series. > + fmap->offset = iomap->offset; > + fmap->length = iomap->length; > + fmap->type = fuse_iomap_type_to_server(iomap->type); > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ > +} ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms 2026-02-05 19:22 ` Chris Mason @ 2026-02-05 23:31 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-05 23:31 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:22:44AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Implement functions to enable upcalling of iomap_begin and iomap_end to > > userspace fuse servers. > > > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > (I don't fully understand why it calls out commit b3f5bd5243, looks like > it's probably just the last place that changes nearby code) > > commit 99dffae5966f9647a430fa4c9198e2d2dacae172 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse: implement the basic iomap mechanisms > > Implement functions to enable upcalling of iomap_begin and iomap_end to > userspace fuse servers. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > new file mode 100644 > > index 0000000000000..d1c2c2e1e68b8 > > --- /dev/null > > +++ b/fs/fuse/fuse_iomap.c > > [ ... ] > > > @@ -0,0 +1,430 @@ > > [ ... ] > > > +/* Convert a mapping from the server into something the kernel can use */ > > +static inline void fuse_iomap_from_server(struct iomap *iomap, > > + const struct fuse_iomap_io *fmap) > > +{ > > + iomap->addr = fmap->addr; > > + iomap->offset = fmap->offset; > > + iomap->length = fmap->length; > > + iomap->type = fuse_iomap_type_from_server(fmap->type); > > + iomap->flags = fuse_iomap_flags_from_server(fmap->flags); > > + iomap->bdev = NULL; /* XXX */ > > +} > > + > > +/* Convert a mapping from the kernel into something the server can use */ > > +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap, > > + const struct iomap *iomap) > > +{ > > + fmap->addr = fmap->addr; > ^^^^ > > Does fmap->addr get the wrong value here? Looking at > fuse_iomap_from_server() just above, it correctly does iomap->addr = > fmap->addr. This function appears to have the reverse bug, assigning > fmap->addr to itself instead of fmap->addr = iomap->addr. The other > fields (offset, length, type, flags) all correctly copy from iomap. > > This would cause fuse_iomap_end() to send uninitialized or garbage data > in the addr field when calling fuse_iomap_to_server(&inarg.map, iomap). > > This bug persists unfixed through commit b3f5bd524315 later in the > series. Yep, that's a copy-paste error from above; good catch! --D > > + fmap->offset = iomap->offset; > > + fmap->length = iomap->length; > > + fmap->type = fuse_iomap_type_to_server(iomap->type); > > + fmap->flags = fuse_iomap_flags_to_server(iomap->flags); > > + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */ > > +} > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810415.1424854.10373764649459618752.stgit@frogsfrogsfrogs>]
* Re: [PATCH 03/31] fuse: make debugging configurable at runtime [not found] ` <176169810415.1424854.10373764649459618752.stgit@frogsfrogsfrogs> @ 2026-01-21 23:42 ` Joanne Koong 2026-01-22 0:02 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-21 23:42 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Use static keys so that we can configure debugging assertions and dmesg > warnings at runtime. By default this is turned off so the cost is > merely scanning a nop sled. However, fuse server developers can turn > it on for their debugging systems. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > fs/fuse/fuse_i.h | 8 +++++ > fs/fuse/iomap_i.h | 16 ++++++++-- > fs/fuse/Kconfig | 15 +++++++++ > fs/fuse/file_iomap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/inode.c | 7 ++++ > 5 files changed, 124 insertions(+), 3 deletions(-) > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 45be59df7ae592..61fb65f3604d61 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -1691,6 +1691,14 @@ extern void fuse_sysctl_unregister(void); > #define fuse_sysctl_unregister() do { } while (0) > #endif /* CONFIG_SYSCTL */ > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > +int fuse_iomap_sysfs_init(struct kobject *kobj); > +void fuse_iomap_sysfs_cleanup(struct kobject *kobj); > +#else > +# define fuse_iomap_sysfs_init(...) (0) > +# define fuse_iomap_sysfs_cleanup(...) ((void)0) > +#endif > + > #if IS_ENABLED(CONFIG_FUSE_IOMAP) > bool fuse_iomap_enabled(void); > > diff --git a/fs/fuse/iomap_i.h b/fs/fuse/iomap_i.h > index 6d9ce9c0f40a04..3615ec76c0dec0 100644 > --- a/fs/fuse/iomap_i.h > +++ b/fs/fuse/iomap_i.h > @@ -6,19 +6,29 @@ > #ifndef _FS_FUSE_IOMAP_I_H > #define _FS_FUSE_IOMAP_I_H > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > +DECLARE_STATIC_KEY_TRUE(fuse_iomap_debug); > +#else > +DECLARE_STATIC_KEY_FALSE(fuse_iomap_debug); > +#endif > + > #if IS_ENABLED(CONFIG_FUSE_IOMAP) > #if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > -# define ASSERT(condition) do { \ > +# define ASSERT(condition) \ > +while (static_branch_unlikely(&fuse_iomap_debug)) { \ > int __cond = !!(condition); \ > if (unlikely(!__cond)) \ > trace_fuse_iomap_assert(__func__, __LINE__, #condition); \ > WARN(!__cond, "Assertion failed: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > -} while (0) > + break; \ > +} > # define BAD_DATA(condition) ({ \ > int __cond = !!(condition); \ > if (unlikely(__cond)) \ > trace_fuse_iomap_bad_data(__func__, __LINE__, #condition); \ > - WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > + if (static_branch_unlikely(&fuse_iomap_debug)) \ > + WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > + unlikely(__cond); \ > }) > #else > # define ASSERT(condition) > diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig > index 934d48076a010c..bb867afe6e867c 100644 > --- a/fs/fuse/Kconfig > +++ b/fs/fuse/Kconfig > @@ -101,6 +101,21 @@ config FUSE_IOMAP_DEBUG > Enable debugging assertions for the fuse iomap code paths and logging > of bad iomap file mapping data being sent to the kernel. > > + Say N here if you don't want any debugging code code compiled in at > + all. > + > +config FUSE_IOMAP_DEBUG_BY_DEFAULT > + bool "Debug FUSE file IO over iomap at boot time" > + default n > + depends on FUSE_IOMAP_DEBUG > + help > + At boot time, enable debugging assertions for the fuse iomap code > + paths and warnings about bad iomap file mapping data. This enables > + fuse server authors to control debugging at runtime even on a > + distribution kernel while avoiding most of the overhead on production > + systems. The setting can be changed at runtime via > + /sys/fs/fuse/iomap/debug. > + > config FUSE_IO_URING > bool "FUSE communication over io-uring" > default y > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index a88f5d8d2bce15..b6fc70068c5542 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -8,6 +8,12 @@ > #include "fuse_trace.h" > #include "iomap_i.h" > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > +DEFINE_STATIC_KEY_TRUE(fuse_iomap_debug); > +#else > +DEFINE_STATIC_KEY_FALSE(fuse_iomap_debug); > +#endif > + > static bool __read_mostly enable_iomap = > #if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > true; > @@ -17,6 +23,81 @@ static bool __read_mostly enable_iomap = > module_param(enable_iomap, bool, 0644); > MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > +static struct kobject *iomap_kobj; > + > +static ssize_t fuse_iomap_debug_show(struct kobject *kobject, > + struct kobj_attribute *a, char *buf) > +{ > + return sysfs_emit(buf, "%d\n", !!static_key_enabled(&fuse_iomap_debug)); > +} > + > +static ssize_t fuse_iomap_debug_store(struct kobject *kobject, > + struct kobj_attribute *a, > + const char *buf, size_t count) > +{ > + int ret; > + int val; > + > + ret = kstrtoint(buf, 0, &val); > + if (ret) > + return ret; > + > + if (val < 0 || val > 1) > + return -EINVAL; > + > + if (val) > + static_branch_enable(&fuse_iomap_debug); > + else > + static_branch_disable(&fuse_iomap_debug); > + > + return count; > +} > + > +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ > +{ \ > + .attr = { .name = __stringify(_name), .mode = _mode }, \ > + .show = _show, \ > + .store = _store, \ > +} > + > +#define FUSE_ATTR_RW(_name, _show, _store) \ > + static struct kobj_attribute fuse_attr_##_name = \ > + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) > + > +#define FUSE_ATTR_PTR(_name) \ > + (&fuse_attr_##_name.attr) > + > +FUSE_ATTR_RW(debug, fuse_iomap_debug_show, fuse_iomap_debug_store); > + > +static const struct attribute *fuse_iomap_attrs[] = { > + FUSE_ATTR_PTR(debug), > + NULL, > +}; > + > +int fuse_iomap_sysfs_init(struct kobject *fuse_kobj) > +{ > + int error; > + > + iomap_kobj = kobject_create_and_add("iomap", fuse_kobj); > + if (!iomap_kobj) > + return -ENOMEM; > + > + error = sysfs_create_files(iomap_kobj, fuse_iomap_attrs); > + if (error) { > + kobject_put(iomap_kobj); > + return error; > + } > + > + return 0; > +} > + > +void fuse_iomap_sysfs_cleanup(struct kobject *fuse_kobj) > +{ Is sysfs_remove_files() also needed here? > + kobject_put(iomap_kobj); > +} > +#endif /* IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) */ > + > bool fuse_iomap_enabled(void) > { > /* Don't let anyone touch iomap until the end of the patchset. */ > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 1eea8dc6e723c6..eec711302a4a13 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -2277,8 +2277,14 @@ static int fuse_sysfs_init(void) > if (err) > goto out_fuse_unregister; > > + err = fuse_iomap_sysfs_init(fuse_kobj); > + if (err) > + goto out_fuse_connections; > + > return 0; > > + out_fuse_connections: > + sysfs_remove_mount_point(fuse_kobj, "connections"); > out_fuse_unregister: > kobject_put(fuse_kobj); > out_err: > @@ -2287,6 +2293,7 @@ static int fuse_sysfs_init(void) > > static void fuse_sysfs_cleanup(void) > { > + fuse_iomap_sysfs_cleanup(fuse_kobj); > sysfs_remove_mount_point(fuse_kobj, "connections"); > kobject_put(fuse_kobj); > } > Could you explain why it's better that this goes through sysfs than through a module param? Thanks, Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 03/31] fuse: make debugging configurable at runtime 2026-01-21 23:42 ` [PATCH 03/31] fuse: make debugging configurable at runtime Joanne Koong @ 2026-01-22 0:02 ` Darrick J. Wong 2026-01-22 0:23 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-22 0:02 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 03:42:04PM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > Use static keys so that we can configure debugging assertions and dmesg > > warnings at runtime. By default this is turned off so the cost is > > merely scanning a nop sled. However, fuse server developers can turn > > it on for their debugging systems. > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > fs/fuse/fuse_i.h | 8 +++++ > > fs/fuse/iomap_i.h | 16 ++++++++-- > > fs/fuse/Kconfig | 15 +++++++++ > > fs/fuse/file_iomap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/inode.c | 7 ++++ > > 5 files changed, 124 insertions(+), 3 deletions(-) > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index 45be59df7ae592..61fb65f3604d61 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -1691,6 +1691,14 @@ extern void fuse_sysctl_unregister(void); > > #define fuse_sysctl_unregister() do { } while (0) > > #endif /* CONFIG_SYSCTL */ > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > +int fuse_iomap_sysfs_init(struct kobject *kobj); > > +void fuse_iomap_sysfs_cleanup(struct kobject *kobj); > > +#else > > +# define fuse_iomap_sysfs_init(...) (0) > > +# define fuse_iomap_sysfs_cleanup(...) ((void)0) > > +#endif > > + > > #if IS_ENABLED(CONFIG_FUSE_IOMAP) > > bool fuse_iomap_enabled(void); > > > > diff --git a/fs/fuse/iomap_i.h b/fs/fuse/iomap_i.h > > index 6d9ce9c0f40a04..3615ec76c0dec0 100644 > > --- a/fs/fuse/iomap_i.h > > +++ b/fs/fuse/iomap_i.h > > @@ -6,19 +6,29 @@ > > #ifndef _FS_FUSE_IOMAP_I_H > > #define _FS_FUSE_IOMAP_I_H > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > > +DECLARE_STATIC_KEY_TRUE(fuse_iomap_debug); > > +#else > > +DECLARE_STATIC_KEY_FALSE(fuse_iomap_debug); > > +#endif > > + > > #if IS_ENABLED(CONFIG_FUSE_IOMAP) > > #if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > -# define ASSERT(condition) do { \ > > +# define ASSERT(condition) \ > > +while (static_branch_unlikely(&fuse_iomap_debug)) { \ > > int __cond = !!(condition); \ > > if (unlikely(!__cond)) \ > > trace_fuse_iomap_assert(__func__, __LINE__, #condition); \ > > WARN(!__cond, "Assertion failed: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > > -} while (0) > > + break; \ > > +} > > # define BAD_DATA(condition) ({ \ > > int __cond = !!(condition); \ > > if (unlikely(__cond)) \ > > trace_fuse_iomap_bad_data(__func__, __LINE__, #condition); \ > > - WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > > + if (static_branch_unlikely(&fuse_iomap_debug)) \ > > + WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \ > > + unlikely(__cond); \ > > }) > > #else > > # define ASSERT(condition) > > diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig > > index 934d48076a010c..bb867afe6e867c 100644 > > --- a/fs/fuse/Kconfig > > +++ b/fs/fuse/Kconfig > > @@ -101,6 +101,21 @@ config FUSE_IOMAP_DEBUG > > Enable debugging assertions for the fuse iomap code paths and logging > > of bad iomap file mapping data being sent to the kernel. > > > > + Say N here if you don't want any debugging code code compiled in at > > + all. > > + > > +config FUSE_IOMAP_DEBUG_BY_DEFAULT > > + bool "Debug FUSE file IO over iomap at boot time" > > + default n > > + depends on FUSE_IOMAP_DEBUG > > + help > > + At boot time, enable debugging assertions for the fuse iomap code > > + paths and warnings about bad iomap file mapping data. This enables > > + fuse server authors to control debugging at runtime even on a > > + distribution kernel while avoiding most of the overhead on production > > + systems. The setting can be changed at runtime via > > + /sys/fs/fuse/iomap/debug. > > + > > config FUSE_IO_URING > > bool "FUSE communication over io-uring" > > default y > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > index a88f5d8d2bce15..b6fc70068c5542 100644 > > --- a/fs/fuse/file_iomap.c > > +++ b/fs/fuse/file_iomap.c > > @@ -8,6 +8,12 @@ > > #include "fuse_trace.h" > > #include "iomap_i.h" > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > > +DEFINE_STATIC_KEY_TRUE(fuse_iomap_debug); > > +#else > > +DEFINE_STATIC_KEY_FALSE(fuse_iomap_debug); > > +#endif > > + > > static bool __read_mostly enable_iomap = > > #if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > > true; > > @@ -17,6 +23,81 @@ static bool __read_mostly enable_iomap = > > module_param(enable_iomap, bool, 0644); > > MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > +static struct kobject *iomap_kobj; > > + > > +static ssize_t fuse_iomap_debug_show(struct kobject *kobject, > > + struct kobj_attribute *a, char *buf) > > +{ > > + return sysfs_emit(buf, "%d\n", !!static_key_enabled(&fuse_iomap_debug)); > > +} > > + > > +static ssize_t fuse_iomap_debug_store(struct kobject *kobject, > > + struct kobj_attribute *a, > > + const char *buf, size_t count) > > +{ > > + int ret; > > + int val; > > + > > + ret = kstrtoint(buf, 0, &val); > > + if (ret) > > + return ret; > > + > > + if (val < 0 || val > 1) > > + return -EINVAL; > > + > > + if (val) > > + static_branch_enable(&fuse_iomap_debug); > > + else > > + static_branch_disable(&fuse_iomap_debug); > > + > > + return count; > > +} > > + > > +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ > > +{ \ > > + .attr = { .name = __stringify(_name), .mode = _mode }, \ > > + .show = _show, \ > > + .store = _store, \ > > +} > > + > > +#define FUSE_ATTR_RW(_name, _show, _store) \ > > + static struct kobj_attribute fuse_attr_##_name = \ > > + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) > > + > > +#define FUSE_ATTR_PTR(_name) \ > > + (&fuse_attr_##_name.attr) > > + > > +FUSE_ATTR_RW(debug, fuse_iomap_debug_show, fuse_iomap_debug_store); > > + > > +static const struct attribute *fuse_iomap_attrs[] = { > > + FUSE_ATTR_PTR(debug), > > + NULL, > > +}; > > + > > +int fuse_iomap_sysfs_init(struct kobject *fuse_kobj) > > +{ > > + int error; > > + > > + iomap_kobj = kobject_create_and_add("iomap", fuse_kobj); > > + if (!iomap_kobj) > > + return -ENOMEM; > > + > > + error = sysfs_create_files(iomap_kobj, fuse_iomap_attrs); > > + if (error) { > > + kobject_put(iomap_kobj); > > + return error; > > + } > > + > > + return 0; > > +} > > + > > +void fuse_iomap_sysfs_cleanup(struct kobject *fuse_kobj) > > +{ > > Is sysfs_remove_files() also needed here? kobject_put is supposed to tear down the attrs that sysfs_create_files attaches to iomap_kobj. Though you're right to be suspicious -- there are a lot of places that explicitly call sysfs_remove_files to undo sysfs_create_files; and also a lot of places that just let kobject_put do the dirty work. > > + kobject_put(iomap_kobj); > > +} > > +#endif /* IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) */ > > + > > bool fuse_iomap_enabled(void) > > { > > /* Don't let anyone touch iomap until the end of the patchset. */ > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > index 1eea8dc6e723c6..eec711302a4a13 100644 > > --- a/fs/fuse/inode.c > > +++ b/fs/fuse/inode.c > > @@ -2277,8 +2277,14 @@ static int fuse_sysfs_init(void) > > if (err) > > goto out_fuse_unregister; > > > > + err = fuse_iomap_sysfs_init(fuse_kobj); > > + if (err) > > + goto out_fuse_connections; > > + > > return 0; > > > > + out_fuse_connections: > > + sysfs_remove_mount_point(fuse_kobj, "connections"); > > out_fuse_unregister: > > kobject_put(fuse_kobj); > > out_err: > > @@ -2287,6 +2293,7 @@ static int fuse_sysfs_init(void) > > > > static void fuse_sysfs_cleanup(void) > > { > > + fuse_iomap_sysfs_cleanup(fuse_kobj); > > sysfs_remove_mount_point(fuse_kobj, "connections"); > > kobject_put(fuse_kobj); > > } > > > Could you explain why it's better that this goes through sysfs than > through a module param? You can dynamically enable debugging on a production system. I (by which I really mean the support org) wishes they could do that with XFS. Module parameters don't come with setter functions so you can't call static_branch_{enable,disable} when the parameter value updates. --D > Thanks, > Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 03/31] fuse: make debugging configurable at runtime 2026-01-22 0:02 ` Darrick J. Wong @ 2026-01-22 0:23 ` Joanne Koong 2026-01-22 0:40 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-22 0:23 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 4:02 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 21, 2026 at 03:42:04PM -0800, Joanne Koong wrote: > > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > Use static keys so that we can configure debugging assertions and dmesg > > > warnings at runtime. By default this is turned off so the cost is > > > merely scanning a nop sled. However, fuse server developers can turn > > > it on for their debugging systems. > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > --- > > > fs/fuse/fuse_i.h | 8 +++++ > > > fs/fuse/iomap_i.h | 16 ++++++++-- > > > fs/fuse/Kconfig | 15 +++++++++ > > > fs/fuse/file_iomap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > fs/fuse/inode.c | 7 ++++ > > > 5 files changed, 124 insertions(+), 3 deletions(-) > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > index a88f5d8d2bce15..b6fc70068c5542 100644 > > > --- a/fs/fuse/file_iomap.c > > > +++ b/fs/fuse/file_iomap.c > > > @@ -8,6 +8,12 @@ > > > #include "fuse_trace.h" > > > #include "iomap_i.h" > > > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > > > +DEFINE_STATIC_KEY_TRUE(fuse_iomap_debug); > > > +#else > > > +DEFINE_STATIC_KEY_FALSE(fuse_iomap_debug); > > > +#endif > > > + > > > static bool __read_mostly enable_iomap = > > > #if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > > > true; > > > @@ -17,6 +23,81 @@ static bool __read_mostly enable_iomap = > > > module_param(enable_iomap, bool, 0644); > > > MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > > +static struct kobject *iomap_kobj; > > > + > > > +static ssize_t fuse_iomap_debug_show(struct kobject *kobject, > > > + struct kobj_attribute *a, char *buf) > > > +{ > > > + return sysfs_emit(buf, "%d\n", !!static_key_enabled(&fuse_iomap_debug)); > > > +} > > > + > > > +static ssize_t fuse_iomap_debug_store(struct kobject *kobject, > > > + struct kobj_attribute *a, > > > + const char *buf, size_t count) > > > +{ > > > + int ret; > > > + int val; > > > + > > > + ret = kstrtoint(buf, 0, &val); > > > + if (ret) > > > + return ret; > > > + > > > + if (val < 0 || val > 1) > > > + return -EINVAL; > > > + > > > + if (val) > > > + static_branch_enable(&fuse_iomap_debug); > > > + else > > > + static_branch_disable(&fuse_iomap_debug); > > > + > > > + return count; > > > +} > > > + > > > +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ > > > +{ \ > > > + .attr = { .name = __stringify(_name), .mode = _mode }, \ > > > + .show = _show, \ > > > + .store = _store, \ > > > +} > > > + > > > +#define FUSE_ATTR_RW(_name, _show, _store) \ > > > + static struct kobj_attribute fuse_attr_##_name = \ > > > + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) > > > + > > > +#define FUSE_ATTR_PTR(_name) \ > > > + (&fuse_attr_##_name.attr) > > > + > > > +FUSE_ATTR_RW(debug, fuse_iomap_debug_show, fuse_iomap_debug_store); > > > + > > > +static const struct attribute *fuse_iomap_attrs[] = { > > > + FUSE_ATTR_PTR(debug), > > > + NULL, > > > +}; > > > + > > > +int fuse_iomap_sysfs_init(struct kobject *fuse_kobj) > > > +{ > > > + int error; > > > + > > > + iomap_kobj = kobject_create_and_add("iomap", fuse_kobj); > > > + if (!iomap_kobj) > > > + return -ENOMEM; > > > + > > > + error = sysfs_create_files(iomap_kobj, fuse_iomap_attrs); > > > + if (error) { > > > + kobject_put(iomap_kobj); > > > + return error; > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +void fuse_iomap_sysfs_cleanup(struct kobject *fuse_kobj) > > > +{ > > > > Is sysfs_remove_files() also needed here? > > kobject_put is supposed to tear down the attrs that sysfs_create_files > attaches to iomap_kobj. Though you're right to be suspicious -- there > are a lot of places that explicitly call sysfs_remove_files to undo > sysfs_create_files; and also a lot of places that just let kobject_put > do the dirty work. Makes sense, thanks for the context. > > > > + kobject_put(iomap_kobj); > > > +} > > > +#endif /* IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) */ > > > + > > > bool fuse_iomap_enabled(void) > > > { > > > /* Don't let anyone touch iomap until the end of the patchset. */ > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > > index 1eea8dc6e723c6..eec711302a4a13 100644 > > > --- a/fs/fuse/inode.c > > > +++ b/fs/fuse/inode.c > > > @@ -2277,8 +2277,14 @@ static int fuse_sysfs_init(void) > > > if (err) > > > goto out_fuse_unregister; > > > > > > + err = fuse_iomap_sysfs_init(fuse_kobj); > > > + if (err) > > > + goto out_fuse_connections; > > > + > > > return 0; > > > > > > + out_fuse_connections: > > > + sysfs_remove_mount_point(fuse_kobj, "connections"); > > > out_fuse_unregister: > > > kobject_put(fuse_kobj); > > > out_err: > > > @@ -2287,6 +2293,7 @@ static int fuse_sysfs_init(void) > > > > > > static void fuse_sysfs_cleanup(void) > > > { > > > + fuse_iomap_sysfs_cleanup(fuse_kobj); > > > sysfs_remove_mount_point(fuse_kobj, "connections"); > > > kobject_put(fuse_kobj); > > > } > > > > > Could you explain why it's better that this goes through sysfs than > > through a module param? > > You can dynamically enable debugging on a production system. I (by > which I really mean the support org) wishes they could do that with XFS. > > Module parameters don't come with setter functions so you can't call > static_branch_{enable,disable} when the parameter value updates. > Ohh I thought the "module_param_cb()" stuff does let you do that and can be dynamically enabled/disabled as well? I mostly ask because it feels like it'd be nicer from a user POV if all the config stuff (eg enable uring, enable iomap, etc.) is in one place. Thanks, Joanne > --D > > > Thanks, > > Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 03/31] fuse: make debugging configurable at runtime 2026-01-22 0:23 ` Joanne Koong @ 2026-01-22 0:40 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-22 0:40 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 04:23:08PM -0800, Joanne Koong wrote: > On Wed, Jan 21, 2026 at 4:02 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 21, 2026 at 03:42:04PM -0800, Joanne Koong wrote: > > > On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > Use static keys so that we can configure debugging assertions and dmesg > > > > warnings at runtime. By default this is turned off so the cost is > > > > merely scanning a nop sled. However, fuse server developers can turn > > > > it on for their debugging systems. > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > --- > > > > fs/fuse/fuse_i.h | 8 +++++ > > > > fs/fuse/iomap_i.h | 16 ++++++++-- > > > > fs/fuse/Kconfig | 15 +++++++++ > > > > fs/fuse/file_iomap.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > > fs/fuse/inode.c | 7 ++++ > > > > 5 files changed, 124 insertions(+), 3 deletions(-) > > > > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > > index a88f5d8d2bce15..b6fc70068c5542 100644 > > > > --- a/fs/fuse/file_iomap.c > > > > +++ b/fs/fuse/file_iomap.c > > > > @@ -8,6 +8,12 @@ > > > > #include "fuse_trace.h" > > > > #include "iomap_i.h" > > > > > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG_DEFAULT) > > > > +DEFINE_STATIC_KEY_TRUE(fuse_iomap_debug); > > > > +#else > > > > +DEFINE_STATIC_KEY_FALSE(fuse_iomap_debug); > > > > +#endif > > > > + > > > > static bool __read_mostly enable_iomap = > > > > #if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT) > > > > true; > > > > @@ -17,6 +23,81 @@ static bool __read_mostly enable_iomap = > > > > module_param(enable_iomap, bool, 0644); > > > > MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap"); > > > > > > > > +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) > > > > +static struct kobject *iomap_kobj; > > > > + > > > > +static ssize_t fuse_iomap_debug_show(struct kobject *kobject, > > > > + struct kobj_attribute *a, char *buf) > > > > +{ > > > > + return sysfs_emit(buf, "%d\n", !!static_key_enabled(&fuse_iomap_debug)); > > > > +} > > > > + > > > > +static ssize_t fuse_iomap_debug_store(struct kobject *kobject, > > > > + struct kobj_attribute *a, > > > > + const char *buf, size_t count) > > > > +{ > > > > + int ret; > > > > + int val; > > > > + > > > > + ret = kstrtoint(buf, 0, &val); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + if (val < 0 || val > 1) > > > > + return -EINVAL; > > > > + > > > > + if (val) > > > > + static_branch_enable(&fuse_iomap_debug); > > > > + else > > > > + static_branch_disable(&fuse_iomap_debug); > > > > + > > > > + return count; > > > > +} > > > > + > > > > +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ > > > > +{ \ > > > > + .attr = { .name = __stringify(_name), .mode = _mode }, \ > > > > + .show = _show, \ > > > > + .store = _store, \ > > > > +} > > > > + > > > > +#define FUSE_ATTR_RW(_name, _show, _store) \ > > > > + static struct kobj_attribute fuse_attr_##_name = \ > > > > + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) > > > > + > > > > +#define FUSE_ATTR_PTR(_name) \ > > > > + (&fuse_attr_##_name.attr) > > > > + > > > > +FUSE_ATTR_RW(debug, fuse_iomap_debug_show, fuse_iomap_debug_store); > > > > + > > > > +static const struct attribute *fuse_iomap_attrs[] = { > > > > + FUSE_ATTR_PTR(debug), > > > > + NULL, > > > > +}; > > > > + > > > > +int fuse_iomap_sysfs_init(struct kobject *fuse_kobj) > > > > +{ > > > > + int error; > > > > + > > > > + iomap_kobj = kobject_create_and_add("iomap", fuse_kobj); > > > > + if (!iomap_kobj) > > > > + return -ENOMEM; > > > > + > > > > + error = sysfs_create_files(iomap_kobj, fuse_iomap_attrs); > > > > + if (error) { > > > > + kobject_put(iomap_kobj); > > > > + return error; > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +void fuse_iomap_sysfs_cleanup(struct kobject *fuse_kobj) > > > > +{ > > > > > > Is sysfs_remove_files() also needed here? > > > > kobject_put is supposed to tear down the attrs that sysfs_create_files > > attaches to iomap_kobj. Though you're right to be suspicious -- there > > are a lot of places that explicitly call sysfs_remove_files to undo > > sysfs_create_files; and also a lot of places that just let kobject_put > > do the dirty work. > > Makes sense, thanks for the context. > > > > > > + kobject_put(iomap_kobj); > > > > +} > > > > +#endif /* IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG) */ > > > > + > > > > bool fuse_iomap_enabled(void) > > > > { > > > > /* Don't let anyone touch iomap until the end of the patchset. */ > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > > > index 1eea8dc6e723c6..eec711302a4a13 100644 > > > > --- a/fs/fuse/inode.c > > > > +++ b/fs/fuse/inode.c > > > > @@ -2277,8 +2277,14 @@ static int fuse_sysfs_init(void) > > > > if (err) > > > > goto out_fuse_unregister; > > > > > > > > + err = fuse_iomap_sysfs_init(fuse_kobj); > > > > + if (err) > > > > + goto out_fuse_connections; > > > > + > > > > return 0; > > > > > > > > + out_fuse_connections: > > > > + sysfs_remove_mount_point(fuse_kobj, "connections"); > > > > out_fuse_unregister: > > > > kobject_put(fuse_kobj); > > > > out_err: > > > > @@ -2287,6 +2293,7 @@ static int fuse_sysfs_init(void) > > > > > > > > static void fuse_sysfs_cleanup(void) > > > > { > > > > + fuse_iomap_sysfs_cleanup(fuse_kobj); > > > > sysfs_remove_mount_point(fuse_kobj, "connections"); > > > > kobject_put(fuse_kobj); > > > > } > > > > > > > Could you explain why it's better that this goes through sysfs than > > > through a module param? > > > > You can dynamically enable debugging on a production system. I (by > > which I really mean the support org) wishes they could do that with XFS. > > > > Module parameters don't come with setter functions so you can't call > > static_branch_{enable,disable} when the parameter value updates. > > > > Ohh I thought the "module_param_cb()" stuff does let you do that and > can be dynamically enabled/disabled as well? I mostly ask because it > feels like it'd be nicer from a user POV if all the config stuff (eg > enable uring, enable iomap, etc.) is in one place. TIL today. HAH well that's been there since 2.6.0. Silly me, that's been there forever. I'll switch it to a magic module parameter that has a setter. Much easier than thinking about /anything/ related to sysfs. --D > Thanks, > Joanne > > > --D > > > > > Thanks, > > > Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810502.1424854.13869957103489591272.stgit@frogsfrogsfrogs>]
* Re: [PATCH 07/31] fuse: create a per-inode flag for toggling iomap [not found] ` <176169810502.1424854.13869957103489591272.stgit@frogsfrogsfrogs> @ 2026-01-22 1:13 ` Joanne Koong 2026-01-22 22:22 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-22 1:13 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:46 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Create a per-inode flag to control whether or not this inode actually > uses iomap. This is required for non-regular files because iomap > doesn't apply there; and enables fuse filesystems to provide some > non-iomap files if desired. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> The logic in this makes sense to me, left just a few comments below. Reviewed-by: Joanne Koong <joannelkoong@gmail.com> > --- > fs/fuse/fuse_i.h | 17 ++++++++++++++++ > include/uapi/linux/fuse.h | 3 +++ > fs/fuse/file.c | 1 + > fs/fuse/file_iomap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/inode.c | 26 ++++++++++++++++++------ > 5 files changed, 90 insertions(+), 6 deletions(-) > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 839d4f2ada4656..c7aeb324fe599e 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -257,6 +257,8 @@ enum { > * or the fuse server has an exclusive "lease" on distributed fs > */ > FUSE_I_EXCLUSIVE, > + /* Use iomap for this inode */ > + FUSE_I_IOMAP, > }; > > struct fuse_conn; > @@ -1717,11 +1719,26 @@ extern const struct fuse_backing_ops fuse_iomap_backing_ops; > > void fuse_iomap_mount(struct fuse_mount *fm); > void fuse_iomap_unmount(struct fuse_mount *fm); > + > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags); > +void fuse_iomap_init_nonreg_inode(struct inode *inode, unsigned attr_flags); > +void fuse_iomap_evict_inode(struct inode *inode); > + > +static inline bool fuse_inode_has_iomap(const struct inode *inode) > +{ > + const struct fuse_inode *fi = get_fuse_inode(inode); > + > + return test_bit(FUSE_I_IOMAP, &fi->state); > +} > #else > # define fuse_iomap_enabled(...) (false) > # define fuse_has_iomap(...) (false) > # define fuse_iomap_mount(...) ((void)0) > # define fuse_iomap_unmount(...) ((void)0) > +# define fuse_iomap_init_reg_inode(...) ((void)0) > +# define fuse_iomap_init_nonreg_inode(...) ((void)0) > +# define fuse_iomap_evict_inode(...) ((void)0) > +# define fuse_inode_has_iomap(...) (false) > #endif > > #endif /* _FS_FUSE_I_H */ > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index e571f8ceecbfad..e949bfe022c3b0 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -243,6 +243,7 @@ > * > * 7.99 > * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > + * - add FUSE_ATTR_IOMAP to enable iomap for specific inodes > */ > > #ifndef _LINUX_FUSE_H > @@ -583,9 +584,11 @@ struct fuse_file_lock { > * > * FUSE_ATTR_SUBMOUNT: Object is a submount root > * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode > + * FUSE_ATTR_IOMAP: Use iomap for this inode > */ > #define FUSE_ATTR_SUBMOUNT (1 << 0) > #define FUSE_ATTR_DAX (1 << 1) > +#define FUSE_ATTR_IOMAP (1 << 2) > > /** > * Open flags > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index f1ef77a0be05bb..42c85c19f3b13b 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -3135,6 +3135,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) > init_waitqueue_head(&fi->page_waitq); > init_waitqueue_head(&fi->direct_io_waitq); > > + fuse_iomap_init_reg_inode(inode, flags); imo it's a bit confusing to have this here when the rest of the fuse_iomap_init_nonreg_inode() calls happen inside the switch case statement. Maybe it makes sense to have this inside the switch case like the fuse_iomap_init_nonreg_inode() calls, or alternatively move the fuse_iomap_init_nonreg_inode() calls into their corresponding helpers (eg fuse_init_dir(), etc.), so that it's consistent? > if (IS_ENABLED(CONFIG_FUSE_DAX)) > fuse_dax_inode_init(inode, flags); > } > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index 1b9e1bf2f799a3..fc0d5f135bacf9 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -635,3 +635,52 @@ void fuse_iomap_unmount(struct fuse_mount *fm) > fuse_flush_requests_and_wait(fc); > fuse_send_destroy(fm); > } > + > +static inline void fuse_inode_set_iomap(struct inode *inode) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + set_bit(FUSE_I_IOMAP, &fi->state); > +} > + > +static inline void fuse_inode_clear_iomap(struct inode *inode) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + clear_bit(FUSE_I_IOMAP, &fi->state); > +} > + > +void fuse_iomap_init_nonreg_inode(struct inode *inode, unsigned attr_flags) > +{ > + struct fuse_conn *conn = get_fuse_conn(inode); > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + ASSERT(!S_ISREG(inode->i_mode)); > + > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > +} > + > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags) > +{ > + struct fuse_conn *conn = get_fuse_conn(inode); > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + ASSERT(S_ISREG(inode->i_mode)); > + > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) { > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > + fuse_inode_set_iomap(inode); > + } > +} > + > +void fuse_iomap_evict_inode(struct inode *inode) > +{ > + struct fuse_conn *conn = get_fuse_conn(inode); > + struct fuse_inode *fi = get_fuse_inode(inode); > + > + if (fuse_inode_has_iomap(inode)) If I'm understanding this correctly, a fuse inode can't have FUSE_I_IOMAP set on it if conn>iomap is not enabled, correct? Maybe it makes sense to just return if (!conn->iomap) at the very beginning, to make that more clear? > + fuse_inode_clear_iomap(inode); > + if (conn->iomap && fuse_inode_is_exclusive(inode)) > + clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > +} > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 271356fa3be3ea..9b9e7b2dd0d928 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -196,6 +196,8 @@ static void fuse_evict_inode(struct inode *inode) > WARN_ON(!list_empty(&fi->write_files)); > WARN_ON(!list_empty(&fi->queued_writes)); > } > + > + fuse_iomap_evict_inode(inode); > } > > static int fuse_reconfigure(struct fs_context *fsc) > @@ -428,20 +430,32 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, > inode->i_size = attr->size; > inode_set_mtime(inode, attr->mtime, attr->mtimensec); > inode_set_ctime(inode, attr->ctime, attr->ctimensec); > - if (S_ISREG(inode->i_mode)) { > + switch (inode->i_mode & S_IFMT) { > + case S_IFREG: > fuse_init_common(inode); > fuse_init_file_inode(inode, attr->flags); > - } else if (S_ISDIR(inode->i_mode)) > + break; > + case S_IFDIR: > fuse_init_dir(inode); > - else if (S_ISLNK(inode->i_mode)) > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > + break; > + case S_IFLNK: > fuse_init_symlink(inode); > - else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || > - S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > + break; > + case S_IFCHR: > + case S_IFBLK: > + case S_IFIFO: > + case S_IFSOCK: > fuse_init_common(inode); > init_special_inode(inode, inode->i_mode, > new_decode_dev(attr->rdev)); > - } else > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > + break; > + default: > BUG(); Just thinking out loud here and curious to hear whether you like this idea or not: another option is calling if (conn->iomap) fuse_iomap_init_inode(); at the end, where fuse_iomap_init_inode() would be something like: void fuse_iomap_init_inode(struct inode *inode, unsigned attr_flags) { struct fuse_inode *fi = get_fuse_inode(inode); if (attr_flags & FUSE_ATTR_IOMAP) set_bit(FUSE_I_EXCLUSIVE, &fi->state); if (S_ISREG(inode->i_mode)) fuse_inode_set_iomap(inode); } which seems simpler to me than having both fuse_iomap_init_nonreg_inode() and fuse_iomap_init_reg_inode() function and invoking it per i_mode case. Thanks, Joanne > + break; > + } > /* > * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL > * so they see the exact same behavior as before. > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 07/31] fuse: create a per-inode flag for toggling iomap 2026-01-22 1:13 ` [PATCH 07/31] fuse: create a per-inode flag for toggling iomap Joanne Koong @ 2026-01-22 22:22 ` Darrick J. Wong 2026-01-23 18:05 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-22 22:22 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 05:13:39PM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:46 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > Create a per-inode flag to control whether or not this inode actually > > uses iomap. This is required for non-regular files because iomap > > doesn't apply there; and enables fuse filesystems to provide some > > non-iomap files if desired. > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > The logic in this makes sense to me, left just a few comments below. > > Reviewed-by: Joanne Koong <joannelkoong@gmail.com> Thanks! > > --- > > fs/fuse/fuse_i.h | 17 ++++++++++++++++ > > include/uapi/linux/fuse.h | 3 +++ > > fs/fuse/file.c | 1 + > > fs/fuse/file_iomap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/inode.c | 26 ++++++++++++++++++------ > > 5 files changed, 90 insertions(+), 6 deletions(-) > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index 839d4f2ada4656..c7aeb324fe599e 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -257,6 +257,8 @@ enum { > > * or the fuse server has an exclusive "lease" on distributed fs > > */ > > FUSE_I_EXCLUSIVE, > > + /* Use iomap for this inode */ > > + FUSE_I_IOMAP, > > }; > > > > struct fuse_conn; > > @@ -1717,11 +1719,26 @@ extern const struct fuse_backing_ops fuse_iomap_backing_ops; > > > > void fuse_iomap_mount(struct fuse_mount *fm); > > void fuse_iomap_unmount(struct fuse_mount *fm); > > + > > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags); > > +void fuse_iomap_init_nonreg_inode(struct inode *inode, unsigned attr_flags); > > +void fuse_iomap_evict_inode(struct inode *inode); > > + > > +static inline bool fuse_inode_has_iomap(const struct inode *inode) > > +{ > > + const struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + return test_bit(FUSE_I_IOMAP, &fi->state); > > +} > > #else > > # define fuse_iomap_enabled(...) (false) > > # define fuse_has_iomap(...) (false) > > # define fuse_iomap_mount(...) ((void)0) > > # define fuse_iomap_unmount(...) ((void)0) > > +# define fuse_iomap_init_reg_inode(...) ((void)0) > > +# define fuse_iomap_init_nonreg_inode(...) ((void)0) > > +# define fuse_iomap_evict_inode(...) ((void)0) > > +# define fuse_inode_has_iomap(...) (false) > > #endif > > > > #endif /* _FS_FUSE_I_H */ > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index e571f8ceecbfad..e949bfe022c3b0 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -243,6 +243,7 @@ > > * > > * 7.99 > > * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations > > + * - add FUSE_ATTR_IOMAP to enable iomap for specific inodes > > */ > > > > #ifndef _LINUX_FUSE_H > > @@ -583,9 +584,11 @@ struct fuse_file_lock { > > * > > * FUSE_ATTR_SUBMOUNT: Object is a submount root > > * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode > > + * FUSE_ATTR_IOMAP: Use iomap for this inode > > */ > > #define FUSE_ATTR_SUBMOUNT (1 << 0) > > #define FUSE_ATTR_DAX (1 << 1) > > +#define FUSE_ATTR_IOMAP (1 << 2) > > > > /** > > * Open flags > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index f1ef77a0be05bb..42c85c19f3b13b 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -3135,6 +3135,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) > > init_waitqueue_head(&fi->page_waitq); > > init_waitqueue_head(&fi->direct_io_waitq); > > > > + fuse_iomap_init_reg_inode(inode, flags); > > imo it's a bit confusing to have this here when the rest of the > fuse_iomap_init_nonreg_inode() calls happen inside the switch case > statement. Maybe it makes sense to have this inside the switch case > like the fuse_iomap_init_nonreg_inode() calls, or alternatively move > the fuse_iomap_init_nonreg_inode() calls into their corresponding > helpers (eg fuse_init_dir(), etc.), so that it's consistent? Ah, that. Originally I /did/ have it in the switch statement in fuse_init_inode. Then I started trying to work on fsdax support (HA!) for which it became necessary to move the fuse_iomap_init_reg_inode call to fuse_init_file_inode and pass it a pointer to args->flags so that it could clear FUSE_ATTR_DAX so that the other fuse dax io paths wouldn't try to install themselves. I never got fsdax working properly so that's why it's never been attached to my fuse-iomap patches. Maybe that'll happen some day in the meantime ... should fuse_iomap_init_reg_inode move back to the switch? > > if (IS_ENABLED(CONFIG_FUSE_DAX)) > > fuse_dax_inode_init(inode, flags); > > } > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > index 1b9e1bf2f799a3..fc0d5f135bacf9 100644 > > --- a/fs/fuse/file_iomap.c > > +++ b/fs/fuse/file_iomap.c > > @@ -635,3 +635,52 @@ void fuse_iomap_unmount(struct fuse_mount *fm) > > fuse_flush_requests_and_wait(fc); > > fuse_send_destroy(fm); > > } > > + > > +static inline void fuse_inode_set_iomap(struct inode *inode) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + set_bit(FUSE_I_IOMAP, &fi->state); > > +} > > + > > +static inline void fuse_inode_clear_iomap(struct inode *inode) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + clear_bit(FUSE_I_IOMAP, &fi->state); > > +} > > + > > +void fuse_iomap_init_nonreg_inode(struct inode *inode, unsigned attr_flags) > > +{ > > + struct fuse_conn *conn = get_fuse_conn(inode); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + ASSERT(!S_ISREG(inode->i_mode)); > > + > > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) > > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > +} > > + > > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags) > > +{ > > + struct fuse_conn *conn = get_fuse_conn(inode); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + ASSERT(S_ISREG(inode->i_mode)); > > + > > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) { > > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > + fuse_inode_set_iomap(inode); > > + } > > +} > > + > > +void fuse_iomap_evict_inode(struct inode *inode) > > +{ > > + struct fuse_conn *conn = get_fuse_conn(inode); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + > > + if (fuse_inode_has_iomap(inode)) > > If I'm understanding this correctly, a fuse inode can't have > FUSE_I_IOMAP set on it if conn>iomap is not enabled, correct? Correct. > Maybe it makes sense to just return if (!conn->iomap) at the very > beginning, to make that more clear? <shrug> fuse_inode_has_iomap only checks FUSE_I_IOMAP... > > + fuse_inode_clear_iomap(inode); > > + if (conn->iomap && fuse_inode_is_exclusive(inode)) > > + clear_bit(FUSE_I_EXCLUSIVE, &fi->state); ...but I wasn't going to assume that iomap is the only way that FUSE_I_EXCLUSIVE could get set. On the other hand, for non-regular files we set FUSE_I_EXCLUSIVE only if conn->iomap is nonzero *and* attr->flags contains FUSE_ATTR_IOMAP. So this clearing code isn't quite the same as the setting code. I wonder if that means we should set FUSE_I_IOMAP for non-regular files? They don't use iomap itself, but I suppose it would be neat if "iomap directories" also meant that timestamps and whatnot worked in the same as they do for regular files. > > +} > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > index 271356fa3be3ea..9b9e7b2dd0d928 100644 > > --- a/fs/fuse/inode.c > > +++ b/fs/fuse/inode.c > > @@ -196,6 +196,8 @@ static void fuse_evict_inode(struct inode *inode) > > WARN_ON(!list_empty(&fi->write_files)); > > WARN_ON(!list_empty(&fi->queued_writes)); > > } > > + > > + fuse_iomap_evict_inode(inode); > > } > > > > static int fuse_reconfigure(struct fs_context *fsc) > > @@ -428,20 +430,32 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, > > inode->i_size = attr->size; > > inode_set_mtime(inode, attr->mtime, attr->mtimensec); > > inode_set_ctime(inode, attr->ctime, attr->ctimensec); > > - if (S_ISREG(inode->i_mode)) { > > + switch (inode->i_mode & S_IFMT) { > > + case S_IFREG: > > fuse_init_common(inode); > > fuse_init_file_inode(inode, attr->flags); > > - } else if (S_ISDIR(inode->i_mode)) > > + break; > > + case S_IFDIR: > > fuse_init_dir(inode); > > - else if (S_ISLNK(inode->i_mode)) > > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > > + break; > > + case S_IFLNK: > > fuse_init_symlink(inode); > > - else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || > > - S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { > > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > > + break; > > + case S_IFCHR: > > + case S_IFBLK: > > + case S_IFIFO: > > + case S_IFSOCK: > > fuse_init_common(inode); > > init_special_inode(inode, inode->i_mode, > > new_decode_dev(attr->rdev)); > > - } else > > + fuse_iomap_init_nonreg_inode(inode, attr->flags); > > + break; > > + default: > > BUG(); > > Just thinking out loud here and curious to hear whether you like this > idea or not: another option is calling > > if (conn->iomap) > fuse_iomap_init_inode(); > > at the end, where fuse_iomap_init_inode() would be something like: > > void fuse_iomap_init_inode(struct inode *inode, unsigned attr_flags) > { > struct fuse_inode *fi = get_fuse_inode(inode); > > if (attr_flags & FUSE_ATTR_IOMAP) > set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > if (S_ISREG(inode->i_mode)) > fuse_inode_set_iomap(inode); > } > > which seems simpler to me than having both > fuse_iomap_init_nonreg_inode() and fuse_iomap_init_reg_inode() > function and invoking it per i_mode case. Yeah that would be simpler, but for that weird fsdax enabling quirk I mentioned earlier. Hrmm, I could also modify fuse_dax_inode_init to return without doing anything if FUSE_ATTR_IOMAP is set. --D > Thanks, > Joanne > > > + break; > > + } > > /* > > * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL > > * so they see the exact same behavior as before. > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 07/31] fuse: create a per-inode flag for toggling iomap 2026-01-22 22:22 ` Darrick J. Wong @ 2026-01-23 18:05 ` Joanne Koong 2026-01-24 16:54 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-23 18:05 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Thu, Jan 22, 2026 at 2:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 21, 2026 at 05:13:39PM -0800, Joanne Koong wrote: > > On Tue, Oct 28, 2025 at 5:46 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > Create a per-inode flag to control whether or not this inode actually > > > uses iomap. This is required for non-regular files because iomap > > > doesn't apply there; and enables fuse filesystems to provide some > > > non-iomap files if desired. > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > The logic in this makes sense to me, left just a few comments below. > > > > Reviewed-by: Joanne Koong <joannelkoong@gmail.com> > > Thanks! > > > > --- > > > fs/fuse/fuse_i.h | 17 ++++++++++++++++ > > > include/uapi/linux/fuse.h | 3 +++ > > > fs/fuse/file.c | 1 + > > > fs/fuse/file_iomap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ > > > fs/fuse/inode.c | 26 ++++++++++++++++++------ > > > 5 files changed, 90 insertions(+), 6 deletions(-) > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > index f1ef77a0be05bb..42c85c19f3b13b 100644 > > > --- a/fs/fuse/file.c > > > +++ b/fs/fuse/file.c > > > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags) > > > +{ > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > + > > > + ASSERT(S_ISREG(inode->i_mode)); > > > + > > > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) { > > > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > > + fuse_inode_set_iomap(inode); > > > + } > > > +} > > > + > > > +void fuse_iomap_evict_inode(struct inode *inode) > > > +{ > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > + > > > + if (fuse_inode_has_iomap(inode)) > > > > If I'm understanding this correctly, a fuse inode can't have > > FUSE_I_IOMAP set on it if conn>iomap is not enabled, correct? > > Correct. > > > Maybe it makes sense to just return if (!conn->iomap) at the very > > beginning, to make that more clear? > > <shrug> fuse_inode_has_iomap only checks FUSE_I_IOMAP... > > > > + fuse_inode_clear_iomap(inode); > > > + if (conn->iomap && fuse_inode_is_exclusive(inode)) > > > + clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > > ...but I wasn't going to assume that iomap is the only way that > FUSE_I_EXCLUSIVE could get set. > > On the other hand, for non-regular files we set FUSE_I_EXCLUSIVE only if > conn->iomap is nonzero *and* attr->flags contains FUSE_ATTR_IOMAP. So > this clearing code isn't quite the same as the setting code. > > I wonder if that means we should set FUSE_I_IOMAP for non-regular files? > They don't use iomap itself, but I suppose it would be neat if "iomap > directories" also meant that timestamps and whatnot worked in the same > as they do for regular files. > That seems like a good idea to me. I think that also makes the mental model (at least for me) simpler. Thanks, Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 07/31] fuse: create a per-inode flag for toggling iomap 2026-01-23 18:05 ` Joanne Koong @ 2026-01-24 16:54 ` Darrick J. Wong 2026-01-27 23:33 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-24 16:54 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Fri, Jan 23, 2026 at 10:05:32AM -0800, Joanne Koong wrote: > On Thu, Jan 22, 2026 at 2:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 21, 2026 at 05:13:39PM -0800, Joanne Koong wrote: > > > On Tue, Oct 28, 2025 at 5:46 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > Create a per-inode flag to control whether or not this inode actually > > > > uses iomap. This is required for non-regular files because iomap > > > > doesn't apply there; and enables fuse filesystems to provide some > > > > non-iomap files if desired. > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > The logic in this makes sense to me, left just a few comments below. > > > > > > Reviewed-by: Joanne Koong <joannelkoong@gmail.com> > > > > Thanks! > > > > > > --- > > > > fs/fuse/fuse_i.h | 17 ++++++++++++++++ > > > > include/uapi/linux/fuse.h | 3 +++ > > > > fs/fuse/file.c | 1 + > > > > fs/fuse/file_iomap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ > > > > fs/fuse/inode.c | 26 ++++++++++++++++++------ > > > > 5 files changed, 90 insertions(+), 6 deletions(-) > > > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > > index f1ef77a0be05bb..42c85c19f3b13b 100644 > > > > --- a/fs/fuse/file.c > > > > +++ b/fs/fuse/file.c > > > > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags) > > > > +{ > > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > + > > > > + ASSERT(S_ISREG(inode->i_mode)); > > > > + > > > > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) { > > > > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > > > + fuse_inode_set_iomap(inode); > > > > + } > > > > +} > > > > + > > > > +void fuse_iomap_evict_inode(struct inode *inode) > > > > +{ > > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > + > > > > + if (fuse_inode_has_iomap(inode)) > > > > > > If I'm understanding this correctly, a fuse inode can't have > > > FUSE_I_IOMAP set on it if conn>iomap is not enabled, correct? > > > > Correct. > > > > > Maybe it makes sense to just return if (!conn->iomap) at the very > > > beginning, to make that more clear? > > > > <shrug> fuse_inode_has_iomap only checks FUSE_I_IOMAP... > > > > > > + fuse_inode_clear_iomap(inode); > > > > + if (conn->iomap && fuse_inode_is_exclusive(inode)) > > > > + clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > > > > ...but I wasn't going to assume that iomap is the only way that > > FUSE_I_EXCLUSIVE could get set. > > > > On the other hand, for non-regular files we set FUSE_I_EXCLUSIVE only if > > conn->iomap is nonzero *and* attr->flags contains FUSE_ATTR_IOMAP. So > > this clearing code isn't quite the same as the setting code. > > > > I wonder if that means we should set FUSE_I_IOMAP for non-regular files? > > They don't use iomap itself, but I suppose it would be neat if "iomap > > directories" also meant that timestamps and whatnot worked in the same > > as they do for regular files. > > > > That seems like a good idea to me. I think that also makes the mental > model (at least for me) simpler. I tried that, and generic/476 immediately broke. I'll get back to that next week, but turning it on unconditionally is not trivial unfortunately. :/ --D > Thanks, > Joanne ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 07/31] fuse: create a per-inode flag for toggling iomap 2026-01-24 16:54 ` Darrick J. Wong @ 2026-01-27 23:33 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-27 23:33 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Sat, Jan 24, 2026 at 08:54:30AM -0800, Darrick J. Wong wrote: > On Fri, Jan 23, 2026 at 10:05:32AM -0800, Joanne Koong wrote: > > On Thu, Jan 22, 2026 at 2:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Wed, Jan 21, 2026 at 05:13:39PM -0800, Joanne Koong wrote: > > > > On Tue, Oct 28, 2025 at 5:46 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > Create a per-inode flag to control whether or not this inode actually > > > > > uses iomap. This is required for non-regular files because iomap > > > > > doesn't apply there; and enables fuse filesystems to provide some > > > > > non-iomap files if desired. > > > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > > > The logic in this makes sense to me, left just a few comments below. > > > > > > > > Reviewed-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > Thanks! > > > > > > > > --- > > > > > fs/fuse/fuse_i.h | 17 ++++++++++++++++ > > > > > include/uapi/linux/fuse.h | 3 +++ > > > > > fs/fuse/file.c | 1 + > > > > > fs/fuse/file_iomap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > fs/fuse/inode.c | 26 ++++++++++++++++++------ > > > > > 5 files changed, 90 insertions(+), 6 deletions(-) > > > > > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > > > index f1ef77a0be05bb..42c85c19f3b13b 100644 > > > > > --- a/fs/fuse/file.c > > > > > +++ b/fs/fuse/file.c > > > > > +void fuse_iomap_init_reg_inode(struct inode *inode, unsigned attr_flags) > > > > > +{ > > > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > > + > > > > > + ASSERT(S_ISREG(inode->i_mode)); > > > > > + > > > > > + if (conn->iomap && (attr_flags & FUSE_ATTR_IOMAP)) { > > > > > + set_bit(FUSE_I_EXCLUSIVE, &fi->state); > > > > > + fuse_inode_set_iomap(inode); > > > > > + } > > > > > +} > > > > > + > > > > > +void fuse_iomap_evict_inode(struct inode *inode) > > > > > +{ > > > > > + struct fuse_conn *conn = get_fuse_conn(inode); > > > > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > > + > > > > > + if (fuse_inode_has_iomap(inode)) > > > > > > > > If I'm understanding this correctly, a fuse inode can't have > > > > FUSE_I_IOMAP set on it if conn>iomap is not enabled, correct? > > > > > > Correct. > > > > > > > Maybe it makes sense to just return if (!conn->iomap) at the very > > > > beginning, to make that more clear? > > > > > > <shrug> fuse_inode_has_iomap only checks FUSE_I_IOMAP... > > > > > > > > + fuse_inode_clear_iomap(inode); > > > > > + if (conn->iomap && fuse_inode_is_exclusive(inode)) > > > > > + clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > > > > > > ...but I wasn't going to assume that iomap is the only way that > > > FUSE_I_EXCLUSIVE could get set. > > > > > > On the other hand, for non-regular files we set FUSE_I_EXCLUSIVE only if > > > conn->iomap is nonzero *and* attr->flags contains FUSE_ATTR_IOMAP. So > > > this clearing code isn't quite the same as the setting code. > > > > > > I wonder if that means we should set FUSE_I_IOMAP for non-regular files? > > > They don't use iomap itself, but I suppose it would be neat if "iomap > > > directories" also meant that timestamps and whatnot worked in the same > > > as they do for regular files. > > > > > > > That seems like a good idea to me. I think that also makes the mental > > model (at least for me) simpler. > > I tried that, and generic/476 immediately broke. I'll get back to that > next week, but turning it on unconditionally is not trivial > unfortunately. :/ I've tentatively fixed this by defining a FUSE_ATTR_EXCLUSIVE flag in the uapi so that the fuse server can tell the kernel which files are "exclusive" files, and hence which ones should have FUSE_I_EXCLUSIVE set. (These are ofc files where the kernel can transmogrify ACLs into i_mode changes and do ACL inheritance because there is no other principal that could be writing to the ondisk metadata.) --D > --D > > > Thanks, > > Joanne > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810568.1424854.4073875923015322741.stgit@frogsfrogsfrogs>]
* Re: [PATCH 10/31] fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} [not found] ` <176169810568.1424854.4073875923015322741.stgit@frogsfrogsfrogs> @ 2026-01-22 2:07 ` Joanne Koong 2026-01-22 22:31 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-22 2:07 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:47 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Implement the basic file mapping reporting functions like FIEMAP, BMAP, > and SEEK_DATA/HOLE. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > fs/fuse/fuse_i.h | 8 ++++++ > fs/fuse/dir.c | 1 + > fs/fuse/file.c | 13 ++++++++++ > fs/fuse/file_iomap.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 89 insertions(+), 1 deletion(-) > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index c7aeb324fe599e..6fe8aa1845b98d 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -1730,6 +1730,11 @@ static inline bool fuse_inode_has_iomap(const struct inode *inode) > > return test_bit(FUSE_I_IOMAP, &fi->state); > } > + > +int fuse_iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > + u64 start, u64 length); > +loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence); > +sector_t fuse_iomap_bmap(struct address_space *mapping, sector_t block); > #else > # define fuse_iomap_enabled(...) (false) > # define fuse_has_iomap(...) (false) > @@ -1739,6 +1744,9 @@ static inline bool fuse_inode_has_iomap(const struct inode *inode) > # define fuse_iomap_init_nonreg_inode(...) ((void)0) > # define fuse_iomap_evict_inode(...) ((void)0) > # define fuse_inode_has_iomap(...) (false) > +# define fuse_iomap_fiemap NULL > +# define fuse_iomap_lseek(...) (-ENOSYS) > +# define fuse_iomap_bmap(...) (-ENOSYS) > #endif > > #endif /* _FS_FUSE_I_H */ > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > index 18eb1bb192bb58..bafc386f2f4d3a 100644 > --- a/fs/fuse/dir.c > +++ b/fs/fuse/dir.c > @@ -2296,6 +2296,7 @@ static const struct inode_operations fuse_common_inode_operations = { > .set_acl = fuse_set_acl, > .fileattr_get = fuse_fileattr_get, > .fileattr_set = fuse_fileattr_set, > + .fiemap = fuse_iomap_fiemap, > }; > > static const struct inode_operations fuse_symlink_inode_operations = { > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index bd9c208a46c78d..8a981f41b1dbd0 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -2512,6 +2512,12 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) > struct fuse_bmap_out outarg; > int err; > > + if (fuse_inode_has_iomap(inode)) { > + sector_t alt_sec = fuse_iomap_bmap(mapping, block); > + if (alt_sec > 0) > + return alt_sec; > + } > + > if (!inode->i_sb->s_bdev || fm->fc->no_bmap) > return 0; > > @@ -2547,6 +2553,13 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) > struct fuse_lseek_out outarg; > int err; > > + if (fuse_inode_has_iomap(inode)) { > + loff_t alt_pos = fuse_iomap_lseek(file, offset, whence); > + > + if (alt_pos >= 0 || (alt_pos < 0 && alt_pos != -ENOSYS)) I don't think you technically need the "alt_pos < 0" part here since the "alt_pos >= 0 ||" part already accounts for that > + return alt_pos; > + } > + > if (fm->fc->no_lseek) > goto fallback; > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index 66a7b8faa31ac2..ce64e7c4860ef8 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -4,6 +4,7 @@ > * Author: Darrick J. Wong <djwong@kernel.org> > */ > #include <linux/iomap.h> > +#include <linux/fiemap.h> > #include "fuse_i.h" > #include "fuse_trace.h" > #include "iomap_i.h" > @@ -561,7 +562,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > return err; > } > > -const struct iomap_ops fuse_iomap_ops = { > +static const struct iomap_ops fuse_iomap_ops = { > .iomap_begin = fuse_iomap_begin, > .iomap_end = fuse_iomap_end, > }; > @@ -690,3 +691,68 @@ void fuse_iomap_evict_inode(struct inode *inode) > if (conn->iomap && fuse_inode_is_exclusive(inode)) > clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > } > + > +int fuse_iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > + u64 start, u64 count) > +{ > + struct fuse_conn *fc = get_fuse_conn(inode); > + int error; > + > + /* > + * We are called directly from the vfs so we need to check per-inode > + * support here explicitly. > + */ > + if (!fuse_inode_has_iomap(inode)) > + return -EOPNOTSUPP; > + > + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) I don't see where FIEMAP_FLAG_SYNC and FIEMAP_FLAG_CACHE are supported either, should these return -EOPNOTSUPP if they're set as well? > + return -EOPNOTSUPP; > + > + if (fuse_is_bad(inode)) > + return -EIO; > + > + if (!fuse_allow_current_process(fc)) > + return -EACCES; > + > + inode_lock_shared(inode); > + error = iomap_fiemap(inode, fieinfo, start, count, &fuse_iomap_ops); > + inode_unlock_shared(inode); > + > + return error; > +} > + > +sector_t fuse_iomap_bmap(struct address_space *mapping, sector_t block) > +{ > + ASSERT(fuse_inode_has_iomap(mapping->host)); > + > + return iomap_bmap(mapping, block, &fuse_iomap_ops); > +} > + > +loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence) > +{ > + struct inode *inode = file->f_mapping->host; > + struct fuse_conn *fc = get_fuse_conn(inode); > + > + ASSERT(fuse_inode_has_iomap(inode)); > + > + if (fuse_is_bad(inode)) > + return -EIO; > + > + if (!fuse_allow_current_process(fc)) > + return -EACCES; > + > + switch (whence) { > + case SEEK_HOLE: > + offset = iomap_seek_hole(inode, offset, &fuse_iomap_ops); > + break; > + case SEEK_DATA: > + offset = iomap_seek_data(inode, offset, &fuse_iomap_ops); > + break; > + default: Does it make sense to have the default case just call generic_file_llseek()? Thanks, Joanne > + return -ENOSYS; > + } > + > + if (offset < 0) > + return offset; > + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); > +} > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 10/31] fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} 2026-01-22 2:07 ` [PATCH 10/31] fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} Joanne Koong @ 2026-01-22 22:31 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-22 22:31 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 21, 2026 at 06:07:12PM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:47 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > Implement the basic file mapping reporting functions like FIEMAP, BMAP, > > and SEEK_DATA/HOLE. > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > fs/fuse/fuse_i.h | 8 ++++++ > > fs/fuse/dir.c | 1 + > > fs/fuse/file.c | 13 ++++++++++ > > fs/fuse/file_iomap.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++- > > 4 files changed, 89 insertions(+), 1 deletion(-) > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index c7aeb324fe599e..6fe8aa1845b98d 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -1730,6 +1730,11 @@ static inline bool fuse_inode_has_iomap(const struct inode *inode) > > > > return test_bit(FUSE_I_IOMAP, &fi->state); > > } > > + > > +int fuse_iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > > + u64 start, u64 length); > > +loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence); > > +sector_t fuse_iomap_bmap(struct address_space *mapping, sector_t block); > > #else > > # define fuse_iomap_enabled(...) (false) > > # define fuse_has_iomap(...) (false) > > @@ -1739,6 +1744,9 @@ static inline bool fuse_inode_has_iomap(const struct inode *inode) > > # define fuse_iomap_init_nonreg_inode(...) ((void)0) > > # define fuse_iomap_evict_inode(...) ((void)0) > > # define fuse_inode_has_iomap(...) (false) > > +# define fuse_iomap_fiemap NULL > > +# define fuse_iomap_lseek(...) (-ENOSYS) > > +# define fuse_iomap_bmap(...) (-ENOSYS) > > #endif > > > > #endif /* _FS_FUSE_I_H */ > > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > > index 18eb1bb192bb58..bafc386f2f4d3a 100644 > > --- a/fs/fuse/dir.c > > +++ b/fs/fuse/dir.c > > @@ -2296,6 +2296,7 @@ static const struct inode_operations fuse_common_inode_operations = { > > .set_acl = fuse_set_acl, > > .fileattr_get = fuse_fileattr_get, > > .fileattr_set = fuse_fileattr_set, > > + .fiemap = fuse_iomap_fiemap, > > }; > > > > static const struct inode_operations fuse_symlink_inode_operations = { > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index bd9c208a46c78d..8a981f41b1dbd0 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -2512,6 +2512,12 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) > > struct fuse_bmap_out outarg; > > int err; > > > > + if (fuse_inode_has_iomap(inode)) { > > + sector_t alt_sec = fuse_iomap_bmap(mapping, block); > > + if (alt_sec > 0) > > + return alt_sec; > > + } > > + > > if (!inode->i_sb->s_bdev || fm->fc->no_bmap) > > return 0; > > > > @@ -2547,6 +2553,13 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) > > struct fuse_lseek_out outarg; > > int err; > > > > + if (fuse_inode_has_iomap(inode)) { > > + loff_t alt_pos = fuse_iomap_lseek(file, offset, whence); > > + > > + if (alt_pos >= 0 || (alt_pos < 0 && alt_pos != -ENOSYS)) > > I don't think you technically need the "alt_pos < 0" part here since > the "alt_pos >= 0 ||" part already accounts for that alt_pos is loff_t, which is a signed type. But I think this could be more concise: alt_pos = fuse_iomap_lseek(...); if (alt_pos != -ENOSYS) return alt_pos; > > + return alt_pos; > > + } > > + > > if (fm->fc->no_lseek) > > goto fallback; > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > index 66a7b8faa31ac2..ce64e7c4860ef8 100644 > > --- a/fs/fuse/file_iomap.c > > +++ b/fs/fuse/file_iomap.c > > @@ -4,6 +4,7 @@ > > * Author: Darrick J. Wong <djwong@kernel.org> > > */ > > #include <linux/iomap.h> > > +#include <linux/fiemap.h> > > #include "fuse_i.h" > > #include "fuse_trace.h" > > #include "iomap_i.h" > > @@ -561,7 +562,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > return err; > > } > > > > -const struct iomap_ops fuse_iomap_ops = { > > +static const struct iomap_ops fuse_iomap_ops = { > > .iomap_begin = fuse_iomap_begin, > > .iomap_end = fuse_iomap_end, > > }; > > @@ -690,3 +691,68 @@ void fuse_iomap_evict_inode(struct inode *inode) > > if (conn->iomap && fuse_inode_is_exclusive(inode)) > > clear_bit(FUSE_I_EXCLUSIVE, &fi->state); > > } > > + > > +int fuse_iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > > + u64 start, u64 count) > > +{ > > + struct fuse_conn *fc = get_fuse_conn(inode); > > + int error; > > + > > + /* > > + * We are called directly from the vfs so we need to check per-inode > > + * support here explicitly. > > + */ > > + if (!fuse_inode_has_iomap(inode)) > > + return -EOPNOTSUPP; > > + > > + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) > > I don't see where FIEMAP_FLAG_SYNC and FIEMAP_FLAG_CACHE are supported > either, should these return -EOPNOTSUPP if they're set as well? The vfs implements FIEMAP_FLAG_SYNC for us in fiemap_prep, which is called by iomap_fiemap. I'm not sure what FIEMAP_FLAG_CACHE means in this context. Its comment says "request caching of the extents" which doesn't sound like doing anything is mandatory. > > + return -EOPNOTSUPP; > > + > > + if (fuse_is_bad(inode)) > > + return -EIO; > > + > > + if (!fuse_allow_current_process(fc)) > > + return -EACCES; > > + > > + inode_lock_shared(inode); > > + error = iomap_fiemap(inode, fieinfo, start, count, &fuse_iomap_ops); > > + inode_unlock_shared(inode); > > + > > + return error; > > +} > > + > > +sector_t fuse_iomap_bmap(struct address_space *mapping, sector_t block) > > +{ > > + ASSERT(fuse_inode_has_iomap(mapping->host)); > > + > > + return iomap_bmap(mapping, block, &fuse_iomap_ops); > > +} > > + > > +loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence) > > +{ > > + struct inode *inode = file->f_mapping->host; > > + struct fuse_conn *fc = get_fuse_conn(inode); > > + > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + if (fuse_is_bad(inode)) > > + return -EIO; > > + > > + if (!fuse_allow_current_process(fc)) > > + return -EACCES; > > + > > + switch (whence) { > > + case SEEK_HOLE: > > + offset = iomap_seek_hole(inode, offset, &fuse_iomap_ops); > > + break; > > + case SEEK_DATA: > > + offset = iomap_seek_data(inode, offset, &fuse_iomap_ops); > > + break; > > + default: > > Does it make sense to have the default case just call generic_file_llseek()? Yes. Thanks for spotting that bug! --D > Thanks, > Joanne > > > + return -ENOSYS; > > + } > > + > > + if (offset < 0) > > + return offset; > > + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); > > +} > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810700.1424854.5753715202341698632.stgit@frogsfrogsfrogs>]
* Re: [PATCH 16/31] fuse: implement large folios for iomap pagecache files [not found] ` <176169810700.1424854.5753715202341698632.stgit@frogsfrogsfrogs> @ 2026-01-23 21:50 ` Joanne Koong 0 siblings, 0 replies; 52+ messages in thread From: Joanne Koong @ 2026-01-23 21:50 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Use large folios when we're using iomap. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> Reviewed-by: Joanne Koong <joannelkoong@gmail.com> > --- > fs/fuse/file_iomap.c | 6 ++++++ > 1 file changed, 6 insertions(+) > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index 897a07f197c797..0bae356045638b 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -1380,12 +1380,18 @@ static const struct address_space_operations fuse_iomap_aops = { > static inline void fuse_inode_set_iomap(struct inode *inode) > { > struct fuse_inode *fi = get_fuse_inode(inode); > + unsigned int min_order = 0; > > inode->i_data.a_ops = &fuse_iomap_aops; > > INIT_WORK(&fi->ioend_work, fuse_iomap_end_io); > INIT_LIST_HEAD(&fi->ioend_list); > spin_lock_init(&fi->ioend_lock); > + > + if (inode->i_blkbits > PAGE_SHIFT) > + min_order = inode->i_blkbits - PAGE_SHIFT; > + > + mapping_set_folio_min_order(inode->i_mapping, min_order); > set_bit(FUSE_I_IOMAP, &fi->state); > } > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810721.1424854.6150447623894591900.stgit@frogsfrogsfrogs>]
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io [not found] ` <176169810721.1424854.6150447623894591900.stgit@frogsfrogsfrogs> @ 2026-01-26 22:03 ` Joanne Koong 2026-01-26 23:55 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-26 22:03 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > With iomap support turned on for the pagecache, the kernel issues > writeback to directly to block devices and we no longer have to push all > those pages through the fuse device to userspace. Therefore, we don't > need the tight dirty limits (~1M) that are used for regular fuse. This > dramatically increases the performance of fuse's pagecache IO. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > 1 file changed, 21 insertions(+) > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index 0bae356045638b..a9bacaa0991afa 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > void fuse_iomap_mount(struct fuse_mount *fm) > { > struct fuse_conn *fc = fm->fc; > + struct super_block *sb = fm->sb; > + struct backing_dev_info *old_bdi = sb->s_bdi; > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > + int res; > + > + /* > + * sb->s_bdi points to the initial private bdi. However, we want to > + * redirect it to a new private bdi with default dirty and readahead > + * settings because iomap writeback won't be pushing a ton of dirty > + * data through the fuse device. If this fails we fall back to the > + * initial fuse bdi. > + */ > + sb->s_bdi = &noop_backing_dev_info; > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > + MINOR(fc->dev), suffix); > + if (res) { > + sb->s_bdi = old_bdi; > + } else { > + bdi_unregister(old_bdi); > + bdi_put(old_bdi); > + } Maybe I'm missing something here, but isn't sb->s_bdi already set to noop_backing_dev_info when fuse_iomap_mount() is called? fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does this already before the fuse_iomap_mount() call, afaict. I think what we need to do is just unset BDI_CAP_STRICTLIMIT and adjust the bdi max ratio? This is more of a nit, but I think it'd also be nice if we swapped the ordering of this patch with the previous one enabling large folios, so that large folios gets enabled only when all the bdi stuff for it is ready. Thanks, Joanne > > /* > * Enable syncfs for iomap fuse servers so that we can send a final > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io 2026-01-26 22:03 ` [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io Joanne Koong @ 2026-01-26 23:55 ` Darrick J. Wong 2026-01-27 1:35 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-26 23:55 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 02:03:35PM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > With iomap support turned on for the pagecache, the kernel issues > > writeback to directly to block devices and we no longer have to push all > > those pages through the fuse device to userspace. Therefore, we don't > > need the tight dirty limits (~1M) that are used for regular fuse. This > > dramatically increases the performance of fuse's pagecache IO. > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > > 1 file changed, 21 insertions(+) > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > index 0bae356045638b..a9bacaa0991afa 100644 > > --- a/fs/fuse/file_iomap.c > > +++ b/fs/fuse/file_iomap.c > > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > > void fuse_iomap_mount(struct fuse_mount *fm) > > { > > struct fuse_conn *fc = fm->fc; > > + struct super_block *sb = fm->sb; > > + struct backing_dev_info *old_bdi = sb->s_bdi; > > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > > + int res; > > + > > + /* > > + * sb->s_bdi points to the initial private bdi. However, we want to > > + * redirect it to a new private bdi with default dirty and readahead > > + * settings because iomap writeback won't be pushing a ton of dirty > > + * data through the fuse device. If this fails we fall back to the > > + * initial fuse bdi. > > + */ > > + sb->s_bdi = &noop_backing_dev_info; > > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > > + MINOR(fc->dev), suffix); > > + if (res) { > > + sb->s_bdi = old_bdi; > > + } else { > > + bdi_unregister(old_bdi); > > + bdi_put(old_bdi); > > + } > > Maybe I'm missing something here, but isn't sb->s_bdi already set to > noop_backing_dev_info when fuse_iomap_mount() is called? > fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does > this already before the fuse_iomap_mount() call, afaict. Right. > I think what we need to do is just unset BDI_CAP_STRICTLIMIT and > adjust the bdi max ratio? That's sufficient to undo the effects of fuse_bdi_init, yes. However the BDI gets created with the name "$major:$minor{-fuseblk}" and there are "management" scripts that try to tweak fuse BDIs for better performance. I don't want some dumb script to mismanage a fuse-iomap filesystem because it can't tell the difference, so I create a new bdi with the name "$major:$minor.iomap" to make it obvious. But super_setup_bdi_name gets cranky if s_bdi isn't set to noop and we don't want to fail a mount here due to ENOMEM so ... I implemented this weird switcheroo code. > This is more of a nit, but I think it'd also be nice if we > swapped the ordering of this patch with the previous one enabling > large folios, so that large folios gets enabled only when all the bdi > stuff for it is ready. Will do, thanks for reading these patches! Also note that I've changed this part of the patchset quite a lot since this posting; iomap configuration is now a completely separate fuse command that gets triggered after the FUSE_INIT reply is received. --D > Thanks, > Joanne > > > > > /* > > * Enable syncfs for iomap fuse servers so that we can send a final > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io 2026-01-26 23:55 ` Darrick J. Wong @ 2026-01-27 1:35 ` Joanne Koong 2026-01-27 2:09 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-27 1:35 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 3:55 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Jan 26, 2026 at 02:03:35PM -0800, Joanne Koong wrote: > > On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > With iomap support turned on for the pagecache, the kernel issues > > > writeback to directly to block devices and we no longer have to push all > > > those pages through the fuse device to userspace. Therefore, we don't > > > need the tight dirty limits (~1M) that are used for regular fuse. This > > > dramatically increases the performance of fuse's pagecache IO. > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > --- > > > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > > > 1 file changed, 21 insertions(+) > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > index 0bae356045638b..a9bacaa0991afa 100644 > > > --- a/fs/fuse/file_iomap.c > > > +++ b/fs/fuse/file_iomap.c > > > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > > > void fuse_iomap_mount(struct fuse_mount *fm) > > > { > > > struct fuse_conn *fc = fm->fc; > > > + struct super_block *sb = fm->sb; > > > + struct backing_dev_info *old_bdi = sb->s_bdi; > > > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > > > + int res; > > > + > > > + /* > > > + * sb->s_bdi points to the initial private bdi. However, we want to > > > + * redirect it to a new private bdi with default dirty and readahead > > > + * settings because iomap writeback won't be pushing a ton of dirty > > > + * data through the fuse device. If this fails we fall back to the > > > + * initial fuse bdi. > > > + */ > > > + sb->s_bdi = &noop_backing_dev_info; > > > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > > > + MINOR(fc->dev), suffix); > > > + if (res) { > > > + sb->s_bdi = old_bdi; > > > + } else { > > > + bdi_unregister(old_bdi); > > > + bdi_put(old_bdi); > > > + } > > > > Maybe I'm missing something here, but isn't sb->s_bdi already set to > > noop_backing_dev_info when fuse_iomap_mount() is called? > > fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does > > this already before the fuse_iomap_mount() call, afaict. > > Right. > > > I think what we need to do is just unset BDI_CAP_STRICTLIMIT and > > adjust the bdi max ratio? > > That's sufficient to undo the effects of fuse_bdi_init, yes. However > the BDI gets created with the name "$major:$minor{-fuseblk}" and there > are "management" scripts that try to tweak fuse BDIs for better > performance. > > I don't want some dumb script to mismanage a fuse-iomap filesystem > because it can't tell the difference, so I create a new bdi with the > name "$major:$minor.iomap" to make it obvious. But super_setup_bdi_name > gets cranky if s_bdi isn't set to noop and we don't want to fail a mount > here due to ENOMEM so ... I implemented this weird switcheroo code. I see. It might be useful to copy/paste this into the commit message just for added context. I don't see a better way of doing it than what you have in this patch then since we rely on the init reply to know whether iomap should be used or not... If the new bdi setup fails, I wonder if the mount should just fail entirely then. That seems better to me than letting it succeed with strictlimiting enforced, especially since large folios will be enabled for fuse iomap. [1] has some numbers for the performance degradations I saw for writes with strictlimiting on and large folios enabled. Speaking of strictlimiting though, from a policy standpoint if we think strictlimiting is needed in general in fuse (there's a thread from last year [1] about removing strict limiting), then I think that would need to apply to iomap as well, at least for unprivileged servers. [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1bwat_r4+pmhaWH-ThAi+zoAJFwmJG65ANj1Zv0O0s4_A@mail.gmail.com/ [2] https://lore.kernel.org/linux-fsdevel/20251010150113.GC6174@frogsfrogsfrogs/T/#ma34ff5ae338a83f8b2e946d7e5332ea835fa0ff6 > > > This is more of a nit, but I think it'd also be nice if we > > swapped the ordering of this patch with the previous one enabling > > large folios, so that large folios gets enabled only when all the bdi > > stuff for it is ready. > > Will do, thanks for reading these patches! > > Also note that I've changed this part of the patchset quite a lot since > this posting; iomap configuration is now a completely separate fuse > command that gets triggered after the FUSE_INIT reply is received. Great, I'll look at your upstream tree then for this part. Thanks, Joanne > > --D > > > Thanks, > > Joanne > > > > > > > > /* > > > * Enable syncfs for iomap fuse servers so that we can send a final > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io 2026-01-27 1:35 ` Joanne Koong @ 2026-01-27 2:09 ` Darrick J. Wong 2026-01-27 18:04 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-27 2:09 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 05:35:05PM -0800, Joanne Koong wrote: > On Mon, Jan 26, 2026 at 3:55 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Jan 26, 2026 at 02:03:35PM -0800, Joanne Koong wrote: > > > On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > With iomap support turned on for the pagecache, the kernel issues > > > > writeback to directly to block devices and we no longer have to push all > > > > those pages through the fuse device to userspace. Therefore, we don't > > > > need the tight dirty limits (~1M) that are used for regular fuse. This > > > > dramatically increases the performance of fuse's pagecache IO. > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > --- > > > > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > > > > 1 file changed, 21 insertions(+) > > > > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > > index 0bae356045638b..a9bacaa0991afa 100644 > > > > --- a/fs/fuse/file_iomap.c > > > > +++ b/fs/fuse/file_iomap.c > > > > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > > > > void fuse_iomap_mount(struct fuse_mount *fm) > > > > { > > > > struct fuse_conn *fc = fm->fc; > > > > + struct super_block *sb = fm->sb; > > > > + struct backing_dev_info *old_bdi = sb->s_bdi; > > > > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > > > > + int res; > > > > + > > > > + /* > > > > + * sb->s_bdi points to the initial private bdi. However, we want to > > > > + * redirect it to a new private bdi with default dirty and readahead > > > > + * settings because iomap writeback won't be pushing a ton of dirty > > > > + * data through the fuse device. If this fails we fall back to the > > > > + * initial fuse bdi. > > > > + */ > > > > + sb->s_bdi = &noop_backing_dev_info; > > > > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > > > > + MINOR(fc->dev), suffix); > > > > + if (res) { > > > > + sb->s_bdi = old_bdi; > > > > + } else { > > > > + bdi_unregister(old_bdi); > > > > + bdi_put(old_bdi); > > > > + } > > > > > > Maybe I'm missing something here, but isn't sb->s_bdi already set to > > > noop_backing_dev_info when fuse_iomap_mount() is called? > > > fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does > > > this already before the fuse_iomap_mount() call, afaict. > > > > Right. > > > > > I think what we need to do is just unset BDI_CAP_STRICTLIMIT and > > > adjust the bdi max ratio? > > > > That's sufficient to undo the effects of fuse_bdi_init, yes. However > > the BDI gets created with the name "$major:$minor{-fuseblk}" and there > > are "management" scripts that try to tweak fuse BDIs for better > > performance. > > > > I don't want some dumb script to mismanage a fuse-iomap filesystem > > because it can't tell the difference, so I create a new bdi with the > > name "$major:$minor.iomap" to make it obvious. But super_setup_bdi_name > > gets cranky if s_bdi isn't set to noop and we don't want to fail a mount > > here due to ENOMEM so ... I implemented this weird switcheroo code. > > I see. It might be useful to copy/paste this into the commit message > just for added context. I don't see a better way of doing it than what > you have in this patch then since we rely on the init reply to know > whether iomap should be used or not... I'll do that. I will also add that as soon as any BDI is created, it will be exposed to userspace in sysfs. That means that running the code from fuse_bdi_init in reverse will not necessarily produce the same results as a freshly created BDI. > If the new bdi setup fails, I wonder if the mount should just fail > entirely then. That seems better to me than letting it succeed with Err, which new bdi setup? If fuse-iomap can't create a new BDI, it will set s_bdi back to the old one and move on. You'll get degraded performance, but that's not the end of the world. > strictlimiting enforced, especially since large folios will be enabled > for fuse iomap. [1] has some numbers for the performance degradations > I saw for writes with strictlimiting on and large folios enabled. If fuse_bdi_init can't set up a bdi it will fail the mount. That said... from reading [1], if strictlimiting is enabled with large folios, then can we figure out what is the effective max folio size and lower it to that? > Speaking of strictlimiting though, from a policy standpoint if we > think strictlimiting is needed in general in fuse (there's a thread > from last year [1] about removing strict limiting), then I think that (did you mean [2] here?) > would need to apply to iomap as well, at least for unprivileged > servers. iomap requires a privileged server, FWIW. > [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1bwat_r4+pmhaWH-ThAi+zoAJFwmJG65ANj1Zv0O0s4_A@mail.gmail.com/ > [2] https://lore.kernel.org/linux-fsdevel/20251010150113.GC6174@frogsfrogsfrogs/T/#ma34ff5ae338a83f8b2e946d7e5332ea835fa0ff6 > > > > > > This is more of a nit, but I think it'd also be nice if we > > > swapped the ordering of this patch with the previous one enabling > > > large folios, so that large folios gets enabled only when all the bdi > > > stuff for it is ready. > > > > Will do, thanks for reading these patches! > > > > Also note that I've changed this part of the patchset quite a lot since > > this posting; iomap configuration is now a completely separate fuse > > command that gets triggered after the FUSE_INIT reply is received. > > Great, I'll look at your upstream tree then for this part. Ok. --D > Thanks, > Joanne > > > > > --D > > > > > Thanks, > > > Joanne > > > > > > > > > > > /* > > > > * Enable syncfs for iomap fuse servers so that we can send a final > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io 2026-01-27 2:09 ` Darrick J. Wong @ 2026-01-27 18:04 ` Joanne Koong 2026-01-27 23:37 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-27 18:04 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 6:09 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Jan 26, 2026 at 05:35:05PM -0800, Joanne Koong wrote: > > On Mon, Jan 26, 2026 at 3:55 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Jan 26, 2026 at 02:03:35PM -0800, Joanne Koong wrote: > > > > On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > With iomap support turned on for the pagecache, the kernel issues > > > > > writeback to directly to block devices and we no longer have to push all > > > > > those pages through the fuse device to userspace. Therefore, we don't > > > > > need the tight dirty limits (~1M) that are used for regular fuse. This > > > > > dramatically increases the performance of fuse's pagecache IO. > > > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > --- > > > > > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > > > > > 1 file changed, 21 insertions(+) > > > > > > > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > > > index 0bae356045638b..a9bacaa0991afa 100644 > > > > > --- a/fs/fuse/file_iomap.c > > > > > +++ b/fs/fuse/file_iomap.c > > > > > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > > > > > void fuse_iomap_mount(struct fuse_mount *fm) > > > > > { > > > > > struct fuse_conn *fc = fm->fc; > > > > > + struct super_block *sb = fm->sb; > > > > > + struct backing_dev_info *old_bdi = sb->s_bdi; > > > > > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > > > > > + int res; > > > > > + > > > > > + /* > > > > > + * sb->s_bdi points to the initial private bdi. However, we want to > > > > > + * redirect it to a new private bdi with default dirty and readahead > > > > > + * settings because iomap writeback won't be pushing a ton of dirty > > > > > + * data through the fuse device. If this fails we fall back to the > > > > > + * initial fuse bdi. > > > > > + */ > > > > > + sb->s_bdi = &noop_backing_dev_info; > > > > > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > > > > > + MINOR(fc->dev), suffix); > > > > > + if (res) { > > > > > + sb->s_bdi = old_bdi; > > > > > + } else { > > > > > + bdi_unregister(old_bdi); > > > > > + bdi_put(old_bdi); > > > > > + } > > > > > > > > Maybe I'm missing something here, but isn't sb->s_bdi already set to > > > > noop_backing_dev_info when fuse_iomap_mount() is called? > > > > fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does > > > > this already before the fuse_iomap_mount() call, afaict. > > > > > > Right. > > > > > > > I think what we need to do is just unset BDI_CAP_STRICTLIMIT and > > > > adjust the bdi max ratio? > > > > > > That's sufficient to undo the effects of fuse_bdi_init, yes. However > > > the BDI gets created with the name "$major:$minor{-fuseblk}" and there > > > are "management" scripts that try to tweak fuse BDIs for better > > > performance. > > > > > > I don't want some dumb script to mismanage a fuse-iomap filesystem > > > because it can't tell the difference, so I create a new bdi with the > > > name "$major:$minor.iomap" to make it obvious. But super_setup_bdi_name > > > gets cranky if s_bdi isn't set to noop and we don't want to fail a mount > > > here due to ENOMEM so ... I implemented this weird switcheroo code. > > > > I see. It might be useful to copy/paste this into the commit message > > just for added context. I don't see a better way of doing it than what > > you have in this patch then since we rely on the init reply to know > > whether iomap should be used or not... > > I'll do that. I will also add that as soon as any BDI is created, it > will be exposed to userspace in sysfs. That means that running the code > from fuse_bdi_init in reverse will not necessarily produce the same > results as a freshly created BDI. > > > If the new bdi setup fails, I wonder if the mount should just fail > > entirely then. That seems better to me than letting it succeed with > > Err, which new bdi setup? If fuse-iomap can't create a new BDI, it will > set s_bdi back to the old one and move on. You'll get degraded > performance, but that's not the end of the world. I was thinking from the user POV, I'd rather the whole mount fail (which it seems like would only be a transient failure, eg running out of memory) and I retry, than it work but have writes potentially run 10x slower (10x comes from the benchmarks Jingbo saw in [1]) > > > strictlimiting enforced, especially since large folios will be enabled > > for fuse iomap. [1] has some numbers for the performance degradations > > I saw for writes with strictlimiting on and large folios enabled. > > If fuse_bdi_init can't set up a bdi it will fail the mount. > > That said... from reading [1], if strictlimiting is enabled with large > folios, then can we figure out what is the effective max folio size and > lower it to that? I'm not really sure how we figure that out, unless I guess we try to do it experimentally? The throttling logic for this is in balance_dirty_pages(). > > > Speaking of strictlimiting though, from a policy standpoint if we > > think strictlimiting is needed in general in fuse (there's a thread > > from last year [1] about removing strict limiting), then I think that > > (did you mean [2] here?) Ah yes sorry, I had meant [2]. > > > would need to apply to iomap as well, at least for unprivileged > > servers. > > iomap requires a privileged server, FWIW. Oh right, I forgot iomap only runs with privileges enabled. In that case, I think that makes the whole strictlimiting thing a lot simpler then. imo for privileged servers, we should get rid of strictlimiting entirely. Though I'm not sure how MIklos feels about that. Thanks, Joanne > > > [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1bwat_r4+pmhaWH-ThAi+zoAJFwmJG65ANj1Zv0O0s4_A@mail.gmail.com/ > > [2] https://lore.kernel.org/linux-fsdevel/20251010150113.GC6174@frogsfrogsfrogs/T/#ma34ff5ae338a83f8b2e946d7e5332ea835fa0ff6 > > > > > > > > > This is more of a nit, but I think it'd also be nice if we > > > > swapped the ordering of this patch with the previous one enabling > > > > large folios, so that large folios gets enabled only when all the bdi > > > > stuff for it is ready. > > > > > > Will do, thanks for reading these patches! > > > > > > Also note that I've changed this part of the patchset quite a lot since > > > this posting; iomap configuration is now a completely separate fuse > > > command that gets triggered after the FUSE_INIT reply is received. > > > > Great, I'll look at your upstream tree then for this part. > > Ok. > > --D > > > Thanks, > > Joanne > > > > > > > > --D > > > > > > > Thanks, > > > > Joanne > > > > > > > > > > > > > > /* > > > > > * Enable syncfs for iomap fuse servers so that we can send a final > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io 2026-01-27 18:04 ` Joanne Koong @ 2026-01-27 23:37 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-27 23:37 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Jan 27, 2026 at 10:04:28AM -0800, Joanne Koong wrote: > On Mon, Jan 26, 2026 at 6:09 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Jan 26, 2026 at 05:35:05PM -0800, Joanne Koong wrote: > > > On Mon, Jan 26, 2026 at 3:55 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Mon, Jan 26, 2026 at 02:03:35PM -0800, Joanne Koong wrote: > > > > > On Tue, Oct 28, 2025 at 5:49 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > > > With iomap support turned on for the pagecache, the kernel issues > > > > > > writeback to directly to block devices and we no longer have to push all > > > > > > those pages through the fuse device to userspace. Therefore, we don't > > > > > > need the tight dirty limits (~1M) that are used for regular fuse. This > > > > > > dramatically increases the performance of fuse's pagecache IO. > > > > > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > --- > > > > > > fs/fuse/file_iomap.c | 21 +++++++++++++++++++++ > > > > > > 1 file changed, 21 insertions(+) > > > > > > > > > > > > > > > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > > > > > index 0bae356045638b..a9bacaa0991afa 100644 > > > > > > --- a/fs/fuse/file_iomap.c > > > > > > +++ b/fs/fuse/file_iomap.c > > > > > > @@ -713,6 +713,27 @@ const struct fuse_backing_ops fuse_iomap_backing_ops = { > > > > > > void fuse_iomap_mount(struct fuse_mount *fm) > > > > > > { > > > > > > struct fuse_conn *fc = fm->fc; > > > > > > + struct super_block *sb = fm->sb; > > > > > > + struct backing_dev_info *old_bdi = sb->s_bdi; > > > > > > + char *suffix = sb->s_bdev ? "-fuseblk" : "-fuse"; > > > > > > + int res; > > > > > > + > > > > > > + /* > > > > > > + * sb->s_bdi points to the initial private bdi. However, we want to > > > > > > + * redirect it to a new private bdi with default dirty and readahead > > > > > > + * settings because iomap writeback won't be pushing a ton of dirty > > > > > > + * data through the fuse device. If this fails we fall back to the > > > > > > + * initial fuse bdi. > > > > > > + */ > > > > > > + sb->s_bdi = &noop_backing_dev_info; > > > > > > + res = super_setup_bdi_name(sb, "%u:%u%s.iomap", MAJOR(fc->dev), > > > > > > + MINOR(fc->dev), suffix); > > > > > > + if (res) { > > > > > > + sb->s_bdi = old_bdi; > > > > > > + } else { > > > > > > + bdi_unregister(old_bdi); > > > > > > + bdi_put(old_bdi); > > > > > > + } > > > > > > > > > > Maybe I'm missing something here, but isn't sb->s_bdi already set to > > > > > noop_backing_dev_info when fuse_iomap_mount() is called? > > > > > fuse_fill_super() -> fuse_fill_super_common() -> fuse_bdi_init() does > > > > > this already before the fuse_iomap_mount() call, afaict. > > > > > > > > Right. > > > > > > > > > I think what we need to do is just unset BDI_CAP_STRICTLIMIT and > > > > > adjust the bdi max ratio? > > > > > > > > That's sufficient to undo the effects of fuse_bdi_init, yes. However > > > > the BDI gets created with the name "$major:$minor{-fuseblk}" and there > > > > are "management" scripts that try to tweak fuse BDIs for better > > > > performance. > > > > > > > > I don't want some dumb script to mismanage a fuse-iomap filesystem > > > > because it can't tell the difference, so I create a new bdi with the > > > > name "$major:$minor.iomap" to make it obvious. But super_setup_bdi_name > > > > gets cranky if s_bdi isn't set to noop and we don't want to fail a mount > > > > here due to ENOMEM so ... I implemented this weird switcheroo code. > > > > > > I see. It might be useful to copy/paste this into the commit message > > > just for added context. I don't see a better way of doing it than what > > > you have in this patch then since we rely on the init reply to know > > > whether iomap should be used or not... > > > > I'll do that. I will also add that as soon as any BDI is created, it > > will be exposed to userspace in sysfs. That means that running the code > > from fuse_bdi_init in reverse will not necessarily produce the same > > results as a freshly created BDI. > > > > > If the new bdi setup fails, I wonder if the mount should just fail > > > entirely then. That seems better to me than letting it succeed with > > > > Err, which new bdi setup? If fuse-iomap can't create a new BDI, it will > > set s_bdi back to the old one and move on. You'll get degraded > > performance, but that's not the end of the world. > > I was thinking from the user POV, I'd rather the whole mount fail > (which it seems like would only be a transient failure, eg running out > of memory) and I retry, than it work but have writes potentially run > 10x slower (10x comes from the benchmarks Jingbo saw in [1]) Hrmm. The difficulty of preallocating the iomap bdi is that I think you'd have to do it in fuse_bdi_init, which occurs before the kernel has even seen the reply to FUSE_INIT and therefore knows if the fuse server even cares about iomap. > > > strictlimiting enforced, especially since large folios will be enabled > > > for fuse iomap. [1] has some numbers for the performance degradations > > > I saw for writes with strictlimiting on and large folios enabled. > > > > If fuse_bdi_init can't set up a bdi it will fail the mount. > > > > That said... from reading [1], if strictlimiting is enabled with large > > folios, then can we figure out what is the effective max folio size and > > lower it to that? > > I'm not really sure how we figure that out, unless I guess we try to > do it experimentally? The throttling logic for this is in > balance_dirty_pages(). Oh, I see, it's a dynamic limit. I don't know how to deal with that either. > > > Speaking of strictlimiting though, from a policy standpoint if we > > > think strictlimiting is needed in general in fuse (there's a thread > > > from last year [1] about removing strict limiting), then I think that > > > > (did you mean [2] here?) > > Ah yes sorry, I had meant [2]. > > > > > would need to apply to iomap as well, at least for unprivileged > > > servers. > > > > iomap requires a privileged server, FWIW. > > Oh right, I forgot iomap only runs with privileges enabled. In that > case, I think that makes the whole strictlimiting thing a lot simpler > then. imo for privileged servers, we should get rid of strictlimiting > entirely. Though I'm not sure how MIklos feels about that. <nod> --D > Thanks, > Joanne > > > > > > [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1bwat_r4+pmhaWH-ThAi+zoAJFwmJG65ANj1Zv0O0s4_A@mail.gmail.com/ > > > [2] https://lore.kernel.org/linux-fsdevel/20251010150113.GC6174@frogsfrogsfrogs/T/#ma34ff5ae338a83f8b2e946d7e5332ea835fa0ff6 > > > > > > > > > > > > This is more of a nit, but I think it'd also be nice if we > > > > > swapped the ordering of this patch with the previous one enabling > > > > > large folios, so that large folios gets enabled only when all the bdi > > > > > stuff for it is ready. > > > > > > > > Will do, thanks for reading these patches! > > > > > > > > Also note that I've changed this part of the patchset quite a lot since > > > > this posting; iomap configuration is now a completely separate fuse > > > > command that gets triggered after the FUSE_INIT reply is received. > > > > > > Great, I'll look at your upstream tree then for this part. > > > > Ok. > > > > --D > > > > > Thanks, > > > Joanne > > > > > > > > > > > --D > > > > > > > > > Thanks, > > > > > Joanne > > > > > > > > > > > > > > > > > /* > > > > > > * Enable syncfs for iomap fuse servers so that we can send a final > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance [not found] ` <176169810144.1424854.11439355400009006946.stgit@frogsfrogsfrogs> ` (5 preceding siblings ...) [not found] ` <176169810721.1424854.6150447623894591900.stgit@frogsfrogsfrogs> @ 2026-01-27 0:59 ` Joanne Koong 2026-01-27 2:22 ` Darrick J. Wong [not found] ` <176169810980.1424854.10557015500766654898.stgit@frogsfrogsfrogs> ` (5 subsequent siblings) 12 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-27 0:59 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > Hi all, > > This series connects fuse (the userspace filesystem layer) to fs-iomap > to get fuse servers out of the business of handling file I/O themselves. > By keeping the IO path mostly within the kernel, we can dramatically > improve the speed of disk-based filesystems. This enables us to move > all the filesystem metadata parsing code out of the kernel and into > userspace, which means that we can containerize them for security > without losing a lot of performance. I haven't looked through how the fuse2fs or fuse4fs servers are implemented yet (also, could you explain the difference between the two? Which one should we look at to see how it all ties together?), but I wonder if having bpf infrastructure hooked up to fuse would be especially helpful for what you're doing here with fuse iomap. afaict, every read/write whether it's buffered or direct will incur at least 1 call to ->iomap_begin() to get the mapping metadata, which will be 2 context-switches (and if the server has ->iomap_end() implemented, then 2 more context-switches). But it seems like the logic for retrieving mapping offsets/lengths/metadata should be pretty straightforward? If the extent lookups are table lookups or tree traversals without complex side effects, then having ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid the context switches and allow all the caching logic to be moved from the kernel to the server-side (eg using bpf maps). Is this your assessment of it as well or do you think the server-side logic for iomap_begin()/iomap_end() is too complicated to make this realistic? Asking because I'm curious whether this direction makes sense, not because I think it would be a blocker for your series. Thanks, Joanne > > If you're going to start using this code, I strongly recommend pulling > from my git trees, which are linked below. > > This has been running on the djcloud for months with no problems. Enjoy! > Comments and questions are, as always, welcome. > > --D > > kernel git tree: > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > --- > Commits in this patchset: > * fuse: implement the basic iomap mechanisms > * fuse_trace: implement the basic iomap mechanisms > * fuse: make debugging configurable at runtime > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > * fuse: create a per-inode flag for toggling iomap > * fuse_trace: create a per-inode flag for toggling iomap > * fuse: isolate the other regular file IO paths from iomap > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > * fuse: implement direct IO with iomap > * fuse_trace: implement direct IO with iomap > * fuse: implement buffered IO with iomap > * fuse_trace: implement buffered IO with iomap > * fuse: implement large folios for iomap pagecache files > * fuse: use an unrestricted backing device with iomap pagecache io > * fuse: advertise support for iomap > * fuse: query filesystem geometry when using iomap > * fuse_trace: query filesystem geometry when using iomap > * fuse: implement fadvise for iomap files > * fuse: invalidate ranges of block devices being used for iomap > * fuse_trace: invalidate ranges of block devices being used for iomap > * fuse: implement inline data file IO via iomap > * fuse_trace: implement inline data file IO via iomap > * fuse: allow more statx fields > * fuse: support atomic writes with iomap > * fuse_trace: support atomic writes with iomap > * fuse: disable direct reclaim for any fuse server that uses iomap > * fuse: enable swapfile activation on iomap > * fuse: implement freeze and shutdowns for iomap filesystems > --- > fs/fuse/fuse_i.h | 161 +++ > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > fs/fuse/iomap_i.h | 52 + > include/uapi/linux/fuse.h | 219 ++++ > fs/fuse/Kconfig | 48 + > fs/fuse/Makefile | 1 > fs/fuse/backing.c | 12 > fs/fuse/dev.c | 30 + > fs/fuse/dir.c | 120 ++ > fs/fuse/file.c | 133 ++- > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/inode.c | 162 +++ > fs/fuse/iomode.c | 2 > fs/fuse/trace.c | 2 > 14 files changed, 4056 insertions(+), 55 deletions(-) > create mode 100644 fs/fuse/iomap_i.h > create mode 100644 fs/fuse/file_iomap.c > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-27 0:59 ` [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance Joanne Koong @ 2026-01-27 2:22 ` Darrick J. Wong 2026-01-27 19:47 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-27 2:22 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > Hi all, > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > to get fuse servers out of the business of handling file I/O themselves. > > By keeping the IO path mostly within the kernel, we can dramatically > > improve the speed of disk-based filesystems. This enables us to move > > all the filesystem metadata parsing code out of the kernel and into > > userspace, which means that we can containerize them for security > > without losing a lot of performance. > > I haven't looked through how the fuse2fs or fuse4fs servers are > implemented yet (also, could you explain the difference between the > two? Which one should we look at to see how it all ties together?), fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. > but I wonder if having bpf infrastructure hooked up to fuse would be > especially helpful for what you're doing here with fuse iomap. afaict, > every read/write whether it's buffered or direct will incur at least 1 > call to ->iomap_begin() to get the mapping metadata, which will be 2 > context-switches (and if the server has ->iomap_end() implemented, > then 2 more context-switches). Yes, I agree that's a lot of context switching for file IO... > But it seems like the logic for retrieving mapping > offsets/lengths/metadata should be pretty straightforward? ...but it gets very cheap if the fuse server can cache mappings in the kernel to avoid all that. That is, incidentally, what patchset #7 implements. https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > If the extent lookups are table lookups or tree > traversals without complex side effects, then having > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > the context switches and allow all the caching logic to be moved from > the kernel to the server-side (eg using bpf maps). Hrmm. Now that /is/ an interesting proposal. Does BPF have a data structure that supports interval mappings? I think the existing bpf map only does key -> value. Also, is there an upper limit on the size of a map? You could have hundreds of millions of maps for a very fragmented regular file. At one point I suggested to the famfs maintainer that it might be easier/better to implement the interleaved mapping lookups as bpf programs instead of being stuck with a fixed format in the fuse userspace abi, but I don't know if he ever implemented that. > Is this your > assessment of it as well or do you think the server-side logic for > iomap_begin()/iomap_end() is too complicated to make this realistic? > Asking because I'm curious whether this direction makes sense, not > because I think it would be a blocker for your series. For disk-based filesystems I think it would be difficult to model a bpf program to do mappings, since they can basically point anywhere and be of any size. OTOH it would be enormously hilarious to me if one could load a file mapping predictive model into the kernel as a bpf program and use that as a first tier before checking the in-memory btree mapping cache from patchset 7. Quite a few years ago now there was a FAST paper establishing that even a stupid linear regression model could in theory beat a disk btree lookup. --D > Thanks, > Joanne > > > > > If you're going to start using this code, I strongly recommend pulling > > from my git trees, which are linked below. > > > > This has been running on the djcloud for months with no problems. Enjoy! > > Comments and questions are, as always, welcome. > > > > --D > > > > kernel git tree: > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > --- > > Commits in this patchset: > > * fuse: implement the basic iomap mechanisms > > * fuse_trace: implement the basic iomap mechanisms > > * fuse: make debugging configurable at runtime > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > * fuse: create a per-inode flag for toggling iomap > > * fuse_trace: create a per-inode flag for toggling iomap > > * fuse: isolate the other regular file IO paths from iomap > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > * fuse: implement direct IO with iomap > > * fuse_trace: implement direct IO with iomap > > * fuse: implement buffered IO with iomap > > * fuse_trace: implement buffered IO with iomap > > * fuse: implement large folios for iomap pagecache files > > * fuse: use an unrestricted backing device with iomap pagecache io > > * fuse: advertise support for iomap > > * fuse: query filesystem geometry when using iomap > > * fuse_trace: query filesystem geometry when using iomap > > * fuse: implement fadvise for iomap files > > * fuse: invalidate ranges of block devices being used for iomap > > * fuse_trace: invalidate ranges of block devices being used for iomap > > * fuse: implement inline data file IO via iomap > > * fuse_trace: implement inline data file IO via iomap > > * fuse: allow more statx fields > > * fuse: support atomic writes with iomap > > * fuse_trace: support atomic writes with iomap > > * fuse: disable direct reclaim for any fuse server that uses iomap > > * fuse: enable swapfile activation on iomap > > * fuse: implement freeze and shutdowns for iomap filesystems > > --- > > fs/fuse/fuse_i.h | 161 +++ > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > fs/fuse/iomap_i.h | 52 + > > include/uapi/linux/fuse.h | 219 ++++ > > fs/fuse/Kconfig | 48 + > > fs/fuse/Makefile | 1 > > fs/fuse/backing.c | 12 > > fs/fuse/dev.c | 30 + > > fs/fuse/dir.c | 120 ++ > > fs/fuse/file.c | 133 ++- > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/inode.c | 162 +++ > > fs/fuse/iomode.c | 2 > > fs/fuse/trace.c | 2 > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > create mode 100644 fs/fuse/iomap_i.h > > create mode 100644 fs/fuse/file_iomap.c > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-27 2:22 ` Darrick J. Wong @ 2026-01-27 19:47 ` Joanne Koong 2026-01-27 23:21 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-27 19:47 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Mon, Jan 26, 2026 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > Hi all, > > > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > > to get fuse servers out of the business of handling file I/O themselves. > > > By keeping the IO path mostly within the kernel, we can dramatically > > > improve the speed of disk-based filesystems. This enables us to move > > > all the filesystem metadata parsing code out of the kernel and into > > > userspace, which means that we can containerize them for security > > > without losing a lot of performance. > > > > I haven't looked through how the fuse2fs or fuse4fs servers are > > implemented yet (also, could you explain the difference between the > > two? Which one should we look at to see how it all ties together?), > > fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse > server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. Ah I see, thanks for the explanation. In that case, I'll just look at fuse4fs then. > > > but I wonder if having bpf infrastructure hooked up to fuse would be > > especially helpful for what you're doing here with fuse iomap. afaict, > > every read/write whether it's buffered or direct will incur at least 1 > > call to ->iomap_begin() to get the mapping metadata, which will be 2 > > context-switches (and if the server has ->iomap_end() implemented, > > then 2 more context-switches). > > Yes, I agree that's a lot of context switching for file IO... > > > But it seems like the logic for retrieving mapping > > offsets/lengths/metadata should be pretty straightforward? > > ...but it gets very cheap if the fuse server can cache mappings in the > kernel to avoid all that. That is, incidentally, what patchset #7 > implements. > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > > > If the extent lookups are table lookups or tree > > traversals without complex side effects, then having > > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > > the context switches and allow all the caching logic to be moved from > > the kernel to the server-side (eg using bpf maps). > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > structure that supports interval mappings? I think the existing bpf map Not yet but I don't see why a b+ tree like data strucutre couldn't be added. Maybe one workaround in the meantime that could work is using a sorted array map and doing binary search on that, until interval mappings can be natively supported? > only does key -> value. Also, is there an upper limit on the size of a > map? You could have hundreds of millions of maps for a very fragmented > regular file. If I'm remembering correctly, there's an upper limit on the number of map entries, which is bounded by u32 > > At one point I suggested to the famfs maintainer that it might be > easier/better to implement the interleaved mapping lookups as bpf > programs instead of being stuck with a fixed format in the fuse > userspace abi, but I don't know if he ever implemented that. This seems like a good use case for it too > > > Is this your > > assessment of it as well or do you think the server-side logic for > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > Asking because I'm curious whether this direction makes sense, not > > because I think it would be a blocker for your series. > > For disk-based filesystems I think it would be difficult to model a bpf > program to do mappings, since they can basically point anywhere and be > of any size. Hmm I'm not familiar enough with disk-based filesystems to know what the "point anywhere and be of any size" means. For the mapping stuff, doesn't it just point to a block number? Or are you saying the problem would be there's too many mappings since a mapping could be any size? I was thinking the issue would be more that there might be other logic inside ->iomap_begin()/->iomap_end() besides the mapping stuff that would need to be done that would be too out-of-scope for bpf. But I think I need to read through the fuse4fs stuff to understand more what it's doing in those functions. Thanks, Joanne > > OTOH it would be enormously hilarious to me if one could load a file > mapping predictive model into the kernel as a bpf program and use that > as a first tier before checking the in-memory btree mapping cache from > patchset 7. Quite a few years ago now there was a FAST paper > establishing that even a stupid linear regression model could in theory > beat a disk btree lookup. > > --D > > > Thanks, > > Joanne > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > from my git trees, which are linked below. > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > Comments and questions are, as always, welcome. > > > > > > --D > > > > > > kernel git tree: > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > --- > > > Commits in this patchset: > > > * fuse: implement the basic iomap mechanisms > > > * fuse_trace: implement the basic iomap mechanisms > > > * fuse: make debugging configurable at runtime > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > * fuse: create a per-inode flag for toggling iomap > > > * fuse_trace: create a per-inode flag for toggling iomap > > > * fuse: isolate the other regular file IO paths from iomap > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > * fuse: implement direct IO with iomap > > > * fuse_trace: implement direct IO with iomap > > > * fuse: implement buffered IO with iomap > > > * fuse_trace: implement buffered IO with iomap > > > * fuse: implement large folios for iomap pagecache files > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > * fuse: advertise support for iomap > > > * fuse: query filesystem geometry when using iomap > > > * fuse_trace: query filesystem geometry when using iomap > > > * fuse: implement fadvise for iomap files > > > * fuse: invalidate ranges of block devices being used for iomap > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > * fuse: implement inline data file IO via iomap > > > * fuse_trace: implement inline data file IO via iomap > > > * fuse: allow more statx fields > > > * fuse: support atomic writes with iomap > > > * fuse_trace: support atomic writes with iomap > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > * fuse: enable swapfile activation on iomap > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > --- > > > fs/fuse/fuse_i.h | 161 +++ > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > fs/fuse/iomap_i.h | 52 + > > > include/uapi/linux/fuse.h | 219 ++++ > > > fs/fuse/Kconfig | 48 + > > > fs/fuse/Makefile | 1 > > > fs/fuse/backing.c | 12 > > > fs/fuse/dev.c | 30 + > > > fs/fuse/dir.c | 120 ++ > > > fs/fuse/file.c | 133 ++- > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > fs/fuse/inode.c | 162 +++ > > > fs/fuse/iomode.c | 2 > > > fs/fuse/trace.c | 2 > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > create mode 100644 fs/fuse/iomap_i.h > > > create mode 100644 fs/fuse/file_iomap.c > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-27 19:47 ` Joanne Koong @ 2026-01-27 23:21 ` Darrick J. Wong 2026-01-28 0:10 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-27 23:21 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Jan 27, 2026 at 11:47:31AM -0800, Joanne Koong wrote: > On Mon, Jan 26, 2026 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > > > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > Hi all, > > > > > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > > > to get fuse servers out of the business of handling file I/O themselves. > > > > By keeping the IO path mostly within the kernel, we can dramatically > > > > improve the speed of disk-based filesystems. This enables us to move > > > > all the filesystem metadata parsing code out of the kernel and into > > > > userspace, which means that we can containerize them for security > > > > without losing a lot of performance. > > > > > > I haven't looked through how the fuse2fs or fuse4fs servers are > > > implemented yet (also, could you explain the difference between the > > > two? Which one should we look at to see how it all ties together?), > > > > fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse > > server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. > > Ah I see, thanks for the explanation. In that case, I'll just look at > fuse4fs then. > > > > > > but I wonder if having bpf infrastructure hooked up to fuse would be > > > especially helpful for what you're doing here with fuse iomap. afaict, > > > every read/write whether it's buffered or direct will incur at least 1 > > > call to ->iomap_begin() to get the mapping metadata, which will be 2 > > > context-switches (and if the server has ->iomap_end() implemented, > > > then 2 more context-switches). > > > > Yes, I agree that's a lot of context switching for file IO... > > > > > But it seems like the logic for retrieving mapping > > > offsets/lengths/metadata should be pretty straightforward? > > > > ...but it gets very cheap if the fuse server can cache mappings in the > > kernel to avoid all that. That is, incidentally, what patchset #7 > > implements. > > > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > > > > > If the extent lookups are table lookups or tree > > > traversals without complex side effects, then having > > > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > > > the context switches and allow all the caching logic to be moved from > > > the kernel to the server-side (eg using bpf maps). > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > structure that supports interval mappings? I think the existing bpf map > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > Maybe one workaround in the meantime that could work is using a sorted > array map and doing binary search on that, until interval mappings can > be natively supported? I guess, though I already had a C structure to borrow from xfs ;) > > only does key -> value. Also, is there an upper limit on the size of a > > map? You could have hundreds of millions of maps for a very fragmented > > regular file. > > If I'm remembering correctly, there's an upper limit on the number of > map entries, which is bounded by u32 That's problematic, since files can have 64-bit logical block numbers. > > At one point I suggested to the famfs maintainer that it might be > > easier/better to implement the interleaved mapping lookups as bpf > > programs instead of being stuck with a fixed format in the fuse > > userspace abi, but I don't know if he ever implemented that. > > This seems like a good use case for it too > > > > > Is this your > > > assessment of it as well or do you think the server-side logic for > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > Asking because I'm curious whether this direction makes sense, not > > > because I think it would be a blocker for your series. > > > > For disk-based filesystems I think it would be difficult to model a bpf > > program to do mappings, since they can basically point anywhere and be > > of any size. > > Hmm I'm not familiar enough with disk-based filesystems to know what > the "point anywhere and be of any size" means. For the mapping stuff, > doesn't it just point to a block number? Or are you saying the problem > would be there's too many mappings since a mapping could be any size? The second -- mappings can be any size, and unprivileged userspace can control the mappings. > I was thinking the issue would be more that there might be other logic > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > would need to be done that would be too out-of-scope for bpf. But I > think I need to read through the fuse4fs stuff to understand more what > it's doing in those functions. <nod> --D > > Thanks, > Joanne > > > > > OTOH it would be enormously hilarious to me if one could load a file > > mapping predictive model into the kernel as a bpf program and use that > > as a first tier before checking the in-memory btree mapping cache from > > patchset 7. Quite a few years ago now there was a FAST paper > > establishing that even a stupid linear regression model could in theory > > beat a disk btree lookup. > > > > --D > > > > > Thanks, > > > Joanne > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > from my git trees, which are linked below. > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > Comments and questions are, as always, welcome. > > > > > > > > --D > > > > > > > > kernel git tree: > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > --- > > > > Commits in this patchset: > > > > * fuse: implement the basic iomap mechanisms > > > > * fuse_trace: implement the basic iomap mechanisms > > > > * fuse: make debugging configurable at runtime > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > * fuse: create a per-inode flag for toggling iomap > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > * fuse: isolate the other regular file IO paths from iomap > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > * fuse: implement direct IO with iomap > > > > * fuse_trace: implement direct IO with iomap > > > > * fuse: implement buffered IO with iomap > > > > * fuse_trace: implement buffered IO with iomap > > > > * fuse: implement large folios for iomap pagecache files > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > * fuse: advertise support for iomap > > > > * fuse: query filesystem geometry when using iomap > > > > * fuse_trace: query filesystem geometry when using iomap > > > > * fuse: implement fadvise for iomap files > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > * fuse: implement inline data file IO via iomap > > > > * fuse_trace: implement inline data file IO via iomap > > > > * fuse: allow more statx fields > > > > * fuse: support atomic writes with iomap > > > > * fuse_trace: support atomic writes with iomap > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > * fuse: enable swapfile activation on iomap > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > --- > > > > fs/fuse/fuse_i.h | 161 +++ > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > fs/fuse/iomap_i.h | 52 + > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > fs/fuse/Kconfig | 48 + > > > > fs/fuse/Makefile | 1 > > > > fs/fuse/backing.c | 12 > > > > fs/fuse/dev.c | 30 + > > > > fs/fuse/dir.c | 120 ++ > > > > fs/fuse/file.c | 133 ++- > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > fs/fuse/inode.c | 162 +++ > > > > fs/fuse/iomode.c | 2 > > > > fs/fuse/trace.c | 2 > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > create mode 100644 fs/fuse/iomap_i.h > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-27 23:21 ` Darrick J. Wong @ 2026-01-28 0:10 ` Joanne Koong 2026-01-28 0:34 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-28 0:10 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Jan 27, 2026 at 3:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Jan 27, 2026 at 11:47:31AM -0800, Joanne Koong wrote: > > On Mon, Jan 26, 2026 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > > > > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > Hi all, > > > > > > > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > > > > to get fuse servers out of the business of handling file I/O themselves. > > > > > By keeping the IO path mostly within the kernel, we can dramatically > > > > > improve the speed of disk-based filesystems. This enables us to move > > > > > all the filesystem metadata parsing code out of the kernel and into > > > > > userspace, which means that we can containerize them for security > > > > > without losing a lot of performance. > > > > > > > > I haven't looked through how the fuse2fs or fuse4fs servers are > > > > implemented yet (also, could you explain the difference between the > > > > two? Which one should we look at to see how it all ties together?), > > > > > > fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse > > > server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. > > > > Ah I see, thanks for the explanation. In that case, I'll just look at > > fuse4fs then. > > > > > > > > > but I wonder if having bpf infrastructure hooked up to fuse would be > > > > especially helpful for what you're doing here with fuse iomap. afaict, > > > > every read/write whether it's buffered or direct will incur at least 1 > > > > call to ->iomap_begin() to get the mapping metadata, which will be 2 > > > > context-switches (and if the server has ->iomap_end() implemented, > > > > then 2 more context-switches). > > > > > > Yes, I agree that's a lot of context switching for file IO... > > > > > > > But it seems like the logic for retrieving mapping > > > > offsets/lengths/metadata should be pretty straightforward? > > > > > > ...but it gets very cheap if the fuse server can cache mappings in the > > > kernel to avoid all that. That is, incidentally, what patchset #7 > > > implements. > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > > > > > > > If the extent lookups are table lookups or tree > > > > traversals without complex side effects, then having > > > > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > > > > the context switches and allow all the caching logic to be moved from > > > > the kernel to the server-side (eg using bpf maps). > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > structure that supports interval mappings? I think the existing bpf map > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > Maybe one workaround in the meantime that could work is using a sorted > > array map and doing binary search on that, until interval mappings can > > be natively supported? > > I guess, though I already had a C structure to borrow from xfs ;) > > > > only does key -> value. Also, is there an upper limit on the size of a > > > map? You could have hundreds of millions of maps for a very fragmented > > > regular file. > > > > If I'm remembering correctly, there's an upper limit on the number of > > map entries, which is bounded by u32 > > That's problematic, since files can have 64-bit logical block numbers. The key size supports 64-bits. The u32 bound would be the limit on the number of extents for the file. > > > > At one point I suggested to the famfs maintainer that it might be > > > easier/better to implement the interleaved mapping lookups as bpf > > > programs instead of being stuck with a fixed format in the fuse > > > userspace abi, but I don't know if he ever implemented that. > > > > This seems like a good use case for it too > > > > > > > Is this your > > > > assessment of it as well or do you think the server-side logic for > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > Asking because I'm curious whether this direction makes sense, not > > > > because I think it would be a blocker for your series. > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > program to do mappings, since they can basically point anywhere and be > > > of any size. > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > the "point anywhere and be of any size" means. For the mapping stuff, > > doesn't it just point to a block number? Or are you saying the problem > > would be there's too many mappings since a mapping could be any size? > > The second -- mappings can be any size, and unprivileged userspace can > control the mappings. If I'm understanding what you're saying here, this is the same discussion as the one above about the u32 bound, correct? > > > I was thinking the issue would be more that there might be other logic > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > would need to be done that would be too out-of-scope for bpf. But I > > think I need to read through the fuse4fs stuff to understand more what > > it's doing in those functions. Looking at fuse4fs logic cursorily, it seems doable? What I like about offloading this to bpf too is it would also then allow John's famfs to just go through your iomap plumbing as a use case of it instead of being an entirely separate thing. Though maybe there's some other reason for that that you guys have discussed prior. In any case, I'll ask this on John's main famfs patchset. It kind of seems to me that you guys are pretty much doing the exact same thing conceptually. Thanks, Joanne > > <nod> > > --D > > > > > Thanks, > > Joanne > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > mapping predictive model into the kernel as a bpf program and use that > > > as a first tier before checking the in-memory btree mapping cache from > > > patchset 7. Quite a few years ago now there was a FAST paper > > > establishing that even a stupid linear regression model could in theory > > > beat a disk btree lookup. > > > > > > --D > > > > > > > Thanks, > > > > Joanne > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > from my git trees, which are linked below. > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > --D > > > > > > > > > > kernel git tree: > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > --- > > > > > Commits in this patchset: > > > > > * fuse: implement the basic iomap mechanisms > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > * fuse: make debugging configurable at runtime > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > * fuse: implement direct IO with iomap > > > > > * fuse_trace: implement direct IO with iomap > > > > > * fuse: implement buffered IO with iomap > > > > > * fuse_trace: implement buffered IO with iomap > > > > > * fuse: implement large folios for iomap pagecache files > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > * fuse: advertise support for iomap > > > > > * fuse: query filesystem geometry when using iomap > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > * fuse: implement fadvise for iomap files > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > * fuse: implement inline data file IO via iomap > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > * fuse: allow more statx fields > > > > > * fuse: support atomic writes with iomap > > > > > * fuse_trace: support atomic writes with iomap > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > * fuse: enable swapfile activation on iomap > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > --- > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > fs/fuse/iomap_i.h | 52 + > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > fs/fuse/Kconfig | 48 + > > > > > fs/fuse/Makefile | 1 > > > > > fs/fuse/backing.c | 12 > > > > > fs/fuse/dev.c | 30 + > > > > > fs/fuse/dir.c | 120 ++ > > > > > fs/fuse/file.c | 133 ++- > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > fs/fuse/inode.c | 162 +++ > > > > > fs/fuse/iomode.c | 2 > > > > > fs/fuse/trace.c | 2 > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-28 0:10 ` Joanne Koong @ 2026-01-28 0:34 ` Darrick J. Wong 2026-01-29 1:12 ` Joanne Koong 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-01-28 0:34 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Jan 27, 2026 at 04:10:43PM -0800, Joanne Koong wrote: > On Tue, Jan 27, 2026 at 3:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Tue, Jan 27, 2026 at 11:47:31AM -0800, Joanne Koong wrote: > > > On Mon, Jan 26, 2026 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > > > > > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > Hi all, > > > > > > > > > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > > > > > to get fuse servers out of the business of handling file I/O themselves. > > > > > > By keeping the IO path mostly within the kernel, we can dramatically > > > > > > improve the speed of disk-based filesystems. This enables us to move > > > > > > all the filesystem metadata parsing code out of the kernel and into > > > > > > userspace, which means that we can containerize them for security > > > > > > without losing a lot of performance. > > > > > > > > > > I haven't looked through how the fuse2fs or fuse4fs servers are > > > > > implemented yet (also, could you explain the difference between the > > > > > two? Which one should we look at to see how it all ties together?), > > > > > > > > fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse > > > > server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. > > > > > > Ah I see, thanks for the explanation. In that case, I'll just look at > > > fuse4fs then. > > > > > > > > > > > > but I wonder if having bpf infrastructure hooked up to fuse would be > > > > > especially helpful for what you're doing here with fuse iomap. afaict, > > > > > every read/write whether it's buffered or direct will incur at least 1 > > > > > call to ->iomap_begin() to get the mapping metadata, which will be 2 > > > > > context-switches (and if the server has ->iomap_end() implemented, > > > > > then 2 more context-switches). > > > > > > > > Yes, I agree that's a lot of context switching for file IO... > > > > > > > > > But it seems like the logic for retrieving mapping > > > > > offsets/lengths/metadata should be pretty straightforward? > > > > > > > > ...but it gets very cheap if the fuse server can cache mappings in the > > > > kernel to avoid all that. That is, incidentally, what patchset #7 > > > > implements. > > > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > > > > > > > > > If the extent lookups are table lookups or tree > > > > > traversals without complex side effects, then having > > > > > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > > > > > the context switches and allow all the caching logic to be moved from > > > > > the kernel to the server-side (eg using bpf maps). > > > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > Maybe one workaround in the meantime that could work is using a sorted > > > array map and doing binary search on that, until interval mappings can > > > be natively supported? > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > regular file. > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > map entries, which is bounded by u32 > > > > That's problematic, since files can have 64-bit logical block numbers. > > The key size supports 64-bits. The u32 bound would be the limit on the > number of extents for the file. Oh, ok. If one treats the incore map as a cache and evicts things when they get too old, then that would be fine. I misread that as an upper limit on the *range* of the map entry keys. :/ As it stands, I need to figure out a way to trim the iomap btree when memory gets tight. Right now it'll drop the cache whenever someone closes the file, but that won't help for long-life processes that open a heavily fragmented file and never close it. A coding-intensive way to do that would be to register a shrinker and deal with that, but ugh. A really stupid way would be to drop the whole cache once you get beyond (say) 64k of memory usage (~2000 mappings). > > > > At one point I suggested to the famfs maintainer that it might be > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > programs instead of being stuck with a fixed format in the fuse > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > This seems like a good use case for it too > > > > > > > > > Is this your > > > > > assessment of it as well or do you think the server-side logic for > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > because I think it would be a blocker for your series. > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > program to do mappings, since they can basically point anywhere and be > > > > of any size. > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > doesn't it just point to a block number? Or are you saying the problem > > > would be there's too many mappings since a mapping could be any size? > > > > The second -- mappings can be any size, and unprivileged userspace can > > control the mappings. > > If I'm understanding what you're saying here, this is the same > discussion as the one above about the u32 bound, correct? A different thing -- file data mappings are irregularly sized, can contain sparse holes, etc. Userspace controls the size and offset of each mapping record (thanks to magic things like fallocate) so it'd be very difficult to create a bpf program to generate mappings on the fly. Also you could have 2^33 mappings records for a file, so I think you can't even write a bpf program that large. > > > I was thinking the issue would be more that there might be other logic > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > would need to be done that would be too out-of-scope for bpf. But I > > > think I need to read through the fuse4fs stuff to understand more what > > > it's doing in those functions. > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > offloading this to bpf too is it would also then allow John's famfs to > just go through your iomap plumbing as a use case of it instead of > being an entirely separate thing. Though maybe there's some other > reason for that that you guys have discussed prior. In any case, I'll > ask this on John's main famfs patchset. It kind of seems to me that > you guys are pretty much doing the exact same thing conceptually. Yes, though John's famfs has the nice property that memory controller interleaving is mathematically regular and likely makes for a compact bpf program. --D > Thanks, > Joanne > > > > > <nod> > > > > --D > > > > > > > > Thanks, > > > Joanne > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > mapping predictive model into the kernel as a bpf program and use that > > > > as a first tier before checking the in-memory btree mapping cache from > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > establishing that even a stupid linear regression model could in theory > > > > beat a disk btree lookup. > > > > > > > > --D > > > > > > > > > Thanks, > > > > > Joanne > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > --D > > > > > > > > > > > > kernel git tree: > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > --- > > > > > > Commits in this patchset: > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > * fuse: make debugging configurable at runtime > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > * fuse: implement direct IO with iomap > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > * fuse: implement buffered IO with iomap > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > * fuse: advertise support for iomap > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > * fuse: implement fadvise for iomap files > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > * fuse: implement inline data file IO via iomap > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > * fuse: allow more statx fields > > > > > > * fuse: support atomic writes with iomap > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > * fuse: enable swapfile activation on iomap > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > --- > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > fs/fuse/Kconfig | 48 + > > > > > > fs/fuse/Makefile | 1 > > > > > > fs/fuse/backing.c | 12 > > > > > > fs/fuse/dev.c | 30 + > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > fs/fuse/file.c | 133 ++- > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > fs/fuse/iomode.c | 2 > > > > > > fs/fuse/trace.c | 2 > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-28 0:34 ` Darrick J. Wong @ 2026-01-29 1:12 ` Joanne Koong 2026-01-29 20:02 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-29 1:12 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Jan 27, 2026 at 4:34 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Jan 27, 2026 at 04:10:43PM -0800, Joanne Koong wrote: > > On Tue, Jan 27, 2026 at 3:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Tue, Jan 27, 2026 at 11:47:31AM -0800, Joanne Koong wrote: > > > > On Mon, Jan 26, 2026 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > On Mon, Jan 26, 2026 at 04:59:16PM -0800, Joanne Koong wrote: > > > > > > On Tue, Oct 28, 2025 at 5:38 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > Hi all, > > > > > > > > > > > > > > This series connects fuse (the userspace filesystem layer) to fs-iomap > > > > > > > to get fuse servers out of the business of handling file I/O themselves. > > > > > > > By keeping the IO path mostly within the kernel, we can dramatically > > > > > > > improve the speed of disk-based filesystems. This enables us to move > > > > > > > all the filesystem metadata parsing code out of the kernel and into > > > > > > > userspace, which means that we can containerize them for security > > > > > > > without losing a lot of performance. > > > > > > > > > > > > I haven't looked through how the fuse2fs or fuse4fs servers are > > > > > > implemented yet (also, could you explain the difference between the > > > > > > two? Which one should we look at to see how it all ties together?), > > > > > > > > > > fuse4fs is a lowlevel fuse server; fuse2fs is a high(?) level fuse > > > > > server. fuse4fs is the successor to fuse2fs, at least on Linux and BSD. > > > > > > > > Ah I see, thanks for the explanation. In that case, I'll just look at > > > > fuse4fs then. > > > > > > > > > > > > > > > but I wonder if having bpf infrastructure hooked up to fuse would be > > > > > > especially helpful for what you're doing here with fuse iomap. afaict, > > > > > > every read/write whether it's buffered or direct will incur at least 1 > > > > > > call to ->iomap_begin() to get the mapping metadata, which will be 2 > > > > > > context-switches (and if the server has ->iomap_end() implemented, > > > > > > then 2 more context-switches). > > > > > > > > > > Yes, I agree that's a lot of context switching for file IO... > > > > > > > > > > > But it seems like the logic for retrieving mapping > > > > > > offsets/lengths/metadata should be pretty straightforward? > > > > > > > > > > ...but it gets very cheap if the fuse server can cache mappings in the > > > > > kernel to avoid all that. That is, incidentally, what patchset #7 > > > > > implements. > > > > > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache_2026-01-22 > > > > > > > > > > > If the extent lookups are table lookups or tree > > > > > > traversals without complex side effects, then having > > > > > > ->iomap_begin()/->iomap_end() be executed as a bpf program would avoid > > > > > > the context switches and allow all the caching logic to be moved from > > > > > > the kernel to the server-side (eg using bpf maps). > > > > > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > > Maybe one workaround in the meantime that could work is using a sorted > > > > array map and doing binary search on that, until interval mappings can > > > > be natively supported? > > > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > > regular file. > > > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > > map entries, which is bounded by u32 > > > > > > That's problematic, since files can have 64-bit logical block numbers. > > > > The key size supports 64-bits. The u32 bound would be the limit on the > > number of extents for the file. > > Oh, ok. If one treats the incore map as a cache and evicts things when > they get too old, then that would be fine. I misread that as an upper > limit on the *range* of the map entry keys. :/ I think for more complicated servers, the bpf prog handling for iomap_begin() would essentially just serve as a cache where if it's not found in the cache, then it sends off the FUSE_IOMAP_BEGIN request to the server. For servers that don't need as much complicated logic (eg famfs), the iomap_begin() logic would just be executed within the bpf prog itself. > > As it stands, I need to figure out a way to trim the iomap btree when > memory gets tight. Right now it'll drop the cache whenever someone > closes the file, but that won't help for long-life processes that open a > heavily fragmented file and never close it. > > A coding-intensive way to do that would be to register a shrinker and > deal with that, but ugh. A really stupid way would be to drop the whole > cache once you get beyond (say) 64k of memory usage (~2000 mappings). This kind of seems like another point in favor of giving userspace control of the caching layer. They could then implement whatever eviction policies they want. It also allows them to prepopulate the cache upfront (eg when servicing a file open request, if the file is below a certain size or if the server knows what'll be hot, it could put those extents into the map from the get-go). in my opinion, the fuse-iomap layer should try to be as simple/minimal and as generic as possible. I haven't read through iomap_cache.c yet but the header comment suggests it's adapted from the xfs extent tree cache. As I understand it, different filesystem implementations have different caching architectures that are better suited for their use cases (I'm guessing that's the case, otherwise there would just be one general cache inside iomap all the filesystems would use?). It seems a lot better to me to just let the userspace server define that themselves. And selfishly from the fuse perspective, would be less code we would have to maintain. And I guess too if some servers don't need caching (like famfs?), they could avoid that overhead. > > > > > > At one point I suggested to the famfs maintainer that it might be > > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > > programs instead of being stuck with a fixed format in the fuse > > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > > > This seems like a good use case for it too > > > > > > > > > > > Is this your > > > > > > assessment of it as well or do you think the server-side logic for > > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > > because I think it would be a blocker for your series. > > > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > > program to do mappings, since they can basically point anywhere and be > > > > > of any size. > > > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > > doesn't it just point to a block number? Or are you saying the problem > > > > would be there's too many mappings since a mapping could be any size? > > > > > > The second -- mappings can be any size, and unprivileged userspace can > > > control the mappings. > > > > If I'm understanding what you're saying here, this is the same > > discussion as the one above about the u32 bound, correct? > > A different thing -- file data mappings are irregularly sized, can > contain sparse holes, etc. Userspace controls the size and offset of > each mapping record (thanks to magic things like fallocate) so it'd be > very difficult to create a bpf program to generate mappings on the fly. Would the bpf prog have to generate mappings on the fly though? If the userspace does things like fallocate, those operations would still go through to the server as a regular request (eg FUSE_FALLOCATE) and on the server side, it'd add that to the map dynamically from userspace. > > Also you could have 2^33 mappings records for a file, so I think you > can't even write a bpf program that large. I think this depends on what map structure gets used. If there is native support added for b+ tree like data structures, I don't see why it wouldn't be able to. > > > > > I was thinking the issue would be more that there might be other logic > > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > > would need to be done that would be too out-of-scope for bpf. But I > > > > think I need to read through the fuse4fs stuff to understand more what > > > > it's doing in those functions. > > > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > > offloading this to bpf too is it would also then allow John's famfs to > > just go through your iomap plumbing as a use case of it instead of > > being an entirely separate thing. Though maybe there's some other > > reason for that that you guys have discussed prior. In any case, I'll > > ask this on John's main famfs patchset. It kind of seems to me that > > you guys are pretty much doing the exact same thing conceptually. > > Yes, though John's famfs has the nice property that memory controller > interleaving is mathematically regular and likely makes for a compact > bpf program. I tried out integrating the bpf hooks into fuse for iomap_begin() just to see if it was realistic and it seems relatively straightforward so far (though maybe the devil is in the details...). I used the drivers/hid/bpf/hid_bpf_struct_ops.c program as a model for how to set up the fuse bpf struct ops on the kernel side. calling it from file_iomap.c looks something like static int fuse_iomap_begin(...) { ... struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); ... err = -EOPNOTSUPP; if (bpf_ops && bpf_ops->iomap_begin) err = bpf_ops->iomap_begin(inode, pos, len, flags, &outarg); if (err) err = fuse_simple_request(fm, &args); ... } and I was able to verify that iomap_begin() is able to return back populated outarg fields from the bpf prog. If we were to actually implement it i'm sure it'd be more complicated (eg we'd need to make the fuse_bpf_ops registered per-connection, etc) but on the whole it seems doable. My worry is that if we land the iomap cache patchset now then we can't remove it in the future without breaking backwards compatibility for being a performance regression (though maybe we can since the fuse-iomap stuff is experimental?), so imo it'd be great if we figured out what direction we want to go before landing the cache stuff. And I think we need to have this conversation too on the main famfs patchset (eg whether it should go through your general iomap plumbing with bpf helpers vs. being a separate implementation) since once that lands, it'd be irrevocable. Thanks, Joanne > > --D > > > Thanks, > > Joanne > > > > > > > > <nod> > > > > > > --D > > > > > > > > > > > Thanks, > > > > Joanne > > > > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > > mapping predictive model into the kernel as a bpf program and use that > > > > > as a first tier before checking the in-memory btree mapping cache from > > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > > establishing that even a stupid linear regression model could in theory > > > > > beat a disk btree lookup. > > > > > > > > > > --D > > > > > > > > > > > Thanks, > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > kernel git tree: > > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > > --- > > > > > > > Commits in this patchset: > > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > > * fuse: make debugging configurable at runtime > > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > * fuse: implement direct IO with iomap > > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > > * fuse: implement buffered IO with iomap > > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > > * fuse: advertise support for iomap > > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > > * fuse: implement fadvise for iomap files > > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > > * fuse: implement inline data file IO via iomap > > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > > * fuse: allow more statx fields > > > > > > > * fuse: support atomic writes with iomap > > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > > * fuse: enable swapfile activation on iomap > > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > > --- > > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > > fs/fuse/Kconfig | 48 + > > > > > > > fs/fuse/Makefile | 1 > > > > > > > fs/fuse/backing.c | 12 > > > > > > > fs/fuse/dev.c | 30 + > > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > > fs/fuse/file.c | 133 ++- > > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > > fs/fuse/iomode.c | 2 > > > > > > > fs/fuse/trace.c | 2 > > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-29 1:12 ` Joanne Koong @ 2026-01-29 20:02 ` Darrick J. Wong 2026-01-29 22:41 ` Darrick J. Wong 2026-01-29 22:50 ` Joanne Koong 0 siblings, 2 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-29 20:02 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Wed, Jan 28, 2026 at 05:12:54PM -0800, Joanne Koong wrote: <snip> > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > > > Maybe one workaround in the meantime that could work is using a sorted > > > > > array map and doing binary search on that, until interval mappings can > > > > > be natively supported? > > > > > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > > > regular file. > > > > > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > > > map entries, which is bounded by u32 > > > > > > > > That's problematic, since files can have 64-bit logical block numbers. > > > > > > The key size supports 64-bits. The u32 bound would be the limit on the > > > number of extents for the file. > > > > Oh, ok. If one treats the incore map as a cache and evicts things when > > they get too old, then that would be fine. I misread that as an upper > > limit on the *range* of the map entry keys. :/ > > I think for more complicated servers, the bpf prog handling for > iomap_begin() would essentially just serve as a cache where if it's > not found in the cache, then it sends off the FUSE_IOMAP_BEGIN request > to the server. For servers that don't need as much complicated logic > (eg famfs), the iomap_begin() logic would just be executed within the > bpf prog itself. Yes, I like the fuse_iomap_begin logic flow of: 1. Try to use a mapping in the iext tree 2. Call a BPF program to try to generate a mapping 3. Issue a fuse command to userspace wherein #2 and #3 can signal that #1 should be retried. (This is already provided by FUSE_IOMAP_TYPE_RETRY_CACHE, FWIW) That said, BPF doesn't expose an interval btree data structure. I think it would be better to add the iext mapping cache and make it so that bpf programs could call fuse_iomap_cache_{upsert,remove,lookup}. You could use the interval tree too, but the iext tree has the advantage of higher fanout factor. > > As it stands, I need to figure out a way to trim the iomap btree when > > memory gets tight. Right now it'll drop the cache whenever someone > > closes the file, but that won't help for long-life processes that open a > > heavily fragmented file and never close it. > > > > A coding-intensive way to do that would be to register a shrinker and > > deal with that, but ugh. A really stupid way would be to drop the whole > > cache once you get beyond (say) 64k of memory usage (~2000 mappings). > > This kind of seems like another point in favor of giving userspace > control of the caching layer. They could then implement whatever > eviction policies they want. Note that userspace already can control the cached iomappings -- FUSE_NOTIFY_IOMAP_UPSERT pushes a mapping into the iext tree, and FUSE_NOTIFY_IOMAP_INVAL removes them. The fuse server can decide to evict whenever it pleases, though admittedly the iext tree doesn't track usage information of any kind, so how would the fuse server know? The static limit is merely the kernel's means to establish a hard limit on the memory consumption of the iext tree, since it can't trust userspace completely. > It also allows them to prepopulate the cache upfront (eg when > servicing a file open request, if the file is below a certain size or > if the server knows what'll be hot, it could put those extents into > the map from the get-go). Hrm. I haven't tried issuing FUSE_NOTIFY_IOMAP_UPSERT during an open call, but I suppose it's possible. > in my opinion, the fuse-iomap layer should try to be as simple/minimal > and as generic as possible. I haven't read through iomap_cache.c yet > but the header comment suggests it's adapted from the xfs extent tree Rudely copied, not adapted ;) I actually wonder if I should make a horrible macro to generate the fuse_iext_* structures and functions, and then xfs_iext_tree.c and fuse_iomap_cache.c can "share" that hairba^Wcode. > cache. As I understand it, different filesystem implementations have > different caching architectures that are better suited for their use > cases Err. The way this evolved is ... way too long to go into in this email. Here's a truncated version; I can tell you the full story next week. Most filesystems store their file mapping data on disk in whatever format the designers specified. When the pagecache asks them to read or write the cache, they attach buffer heads to the folio, fill out the buffer heads with the minimum mapping information needed to map the folios to disk addresses. bios are constructed for each folio based on what's in the bufferhead. This was fine for filesystems that map each block individually, such as FFS/ext2/ext3/fat... > (I'm guessing that's the case, otherwise there would just be one > general cache inside iomap all the filesystems would use?). It seems a ...but newer filesystems such as xfs/ext4/btrfs map a bunch of blocks at a time. Each of them invented their own private incore mapping structures to mirror the ondisk structure. xfs kept using the old bufferheads into the early 2010s, ext4 is still using them, and btrfs went its own way from the start. Eventually XFS grew its own internal extent-to-bio mapping code that flipped the model -- rather than get a pagecache folio, map the folio to blocks, and issue IOs based on the blocks, it would get the file mapping, grab folios for the whole mapping, and issue bios for the batch of folios. That's more efficient, but at this point we have a legacy codebase problem for everything else in fs/. In 2019, hch and I decided to export the extent-to-bio mapping code from xfs so that new filesystems could start with something cleaner than bufferheads. In the past 7 years, nobody's added a new filesystem with complex mapping requirements; they've only ported existing filesystems to it, without further refactoring of their incore data structures. That's why there's no generic iomap cache. > lot better to me to just let the userspace server define that > themselves. And selfishly from the fuse perspective, would be less Well if I turned the iext code into a template then fuse would only need enough glue code to declare a template class and use it. The glue part is only ... 230LOC. > code we would have to maintain. And I guess too if some servers don't > need caching (like famfs?), they could avoid that overhead. Hrm. Right now the struct fuse_iomap_cache is embedded in struct fuse_inode, but that could be turned into a dynamic allocation. > > > > > > At one point I suggested to the famfs maintainer that it might be > > > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > > > programs instead of being stuck with a fixed format in the fuse > > > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > > > > > This seems like a good use case for it too > > > > > > > > > > > > > Is this your > > > > > > > assessment of it as well or do you think the server-side logic for > > > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > > > because I think it would be a blocker for your series. > > > > > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > > > program to do mappings, since they can basically point anywhere and be > > > > > > of any size. > > > > > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > > > doesn't it just point to a block number? Or are you saying the problem > > > > > would be there's too many mappings since a mapping could be any size? > > > > > > > > The second -- mappings can be any size, and unprivileged userspace can > > > > control the mappings. > > > > > > If I'm understanding what you're saying here, this is the same > > > discussion as the one above about the u32 bound, correct? > > > > A different thing -- file data mappings are irregularly sized, can > > contain sparse holes, etc. Userspace controls the size and offset of > > each mapping record (thanks to magic things like fallocate) so it'd be > > very difficult to create a bpf program to generate mappings on the fly. > > Would the bpf prog have to generate mappings on the fly though? If the > userspace does things like fallocate, those operations would still go > through to the server as a regular request (eg FUSE_FALLOCATE) and on > the server side, it'd add that to the map dynamically from userspace. That depends on the fuse server design. For simple things like famfs where the layout is bog simple and there's no fancy features like delayed allocation or unwritten extents, then you could probably get away a BPF program to generate the entire mapping set. I suspect an object-store type filesystem (aka write a file once, close it, snapshot it, and never change it again) might be good at landing all the file data in relatively few extent mappings, and it could actually compile a custom bpf program for that file and push it to the kernel. > > Also you could have 2^33 mappings records for a file, so I think you > > can't even write a bpf program that large. > > I think this depends on what map structure gets used. If there is > native support added for b+ tree like data structures, I don't see why > it wouldn't be able to. <nod> > > > > > I was thinking the issue would be more that there might be other logic > > > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > > > would need to be done that would be too out-of-scope for bpf. But I > > > > > think I need to read through the fuse4fs stuff to understand more what > > > > > it's doing in those functions. > > > > > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > > > offloading this to bpf too is it would also then allow John's famfs to > > > just go through your iomap plumbing as a use case of it instead of > > > being an entirely separate thing. Though maybe there's some other > > > reason for that that you guys have discussed prior. In any case, I'll > > > ask this on John's main famfs patchset. It kind of seems to me that > > > you guys are pretty much doing the exact same thing conceptually. > > > > Yes, though John's famfs has the nice property that memory controller > > interleaving is mathematically regular and likely makes for a compact > > bpf program. > > I tried out integrating the bpf hooks into fuse for iomap_begin() just > to see if it was realistic and it seems relatively straightforward so > far (though maybe the devil is in the details...). I used the Ok, now *that's* interesting! I guess I had better push the latest fuse-iomap code ... but I cannot share a link, because I cannot get through the @!#%%!!! kernel.org anubis bullcrap. So I generated a pull request and I *think* this munged URL will work https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-service-container_2026-01-29 Or I guess you could just git-pull this: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git tags/fuse-service-container_2026-01-29 > drivers/hid/bpf/hid_bpf_struct_ops.c program as a model for how to set > up the fuse bpf struct ops on the kernel side. calling it from > file_iomap.c looks something like > > static int fuse_iomap_begin(...) > { > ... > struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); > ... > err = -EOPNOTSUPP; > if (bpf_ops && bpf_ops->iomap_begin) > err = bpf_ops->iomap_begin(inode, pos, len, flags, &outarg); > if (err) > err = fuse_simple_request(fm, &args); > ... > } I'm curious what the rest of the bpf integration code looks like. > and I was able to verify that iomap_begin() is able to return back > populated outarg fields from the bpf prog. If we were to actually > implement it i'm sure it'd be more complicated (eg we'd need to make > the fuse_bpf_ops registered per-connection, etc) but on the whole it What is a fuse_bpf_ops? I'm assuming that's the attachment point for a bpf program that the fuse server would compile? In which case, yes, I think that ought to be per-connection. So the bpf program can examine the struct inode, and the pos/len/opflags field; and from that information it has to write the appropriate fields in &outarg? That's new, I didn't think bpf was allowed to write to kernel memory. But it's been a few years since I last touched the bpf internals. Some bpf programs might just know how to fill out outarg on their own (e.g. famfs memory interleaving) but other bpf programs might perform a range query on some imaginary bpf-interval-tree wherein you can do a fast lookup based on (inumber, pos, len)? I guess that's an interesting question -- would each fuse connection have one big bpf-interval-tree? Or would you shard things by inode to reduce contention? And if you sharded like that, then would you need a fuse_bpf_ops per inode? (I'm imagining that the fuse_bpf_ops might be where you'd stash the root of the bpf data structure, but I know nothing of bpf internals ;)) Rolling on: how easy is it for a userspace program to compile and upload bpf programs into the kernel? I've played around with bcc enough to write some fairly stupid latency tracing tools for XFS, but at the end of the day it still python scripts feeding a string full of maybe-C into whatever the BPF does under the hood. I /think/ it calls clang on the provided text, links that against the current kernel's header files, and pushes the compiled bpf binary into the kernel, right? So fuse4fs would have to learn how to do that; and now fuse4fs has a runtime dependency on libllvm. And while I'm on the topic of fuse-bpf uapi: It's ok for us to expose primitive-typed variables (pos/len/opflags) and existing fuse uapi directly to a bpf program, but I don't think we should expose struct inode/fuse_inode. Maybe just fuse_inode::nodeid? If we're careful not to allow #include'ing structured types in the fuse bpf code, then perhaps the bpf programs could be compiled at the same time as the fuse server. > seems doable. My worry is that if we land the iomap cache patchset now > then we can't remove it in the future without breaking backwards > compatibility for being a performance regression (though maybe we can > since the fuse-iomap stuff is experimental?), so imo it'd be great if I don't think it's a huge problem to remove functionality while the EXPERIMENTAL warnings are in place. We'd forever lose the command codes for FUSE_NOTIFY_IOMAP_UPSERT and FUSE_NOTIFY_IOMAP_INVAL, but we've only used 12 out of INT_MAX so that's not likely to be a concern. > we figured out what direction we want to go before landing the cache > stuff. And I think we need to have this conversation too on the main > famfs patchset (eg whether it should go through your general iomap > plumbing with bpf helpers vs. being a separate implementation) since > once that lands, it'd be irrevocable. I've of two minds on that -- John got here first, so I don't want to delay his patchset whilst I slowly work on this thing. OTOH from an architecture standpoint we probably ought to push for three ways for a fuse server to upload mappings: 1. Upserting mappings with arbitrary offset and size into a cache 2. Self contained bpf program that can generate any mapping 3. Sprawling bpf program that can read any other artifacts that another bpf program might have set up for it But yeah, let's involve John. --D > > Thanks, > Joanne > > > > --D > > > > > Thanks, > > > Joanne > > > > > > > > > > > <nod> > > > > > > > > --D > > > > > > > > > > > > > > Thanks, > > > > > Joanne > > > > > > > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > > > mapping predictive model into the kernel as a bpf program and use that > > > > > > as a first tier before checking the in-memory btree mapping cache from > > > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > > > establishing that even a stupid linear regression model could in theory > > > > > > beat a disk btree lookup. > > > > > > > > > > > > --D > > > > > > > > > > > > > Thanks, > > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > kernel git tree: > > > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > > > --- > > > > > > > > Commits in this patchset: > > > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > > > * fuse: make debugging configurable at runtime > > > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > * fuse: implement direct IO with iomap > > > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > > > * fuse: implement buffered IO with iomap > > > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > > > * fuse: advertise support for iomap > > > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > > > * fuse: implement fadvise for iomap files > > > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > > > * fuse: implement inline data file IO via iomap > > > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > > > * fuse: allow more statx fields > > > > > > > > * fuse: support atomic writes with iomap > > > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > > > * fuse: enable swapfile activation on iomap > > > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > > > --- > > > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > > > fs/fuse/Kconfig | 48 + > > > > > > > > fs/fuse/Makefile | 1 > > > > > > > > fs/fuse/backing.c | 12 > > > > > > > > fs/fuse/dev.c | 30 + > > > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > > > fs/fuse/file.c | 133 ++- > > > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > > > fs/fuse/iomode.c | 2 > > > > > > > > fs/fuse/trace.c | 2 > > > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-29 20:02 ` Darrick J. Wong @ 2026-01-29 22:41 ` Darrick J. Wong 2026-01-29 22:50 ` Joanne Koong 1 sibling, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-29 22:41 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Thu, Jan 29, 2026 at 12:02:54PM -0800, Darrick J. Wong wrote: > On Wed, Jan 28, 2026 at 05:12:54PM -0800, Joanne Koong wrote: > > <snip> > > > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > > > > Maybe one workaround in the meantime that could work is using a sorted > > > > > > array map and doing binary search on that, until interval mappings can > > > > > > be natively supported? > > > > > > > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > > > > regular file. > > > > > > > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > > > > map entries, which is bounded by u32 > > > > > > > > > > That's problematic, since files can have 64-bit logical block numbers. > > > > > > > > The key size supports 64-bits. The u32 bound would be the limit on the > > > > number of extents for the file. > > > > > > Oh, ok. If one treats the incore map as a cache and evicts things when > > > they get too old, then that would be fine. I misread that as an upper > > > limit on the *range* of the map entry keys. :/ > > > > I think for more complicated servers, the bpf prog handling for > > iomap_begin() would essentially just serve as a cache where if it's > > not found in the cache, then it sends off the FUSE_IOMAP_BEGIN request > > to the server. For servers that don't need as much complicated logic > > (eg famfs), the iomap_begin() logic would just be executed within the > > bpf prog itself. > > Yes, I like the fuse_iomap_begin logic flow of: > > 1. Try to use a mapping in the iext tree > 2. Call a BPF program to try to generate a mapping > 3. Issue a fuse command to userspace > > wherein #2 and #3 can signal that #1 should be retried. (This is > already provided by FUSE_IOMAP_TYPE_RETRY_CACHE, FWIW) > > That said, BPF doesn't expose an interval btree data structure. I think > it would be better to add the iext mapping cache and make it so that bpf > programs could call fuse_iomap_cache_{upsert,remove,lookup}. You could > use the interval tree too, but the iext tree has the advantage of higher > fanout factor. > > > > As it stands, I need to figure out a way to trim the iomap btree when > > > memory gets tight. Right now it'll drop the cache whenever someone > > > closes the file, but that won't help for long-life processes that open a > > > heavily fragmented file and never close it. > > > > > > A coding-intensive way to do that would be to register a shrinker and > > > deal with that, but ugh. A really stupid way would be to drop the whole > > > cache once you get beyond (say) 64k of memory usage (~2000 mappings). > > > > This kind of seems like another point in favor of giving userspace > > control of the caching layer. They could then implement whatever > > eviction policies they want. > > Note that userspace already can control the cached iomappings -- > FUSE_NOTIFY_IOMAP_UPSERT pushes a mapping into the iext tree, and > FUSE_NOTIFY_IOMAP_INVAL removes them. The fuse server can decide to > evict whenever it pleases, though admittedly the iext tree doesn't track > usage information of any kind, so how would the fuse server know? > > The static limit is merely the kernel's means to establish a hard limit > on the memory consumption of the iext tree, since it can't trust > userspace completely. > > > It also allows them to prepopulate the cache upfront (eg when > > servicing a file open request, if the file is below a certain size or > > if the server knows what'll be hot, it could put those extents into > > the map from the get-go). > > Hrm. I haven't tried issuing FUSE_NOTIFY_IOMAP_UPSERT during an open > call, but I suppose it's possible. > > > in my opinion, the fuse-iomap layer should try to be as simple/minimal > > and as generic as possible. I haven't read through iomap_cache.c yet > > but the header comment suggests it's adapted from the xfs extent tree > > Rudely copied, not adapted ;) > > I actually wonder if I should make a horrible macro to generate the > fuse_iext_* structures and functions, and then xfs_iext_tree.c and > fuse_iomap_cache.c can "share" that hairba^Wcode. I tried templatizing this with cpp macros and very rapidly lost all sanity. :( --D > > cache. As I understand it, different filesystem implementations have > > different caching architectures that are better suited for their use > > cases > > Err. The way this evolved is ... way too long to go into in this email. > Here's a truncated version; I can tell you the full story next week. > > Most filesystems store their file mapping data on disk in whatever > format the designers specified. When the pagecache asks them to read > or write the cache, they attach buffer heads to the folio, fill out the > buffer heads with the minimum mapping information needed to map the > folios to disk addresses. bios are constructed for each folio based on > what's in the bufferhead. > > This was fine for filesystems that map each block individually, such as > FFS/ext2/ext3/fat... > > > (I'm guessing that's the case, otherwise there would just be one > > general cache inside iomap all the filesystems would use?). It seems a > > ...but newer filesystems such as xfs/ext4/btrfs map a bunch of blocks at > a time. Each of them invented their own private incore mapping > structures to mirror the ondisk structure. xfs kept using the old > bufferheads into the early 2010s, ext4 is still using them, and btrfs > went its own way from the start. > > Eventually XFS grew its own internal extent-to-bio mapping code that > flipped the model -- rather than get a pagecache folio, map the folio to > blocks, and issue IOs based on the blocks, it would get the file > mapping, grab folios for the whole mapping, and issue bios for the batch > of folios. That's more efficient, but at this point we have a legacy > codebase problem for everything else in fs/. > > In 2019, hch and I decided to export the extent-to-bio mapping code from > xfs so that new filesystems could start with something cleaner than > bufferheads. In the past 7 years, nobody's added a new filesystem with > complex mapping requirements; they've only ported existing filesystems > to it, without further refactoring of their incore data structures. > That's why there's no generic iomap cache. > > > lot better to me to just let the userspace server define that > > themselves. And selfishly from the fuse perspective, would be less > > Well if I turned the iext code into a template then fuse would only need > enough glue code to declare a template class and use it. The glue part > is only ... 230LOC. > > > code we would have to maintain. And I guess too if some servers don't > > need caching (like famfs?), they could avoid that overhead. > > Hrm. Right now the struct fuse_iomap_cache is embedded in struct > fuse_inode, but that could be turned into a dynamic allocation. > > > > > > > > At one point I suggested to the famfs maintainer that it might be > > > > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > > > > programs instead of being stuck with a fixed format in the fuse > > > > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > > > > > > > This seems like a good use case for it too > > > > > > > > > > > > > > > Is this your > > > > > > > > assessment of it as well or do you think the server-side logic for > > > > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > > > > because I think it would be a blocker for your series. > > > > > > > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > > > > program to do mappings, since they can basically point anywhere and be > > > > > > > of any size. > > > > > > > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > > > > doesn't it just point to a block number? Or are you saying the problem > > > > > > would be there's too many mappings since a mapping could be any size? > > > > > > > > > > The second -- mappings can be any size, and unprivileged userspace can > > > > > control the mappings. > > > > > > > > If I'm understanding what you're saying here, this is the same > > > > discussion as the one above about the u32 bound, correct? > > > > > > A different thing -- file data mappings are irregularly sized, can > > > contain sparse holes, etc. Userspace controls the size and offset of > > > each mapping record (thanks to magic things like fallocate) so it'd be > > > very difficult to create a bpf program to generate mappings on the fly. > > > > Would the bpf prog have to generate mappings on the fly though? If the > > userspace does things like fallocate, those operations would still go > > through to the server as a regular request (eg FUSE_FALLOCATE) and on > > the server side, it'd add that to the map dynamically from userspace. > > That depends on the fuse server design. For simple things like famfs > where the layout is bog simple and there's no fancy features like > delayed allocation or unwritten extents, then you could probably get > away a BPF program to generate the entire mapping set. I suspect an > object-store type filesystem (aka write a file once, close it, snapshot > it, and never change it again) might be good at landing all the file > data in relatively few extent mappings, and it could actually compile a > custom bpf program for that file and push it to the kernel. > > > > Also you could have 2^33 mappings records for a file, so I think you > > > can't even write a bpf program that large. > > > > I think this depends on what map structure gets used. If there is > > native support added for b+ tree like data structures, I don't see why > > it wouldn't be able to. > > <nod> > > > > > > > I was thinking the issue would be more that there might be other logic > > > > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > > > > would need to be done that would be too out-of-scope for bpf. But I > > > > > > think I need to read through the fuse4fs stuff to understand more what > > > > > > it's doing in those functions. > > > > > > > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > > > > offloading this to bpf too is it would also then allow John's famfs to > > > > just go through your iomap plumbing as a use case of it instead of > > > > being an entirely separate thing. Though maybe there's some other > > > > reason for that that you guys have discussed prior. In any case, I'll > > > > ask this on John's main famfs patchset. It kind of seems to me that > > > > you guys are pretty much doing the exact same thing conceptually. > > > > > > Yes, though John's famfs has the nice property that memory controller > > > interleaving is mathematically regular and likely makes for a compact > > > bpf program. > > > > I tried out integrating the bpf hooks into fuse for iomap_begin() just > > to see if it was realistic and it seems relatively straightforward so > > far (though maybe the devil is in the details...). I used the > > Ok, now *that's* interesting! I guess I had better push the latest > fuse-iomap code ... but I cannot share a link, because I cannot get > through the @!#%%!!! kernel.org anubis bullcrap. > > So I generated a pull request and I *think* this munged URL will work > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-service-container_2026-01-29 > > Or I guess you could just git-pull this: > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git tags/fuse-service-container_2026-01-29 > > > drivers/hid/bpf/hid_bpf_struct_ops.c program as a model for how to set > > up the fuse bpf struct ops on the kernel side. calling it from > > file_iomap.c looks something like > > > > static int fuse_iomap_begin(...) > > { > > ... > > struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); > > ... > > err = -EOPNOTSUPP; > > if (bpf_ops && bpf_ops->iomap_begin) > > err = bpf_ops->iomap_begin(inode, pos, len, flags, &outarg); > > if (err) > > err = fuse_simple_request(fm, &args); > > ... > > } > > I'm curious what the rest of the bpf integration code looks like. > > > and I was able to verify that iomap_begin() is able to return back > > populated outarg fields from the bpf prog. If we were to actually > > implement it i'm sure it'd be more complicated (eg we'd need to make > > the fuse_bpf_ops registered per-connection, etc) but on the whole it > > What is a fuse_bpf_ops? I'm assuming that's the attachment point for a > bpf program that the fuse server would compile? In which case, yes, I > think that ought to be per-connection. > > So the bpf program can examine the struct inode, and the pos/len/opflags > field; and from that information it has to write the appropriate fields > in &outarg? That's new, I didn't think bpf was allowed to write to > kernel memory. But it's been a few years since I last touched the bpf > internals. > > Some bpf programs might just know how to fill out outarg on their own > (e.g. famfs memory interleaving) but other bpf programs might perform a > range query on some imaginary bpf-interval-tree wherein you can do a > fast lookup based on (inumber, pos, len)? > > I guess that's an interesting question -- would each fuse connection > have one big bpf-interval-tree? Or would you shard things by inode to > reduce contention? And if you sharded like that, then would you need a > fuse_bpf_ops per inode? > > (I'm imagining that the fuse_bpf_ops might be where you'd stash the root > of the bpf data structure, but I know nothing of bpf internals ;)) > > Rolling on: how easy is it for a userspace program to compile and upload > bpf programs into the kernel? I've played around with bcc enough to > write some fairly stupid latency tracing tools for XFS, but at the end > of the day it still python scripts feeding a string full of maybe-C into > whatever the BPF does under the hood. > > I /think/ it calls clang on the provided text, links that against the > current kernel's header files, and pushes the compiled bpf binary into > the kernel, right? So fuse4fs would have to learn how to do that; and > now fuse4fs has a runtime dependency on libllvm. > > And while I'm on the topic of fuse-bpf uapi: It's ok for us to expose > primitive-typed variables (pos/len/opflags) and existing fuse uapi > directly to a bpf program, but I don't think we should expose struct > inode/fuse_inode. Maybe just fuse_inode::nodeid? If we're careful not > to allow #include'ing structured types in the fuse bpf code, then > perhaps the bpf programs could be compiled at the same time as the fuse > server. > > > seems doable. My worry is that if we land the iomap cache patchset now > > then we can't remove it in the future without breaking backwards > > compatibility for being a performance regression (though maybe we can > > since the fuse-iomap stuff is experimental?), so imo it'd be great if > > I don't think it's a huge problem to remove functionality while the > EXPERIMENTAL warnings are in place. We'd forever lose the command codes > for FUSE_NOTIFY_IOMAP_UPSERT and FUSE_NOTIFY_IOMAP_INVAL, but we've only > used 12 out of INT_MAX so that's not likely to be a concern. > > > we figured out what direction we want to go before landing the cache > > stuff. And I think we need to have this conversation too on the main > > famfs patchset (eg whether it should go through your general iomap > > plumbing with bpf helpers vs. being a separate implementation) since > > once that lands, it'd be irrevocable. > > I've of two minds on that -- John got here first, so I don't want to > delay his patchset whilst I slowly work on this thing. OTOH from an > architecture standpoint we probably ought to push for three ways for a > fuse server to upload mappings: > > 1. Upserting mappings with arbitrary offset and size into a cache > 2. Self contained bpf program that can generate any mapping > 3. Sprawling bpf program that can read any other artifacts that another > bpf program might have set up for it > > But yeah, let's involve John. > > --D > > > > > Thanks, > > Joanne > > > > > > --D > > > > > > > Thanks, > > > > Joanne > > > > > > > > > > > > > > <nod> > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Thanks, > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > > > > mapping predictive model into the kernel as a bpf program and use that > > > > > > > as a first tier before checking the in-memory btree mapping cache from > > > > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > > > > establishing that even a stupid linear regression model could in theory > > > > > > > beat a disk btree lookup. > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Thanks, > > > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > kernel git tree: > > > > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > > > > --- > > > > > > > > > Commits in this patchset: > > > > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > > > > * fuse: make debugging configurable at runtime > > > > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > * fuse: implement direct IO with iomap > > > > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > > > > * fuse: implement buffered IO with iomap > > > > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > > > > * fuse: advertise support for iomap > > > > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > > > > * fuse: implement fadvise for iomap files > > > > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > > > > * fuse: implement inline data file IO via iomap > > > > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > > > > * fuse: allow more statx fields > > > > > > > > > * fuse: support atomic writes with iomap > > > > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > > > > * fuse: enable swapfile activation on iomap > > > > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > > > > --- > > > > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > > > > fs/fuse/Kconfig | 48 + > > > > > > > > > fs/fuse/Makefile | 1 > > > > > > > > > fs/fuse/backing.c | 12 > > > > > > > > > fs/fuse/dev.c | 30 + > > > > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > > > > fs/fuse/file.c | 133 ++- > > > > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > > > > fs/fuse/iomode.c | 2 > > > > > > > > > fs/fuse/trace.c | 2 > > > > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-29 20:02 ` Darrick J. Wong 2026-01-29 22:41 ` Darrick J. Wong @ 2026-01-29 22:50 ` Joanne Koong 2026-01-29 23:12 ` Darrick J. Wong 1 sibling, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-29 22:50 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Thu, Jan 29, 2026 at 12:02 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 28, 2026 at 05:12:54PM -0800, Joanne Koong wrote: > > <snip> > > > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > > > > Maybe one workaround in the meantime that could work is using a sorted > > > > > > array map and doing binary search on that, until interval mappings can > > > > > > be natively supported? > > > > > > > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > > > > regular file. > > > > > > > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > > > > map entries, which is bounded by u32 > > > > > > > > > > That's problematic, since files can have 64-bit logical block numbers. > > > > > > > > The key size supports 64-bits. The u32 bound would be the limit on the > > > > number of extents for the file. > > > > > > Oh, ok. If one treats the incore map as a cache and evicts things when > > > they get too old, then that would be fine. I misread that as an upper > > > limit on the *range* of the map entry keys. :/ > > > > I think for more complicated servers, the bpf prog handling for > > iomap_begin() would essentially just serve as a cache where if it's > > not found in the cache, then it sends off the FUSE_IOMAP_BEGIN request > > to the server. For servers that don't need as much complicated logic > > (eg famfs), the iomap_begin() logic would just be executed within the > > bpf prog itself. > > Yes, I like the fuse_iomap_begin logic flow of: > > 1. Try to use a mapping in the iext tree > 2. Call a BPF program to try to generate a mapping > 3. Issue a fuse command to userspace > > wherein #2 and #3 can signal that #1 should be retried. (This is > already provided by FUSE_IOMAP_TYPE_RETRY_CACHE, FWIW) > > That said, BPF doesn't expose an interval btree data structure. I think > it would be better to add the iext mapping cache and make it so that bpf > programs could call fuse_iomap_cache_{upsert,remove,lookup}. You could > use the interval tree too, but the iext tree has the advantage of higher > fanout factor. > > > > As it stands, I need to figure out a way to trim the iomap btree when > > > memory gets tight. Right now it'll drop the cache whenever someone > > > closes the file, but that won't help for long-life processes that open a > > > heavily fragmented file and never close it. > > > > > > A coding-intensive way to do that would be to register a shrinker and > > > deal with that, but ugh. A really stupid way would be to drop the whole > > > cache once you get beyond (say) 64k of memory usage (~2000 mappings). > > > > This kind of seems like another point in favor of giving userspace > > control of the caching layer. They could then implement whatever > > eviction policies they want. > > Note that userspace already can control the cached iomappings -- > FUSE_NOTIFY_IOMAP_UPSERT pushes a mapping into the iext tree, and > FUSE_NOTIFY_IOMAP_INVAL removes them. The fuse server can decide to This incurs round-trip context-switch costs though, which the bpf prog approach doesn't incur. > evict whenever it pleases, though admittedly the iext tree doesn't track > usage information of any kind, so how would the fuse server know? > > The static limit is merely the kernel's means to establish a hard limit > on the memory consumption of the iext tree, since it can't trust > userspace completely. > > > It also allows them to prepopulate the cache upfront (eg when > > servicing a file open request, if the file is below a certain size or > > if the server knows what'll be hot, it could put those extents into > > the map from the get-go). > > Hrm. I haven't tried issuing FUSE_NOTIFY_IOMAP_UPSERT during an open > call, but I suppose it's possible. > > > in my opinion, the fuse-iomap layer should try to be as simple/minimal > > and as generic as possible. I haven't read through iomap_cache.c yet > > but the header comment suggests it's adapted from the xfs extent tree > > Rudely copied, not adapted ;) > > I actually wonder if I should make a horrible macro to generate the > fuse_iext_* structures and functions, and then xfs_iext_tree.c and > fuse_iomap_cache.c can "share" that hairba^Wcode. > > > cache. As I understand it, different filesystem implementations have > > different caching architectures that are better suited for their use > > cases > > Err. The way this evolved is ... way too long to go into in this email. > Here's a truncated version; I can tell you the full story next week. > > Most filesystems store their file mapping data on disk in whatever > format the designers specified. When the pagecache asks them to read > or write the cache, they attach buffer heads to the folio, fill out the > buffer heads with the minimum mapping information needed to map the > folios to disk addresses. bios are constructed for each folio based on > what's in the bufferhead. > > This was fine for filesystems that map each block individually, such as > FFS/ext2/ext3/fat... > > > (I'm guessing that's the case, otherwise there would just be one > > general cache inside iomap all the filesystems would use?). It seems a > > ...but newer filesystems such as xfs/ext4/btrfs map a bunch of blocks at > a time. Each of them invented their own private incore mapping > structures to mirror the ondisk structure. xfs kept using the old > bufferheads into the early 2010s, ext4 is still using them, and btrfs > went its own way from the start. > > Eventually XFS grew its own internal extent-to-bio mapping code that > flipped the model -- rather than get a pagecache folio, map the folio to > blocks, and issue IOs based on the blocks, it would get the file > mapping, grab folios for the whole mapping, and issue bios for the batch > of folios. That's more efficient, but at this point we have a legacy > codebase problem for everything else in fs/. > > In 2019, hch and I decided to export the extent-to-bio mapping code from > xfs so that new filesystems could start with something cleaner than > bufferheads. In the past 7 years, nobody's added a new filesystem with > complex mapping requirements; they've only ported existing filesystems > to it, without further refactoring of their incore data structures. > That's why there's no generic iomap cache. Oh I see, so it actually *is* a generic cache? Just that the other filesystems haven't ported over to it yet? That changes my opinion a lot on this then. If it's a generic cache that pretty much any modern filesystem should use, then it seems reasonable to me to have it on the fuse iomap kernel side. Though in that case, it seems a lot cleaner imo if the cache could be ported to the iomap layer as a generic cache (which seems like it'd be useful anyways for other filesystems to use if/when they port over to it, if I'm understanding what you're saying correctly), and then fuse just call into that api. > > > lot better to me to just let the userspace server define that > > themselves. And selfishly from the fuse perspective, would be less > > Well if I turned the iext code into a template then fuse would only need > enough glue code to declare a template class and use it. The glue part > is only ... 230LOC. Nice, I think this would be a lot nicer / less of a headache in the future for fuse to maintain. > > > code we would have to maintain. And I guess too if some servers don't > > need caching (like famfs?), they could avoid that overhead. > > Hrm. Right now the struct fuse_iomap_cache is embedded in struct > fuse_inode, but that could be turned into a dynamic allocation. > > > > > > > > At one point I suggested to the famfs maintainer that it might be > > > > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > > > > programs instead of being stuck with a fixed format in the fuse > > > > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > > > > > > > This seems like a good use case for it too > > > > > > > > > > > > > > > Is this your > > > > > > > > assessment of it as well or do you think the server-side logic for > > > > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > > > > because I think it would be a blocker for your series. > > > > > > > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > > > > program to do mappings, since they can basically point anywhere and be > > > > > > > of any size. > > > > > > > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > > > > doesn't it just point to a block number? Or are you saying the problem > > > > > > would be there's too many mappings since a mapping could be any size? > > > > > > > > > > The second -- mappings can be any size, and unprivileged userspace can > > > > > control the mappings. > > > > > > > > If I'm understanding what you're saying here, this is the same > > > > discussion as the one above about the u32 bound, correct? > > > > > > A different thing -- file data mappings are irregularly sized, can > > > contain sparse holes, etc. Userspace controls the size and offset of > > > each mapping record (thanks to magic things like fallocate) so it'd be > > > very difficult to create a bpf program to generate mappings on the fly. > > > > Would the bpf prog have to generate mappings on the fly though? If the > > userspace does things like fallocate, those operations would still go > > through to the server as a regular request (eg FUSE_FALLOCATE) and on > > the server side, it'd add that to the map dynamically from userspace. > > That depends on the fuse server design. For simple things like famfs > where the layout is bog simple and there's no fancy features like > delayed allocation or unwritten extents, then you could probably get > away a BPF program to generate the entire mapping set. I suspect an > object-store type filesystem (aka write a file once, close it, snapshot > it, and never change it again) might be good at landing all the file > data in relatively few extent mappings, and it could actually compile a > custom bpf program for that file and push it to the kernel. > > > > Also you could have 2^33 mappings records for a file, so I think you > > > can't even write a bpf program that large. > > > > I think this depends on what map structure gets used. If there is > > native support added for b+ tree like data structures, I don't see why > > it wouldn't be able to. > > <nod> > > > > > > > I was thinking the issue would be more that there might be other logic > > > > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > > > > would need to be done that would be too out-of-scope for bpf. But I > > > > > > think I need to read through the fuse4fs stuff to understand more what > > > > > > it's doing in those functions. > > > > > > > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > > > > offloading this to bpf too is it would also then allow John's famfs to > > > > just go through your iomap plumbing as a use case of it instead of > > > > being an entirely separate thing. Though maybe there's some other > > > > reason for that that you guys have discussed prior. In any case, I'll > > > > ask this on John's main famfs patchset. It kind of seems to me that > > > > you guys are pretty much doing the exact same thing conceptually. > > > > > > Yes, though John's famfs has the nice property that memory controller > > > interleaving is mathematically regular and likely makes for a compact > > > bpf program. > > > > I tried out integrating the bpf hooks into fuse for iomap_begin() just > > to see if it was realistic and it seems relatively straightforward so > > far (though maybe the devil is in the details...). I used the > > Ok, now *that's* interesting! I guess I had better push the latest > fuse-iomap code ... but I cannot share a link, because I cannot get > through the @!#%%!!! kernel.org anubis bullcrap. > > So I generated a pull request and I *think* this munged URL will work > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-service-container_2026-01-29 > > Or I guess you could just git-pull this: > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git tags/fuse-service-container_2026-01-29 > > > drivers/hid/bpf/hid_bpf_struct_ops.c program as a model for how to set > > up the fuse bpf struct ops on the kernel side. calling it from > > file_iomap.c looks something like > > > > static int fuse_iomap_begin(...) > > { > > ... > > struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); > > ... > > err = -EOPNOTSUPP; > > if (bpf_ops && bpf_ops->iomap_begin) > > err = bpf_ops->iomap_begin(inode, pos, len, flags, &outarg); > > if (err) > > err = fuse_simple_request(fm, &args); > > ... > > } > > I'm curious what the rest of the bpf integration code looks like. This is the code I had yesterday (I didn't know how to run the fuse4fs stuff, so I used passthrough_hp as the server and had the trigger go through statfs): diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 22ad9538dfc4..10c3939f4cf3 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := trace.o # put trace.o first so we see ftrace errors sooner fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o +fuse-y += fuse_bpf.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o fuse-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/fuse/fuse_bpf.c b/fs/fuse/fuse_bpf.c new file mode 100644 index 000000000000..637cf152e997 --- /dev/null +++ b/fs/fuse/fuse_bpf.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> + +#include "fuse_i.h" +#include "fuse_dev_i.h" +#include "fuse_bpf.h" + +static struct btf *fuse_bpf_ops_btf; +static struct fuse_bpf_ops *active_bpf_ops; + +static int fuse_bpf_ops_init(struct btf *btf) +{ + fuse_bpf_ops_btf = btf; + return 0; +} + +static bool fuse_bpf_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int fuse_bpf_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + return 0; +} + +static int fuse_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return 0; +} + +static const struct bpf_verifier_ops fuse_bpf_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = fuse_bpf_ops_is_valid_access, + .btf_struct_access = fuse_bpf_ops_btf_struct_access, +}; + +static int fuse_bpf_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct fuse_bpf_ops *u_ops = udata; + struct fuse_bpf_ops *ops = kdata; + u32 moff; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct fuse_bpf_ops, name): + if (bpf_obj_name_cpy(ops->name, u_ops->name, + sizeof(ops->name)) <= 0) + return -EINVAL; + return 1; /* Handled */ + } + + /* Not handled, use default */ + return 0; +} + +static int fuse_bpf_reg(void *kdata, struct bpf_link *link) +{ + struct fuse_bpf_ops *ops = kdata; + + active_bpf_ops = ops; + + printk("fuse_bpf: registered ops '%s'\n", ops->name); + + return 0; +} + +static void fuse_bpf_unreg(void *kdata, struct bpf_link *link) +{ + struct fuse_bpf_ops *ops = kdata; + + if (active_bpf_ops == ops) + active_bpf_ops = NULL; + + printk("fuse_bpf: unregistered ops '%s'\n", ops->name); +} + +static int __iomap_begin(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct fuse_iomap_io *out_io) +{ + printk("stub __iomap_begin(). should never get called\n"); + return 0; +} + +static struct fuse_bpf_ops __fuse_bpf_ops = { + .iomap_begin = __iomap_begin, +}; + +static struct bpf_struct_ops fuse_bpf_struct_ops = { + .verifier_ops = &fuse_bpf_verifier_ops, + .init = fuse_bpf_ops_init, + .check_member = fuse_bpf_ops_check_member, + .init_member = fuse_bpf_ops_init_member, + .reg = fuse_bpf_reg, + .unreg = fuse_bpf_unreg, + .name = "fuse_bpf_ops", + .cfi_stubs = &__fuse_bpf_ops, + .owner = THIS_MODULE, +}; + +struct fuse_bpf_ops *fuse_get_bpf_ops(void) +{ + return active_bpf_ops; +} + +int fuse_bpf_init(void) +{ + return register_bpf_struct_ops(&fuse_bpf_struct_ops, fuse_bpf_ops); +} + +BTF_ID_LIST_GLOBAL_SINGLE(btf_fuse_bpf_ops_id, struct, fuse_bpf_ops) +BTF_ID_LIST_GLOBAL_SINGLE(btf_fuse_iomap_io_id, struct, fuse_iomap_io) diff --git a/fs/fuse/fuse_bpf.h b/fs/fuse/fuse_bpf.h new file mode 100644 index 000000000000..d9482b64642b --- /dev/null +++ b/fs/fuse/fuse_bpf.h @@ -0,0 +1,29 @@ +#ifndef _FS_FUSE_BPF_H +#define _FS_FUSE_BPF_H + +#include "fuse_i.h" +#include <linux/iomap.h> + +/* copied from darrick's iomap patchset */ +struct fuse_iomap_io { + uint64_t offset; /* file offset of mapping, bytes */ + uint64_t length; /* length of mapping, bytes */ + uint64_t addr; /* disk offset of mapping, bytes */ + uint16_t type; /* FUSE_IOMAP_TYPE_* */ + uint16_t flags; /* FUSE_IOMAP_F_* */ + uint32_t dev; /* device cookie */ +}; + +struct fuse_bpf_ops { + int (*iomap_begin)(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct fuse_iomap_io *out_io__nullable); + + /* Required for bpf struct_ops */ + char name[16]; +}; + +struct fuse_bpf_ops *fuse_get_bpf_ops(void); +int fuse_bpf_init(void); + +#endif /* _FS_FUSE_BPF_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 819e50d66622..78ae4425e863 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_bpf.h" #include "fuse_dev_i.h" #include "dev_uring_i.h" @@ -662,6 +663,21 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; + printk("fuse_statfs() called!\n"); + + struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); + struct fuse_iomap_io out_io = {}; + + /* call BPF prog if attached */ + if (bpf_ops && bpf_ops->iomap_begin) { + err = bpf_ops->iomap_begin(d_inode(dentry), 111, 222, + 333, &out_io); + printk("bpf prog returned: err=%d, out_io->offset=%llu\n", + err, out_io.offset); + } else { + printk("did not find a bpf prog\n"); + } + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; @@ -2194,6 +2210,12 @@ static int __init fuse_fs_init(void) if (!fuse_inode_cachep) goto out; + err = fuse_bpf_init(); + if (err) { + printk("fuse_bpf_init() failed %d\n", err); + goto out2; + } + err = register_fuseblk(); if (err) goto out2; These are the changes for the libfuse side: https://github.com/joannekoong/libfuse/commit/1a6198f17dd215c93fd82ec020641c079aae1241 To run it, run "make clean; make" from libfuse/example, and then sudo ninja from libfuse/build, and then sudo ~/libfuse/build/example/passthrough_hp ~/liburing ~/mounts/tmp2 --nopassthrough --foreground Then from ~/mounts/tmp2, run "stat -f filename" and that will show a few things: on the kernel side it'll print a statement like "bpf prog returned: err=0, out_io->offset=999" which shows that the prog can return back a "struct fuse_iomap_io" with all the requisite metadata filled out on the server-side, if you run " sudo cat /sys/kernel/debug/tracing/trace_pipe", that should print out " bpf_trace_printk: fuse iomap_begin: inode=ffff8a75cbe63800 pos=111 len=222 flags=333" which shows the prog can take in whatever pos/len/flags values you pass it from the fuse kernel > > > and I was able to verify that iomap_begin() is able to return back > > populated outarg fields from the bpf prog. If we were to actually > > implement it i'm sure it'd be more complicated (eg we'd need to make > > the fuse_bpf_ops registered per-connection, etc) but on the whole it > > What is a fuse_bpf_ops? I'm assuming that's the attachment point for a > bpf program that the fuse server would compile? In which case, yes, I > think that ought to be per-connection. > > So the bpf program can examine the struct inode, and the pos/len/opflags > field; and from that information it has to write the appropriate fields > in &outarg? That's new, I didn't think bpf was allowed to write to > kernel memory. But it's been a few years since I last touched the bpf > internals. It's been a few years since I looked at bpf as well but yes fuse_bpf_ops is basically the kernel-side struct_ops interface for getting fuse to trigger the attached bpf program's callback implementations. When the bpf program is loaded in, its callback functions get swapped in and fuse_bpf_ops's function pointers now point to the bpf's callback functions, so when you invoke fuse_bpf_ops's callbacks, it calls into the bpf prog's callback. > > Some bpf programs might just know how to fill out outarg on their own > (e.g. famfs memory interleaving) but other bpf programs might perform a > range query on some imaginary bpf-interval-tree wherein you can do a > fast lookup based on (inumber, pos, len)? > > I guess that's an interesting question -- would each fuse connection > have one big bpf-interval-tree? Or would you shard things by inode to > reduce contention? And if you sharded like that, then would you need a > fuse_bpf_ops per inode? Hmm the cache's tree is per-inode as I understand it, so probably each inode would have its own tree / map? > > (I'm imagining that the fuse_bpf_ops might be where you'd stash the root > of the bpf data structure, but I know nothing of bpf internals ;)) > > Rolling on: how easy is it for a userspace program to compile and upload > bpf programs into the kernel? I've played around with bcc enough to > write some fairly stupid latency tracing tools for XFS, but at the end > of the day it still python scripts feeding a string full of maybe-C into > whatever the BPF does under the hood. I found it pretty easy with the libbpf library which will generate the skeletons and provide the api helpers to load it in and other stuff (the libfuse link I pasted above for the userspace side has the code for compiling it and loading it). > > I /think/ it calls clang on the provided text, links that against the > current kernel's header files, and pushes the compiled bpf binary into > the kernel, right? So fuse4fs would have to learn how to do that; and > now fuse4fs has a runtime dependency on libllvm. I think the libbpf library takes care of a lot of that for you. I think fuse4fs would just need to do the same thing as in that libfuse link above > > And while I'm on the topic of fuse-bpf uapi: It's ok for us to expose > primitive-typed variables (pos/len/opflags) and existing fuse uapi > directly to a bpf program, but I don't think we should expose struct > inode/fuse_inode. Maybe just fuse_inode::nodeid? If we're careful not > to allow #include'ing structured types in the fuse bpf code, then > perhaps the bpf programs could be compiled at the same time as the fuse > server. I agree, I think if we do decide to go further with this approach, we'll need to define exactly what the interfaces should be that would be safe to expose. > > > seems doable. My worry is that if we land the iomap cache patchset now > > then we can't remove it in the future without breaking backwards > > compatibility for being a performance regression (though maybe we can > > since the fuse-iomap stuff is experimental?), so imo it'd be great if > > I don't think it's a huge problem to remove functionality while the > EXPERIMENTAL warnings are in place. We'd forever lose the command codes > for FUSE_NOTIFY_IOMAP_UPSERT and FUSE_NOTIFY_IOMAP_INVAL, but we've only > used 12 out of INT_MAX so that's not likely to be a concern. > > > we figured out what direction we want to go before landing the cache > > stuff. And I think we need to have this conversation too on the main > > famfs patchset (eg whether it should go through your general iomap > > plumbing with bpf helpers vs. being a separate implementation) since > > once that lands, it'd be irrevocable. > > I've of two minds on that -- John got here first, so I don't want to > delay his patchset whilst I slowly work on this thing. OTOH from an > architecture standpoint we probably ought to push for three ways for a > fuse server to upload mappings: I think if John/Miklos wanted to go in this direction, all that would be needed from your series is the first one or two patches that define the basic fuse_iomap_io / fuse_iomap_begin / fuse_iomap_end structs and init config plumbing. Thanks, Joanne > > 1. Upserting mappings with arbitrary offset and size into a cache > 2. Self contained bpf program that can generate any mapping > 3. Sprawling bpf program that can read any other artifacts that another > bpf program might have set up for it > > But yeah, let's involve John. > > --D > > > > > Thanks, > > Joanne > > > > > > --D > > > > > > > Thanks, > > > > Joanne > > > > > > > > > > > > > > <nod> > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Thanks, > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > > > > mapping predictive model into the kernel as a bpf program and use that > > > > > > > as a first tier before checking the in-memory btree mapping cache from > > > > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > > > > establishing that even a stupid linear regression model could in theory > > > > > > > beat a disk btree lookup. > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Thanks, > > > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > kernel git tree: > > > > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > > > > --- > > > > > > > > > Commits in this patchset: > > > > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > > > > * fuse: make debugging configurable at runtime > > > > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > * fuse: implement direct IO with iomap > > > > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > > > > * fuse: implement buffered IO with iomap > > > > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > > > > * fuse: advertise support for iomap > > > > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > > > > * fuse: implement fadvise for iomap files > > > > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > > > > * fuse: implement inline data file IO via iomap > > > > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > > > > * fuse: allow more statx fields > > > > > > > > > * fuse: support atomic writes with iomap > > > > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > > > > * fuse: enable swapfile activation on iomap > > > > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > > > > --- > > > > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > > > > fs/fuse/Kconfig | 48 + > > > > > > > > > fs/fuse/Makefile | 1 > > > > > > > > > fs/fuse/backing.c | 12 > > > > > > > > > fs/fuse/dev.c | 30 + > > > > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > > > > fs/fuse/file.c | 133 ++- > > > > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > > > > fs/fuse/iomode.c | 2 > > > > > > > > > fs/fuse/trace.c | 2 > > > > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > > > > > > > > > > > > ^ permalink raw reply related [flat|nested] 52+ messages in thread
* Re: [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance 2026-01-29 22:50 ` Joanne Koong @ 2026-01-29 23:12 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-29 23:12 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Thu, Jan 29, 2026 at 02:50:23PM -0800, Joanne Koong wrote: > On Thu, Jan 29, 2026 at 12:02 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 28, 2026 at 05:12:54PM -0800, Joanne Koong wrote: > > > > <snip> > > > > > > > > > > Hrmm. Now that /is/ an interesting proposal. Does BPF have a data > > > > > > > > structure that supports interval mappings? I think the existing bpf map > > > > > > > > > > > > > > Not yet but I don't see why a b+ tree like data strucutre couldn't be added. > > > > > > > Maybe one workaround in the meantime that could work is using a sorted > > > > > > > array map and doing binary search on that, until interval mappings can > > > > > > > be natively supported? > > > > > > > > > > > > I guess, though I already had a C structure to borrow from xfs ;) > > > > > > > > > > > > > > only does key -> value. Also, is there an upper limit on the size of a > > > > > > > > map? You could have hundreds of millions of maps for a very fragmented > > > > > > > > regular file. > > > > > > > > > > > > > > If I'm remembering correctly, there's an upper limit on the number of > > > > > > > map entries, which is bounded by u32 > > > > > > > > > > > > That's problematic, since files can have 64-bit logical block numbers. > > > > > > > > > > The key size supports 64-bits. The u32 bound would be the limit on the > > > > > number of extents for the file. > > > > > > > > Oh, ok. If one treats the incore map as a cache and evicts things when > > > > they get too old, then that would be fine. I misread that as an upper > > > > limit on the *range* of the map entry keys. :/ > > > > > > I think for more complicated servers, the bpf prog handling for > > > iomap_begin() would essentially just serve as a cache where if it's > > > not found in the cache, then it sends off the FUSE_IOMAP_BEGIN request > > > to the server. For servers that don't need as much complicated logic > > > (eg famfs), the iomap_begin() logic would just be executed within the > > > bpf prog itself. > > > > Yes, I like the fuse_iomap_begin logic flow of: > > > > 1. Try to use a mapping in the iext tree > > 2. Call a BPF program to try to generate a mapping > > 3. Issue a fuse command to userspace > > > > wherein #2 and #3 can signal that #1 should be retried. (This is > > already provided by FUSE_IOMAP_TYPE_RETRY_CACHE, FWIW) > > > > That said, BPF doesn't expose an interval btree data structure. I think > > it would be better to add the iext mapping cache and make it so that bpf > > programs could call fuse_iomap_cache_{upsert,remove,lookup}. You could > > use the interval tree too, but the iext tree has the advantage of higher > > fanout factor. > > > > > > As it stands, I need to figure out a way to trim the iomap btree when > > > > memory gets tight. Right now it'll drop the cache whenever someone > > > > closes the file, but that won't help for long-life processes that open a > > > > heavily fragmented file and never close it. > > > > > > > > A coding-intensive way to do that would be to register a shrinker and > > > > deal with that, but ugh. A really stupid way would be to drop the whole > > > > cache once you get beyond (say) 64k of memory usage (~2000 mappings). > > > > > > This kind of seems like another point in favor of giving userspace > > > control of the caching layer. They could then implement whatever > > > eviction policies they want. > > > > Note that userspace already can control the cached iomappings -- > > FUSE_NOTIFY_IOMAP_UPSERT pushes a mapping into the iext tree, and > > FUSE_NOTIFY_IOMAP_INVAL removes them. The fuse server can decide to > > This incurs round-trip context-switch costs though, which the bpf prog > approach doesn't incur. I realize that, but I think we're simply headed towards a fast path wherein you can upload a bpf program; and a slow path for big complex filesystems like ext4. ext4 always gonna be slow. :) > > evict whenever it pleases, though admittedly the iext tree doesn't track > > usage information of any kind, so how would the fuse server know? > > > > The static limit is merely the kernel's means to establish a hard limit > > on the memory consumption of the iext tree, since it can't trust > > userspace completely. > > > > > It also allows them to prepopulate the cache upfront (eg when > > > servicing a file open request, if the file is below a certain size or > > > if the server knows what'll be hot, it could put those extents into > > > the map from the get-go). > > > > Hrm. I haven't tried issuing FUSE_NOTIFY_IOMAP_UPSERT during an open > > call, but I suppose it's possible. (It's quite possible; running through QA now) > > > in my opinion, the fuse-iomap layer should try to be as simple/minimal > > > and as generic as possible. I haven't read through iomap_cache.c yet > > > but the header comment suggests it's adapted from the xfs extent tree > > > > Rudely copied, not adapted ;) > > > > I actually wonder if I should make a horrible macro to generate the > > fuse_iext_* structures and functions, and then xfs_iext_tree.c and > > fuse_iomap_cache.c can "share" that hairba^Wcode. > > > > > cache. As I understand it, different filesystem implementations have > > > different caching architectures that are better suited for their use > > > cases > > > > Err. The way this evolved is ... way too long to go into in this email. > > Here's a truncated version; I can tell you the full story next week. > > > > Most filesystems store their file mapping data on disk in whatever > > format the designers specified. When the pagecache asks them to read > > or write the cache, they attach buffer heads to the folio, fill out the > > buffer heads with the minimum mapping information needed to map the > > folios to disk addresses. bios are constructed for each folio based on > > what's in the bufferhead. > > > > This was fine for filesystems that map each block individually, such as > > FFS/ext2/ext3/fat... > > > > > (I'm guessing that's the case, otherwise there would just be one > > > general cache inside iomap all the filesystems would use?). It seems a > > > > ...but newer filesystems such as xfs/ext4/btrfs map a bunch of blocks at > > a time. Each of them invented their own private incore mapping > > structures to mirror the ondisk structure. xfs kept using the old > > bufferheads into the early 2010s, ext4 is still using them, and btrfs > > went its own way from the start. > > > > Eventually XFS grew its own internal extent-to-bio mapping code that > > flipped the model -- rather than get a pagecache folio, map the folio to > > blocks, and issue IOs based on the blocks, it would get the file > > mapping, grab folios for the whole mapping, and issue bios for the batch > > of folios. That's more efficient, but at this point we have a legacy > > codebase problem for everything else in fs/. > > > > In 2019, hch and I decided to export the extent-to-bio mapping code from > > xfs so that new filesystems could start with something cleaner than > > bufferheads. In the past 7 years, nobody's added a new filesystem with > > complex mapping requirements; they've only ported existing filesystems > > to it, without further refactoring of their incore data structures. > > That's why there's no generic iomap cache. > > Oh I see, so it actually *is* a generic cache? Just that the other No, quite the opposite. fs/iomap/ handles individual mapping records, but does not itself implement a cache. A filesystem /could/ decide that it wants to translate its ondisk mappings directly into a struct iomap and then push that struct iomap object into a cache, but it would have to implement the incore cache itself. "struct iomap" is merely a generic interface; prior to the hoist, the IO mapping code in xfs directly handled struct xfs_bmbt_irec objects. Now xfs translates xfs_bmbt_irec objects from its own cache into a struct iomap and returns that from ->iomap_begin. That said it wouldn't be terrible to provide a generic cache implementation. But the hard part is that caching struct iomap objects would result in more memory use for XFS because xfs_bmbt_irec is bitpacked. > filesystems haven't ported over to it yet? That changes my opinion a > lot on this then. If it's a generic cache that pretty much any modern > filesystem should use, then it seems reasonable to me to have it on > the fuse iomap kernel side. Though in that case, it seems a lot > cleaner imo if the cache could be ported to the iomap layer as a > generic cache (which seems like it'd be useful anyways for other > filesystems to use if/when they port over to it, if I'm understanding > what you're saying correctly), and then fuse just call into that api. <nod> That sounds like a good long term goal. For now let's get it working inside fuse and then we can see if any other filesystems are interested. Also unclear: do filesystems want to hang more data off the incore mapping data than the dataset that iomap itself needs? That would hamper attempts to make a generic iomap cache. > > > > > lot better to me to just let the userspace server define that > > > themselves. And selfishly from the fuse perspective, would be less > > > > Well if I turned the iext code into a template then fuse would only need > > enough glue code to declare a template class and use it. The glue part > > is only ... 230LOC. > > Nice, I think this would be a lot nicer / less of a headache in the > future for fuse to maintain. > > > > > > code we would have to maintain. And I guess too if some servers don't > > > need caching (like famfs?), they could avoid that overhead. > > > > Hrm. Right now the struct fuse_iomap_cache is embedded in struct > > fuse_inode, but that could be turned into a dynamic allocation. > > > > > > > > > > At one point I suggested to the famfs maintainer that it might be > > > > > > > > easier/better to implement the interleaved mapping lookups as bpf > > > > > > > > programs instead of being stuck with a fixed format in the fuse > > > > > > > > userspace abi, but I don't know if he ever implemented that. > > > > > > > > > > > > > > This seems like a good use case for it too > > > > > > > > > > > > > > > > > Is this your > > > > > > > > > assessment of it as well or do you think the server-side logic for > > > > > > > > > iomap_begin()/iomap_end() is too complicated to make this realistic? > > > > > > > > > Asking because I'm curious whether this direction makes sense, not > > > > > > > > > because I think it would be a blocker for your series. > > > > > > > > > > > > > > > > For disk-based filesystems I think it would be difficult to model a bpf > > > > > > > > program to do mappings, since they can basically point anywhere and be > > > > > > > > of any size. > > > > > > > > > > > > > > Hmm I'm not familiar enough with disk-based filesystems to know what > > > > > > > the "point anywhere and be of any size" means. For the mapping stuff, > > > > > > > doesn't it just point to a block number? Or are you saying the problem > > > > > > > would be there's too many mappings since a mapping could be any size? > > > > > > > > > > > > The second -- mappings can be any size, and unprivileged userspace can > > > > > > control the mappings. > > > > > > > > > > If I'm understanding what you're saying here, this is the same > > > > > discussion as the one above about the u32 bound, correct? > > > > > > > > A different thing -- file data mappings are irregularly sized, can > > > > contain sparse holes, etc. Userspace controls the size and offset of > > > > each mapping record (thanks to magic things like fallocate) so it'd be > > > > very difficult to create a bpf program to generate mappings on the fly. > > > > > > Would the bpf prog have to generate mappings on the fly though? If the > > > userspace does things like fallocate, those operations would still go > > > through to the server as a regular request (eg FUSE_FALLOCATE) and on > > > the server side, it'd add that to the map dynamically from userspace. > > > > That depends on the fuse server design. For simple things like famfs > > where the layout is bog simple and there's no fancy features like > > delayed allocation or unwritten extents, then you could probably get > > away a BPF program to generate the entire mapping set. I suspect an > > object-store type filesystem (aka write a file once, close it, snapshot > > it, and never change it again) might be good at landing all the file > > data in relatively few extent mappings, and it could actually compile a > > custom bpf program for that file and push it to the kernel. > > > > > > Also you could have 2^33 mappings records for a file, so I think you > > > > can't even write a bpf program that large. > > > > > > I think this depends on what map structure gets used. If there is > > > native support added for b+ tree like data structures, I don't see why > > > it wouldn't be able to. > > > > <nod> > > > > > > > > > I was thinking the issue would be more that there might be other logic > > > > > > > inside ->iomap_begin()/->iomap_end() besides the mapping stuff that > > > > > > > would need to be done that would be too out-of-scope for bpf. But I > > > > > > > think I need to read through the fuse4fs stuff to understand more what > > > > > > > it's doing in those functions. > > > > > > > > > > Looking at fuse4fs logic cursorily, it seems doable? What I like about > > > > > offloading this to bpf too is it would also then allow John's famfs to > > > > > just go through your iomap plumbing as a use case of it instead of > > > > > being an entirely separate thing. Though maybe there's some other > > > > > reason for that that you guys have discussed prior. In any case, I'll > > > > > ask this on John's main famfs patchset. It kind of seems to me that > > > > > you guys are pretty much doing the exact same thing conceptually. > > > > > > > > Yes, though John's famfs has the nice property that memory controller > > > > interleaving is mathematically regular and likely makes for a compact > > > > bpf program. > > > > > > I tried out integrating the bpf hooks into fuse for iomap_begin() just > > > to see if it was realistic and it seems relatively straightforward so > > > far (though maybe the devil is in the details...). I used the > > > > Ok, now *that's* interesting! I guess I had better push the latest > > fuse-iomap code ... but I cannot share a link, because I cannot get > > through the @!#%%!!! kernel.org anubis bullcrap. > > > > So I generated a pull request and I *think* this munged URL will work > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-service-container_2026-01-29 > > > > Or I guess you could just git-pull this: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git tags/fuse-service-container_2026-01-29 > > > > > drivers/hid/bpf/hid_bpf_struct_ops.c program as a model for how to set > > > up the fuse bpf struct ops on the kernel side. calling it from > > > file_iomap.c looks something like > > > > > > static int fuse_iomap_begin(...) > > > { > > > ... > > > struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); > > > ... > > > err = -EOPNOTSUPP; > > > if (bpf_ops && bpf_ops->iomap_begin) > > > err = bpf_ops->iomap_begin(inode, pos, len, flags, &outarg); > > > if (err) > > > err = fuse_simple_request(fm, &args); > > > ... > > > } > > > > I'm curious what the rest of the bpf integration code looks like. > > This is the code I had yesterday (I didn't know how to run the fuse4fs > stuff, so I used passthrough_hp as the server and had the trigger go > through statfs): Yeah, it's nasty ... you have to go build a patched libfuse3, then grab the modified e2fsprogs, modify your ./configure commandline to point itself at the built libfuse3, and then it'll build fuse4fs. > > diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile > index 22ad9538dfc4..10c3939f4cf3 100644 > --- a/fs/fuse/Makefile > +++ b/fs/fuse/Makefile > @@ -13,6 +13,7 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o > fuse-y := trace.o # put trace.o first so we see ftrace errors sooner > fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o > fuse-y += iomode.o > +fuse-y += fuse_bpf.o > fuse-$(CONFIG_FUSE_DAX) += dax.o > fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o > fuse-$(CONFIG_SYSCTL) += sysctl.o > diff --git a/fs/fuse/fuse_bpf.c b/fs/fuse/fuse_bpf.c > new file mode 100644 > index 000000000000..637cf152e997 > --- /dev/null > +++ b/fs/fuse/fuse_bpf.c > @@ -0,0 +1,123 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/bpf.h> > + > +#include "fuse_i.h" > +#include "fuse_dev_i.h" > +#include "fuse_bpf.h" > + > +static struct btf *fuse_bpf_ops_btf; > +static struct fuse_bpf_ops *active_bpf_ops; > + > +static int fuse_bpf_ops_init(struct btf *btf) > +{ > + fuse_bpf_ops_btf = btf; > + return 0; > +} > + > +static bool fuse_bpf_ops_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static int fuse_bpf_ops_check_member(const struct btf_type *t, > + const struct btf_member *member, > + const struct bpf_prog *prog) > +{ > + return 0; > +} > + > +static int fuse_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, > + const struct bpf_reg_state *reg, > + int off, int size) > +{ > + return 0; > +} > + > +static const struct bpf_verifier_ops fuse_bpf_verifier_ops = { > + .get_func_proto = bpf_base_func_proto, > + .is_valid_access = fuse_bpf_ops_is_valid_access, > + .btf_struct_access = fuse_bpf_ops_btf_struct_access, > +}; > + > +static int fuse_bpf_ops_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + const struct fuse_bpf_ops *u_ops = udata; > + struct fuse_bpf_ops *ops = kdata; > + u32 moff; > + > + moff = __btf_member_bit_offset(t, member) / 8; > + switch (moff) { > + case offsetof(struct fuse_bpf_ops, name): > + if (bpf_obj_name_cpy(ops->name, u_ops->name, > + sizeof(ops->name)) <= 0) > + return -EINVAL; > + return 1; /* Handled */ > + } > + > + /* Not handled, use default */ > + return 0; > +} > + > +static int fuse_bpf_reg(void *kdata, struct bpf_link *link) > +{ > + struct fuse_bpf_ops *ops = kdata; > + > + active_bpf_ops = ops; > + > + printk("fuse_bpf: registered ops '%s'\n", ops->name); > + > + return 0; > +} > + > +static void fuse_bpf_unreg(void *kdata, struct bpf_link *link) > +{ > + struct fuse_bpf_ops *ops = kdata; > + > + if (active_bpf_ops == ops) > + active_bpf_ops = NULL; > + > + printk("fuse_bpf: unregistered ops '%s'\n", ops->name); > +} > + > +static int __iomap_begin(struct inode *inode, loff_t pos, > + loff_t length, unsigned int flags, > + struct fuse_iomap_io *out_io) > +{ > + printk("stub __iomap_begin(). should never get called\n"); > + return 0; > +} > + > +static struct fuse_bpf_ops __fuse_bpf_ops = { > + .iomap_begin = __iomap_begin, > +}; > + > +static struct bpf_struct_ops fuse_bpf_struct_ops = { > + .verifier_ops = &fuse_bpf_verifier_ops, > + .init = fuse_bpf_ops_init, > + .check_member = fuse_bpf_ops_check_member, > + .init_member = fuse_bpf_ops_init_member, > + .reg = fuse_bpf_reg, > + .unreg = fuse_bpf_unreg, > + .name = "fuse_bpf_ops", > + .cfi_stubs = &__fuse_bpf_ops, > + .owner = THIS_MODULE, > +}; > + > +struct fuse_bpf_ops *fuse_get_bpf_ops(void) > +{ > + return active_bpf_ops; > +} > + > +int fuse_bpf_init(void) > +{ > + return register_bpf_struct_ops(&fuse_bpf_struct_ops, fuse_bpf_ops); > +} > + > +BTF_ID_LIST_GLOBAL_SINGLE(btf_fuse_bpf_ops_id, struct, fuse_bpf_ops) > +BTF_ID_LIST_GLOBAL_SINGLE(btf_fuse_iomap_io_id, struct, fuse_iomap_io) > diff --git a/fs/fuse/fuse_bpf.h b/fs/fuse/fuse_bpf.h > new file mode 100644 > index 000000000000..d9482b64642b > --- /dev/null > +++ b/fs/fuse/fuse_bpf.h > @@ -0,0 +1,29 @@ > +#ifndef _FS_FUSE_BPF_H > +#define _FS_FUSE_BPF_H > + > +#include "fuse_i.h" > +#include <linux/iomap.h> > + > +/* copied from darrick's iomap patchset */ > +struct fuse_iomap_io { > + uint64_t offset; /* file offset of mapping, bytes */ > + uint64_t length; /* length of mapping, bytes */ > + uint64_t addr; /* disk offset of mapping, bytes */ > + uint16_t type; /* FUSE_IOMAP_TYPE_* */ > + uint16_t flags; /* FUSE_IOMAP_F_* */ > + uint32_t dev; /* device cookie */ > +}; > + > +struct fuse_bpf_ops { > + int (*iomap_begin)(struct inode *inode, loff_t pos, > + loff_t length, unsigned int flags, > + struct fuse_iomap_io *out_io__nullable); > + > + /* Required for bpf struct_ops */ > + char name[16]; > +}; > + > +struct fuse_bpf_ops *fuse_get_bpf_ops(void); > +int fuse_bpf_init(void); > + > +#endif /* _FS_FUSE_BPF_H */ > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 819e50d66622..78ae4425e863 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -7,6 +7,7 @@ > */ > > #include "fuse_i.h" > +#include "fuse_bpf.h" > #include "fuse_dev_i.h" > #include "dev_uring_i.h" > > @@ -662,6 +663,21 @@ static int fuse_statfs(struct dentry *dentry, > struct kstatfs *buf) > struct fuse_statfs_out outarg; > int err; > > + printk("fuse_statfs() called!\n"); > + > + struct fuse_bpf_ops *bpf_ops = fuse_get_bpf_ops(); > + struct fuse_iomap_io out_io = {}; > + > + /* call BPF prog if attached */ > + if (bpf_ops && bpf_ops->iomap_begin) { > + err = bpf_ops->iomap_begin(d_inode(dentry), 111, 222, > + 333, &out_io); > + printk("bpf prog returned: err=%d, out_io->offset=%llu\n", > + err, out_io.offset); > + } else { > + printk("did not find a bpf prog\n"); > + } > + > if (!fuse_allow_current_process(fm->fc)) { > buf->f_type = FUSE_SUPER_MAGIC; > return 0; > @@ -2194,6 +2210,12 @@ static int __init fuse_fs_init(void) > if (!fuse_inode_cachep) > goto out; > > + err = fuse_bpf_init(); > + if (err) { > + printk("fuse_bpf_init() failed %d\n", err); > + goto out2; > + } > + > err = register_fuseblk(); > if (err) > goto out2; > > > These are the changes for the libfuse side: > https://github.com/joannekoong/libfuse/commit/1a6198f17dd215c93fd82ec020641c079aae1241 /me sees "SEC("struct_ops/iomap_begin")" ohhhh my. Well at least now I know that C programs can do bpf stuff via libfuse. > To run it, run "make clean; make" from libfuse/example, and then sudo > ninja from libfuse/build, and then > sudo ~/libfuse/build/example/passthrough_hp ~/liburing ~/mounts/tmp2 > --nopassthrough --foreground > > Then from ~/mounts/tmp2, run "stat -f filename" and that will show a few things: > on the kernel side it'll print a statement like "bpf prog returned: > err=0, out_io->offset=999" which shows that the prog can return back a > "struct fuse_iomap_io" with all the requisite metadata filled out > on the server-side, if you run " sudo cat > /sys/kernel/debug/tracing/trace_pipe", that should print out " > bpf_trace_printk: fuse iomap_begin: inode=ffff8a75cbe63800 pos=111 > len=222 flags=333" which shows the prog can take in whatever > pos/len/flags values you pass it from the fuse kernel Hehhehe. Will give that a shot. --D > > > > > > and I was able to verify that iomap_begin() is able to return back > > > populated outarg fields from the bpf prog. If we were to actually > > > implement it i'm sure it'd be more complicated (eg we'd need to make > > > the fuse_bpf_ops registered per-connection, etc) but on the whole it > > > > What is a fuse_bpf_ops? I'm assuming that's the attachment point for a > > bpf program that the fuse server would compile? In which case, yes, I > > think that ought to be per-connection. > > > > So the bpf program can examine the struct inode, and the pos/len/opflags > > field; and from that information it has to write the appropriate fields > > in &outarg? That's new, I didn't think bpf was allowed to write to > > kernel memory. But it's been a few years since I last touched the bpf > > internals. > > It's been a few years since I looked at bpf as well but yes > fuse_bpf_ops is basically the kernel-side struct_ops interface for > getting fuse to trigger the attached bpf program's callback > implementations. When the bpf program is loaded in, its callback > functions get swapped in and fuse_bpf_ops's function pointers now > point to the bpf's callback functions, so when you invoke > fuse_bpf_ops's callbacks, it calls into the bpf prog's callback. > > > > > Some bpf programs might just know how to fill out outarg on their own > > (e.g. famfs memory interleaving) but other bpf programs might perform a > > range query on some imaginary bpf-interval-tree wherein you can do a > > fast lookup based on (inumber, pos, len)? > > > > I guess that's an interesting question -- would each fuse connection > > have one big bpf-interval-tree? Or would you shard things by inode to > > reduce contention? And if you sharded like that, then would you need a > > fuse_bpf_ops per inode? > > Hmm the cache's tree is per-inode as I understand it, so probably each > inode would have its own tree / map? > > > > > (I'm imagining that the fuse_bpf_ops might be where you'd stash the root > > of the bpf data structure, but I know nothing of bpf internals ;)) > > > > Rolling on: how easy is it for a userspace program to compile and upload > > bpf programs into the kernel? I've played around with bcc enough to > > write some fairly stupid latency tracing tools for XFS, but at the end > > of the day it still python scripts feeding a string full of maybe-C into > > whatever the BPF does under the hood. > > I found it pretty easy with the libbpf library which will generate the > skeletons and provide the api helpers to load it in and other stuff > (the libfuse link I pasted above for the userspace side has the code > for compiling it and loading it). > > > > > I /think/ it calls clang on the provided text, links that against the > > current kernel's header files, and pushes the compiled bpf binary into > > the kernel, right? So fuse4fs would have to learn how to do that; and > > now fuse4fs has a runtime dependency on libllvm. > > I think the libbpf library takes care of a lot of that for you. I > think fuse4fs would just need to do the same thing as in that libfuse > link above > > > > > And while I'm on the topic of fuse-bpf uapi: It's ok for us to expose > > primitive-typed variables (pos/len/opflags) and existing fuse uapi > > directly to a bpf program, but I don't think we should expose struct > > inode/fuse_inode. Maybe just fuse_inode::nodeid? If we're careful not > > to allow #include'ing structured types in the fuse bpf code, then > > perhaps the bpf programs could be compiled at the same time as the fuse > > server. > > I agree, I think if we do decide to go further with this approach, > we'll need to define exactly what the interfaces should be that would > be safe to expose. > > > > > > seems doable. My worry is that if we land the iomap cache patchset now > > > then we can't remove it in the future without breaking backwards > > > compatibility for being a performance regression (though maybe we can > > > since the fuse-iomap stuff is experimental?), so imo it'd be great if > > > > I don't think it's a huge problem to remove functionality while the > > EXPERIMENTAL warnings are in place. We'd forever lose the command codes > > for FUSE_NOTIFY_IOMAP_UPSERT and FUSE_NOTIFY_IOMAP_INVAL, but we've only > > used 12 out of INT_MAX so that's not likely to be a concern. > > > > > we figured out what direction we want to go before landing the cache > > > stuff. And I think we need to have this conversation too on the main > > > famfs patchset (eg whether it should go through your general iomap > > > plumbing with bpf helpers vs. being a separate implementation) since > > > once that lands, it'd be irrevocable. > > > > I've of two minds on that -- John got here first, so I don't want to > > delay his patchset whilst I slowly work on this thing. OTOH from an > > architecture standpoint we probably ought to push for three ways for a > > fuse server to upload mappings: > > I think if John/Miklos wanted to go in this direction, all that would > be needed from your series is the first one or two patches that define > the basic fuse_iomap_io / fuse_iomap_begin / fuse_iomap_end structs > and init config plumbing. > > Thanks, > Joanne > > > > > 1. Upserting mappings with arbitrary offset and size into a cache > > 2. Self contained bpf program that can generate any mapping > > 3. Sprawling bpf program that can read any other artifacts that another > > bpf program might have set up for it > > > > But yeah, let's involve John. > > > > --D > > > > > > > > Thanks, > > > Joanne > > > > > > > > --D > > > > > > > > > Thanks, > > > > > Joanne > > > > > > > > > > > > > > > > > <nod> > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > Thanks, > > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > > > > OTOH it would be enormously hilarious to me if one could load a file > > > > > > > > mapping predictive model into the kernel as a bpf program and use that > > > > > > > > as a first tier before checking the in-memory btree mapping cache from > > > > > > > > patchset 7. Quite a few years ago now there was a FAST paper > > > > > > > > establishing that even a stupid linear regression model could in theory > > > > > > > > beat a disk btree lookup. > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Thanks, > > > > > > > > > Joanne > > > > > > > > > > > > > > > > > > > > > > > > > > > > > If you're going to start using this code, I strongly recommend pulling > > > > > > > > > > from my git trees, which are linked below. > > > > > > > > > > > > > > > > > > > > This has been running on the djcloud for months with no problems. Enjoy! > > > > > > > > > > Comments and questions are, as always, welcome. > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > kernel git tree: > > > > > > > > > > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-fileio > > > > > > > > > > --- > > > > > > > > > > Commits in this patchset: > > > > > > > > > > * fuse: implement the basic iomap mechanisms > > > > > > > > > > * fuse_trace: implement the basic iomap mechanisms > > > > > > > > > > * fuse: make debugging configurable at runtime > > > > > > > > > > * fuse: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > > * fuse_trace: adapt FUSE_DEV_IOC_BACKING_{OPEN,CLOSE} to add new iomap devices > > > > > > > > > > * fuse: flush events and send FUSE_SYNCFS and FUSE_DESTROY on unmount > > > > > > > > > > * fuse: create a per-inode flag for toggling iomap > > > > > > > > > > * fuse_trace: create a per-inode flag for toggling iomap > > > > > > > > > > * fuse: isolate the other regular file IO paths from iomap > > > > > > > > > > * fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > > * fuse_trace: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} > > > > > > > > > > * fuse: implement direct IO with iomap > > > > > > > > > > * fuse_trace: implement direct IO with iomap > > > > > > > > > > * fuse: implement buffered IO with iomap > > > > > > > > > > * fuse_trace: implement buffered IO with iomap > > > > > > > > > > * fuse: implement large folios for iomap pagecache files > > > > > > > > > > * fuse: use an unrestricted backing device with iomap pagecache io > > > > > > > > > > * fuse: advertise support for iomap > > > > > > > > > > * fuse: query filesystem geometry when using iomap > > > > > > > > > > * fuse_trace: query filesystem geometry when using iomap > > > > > > > > > > * fuse: implement fadvise for iomap files > > > > > > > > > > * fuse: invalidate ranges of block devices being used for iomap > > > > > > > > > > * fuse_trace: invalidate ranges of block devices being used for iomap > > > > > > > > > > * fuse: implement inline data file IO via iomap > > > > > > > > > > * fuse_trace: implement inline data file IO via iomap > > > > > > > > > > * fuse: allow more statx fields > > > > > > > > > > * fuse: support atomic writes with iomap > > > > > > > > > > * fuse_trace: support atomic writes with iomap > > > > > > > > > > * fuse: disable direct reclaim for any fuse server that uses iomap > > > > > > > > > > * fuse: enable swapfile activation on iomap > > > > > > > > > > * fuse: implement freeze and shutdowns for iomap filesystems > > > > > > > > > > --- > > > > > > > > > > fs/fuse/fuse_i.h | 161 +++ > > > > > > > > > > fs/fuse/fuse_trace.h | 939 +++++++++++++++++++ > > > > > > > > > > fs/fuse/iomap_i.h | 52 + > > > > > > > > > > include/uapi/linux/fuse.h | 219 ++++ > > > > > > > > > > fs/fuse/Kconfig | 48 + > > > > > > > > > > fs/fuse/Makefile | 1 > > > > > > > > > > fs/fuse/backing.c | 12 > > > > > > > > > > fs/fuse/dev.c | 30 + > > > > > > > > > > fs/fuse/dir.c | 120 ++ > > > > > > > > > > fs/fuse/file.c | 133 ++- > > > > > > > > > > fs/fuse/file_iomap.c | 2230 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > > fs/fuse/inode.c | 162 +++ > > > > > > > > > > fs/fuse/iomode.c | 2 > > > > > > > > > > fs/fuse/trace.c | 2 > > > > > > > > > > 14 files changed, 4056 insertions(+), 55 deletions(-) > > > > > > > > > > create mode 100644 fs/fuse/iomap_i.h > > > > > > > > > > create mode 100644 fs/fuse/file_iomap.c > > > > > > > > > > > > > > > > > > > > > > > > > > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810980.1424854.10557015500766654898.stgit@frogsfrogsfrogs>]
* Re: [PATCH 29/31] fuse: disable direct reclaim for any fuse server that uses iomap [not found] ` <176169810980.1424854.10557015500766654898.stgit@frogsfrogsfrogs> @ 2026-02-05 18:57 ` Chris Mason 2026-02-06 4:25 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 18:57 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Any fuse server that uses iomap can create a substantial amount of dirty > pages in the pagecache because we don't write dirty stuff until reclaim > or fsync. Therefore, memory reclaim on any fuse iomap server musn't > ever recurse back into the same filesystem. We must also never throttle > the fuse server writes to a bdi because that will just slow down > metadata operations. Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: commit 3bdd586d6fa00e067dbc3676642ac75e54ec3031 Author: Darrick J. Wong <djwong@kernel.org> fuse: disable direct reclaim for any fuse server that uses iomap Any fuse server that uses iomap can create a substantial amount of dirty pages in the pagecache because we don't write dirty stuff until reclaim or fsync. Therefore, memory reclaim on any fuse iomap server must avoid recursing back into the same filesystem and throttling the fuse server. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > --- a/fs/fuse/fuse_iomap.c > +++ b/fs/fuse/fuse_iomap.c > @@ -1019,6 +1019,12 @@ static void fuse_iomap_config_reply(struct fuse_mount *fm, > */ > fc->sync_fs = true; > > + /* > + * We could be on the hook for a substantial amount of writeback, so > + * prohibit reclaim from recursing into fuse or the kernel from > + * throttling any bdis that the fuse server might write to. > + */ > + current->flags |= PF_MEMALLOC_NOFS | PF_LOCAL_THROTTLE; Are these process flags being set on the correct process? Looking at the call chain: Sync path: fuse_fill_super() -> fuse_iomap_mount() -> fuse_iomap_config_reply() Async path: fuse_request_end() -> args->end() -> fuse_iomap_config_reply() In the sync path, current points to the mount(2) process which exits after mounting completes. In the async path, current points to whichever process writes the reply to /dev/fuse (the fuse daemon thread) or a kernel worker thread for virtio-fs. The commit message states the goal is to prohibit reclaim from recursing into the fuse server, but the fuse server is a separate userspace process. Setting flags on the mount process (which exits) or on a random fuse daemon thread (only once during init) doesn't protect the fuse server threads that actually perform I/O operations. Should the flags be set on the correct process context? Or if they need to be set here, should they follow the pattern used in fs/nfsd/vfs.c and fs/nfs/localio.c which save and restore the flags? Looking at fs/nfsd/vfs.c:1438-1509: unsigned long old_flags = current->flags; current->flags |= PF_LOCAL_THROTTLE; ... current_restore_flags(pflags, PF_LOCAL_THROTTLE); and fs/nfs/localio.c:824-828: unsigned long old_flags = current->flags; current->flags |= PF_LOCAL_THROTTLE; ... current_restore_flags(old_flags, PF_LOCAL_THROTTLE); Without saving and restoring, the flags permanently alter the process state. For the mount process this is wasted when it exits, and for fuse daemon threads this causes permanent behavior changes. ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 29/31] fuse: disable direct reclaim for any fuse server that uses iomap 2026-02-05 18:57 ` [PATCH 29/31] fuse: disable direct reclaim for any fuse server that uses iomap Chris Mason @ 2026-02-06 4:25 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 4:25 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 10:57:15AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Any fuse server that uses iomap can create a substantial amount of dirty > > pages in the pagecache because we don't write dirty stuff until reclaim > > or fsync. Therefore, memory reclaim on any fuse iomap server musn't > > ever recurse back into the same filesystem. We must also never throttle > > the fuse server writes to a bdi because that will just slow down > > metadata operations. > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > commit 3bdd586d6fa00e067dbc3676642ac75e54ec3031 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse: disable direct reclaim for any fuse server that uses iomap > > Any fuse server that uses iomap can create a substantial amount of dirty > pages in the pagecache because we don't write dirty stuff until reclaim > or fsync. Therefore, memory reclaim on any fuse iomap server must avoid > recursing back into the same filesystem and throttling the fuse server. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > --- a/fs/fuse/fuse_iomap.c > > +++ b/fs/fuse/fuse_iomap.c > > @@ -1019,6 +1019,12 @@ static void fuse_iomap_config_reply(struct fuse_mount *fm, > > */ > > fc->sync_fs = true; > > > > + /* > > + * We could be on the hook for a substantial amount of writeback, so > > + * prohibit reclaim from recursing into fuse or the kernel from > > + * throttling any bdis that the fuse server might write to. > > + */ > > + current->flags |= PF_MEMALLOC_NOFS | PF_LOCAL_THROTTLE; > > Are these process flags being set on the correct process? Looking at the > call chain: > > Sync path: > fuse_fill_super() -> fuse_iomap_mount() -> fuse_iomap_config_reply() > > Async path: > fuse_request_end() -> args->end() -> fuse_iomap_config_reply() > > In the sync path, current points to the mount(2) process which exits > after mounting completes. In the async path, current points to whichever > process writes the reply to /dev/fuse (the fuse daemon thread) or a > kernel worker thread for virtio-fs. > > The commit message states the goal is to prohibit reclaim from recursing > into the fuse server, but the fuse server is a separate userspace process. > Setting flags on the mount process (which exits) or on a random fuse > daemon thread (only once during init) doesn't protect the fuse server > threads that actually perform I/O operations. > > Should the flags be set on the correct process context? Or if they need > to be set here, should they follow the pattern used in fs/nfsd/vfs.c and > fs/nfs/localio.c which save and restore the flags? > > Looking at fs/nfsd/vfs.c:1438-1509: > > unsigned long old_flags = current->flags; > current->flags |= PF_LOCAL_THROTTLE; > ... > current_restore_flags(pflags, PF_LOCAL_THROTTLE); > > and fs/nfs/localio.c:824-828: > > unsigned long old_flags = current->flags; > current->flags |= PF_LOCAL_THROTTLE; > ... > current_restore_flags(old_flags, PF_LOCAL_THROTTLE); > > Without saving and restoring, the flags permanently alter the process > state. For the mount process this is wasted when it exits, and for fuse > daemon threads this causes permanent behavior changes. Hrmm. Normally the fuse server itself calls mount(2) via libfuse to set up the mount point, so this does (AFAICT) set PF_ flags on the fuse server itself. However, any fuse setup that doesn't handle things this way would indeed set the PF_ flags on the wrong process. There are (a) other fuse libraries out there, and (b) the fuse service architecture mentioned downthread has a mount helper that starts up a fuse server on the other end of a unix socket, passes it resources including /dev/fuse to start up, and then calls mount(2) itself instead of the fuse server doing that. That part's broken, and I'll have to think about how to solve that. Or maybe someone else will tell me this is all undesirable and I'll just drop this patch. :) --D ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810874.1424854.5037707950055785011.stgit@frogsfrogsfrogs>]
* Re: [PATCH 24/31] fuse: implement inline data file IO via iomap [not found] ` <176169810874.1424854.5037707950055785011.stgit@frogsfrogsfrogs> @ 2026-02-05 19:01 ` Chris Mason 2026-02-06 2:27 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:01 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Implement inline data file IO by issuing FUSE_READ/FUSE_WRITE commands > in response to an inline data mapping. Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: commit 1263eae0852acb1359182377ad94d570701e2274 Author: Darrick J. Wong <djwong@kernel.org> fuse: implement inline data file IO via iomap Implement inline data file IO by issuing FUSE_READ/FUSE_WRITE commands in response to an inline data mapping. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > --- a/fs/fuse/fuse_iomap.c > +++ b/fs/fuse/fuse_iomap.c > @@ -398,6 +398,150 @@ fuse_iomap_find_dev(struct fuse_conn *fc, const struct fuse_iomap_io *map) > return ret; > } > > +/* > + * Use the FUSE_READ command to read inline file data from the fuse server. > + * Note that there's no file handle attached, so the fuse server must be able > + * to reconnect to the inode via the nodeid. > + */ > +static int fuse_iomap_inline_read(struct inode *inode, loff_t pos, > + loff_t count, struct iomap *iomap) > +{ > + struct fuse_read_in in = { > + .offset = pos, > + .size = count, > + }; > + struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_mount *fm = get_fuse_mount(inode); > + FUSE_ARGS(args); > + ssize_t ret; > + > + if (BAD_DATA(!iomap_inline_data_valid(iomap))) > + return -EFSCORRUPTED; Does this code leak memory when the BAD_DATA check fails? Looking at the call chain: fuse_iomap_set_inline()->fuse_iomap_inline_alloc() allocates iomap->inline_data via kvzalloc. If the iomap_inline_data_valid check fails (i.e., iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data)), the function returns -EFSCORRUPTED without freeing iomap->inline_data. Other error paths in this function (around the fuse_simple_request call) properly call fuse_iomap_inline_free(iomap) before returning errors. The caller fuse_iomap_set_inline doesn't clean up iomap->inline_data on error in the read path (line 538: 'if (err) return err;'), so the allocation is leaked when a FUSE server returns an IOMAP_INLINE mapping with length > PAGE_SIZE. > + > + args.opcode = FUSE_READ; > + args.nodeid = fi->nodeid; > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(in); > + args.in_args[0].value = ∈ > + args.out_argvar = true; > + args.out_numargs = 1; > + args.out_args[0].size = count; > + args.out_args[0].value = iomap_inline_data(iomap, pos); > + > + ret = fuse_simple_request(fm, &args); > + if (ret < 0) { > + fuse_iomap_inline_free(iomap); > + return ret; > + } > + /* no readahead means something bad happened */ > + if (ret == 0) { > + fuse_iomap_inline_free(iomap); > + return -EIO; > + } > + > + return 0; > +} [ ... ] > @@ -511,8 +663,28 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > { > struct fuse_inode *fi = get_fuse_inode(inode); > struct fuse_mount *fm = get_fuse_mount(inode); > + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); > + struct iomap *srcmap = &iter->srcmap; Can this container_of usage corrupt memory? Looking at fuse_iomap_writeback_range, it calls fuse_iomap_end with &write_iomap, which is a stack-local struct iomap variable (not embedded in an iomap_iter): fuse_iomap_writeback_range() struct iomap write_iomap = { }; ... fuse_iomap_end(inode, offset, len, 0, FUSE_IOMAP_OP_WRITEBACK, &write_iomap); The container_of macro computes an invalid pointer by subtracting the offset of the iomap member from the address, resulting in iter pointing to garbage memory on the stack. Subsequently, accessing iter->srcmap reads from invalid memory, potentially causing undefined behavior or crashes. The iomap core calls fuse_iomap_end via fuse_iomap_ops where iomap IS properly embedded in iomap_iter, but the direct call from fuse_iomap_writeback_range violates this assumption. ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 24/31] fuse: implement inline data file IO via iomap 2026-02-05 19:01 ` [PATCH 24/31] fuse: implement inline data file IO via iomap Chris Mason @ 2026-02-06 2:27 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 2:27 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:01:07AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Implement inline data file IO by issuing FUSE_READ/FUSE_WRITE commands > > in response to an inline data mapping. > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > commit 1263eae0852acb1359182377ad94d570701e2274 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse: implement inline data file IO via iomap > > Implement inline data file IO by issuing FUSE_READ/FUSE_WRITE commands > in response to an inline data mapping. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > --- a/fs/fuse/fuse_iomap.c > > +++ b/fs/fuse/fuse_iomap.c > > @@ -398,6 +398,150 @@ fuse_iomap_find_dev(struct fuse_conn *fc, const struct fuse_iomap_io *map) > > return ret; > > } > > > > +/* > > + * Use the FUSE_READ command to read inline file data from the fuse server. > > + * Note that there's no file handle attached, so the fuse server must be able > > + * to reconnect to the inode via the nodeid. > > + */ > > +static int fuse_iomap_inline_read(struct inode *inode, loff_t pos, > > + loff_t count, struct iomap *iomap) > > +{ > > + struct fuse_read_in in = { > > + .offset = pos, > > + .size = count, > > + }; > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct fuse_mount *fm = get_fuse_mount(inode); > > + FUSE_ARGS(args); > > + ssize_t ret; > > + > > + if (BAD_DATA(!iomap_inline_data_valid(iomap))) > > + return -EFSCORRUPTED; > > Does this code leak memory when the BAD_DATA check fails? Looking at > the call chain: > > fuse_iomap_set_inline()->fuse_iomap_inline_alloc() allocates > iomap->inline_data via kvzalloc. If the iomap_inline_data_valid check > fails (i.e., iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data)), > the function returns -EFSCORRUPTED without freeing iomap->inline_data. > > Other error paths in this function (around the fuse_simple_request call) > properly call fuse_iomap_inline_free(iomap) before returning errors. > > The caller fuse_iomap_set_inline doesn't clean up iomap->inline_data on > error in the read path (line 538: 'if (err) return err;'), so the > allocation is leaked when a FUSE server returns an IOMAP_INLINE mapping > with length > PAGE_SIZE. Yep, it does leak the ioend and the bio. I'll amend this to call fuse_iomap_inline_free on its way out. > > + > > + args.opcode = FUSE_READ; > > + args.nodeid = fi->nodeid; > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(in); > > + args.in_args[0].value = ∈ > > + args.out_argvar = true; > > + args.out_numargs = 1; > > + args.out_args[0].size = count; > > + args.out_args[0].value = iomap_inline_data(iomap, pos); > > + > > + ret = fuse_simple_request(fm, &args); > > + if (ret < 0) { > > + fuse_iomap_inline_free(iomap); > > + return ret; > > + } > > + /* no readahead means something bad happened */ > > + if (ret == 0) { > > + fuse_iomap_inline_free(iomap); > > + return -EIO; > > + } > > + > > + return 0; > > +} > > [ ... ] > > > @@ -511,8 +663,28 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > { > > struct fuse_inode *fi = get_fuse_inode(inode); > > struct fuse_mount *fm = get_fuse_mount(inode); > > + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); > > + struct iomap *srcmap = &iter->srcmap; > > Can this container_of usage corrupt memory? Looking at > fuse_iomap_writeback_range, it calls fuse_iomap_end with &write_iomap, > which is a stack-local struct iomap variable (not embedded in an > iomap_iter): > > fuse_iomap_writeback_range() > struct iomap write_iomap = { }; > ... > fuse_iomap_end(inode, offset, len, 0, > FUSE_IOMAP_OP_WRITEBACK, &write_iomap); > > The container_of macro computes an invalid pointer by subtracting the > offset of the iomap member from the address, resulting in iter pointing > to garbage memory on the stack. Subsequently, accessing iter->srcmap > reads from invalid memory, potentially causing undefined behavior or > crashes. > > The iomap core calls fuse_iomap_end via fuse_iomap_ops where iomap IS > properly embedded in iomap_iter, but the direct call from > fuse_iomap_writeback_range violates this assumption. Oops, that's a severe bug. fuse_iomap_writeback_range should indeed define a whole iomap_iter instead of just the iomap. I'll fix that, thanks for pointing out these bugs. --D ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810765.1424854.10969346031644824992.stgit@frogsfrogsfrogs>]
* Re: [PATCH 19/31] fuse: query filesystem geometry when using iomap [not found] ` <176169810765.1424854.10969346031644824992.stgit@frogsfrogsfrogs> @ 2026-02-05 19:07 ` Chris Mason 2026-02-06 2:17 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:07 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Add a new upcall to the fuse server so that the kernel can request > filesystem geometry bits when iomap mode is in use. > Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: commit d529d13100b2e00c33a507c371b6c1b19f2206ae Author: Darrick J. Wong <djwong@kernel.org> fuse_trace: query filesystem geometry when using iomap This commit adds tracepoints for the FUSE_IOMAP_CONFIG operation, enabling tracing of filesystem geometry queries when using iomap with FUSE. The tracepoint captures configuration flags, block size, link limits, time granularity and ranges, maximum file size, and UUID length. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > --- a/fs/fuse/fuse_iomap.c > +++ b/fs/fuse/fuse_iomap.c > @@ -743,6 +743,8 @@ static int fuse_iomap_process_config(struct fuse_mount *fm, int error, > return error; > } > > + trace_fuse_iomap_config(fm, outarg); > + > if (outarg->flags & ~FUSE_IOMAP_CONFIG_ALL) > return -EINVAL; > [ ... ] > diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h > --- a/fs/fuse/fuse_trace.h > +++ b/fs/fuse/fuse_trace.h [ ... ] > @@ -1005,6 +1014,45 @@ > __entry->mode, > __entry->newsize) > ); > + > +TRACE_EVENT(fuse_iomap_config, > + TP_PROTO(const struct fuse_mount *fm, > + const struct fuse_iomap_config_out *outarg), > + TP_ARGS(fm, outarg), > + > + TP_STRUCT__entry( > + __field(dev_t, connection) > + > + __field(uint32_t, flags) > + __field(uint32_t, blocksize) > + __field(uint32_t, max_links) > + __field(uint32_t, time_gran) > + > + __field(int64_t, time_min) > + __field(int64_t, time_max) > + __field(int64_t, maxbytes) > + __field(uint8_t, uuid_len) > + ), > + > + TP_fast_assign( > + __entry->connection = fm->fc->dev; > + __entry->flags = outarg->flags; ^^^^ Does this truncate the upper 32 bits of the flags field? The flags field in struct fuse_iomap_config_out (include/uapi/linux/fuse.h:1471) is defined as uint64_t: struct fuse_iomap_config_out { uint64_t flags; /* FUSE_IOMAP_CONFIG_* */ ... } But the trace event declares it as uint32_t in TP_STRUCT__entry. While current flag definitions (FUSE_IOMAP_CONFIG_SID through FUSE_IOMAP_CONFIG_MAXBYTES) only use bits 0-5, using uint32_t will lose the upper 32 bits if future flags use them. Should the trace event field be declared as uint64_t to match the structure definition? ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 19/31] fuse: query filesystem geometry when using iomap 2026-02-05 19:07 ` [PATCH 19/31] fuse: query filesystem geometry when using iomap Chris Mason @ 2026-02-06 2:17 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 2:17 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:07:55AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Add a new upcall to the fuse server so that the kernel can request > > filesystem geometry bits when iomap mode is in use. > > > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > commit d529d13100b2e00c33a507c371b6c1b19f2206ae > Author: Darrick J. Wong <djwong@kernel.org> > > fuse_trace: query filesystem geometry when using iomap > > This commit adds tracepoints for the FUSE_IOMAP_CONFIG operation, > enabling tracing of filesystem geometry queries when using iomap with > FUSE. The tracepoint captures configuration flags, block size, link > limits, time granularity and ranges, maximum file size, and UUID length. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > --- a/fs/fuse/fuse_iomap.c > > +++ b/fs/fuse/fuse_iomap.c > > @@ -743,6 +743,8 @@ static int fuse_iomap_process_config(struct fuse_mount *fm, int error, > > return error; > > } > > > > + trace_fuse_iomap_config(fm, outarg); > > + > > if (outarg->flags & ~FUSE_IOMAP_CONFIG_ALL) > > return -EINVAL; > > > > [ ... ] > > > diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h > > --- a/fs/fuse/fuse_trace.h > > +++ b/fs/fuse/fuse_trace.h > > [ ... ] > > > @@ -1005,6 +1014,45 @@ > > __entry->mode, > > __entry->newsize) > > ); > > + > > +TRACE_EVENT(fuse_iomap_config, > > + TP_PROTO(const struct fuse_mount *fm, > > + const struct fuse_iomap_config_out *outarg), > > + TP_ARGS(fm, outarg), > > + > > + TP_STRUCT__entry( > > + __field(dev_t, connection) > > + > > + __field(uint32_t, flags) > > + __field(uint32_t, blocksize) > > + __field(uint32_t, max_links) > > + __field(uint32_t, time_gran) > > + > > + __field(int64_t, time_min) > > + __field(int64_t, time_max) > > + __field(int64_t, maxbytes) > > + __field(uint8_t, uuid_len) > > + ), > > + > > + TP_fast_assign( > > + __entry->connection = fm->fc->dev; > > + __entry->flags = outarg->flags; > ^^^^ > > Does this truncate the upper 32 bits of the flags field? The flags field > in struct fuse_iomap_config_out (include/uapi/linux/fuse.h:1471) is > defined as uint64_t: > > struct fuse_iomap_config_out { > uint64_t flags; /* FUSE_IOMAP_CONFIG_* */ > ... > } > > But the trace event declares it as uint32_t in TP_STRUCT__entry. While > current flag definitions (FUSE_IOMAP_CONFIG_SID through > FUSE_IOMAP_CONFIG_MAXBYTES) only use bits 0-5, using uint32_t will lose > the upper 32 bits if future flags use them. > > Should the trace event field be declared as uint64_t to match the > structure definition? Yes. Fixed, thanks! --D ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810656.1424854.15239592653019383193.stgit@frogsfrogsfrogs>]
* Re: [PATCH 14/31] fuse: implement buffered IO with iomap [not found] ` <176169810656.1424854.15239592653019383193.stgit@frogsfrogsfrogs> @ 2026-02-05 19:12 ` Chris Mason 2026-02-06 2:14 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:12 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Implement pagecache IO with iomap, complete with hooks into truncate and > fallocate so that the fuse server needn't implement disk block zeroing > of post-EOF and unaligned punch/zero regions. > Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: commit 05d561a4dd38273cb93745e8ddf913f490f81ad5 Author: Darrick J. Wong <djwong@kernel.org> fuse: implement buffered IO with iomap Implement pagecache IO with iomap, complete with hooks into truncate and fallocate so that the fuse server needn't implement disk block zeroing of post-EOF and unaligned punch/zero regions. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c [ ... ] > @@ -1056,6 +1144,365 @@ void fuse_iomap_open_truncate(struct inode *inode) > fi->i_disk_size = 0; > } > > +struct fuse_writepage_ctx { > + struct iomap_writepage_ctx ctx; > +}; > + > +static void fuse_iomap_end_ioend(struct iomap_ioend *ioend) > +{ > + struct inode *inode = ioend->io_inode; > + unsigned int ioendflags = FUSE_IOMAP_IOEND_WRITEBACK; > + unsigned int nofs_flag; > + int error = blk_status_to_errno(ioend->io_bio.bi_status); > + > + ASSERT(fuse_inode_has_iomap(inode)); > + > + if (fuse_is_bad(inode)) > + return; > ^^^^ Does this early return leak the ioend? When fuse_is_bad(inode) returns true, this function returns without calling iomap_finish_ioends(). Looking at the iomap code, iomap_finish_ioends() is responsible for: 1. Calling bio_put() to free the bio structure embedded in the ioend 2. Calling iomap_finish_folio_write() to end writeback on the folios Without this cleanup, we get two problems: - Memory leak: the bio structure is never freed - Folios stuck in writeback state forever, which can cause system hangs as pages cannot be reclaimed or written again The cleanup at line 1176 appears mandatory regardless of inode state. [ ... ] ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 14/31] fuse: implement buffered IO with iomap 2026-02-05 19:12 ` [PATCH 14/31] fuse: implement buffered IO with iomap Chris Mason @ 2026-02-06 2:14 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 2:14 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:12:09AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Implement pagecache IO with iomap, complete with hooks into truncate and > > fallocate so that the fuse server needn't implement disk block zeroing > > of post-EOF and unaligned punch/zero regions. > > > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > commit 05d561a4dd38273cb93745e8ddf913f490f81ad5 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse: implement buffered IO with iomap > > Implement pagecache IO with iomap, complete with hooks into truncate and > fallocate so that the fuse server needn't implement disk block zeroing > of post-EOF and unaligned punch/zero regions. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > [ ... ] > > > @@ -1056,6 +1144,365 @@ void fuse_iomap_open_truncate(struct inode *inode) > > fi->i_disk_size = 0; > > } > > > > +struct fuse_writepage_ctx { > > + struct iomap_writepage_ctx ctx; > > +}; > > + > > +static void fuse_iomap_end_ioend(struct iomap_ioend *ioend) > > +{ > > + struct inode *inode = ioend->io_inode; > > + unsigned int ioendflags = FUSE_IOMAP_IOEND_WRITEBACK; > > + unsigned int nofs_flag; > > + int error = blk_status_to_errno(ioend->io_bio.bi_status); > > + > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + if (fuse_is_bad(inode)) > > + return; > > ^^^^ > > Does this early return leak the ioend? When fuse_is_bad(inode) returns > true, this function returns without calling iomap_finish_ioends(). > > Looking at the iomap code, iomap_finish_ioends() is responsible for: > > 1. Calling bio_put() to free the bio structure embedded in the ioend > 2. Calling iomap_finish_folio_write() to end writeback on the folios > > Without this cleanup, we get two problems: > > - Memory leak: the bio structure is never freed > - Folios stuck in writeback state forever, which can cause system > hangs as pages cannot be reclaimed or written again > > The cleanup at line 1176 appears mandatory regardless of inode state. Correct, will fix. --D > > [ ... ] > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810634.1424854.13084435884326863405.stgit@frogsfrogsfrogs>]
* Re: [PATCH 13/31] fuse_trace: implement direct IO with iomap [not found] ` <176169810634.1424854.13084435884326863405.stgit@frogsfrogsfrogs> @ 2026-02-05 19:16 ` Chris Mason 2026-02-06 2:12 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:16 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Add tracepoints for the previous patch. > commit 3fad15ebe8d51187a2b6820ed6db4774a8b065b0 Author: Darrick J. Wong <djwong@kernel.org> fuse_trace: implement direct IO with iomap Add tracepoints for the previous patch. > diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h > index 730ab8bce4445..efba6ecbbf670 100644 > --- a/fs/fuse/fuse_trace.h > +++ b/fs/fuse/fuse_trace.h [ ... ] > @@ -484,6 +499,75 @@ TRACE_EVENT(fuse_iomap_end_error, > +TRACE_EVENT(fuse_iomap_ioend, > + TP_PROTO(const struct inode *inode, > + const struct fuse_iomap_ioend_in *inarg), > + > + TP_ARGS(inode, inarg), > + > + TP_STRUCT__entry( > + FUSE_IO_RANGE_FIELDS() > + __field(unsigned, ioendflags) > + __field(int, error) > + __field(uint32_t, dev) > + __field(uint64_t, new_addr) > + ), > + > + TP_fast_assign( > + FUSE_INODE_ASSIGN(inode, fi, fm); > + __entry->offset = inarg->pos; > + __entry->length = inarg->written; > + __entry->ioendflags = inarg->flags; > + __entry->error = inarg->error; > + __entry->dev = inarg->dev; > + __entry->new_addr = inarg->new_addr; > + ), > + > + TP_printk(FUSE_IO_RANGE_FMT() " ioendflags (%s) error %d dev %u new_addr 0x%llx", > + FUSE_IO_RANGE_PRINTK_ARGS(), > + __print_flags(__entry->ioendflags, "|", FUSE_IOMAP_IOEND_STRINGS), > + __entry->dev, > + __entry->error, > + __entry->new_addr) ^^^^ Are the error and dev fields swapped in the argument list? The format string expects 'error %d dev %u' but the arguments are provided as __entry->dev, __entry->error. This would cause the trace output to display the dev value as error and the error value as dev. > +); > + > +TRACE_EVENT(fuse_iomap_ioend_error, > + TP_PROTO(const struct inode *inode, > + const struct fuse_iomap_ioend_in *inarg, > + const struct fuse_iomap_ioend_out *outarg, > + int error), > + > + TP_ARGS(inode, inarg, outarg, error), > + > + TP_STRUCT__entry( > + FUSE_IO_RANGE_FIELDS() > + __field(unsigned, ioendflags) > + __field(int, error) > + __field(uint32_t, dev) > + __field(uint64_t, new_addr) > + __field(uint64_t, new_size) > + ), > + > + TP_fast_assign( > + FUSE_INODE_ASSIGN(inode, fi, fm); > + __entry->offset = inarg->pos; > + __entry->length = inarg->written; > + __entry->ioendflags = inarg->flags; > + __entry->error = error; > + __entry->dev = inarg->dev; > + __entry->new_addr = inarg->new_addr; > + __entry->new_size = outarg->newsize; > + ), > + > + TP_printk(FUSE_IO_RANGE_FMT() " ioendflags (%s) error %d dev %u new_addr 0x%llx new_size 0x%llx", > + FUSE_IO_RANGE_PRINTK_ARGS(), > + __print_flags(__entry->ioendflags, "|", FUSE_IOMAP_IOEND_STRINGS), > + __entry->dev, > + __entry->error, > + __entry->new_addr, > + __entry->new_size) ^^^^ Same issue here. The format string expects 'error %d dev %u' but the arguments are provided as __entry->dev, __entry->error, swapping the values in the trace output. > +); > + [ ... ] ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 13/31] fuse_trace: implement direct IO with iomap 2026-02-05 19:16 ` [PATCH 13/31] fuse_trace: implement direct " Chris Mason @ 2026-02-06 2:12 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 2:12 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:16:54AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Add tracepoints for the previous patch. > > > > commit 3fad15ebe8d51187a2b6820ed6db4774a8b065b0 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse_trace: implement direct IO with iomap > > Add tracepoints for the previous patch. > > > diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h > > index 730ab8bce4445..efba6ecbbf670 100644 > > --- a/fs/fuse/fuse_trace.h > > +++ b/fs/fuse/fuse_trace.h > > [ ... ] > > > @@ -484,6 +499,75 @@ TRACE_EVENT(fuse_iomap_end_error, > > +TRACE_EVENT(fuse_iomap_ioend, > > + TP_PROTO(const struct inode *inode, > > + const struct fuse_iomap_ioend_in *inarg), > > + > > + TP_ARGS(inode, inarg), > > + > > + TP_STRUCT__entry( > > + FUSE_IO_RANGE_FIELDS() > > + __field(unsigned, ioendflags) > > + __field(int, error) > > + __field(uint32_t, dev) > > + __field(uint64_t, new_addr) > > + ), > > + > > + TP_fast_assign( > > + FUSE_INODE_ASSIGN(inode, fi, fm); > > + __entry->offset = inarg->pos; > > + __entry->length = inarg->written; > > + __entry->ioendflags = inarg->flags; > > + __entry->error = inarg->error; > > + __entry->dev = inarg->dev; > > + __entry->new_addr = inarg->new_addr; > > + ), > > + > > + TP_printk(FUSE_IO_RANGE_FMT() " ioendflags (%s) error %d dev %u new_addr 0x%llx", > > + FUSE_IO_RANGE_PRINTK_ARGS(), > > + __print_flags(__entry->ioendflags, "|", FUSE_IOMAP_IOEND_STRINGS), > > + __entry->dev, > > + __entry->error, > > + __entry->new_addr) > ^^^^ > > Are the error and dev fields swapped in the argument list? > > The format string expects 'error %d dev %u' but the arguments are > provided as __entry->dev, __entry->error. This would cause the trace > output to display the dev value as error and the error value as dev. > > > +); > > + > > +TRACE_EVENT(fuse_iomap_ioend_error, > > + TP_PROTO(const struct inode *inode, > > + const struct fuse_iomap_ioend_in *inarg, > > + const struct fuse_iomap_ioend_out *outarg, > > + int error), > > + > > + TP_ARGS(inode, inarg, outarg, error), > > + > > + TP_STRUCT__entry( > > + FUSE_IO_RANGE_FIELDS() > > + __field(unsigned, ioendflags) > > + __field(int, error) > > + __field(uint32_t, dev) > > + __field(uint64_t, new_addr) > > + __field(uint64_t, new_size) > > + ), > > + > > + TP_fast_assign( > > + FUSE_INODE_ASSIGN(inode, fi, fm); > > + __entry->offset = inarg->pos; > > + __entry->length = inarg->written; > > + __entry->ioendflags = inarg->flags; > > + __entry->error = error; > > + __entry->dev = inarg->dev; > > + __entry->new_addr = inarg->new_addr; > > + __entry->new_size = outarg->newsize; > > + ), > > + > > + TP_printk(FUSE_IO_RANGE_FMT() " ioendflags (%s) error %d dev %u new_addr 0x%llx new_size 0x%llx", > > + FUSE_IO_RANGE_PRINTK_ARGS(), > > + __print_flags(__entry->ioendflags, "|", FUSE_IOMAP_IOEND_STRINGS), > > + __entry->dev, > > + __entry->error, > > + __entry->new_addr, > > + __entry->new_size) > ^^^^ > > Same issue here. The format string expects 'error %d dev %u' but the > arguments are provided as __entry->dev, __entry->error, swapping the > values in the trace output. Yep, both wrong, both fixed. Thanks! --D > > > +); > > + > > [ ... ] > > ^ permalink raw reply [flat|nested] 52+ messages in thread
[parent not found: <176169810612.1424854.16053093294573829123.stgit@frogsfrogsfrogs>]
* Re: [PATCH 12/31] fuse: implement direct IO with iomap [not found] ` <176169810612.1424854.16053093294573829123.stgit@frogsfrogsfrogs> @ 2026-01-23 18:56 ` Joanne Koong 2026-01-26 23:46 ` Darrick J. Wong 2026-02-05 19:19 ` Chris Mason 1 sibling, 1 reply; 52+ messages in thread From: Joanne Koong @ 2026-01-23 18:56 UTC (permalink / raw) To: Darrick J. Wong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Tue, Oct 28, 2025 at 5:48 PM Darrick J. Wong <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > Start implementing the fuse-iomap file I/O paths by adding direct I/O > support and all the signalling flags that come with it. Buffered I/O > is much more complicated, so we leave that to a subsequent patch. Overall, this makes sense to me. Left a few comments below. > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > fs/fuse/fuse_i.h | 30 +++++ > include/uapi/linux/fuse.h | 22 ++++ > fs/fuse/dir.c | 7 + > fs/fuse/file.c | 16 +++ > fs/fuse/file_iomap.c | 249 +++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/trace.c | 1 > 6 files changed, 323 insertions(+), 2 deletions(-) > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index e949bfe022c3b0..be0e95924a24af 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -672,6 +672,7 @@ enum fuse_opcode { > FUSE_STATX = 52, > FUSE_COPY_FILE_RANGE_64 = 53, > > + FUSE_IOMAP_IOEND = 4093, > FUSE_IOMAP_BEGIN = 4094, > FUSE_IOMAP_END = 4095, > > @@ -1406,4 +1407,25 @@ struct fuse_iomap_end_in { > struct fuse_iomap_io map; > }; > > +/* out of place write extent */ > +#define FUSE_IOMAP_IOEND_SHARED (1U << 0) > +/* unwritten extent */ > +#define FUSE_IOMAP_IOEND_UNWRITTEN (1U << 1) > +/* don't merge into previous ioend */ > +#define FUSE_IOMAP_IOEND_BOUNDARY (1U << 2) > +/* is direct I/O */ > +#define FUSE_IOMAP_IOEND_DIRECT (1U << 3) > +/* is append ioend */ > +#define FUSE_IOMAP_IOEND_APPEND (1U << 4) > + > +struct fuse_iomap_ioend_in { > + uint32_t ioendflags; /* FUSE_IOMAP_IOEND_* */ Hmm, maybe just "flags" is descriptive enough? Or if not, then "ioend_flags"? > + int32_t error; /* negative errno or 0 */ > + uint64_t attr_ino; /* matches fuse_attr:ino */ > + uint64_t pos; /* file position, in bytes */ > + uint64_t new_addr; /* disk offset of new mapping, in bytes */ > + uint32_t written; /* bytes processed */ Is uint32_t enough here or does it need to be bigger? Asking mostly because I see in fuse_iomap_ioend() that the written passed in is size_t. > + uint32_t reserved1; /* zero */ > +}; > + > #endif /* _LINUX_FUSE_H */ > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > index bafc386f2f4d3a..171f38ba734d16 100644 > --- a/fs/fuse/dir.c > +++ b/fs/fuse/dir.c > @@ -712,6 +712,10 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, > if (err) > goto out_acl_release; > fuse_dir_changed(dir); > + > + if (fuse_inode_has_iomap(inode)) > + fuse_iomap_open(inode, file); > + > err = generic_file_open(inode, file); > if (!err) { > file->private_data = ff; > @@ -1743,6 +1747,9 @@ static int fuse_dir_open(struct inode *inode, struct file *file) > if (fuse_is_bad(inode)) > return -EIO; > > + if (fuse_inode_has_iomap(inode)) > + fuse_iomap_open(inode, file); > + > err = generic_file_open(inode, file); > if (err) > return err; > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 8a981f41b1dbd0..43007cea550ae7 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -246,6 +246,9 @@ static int fuse_open(struct inode *inode, struct file *file) > if (fuse_is_bad(inode)) > return -EIO; > > + if (is_iomap) > + fuse_iomap_open(inode, file); > + AFAICT, there aren't any calls to generic_file_open() where we don't also do this "if (is_iomap) ..." check, so maybe we should just put this logic inside generic_file_open()? > err = generic_file_open(inode, file); > if (err) > return err; > @@ -1751,10 +1754,17 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > struct file *file = iocb->ki_filp; > struct fuse_file *ff = file->private_data; > struct inode *inode = file_inode(file); > + ssize_t ret; > > if (fuse_is_bad(inode)) > return -EIO; > > + if (fuse_want_iomap_directio(iocb)) { In fuse, directio is also done if the server sets FOPEN_DIRECT_IO as part of the struct fuse_open_out open_flags arg, even if iocb->ki_flags doesn't have IOCB_DIRECT set. > + ret = fuse_iomap_direct_read(iocb, to); > + if (ret != -ENOSYS) Hmm, where does fuse_iomap_direct_read() return -ENOSYS? afaict, neither fuse_iomap_ilock_iocb() nor iomap_dio_rw() do? > + return ret; > + } I see that later on, in the patch that adds the implementation for buffered IO with iomap, this logic later becomes something like if (fuse_want_iomap_directio(iocb)) { ... } if (fuse_want_iomap_buffered_io(iocb)) return fuse_iomap_buffered_read(iocb, to); imo (if -ENOSYS is indeed not possible) something like this is maybe cleaner: if (fuse_inode_has_iomap(inode)) fuse_iomap_read_iter(iocb, to); to move as much iomap-specific logic away from generic fuse files? And then I think this would also let us get rid of the fuse_want_iomap_directio()/fuse_want_iomap_buffered_io() helpers, eg: ssize_t fuse_iomap_read_iter(struct kiocb *iocb, struct iov_iter *to) { if (iocb->ki_flags & IOCB_DIRECT) return fuse_iomap_direct_read(iocb, to); return fuse_iomap_buffered_read(iocb, to); } > + > if (FUSE_IS_DAX(inode)) > return fuse_dax_read_iter(iocb, to); > > @@ -1776,6 +1786,12 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > if (fuse_is_bad(inode)) > return -EIO; > > + if (fuse_want_iomap_directio(iocb)) { > + ssize_t ret = fuse_iomap_direct_write(iocb, from); > + if (ret != -ENOSYS) > + return ret; > + } Same questions as above about -ENOSYS > + > if (FUSE_IS_DAX(inode)) > return fuse_dax_write_iter(iocb, from); > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index c63527cec0448b..4db2acd8bc9925 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -495,10 +495,15 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > } > > /* Decide if we send FUSE_IOMAP_END to the fuse server */ > -static bool fuse_should_send_iomap_end(const struct iomap *iomap, > +static bool fuse_should_send_iomap_end(const struct fuse_mount *fm, > + const struct iomap *iomap, > unsigned int opflags, loff_t count, > ssize_t written) > { > + /* Not implemented on fuse server */ > + if (fm->fc->iomap_conn.no_end) > + return false; > + > /* fuse server demanded an iomap_end call. */ > if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END) > return true; > @@ -523,7 +528,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > struct fuse_mount *fm = get_fuse_mount(inode); > int err = 0; > > - if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > + if (fuse_should_send_iomap_end(fm, iomap, opflags, count, written)) { > struct fuse_iomap_end_in inarg = { > .opflags = fuse_iomap_op_to_server(opflags), > .attr_ino = fi->orig_ino, > @@ -549,6 +554,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > * libfuse returns ENOSYS for servers that don't > * implement iomap_end > */ > + fm->fc->iomap_conn.no_end = 1; > err = 0; > break; > case 0: > @@ -567,6 +573,95 @@ static const struct iomap_ops fuse_iomap_ops = { > .iomap_end = fuse_iomap_end, > }; > > +static inline bool > +fuse_should_send_iomap_ioend(const struct fuse_mount *fm, > + const struct fuse_iomap_ioend_in *inarg) > +{ > + /* Not implemented on fuse server */ > + if (fm->fc->iomap_conn.no_ioend) > + return false; > + > + /* Always send an ioend for errors. */ > + if (inarg->error) > + return true; > + > + /* Send an ioend if we performed an IO involving metadata changes. */ > + return inarg->written > 0 && > + (inarg->ioendflags & (FUSE_IOMAP_IOEND_SHARED | > + FUSE_IOMAP_IOEND_UNWRITTEN | > + FUSE_IOMAP_IOEND_APPEND)); > +} > + > +/* > + * Fast and loose check if this write could update the on-disk inode size. > + */ > +static inline bool fuse_ioend_is_append(const struct fuse_inode *fi, > + loff_t pos, size_t written) > +{ > + return pos + written > i_size_read(&fi->inode); > +} > + > +static int fuse_iomap_ioend(struct inode *inode, loff_t pos, size_t written, > + int error, unsigned ioendflags, sector_t new_addr) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_mount *fm = get_fuse_mount(inode); > + struct fuse_iomap_ioend_in inarg = { > + .ioendflags = ioendflags, > + .error = error, > + .attr_ino = fi->orig_ino, > + .pos = pos, > + .written = written, > + .new_addr = new_addr, > + }; > + > + if (fuse_ioend_is_append(fi, pos, written)) > + inarg.ioendflags |= FUSE_IOMAP_IOEND_APPEND; > + > + if (fuse_should_send_iomap_ioend(fm, &inarg)) { > + FUSE_ARGS(args); > + int err; > + > + args.opcode = FUSE_IOMAP_IOEND; > + args.nodeid = get_node_id(inode); > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + err = fuse_simple_request(fm, &args); > + switch (err) { > + case -ENOSYS: > + /* > + * fuse servers can return ENOSYS if ioend processing > + * is never needed for this filesystem. > + */ > + fm->fc->iomap_conn.no_ioend = 1; > + err = 0; It doesn't look like we need to set err here or maybe I'm missing something > + break; > + case 0: > + break; > + default: > + /* > + * If the write IO failed, return the failure code to > + * the caller no matter what happens with the ioend. > + * If the write IO succeeded but the ioend did not, > + * pass the new error up to the caller. > + */ > + if (!error) > + error = err; > + break; > + } > + } > + if (error) > + return error; > + > + /* > + * If there weren't any ioend errors, update the incore isize, which Not sure if incore is a standard term, but it had me confused for a bit. I think incore just means kernel-internal? > + * confusingly takes the new i_size as "pos". > + */ > + fuse_write_update_attr(inode, pos + written, written); > + return 0; > +} > + > static int fuse_iomap_may_admin(struct fuse_conn *fc, unsigned int flags) > { > if (!fc->iomap) > @@ -618,6 +713,8 @@ void fuse_iomap_mount(struct fuse_mount *fm) > * freeze/thaw properly. > */ > fc->sync_fs = true; > + fc->iomap_conn.no_end = 0; > + fc->iomap_conn.no_ioend = 0; fc after it's first allocated has all its fields memset to 0 > } > > void fuse_iomap_unmount(struct fuse_mount *fm) > @@ -760,3 +857,151 @@ loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence) > return offset; > return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); > } > + > +void fuse_iomap_open(struct inode *inode, struct file *file) > +{ > + ASSERT(fuse_inode_has_iomap(inode)); > + > + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; > +} > + > +enum fuse_ilock_type { > + SHARED, > + EXCL, > +}; > + > +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, > + enum fuse_ilock_type type) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (iocb->ki_flags & IOCB_NOWAIT) { > + switch (type) { > + case SHARED: > + return inode_trylock_shared(inode) ? 0 : -EAGAIN; > + case EXCL: > + return inode_trylock(inode) ? 0 : -EAGAIN; > + default: > + ASSERT(0); > + return -EIO; > + } > + } else { nit: the else {} scoping doesn't seem needed here > + switch (type) { > + case SHARED: > + inode_lock_shared(inode); > + break; > + case EXCL: > + inode_lock(inode); > + break; > + default: > + ASSERT(0); > + return -EIO; > + } > + } > + > + return 0; > +} > + > +ssize_t fuse_iomap_direct_read(struct kiocb *iocb, struct iov_iter *to) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + ssize_t ret; > + > + ASSERT(fuse_inode_has_iomap(inode)); > + > + if (!iov_iter_count(to)) > + return 0; /* skip atime */ > + > + file_accessed(iocb->ki_filp); Does it make sense for this to be moved below so it's called only if fuse_iomap_ilock_iocb() succeeded? > + > + ret = fuse_iomap_ilock_iocb(iocb, SHARED); > + if (ret) > + return ret; > + ret = iomap_dio_rw(iocb, to, &fuse_iomap_ops, NULL, 0, NULL, 0); > + inode_unlock_shared(inode); > + > + return ret; > +} > + > +static int fuse_iomap_dio_write_end_io(struct kiocb *iocb, ssize_t written, > + int error, unsigned dioflags) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + unsigned int nofs_flag; > + unsigned int ioendflags = FUSE_IOMAP_IOEND_DIRECT; > + int ret; > + > + if (fuse_is_bad(inode)) > + return -EIO; > + > + ASSERT(fuse_inode_has_iomap(inode)); > + > + if (dioflags & IOMAP_DIO_COW) > + ioendflags |= FUSE_IOMAP_IOEND_SHARED; > + if (dioflags & IOMAP_DIO_UNWRITTEN) > + ioendflags |= FUSE_IOMAP_IOEND_UNWRITTEN; > + > + /* > + * We can allocate memory here while doing writeback on behalf of > + * memory reclaim. To avoid memory allocation deadlocks set the > + * task-wide nofs context for the following operations. > + */ > + nofs_flag = memalloc_nofs_save(); I'm a bit confused by this part. Could you explain how it's invoked while doing writeback for memory reclaim? As I understand it, writeback goes through buffered io and not direct io? > + ret = fuse_iomap_ioend(inode, iocb->ki_pos, written, error, ioendflags, > + FUSE_IOMAP_NULL_ADDR); > + memalloc_nofs_restore(nofs_flag); > + return ret; > +} > + > +static const struct iomap_dio_ops fuse_iomap_dio_write_ops = { > + .end_io = fuse_iomap_dio_write_end_io, > +}; > + > +ssize_t fuse_iomap_direct_write(struct kiocb *iocb, struct iov_iter *from) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + loff_t blockmask = i_blocksize(inode) - 1; > + size_t count = iov_iter_count(from); > + unsigned int flags = 0; > + ssize_t ret; > + > + ASSERT(fuse_inode_has_iomap(inode)); > + > + if (!count) > + return 0; > + > + /* > + * Unaligned direct writes require zeroing of unwritten head and tail > + * blocks. Extending writes require zeroing of post-EOF tail blocks. > + * The zeroing writes must complete before we return the direct write > + * to userspace. Don't even bother trying the fast path. > + */ > + if ((iocb->ki_pos | count) & blockmask) > + flags = IOMAP_DIO_FORCE_WAIT; > + > + ret = fuse_iomap_ilock_iocb(iocb, EXCL); > + if (ret) > + goto out_dsync; I wonder if we need the out_dsync goto at all. Maybe just return ret here directly? > + ret = generic_write_checks(iocb, from); > + if (ret <= 0) > + goto out_unlock; > + > + /* > + * If we are doing exclusive unaligned I/O, this must be the only I/O > + * in-flight. Otherwise we risk data corruption due to unwritten > + * extent conversions from the AIO end_io handler. Wait for all other > + * I/O to drain first. > + */ > + if (flags & IOMAP_DIO_FORCE_WAIT) > + inode_dio_wait(inode); > + Should we add a file_modified() call here? > + ret = iomap_dio_rw(iocb, from, &fuse_iomap_ops, > + &fuse_iomap_dio_write_ops, flags, NULL, 0); > + if (ret) > + goto out_unlock; I think we could get rid of this if (ret) check > + > +out_unlock: > + inode_unlock(inode); > +out_dsync: > + return ret; > +} > diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c > index 68d2eecb8559a5..300985d62a2f9b 100644 > --- a/fs/fuse/trace.c > +++ b/fs/fuse/trace.c > @@ -9,6 +9,7 @@ > #include "iomap_i.h" > > #include <linux/pagemap.h> > +#include <linux/iomap.h> Was this meant to be part of the subsequent trace.h patch? I haven't tried compiling this though so maybe I' mmissing something but I'm not seeing which part of the logic above needs this. Thanks, Joanne > > #define CREATE_TRACE_POINTS > #include "fuse_trace.h" > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap 2026-01-23 18:56 ` [PATCH 12/31] fuse: " Joanne Koong @ 2026-01-26 23:46 ` Darrick J. Wong 0 siblings, 0 replies; 52+ messages in thread From: Darrick J. Wong @ 2026-01-26 23:46 UTC (permalink / raw) To: Joanne Koong; +Cc: miklos, bernd, neal, linux-ext4, linux-fsdevel On Fri, Jan 23, 2026 at 10:56:14AM -0800, Joanne Koong wrote: > On Tue, Oct 28, 2025 at 5:48 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > Start implementing the fuse-iomap file I/O paths by adding direct I/O > > support and all the signalling flags that come with it. Buffered I/O > > is much more complicated, so we leave that to a subsequent patch. > > Overall, this makes sense to me. Left a few comments below. <nod> > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > fs/fuse/fuse_i.h | 30 +++++ > > include/uapi/linux/fuse.h | 22 ++++ > > fs/fuse/dir.c | 7 + > > fs/fuse/file.c | 16 +++ > > fs/fuse/file_iomap.c | 249 +++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/trace.c | 1 > > 6 files changed, 323 insertions(+), 2 deletions(-) > > > > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index e949bfe022c3b0..be0e95924a24af 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -672,6 +672,7 @@ enum fuse_opcode { > > FUSE_STATX = 52, > > FUSE_COPY_FILE_RANGE_64 = 53, > > > > + FUSE_IOMAP_IOEND = 4093, > > FUSE_IOMAP_BEGIN = 4094, > > FUSE_IOMAP_END = 4095, > > > > @@ -1406,4 +1407,25 @@ struct fuse_iomap_end_in { > > struct fuse_iomap_io map; > > }; > > > > +/* out of place write extent */ > > +#define FUSE_IOMAP_IOEND_SHARED (1U << 0) > > +/* unwritten extent */ > > +#define FUSE_IOMAP_IOEND_UNWRITTEN (1U << 1) > > +/* don't merge into previous ioend */ > > +#define FUSE_IOMAP_IOEND_BOUNDARY (1U << 2) > > +/* is direct I/O */ > > +#define FUSE_IOMAP_IOEND_DIRECT (1U << 3) > > +/* is append ioend */ > > +#define FUSE_IOMAP_IOEND_APPEND (1U << 4) > > + > > +struct fuse_iomap_ioend_in { > > + uint32_t ioendflags; /* FUSE_IOMAP_IOEND_* */ > > Hmm, maybe just "flags" is descriptive enough? Or if not, then "ioend_flags"? flags is fine, will change. > > + int32_t error; /* negative errno or 0 */ > > + uint64_t attr_ino; /* matches fuse_attr:ino */ > > + uint64_t pos; /* file position, in bytes */ > > + uint64_t new_addr; /* disk offset of new mapping, in bytes */ > > + uint32_t written; /* bytes processed */ > > Is uint32_t enough here or does it need to be bigger? Asking mostly > because I see in fuse_iomap_ioend() that the written passed in is > size_t. Hrmm. A directio write cannot exceed MAX_RW_COUNT, which is slightly less than 2GB. On the other hand, the iomap ioend allows us to chain together up to (IOEND_BATCH_SIZE * folio_size) bytes of writeback completions. Even on x86 that could be 8GB, so you're right, this ought to be a u64. > > + uint32_t reserved1; /* zero */ > > +}; > > + > > #endif /* _LINUX_FUSE_H */ > > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > > index bafc386f2f4d3a..171f38ba734d16 100644 > > --- a/fs/fuse/dir.c > > +++ b/fs/fuse/dir.c > > @@ -712,6 +712,10 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, > > if (err) > > goto out_acl_release; > > fuse_dir_changed(dir); > > + > > + if (fuse_inode_has_iomap(inode)) > > + fuse_iomap_open(inode, file); > > + > > err = generic_file_open(inode, file); > > if (!err) { > > file->private_data = ff; > > @@ -1743,6 +1747,9 @@ static int fuse_dir_open(struct inode *inode, struct file *file) > > if (fuse_is_bad(inode)) > > return -EIO; > > > > + if (fuse_inode_has_iomap(inode)) > > + fuse_iomap_open(inode, file); > > + > > err = generic_file_open(inode, file); > > if (err) > > return err; > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index 8a981f41b1dbd0..43007cea550ae7 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -246,6 +246,9 @@ static int fuse_open(struct inode *inode, struct file *file) > > if (fuse_is_bad(inode)) > > return -EIO; > > > > + if (is_iomap) > > + fuse_iomap_open(inode, file); > > + > > AFAICT, there aren't any calls to generic_file_open() where we don't > also do this "if (is_iomap) ..." check, so maybe we should just put > this logic inside generic_file_open()? I'm confused by your question; there are many users of generic_file_open...? $ git grep generic_file_open fs/9p/vfs_inode.c:800: err = finish_open(file, dentry, generic_file_open); fs/9p/vfs_inode_dotl.c:317: err = finish_open(file, dentry, generic_file_open); fs/btrfs/file.c:3821: return generic_file_open(inode, filp); fs/fuse/dir.c:916: err = generic_file_open(inode, file); fs/fuse/dir.c:2007: err = generic_file_open(inode, file); fs/fuse/file.c:270: err = generic_file_open(inode, file); fs/gfs2/file.c:632: ret = generic_file_open(inode, file); fs/jffs2/file.c:55: .open = generic_file_open, fs/nilfs2/file.c:148: .open = generic_file_open, fs/ntfs3/file.c:1364: return generic_file_open(inode, file); fs/open.c:1607:int generic_file_open(struct inode * inode, struct file * filp) fs/open.c:1614:EXPORT_SYMBOL(generic_file_open); fs/orangefs/file.c:580: .open = generic_file_open, fs/quota/dquot.c:2219: error = generic_file_open(inode, file); fs/smb/client/dir.c:532: rc = finish_open(file, direntry, generic_file_open); fs/udf/file.c:203: .open = generic_file_open, fs/ufs/file.c:42: .open = generic_file_open, fs/vboxsf/dir.c:341: err = finish_open(file, dentry, generic_file_open); fs/xfs/xfs_file.c:1612: return generic_file_open(inode, file); fs/xfs/xfs_file.c:1626: error = generic_file_open(inode, file); ... > > err = generic_file_open(inode, file); > > if (err) > > return err; > > @@ -1751,10 +1754,17 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > > struct file *file = iocb->ki_filp; > > struct fuse_file *ff = file->private_data; > > struct inode *inode = file_inode(file); > > + ssize_t ret; > > > > if (fuse_is_bad(inode)) > > return -EIO; > > > > + if (fuse_want_iomap_directio(iocb)) { > > In fuse, directio is also done if the server sets FOPEN_DIRECT_IO as > part of the struct fuse_open_out open_flags arg, even if > iocb->ki_flags doesn't have IOCB_DIRECT set. Yikes. Let me look at the other fuse FOPEN_ flags: * FOPEN_DIRECT_IO: bypass page cache for this open file Hrm. This wouldn't be hard to implement, but what happens if the read or write range aren't aligned to the block size? Is it ok to return EINVAL here? * FOPEN_KEEP_CACHE: don't invalidate the data cache on open fuse_open takes care of this too. Evidently I need to fix fuse4fs to set this flag. * FOPEN_NONSEEKABLE: the file is not seekable * FOPEN_STREAM: the file is stream-like (no file position at all) Not sure why you'd have a non-seekable regular iomap file but I think fuse_finish_open takes care of this. * FOPEN_CACHE_DIR: allow caching this directory Oh wow building up a dirent cache when someone does a readdir. fuse4fs should set this too. * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) Hrm. In theory fuse4fs could set this one too, but I think given the higher chance of failure (or the usb stick being yanked out) I might not set this one. * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode iomap supports this, but fuse-iomap does not. Are we supposed to return EOPNOTSUPP/EINVAL or can we quietly ignore this flag until we support it? * FOPEN_PASSTHROUGH: passthrough read/write io for this open file I need to screen this out too. > > + ret = fuse_iomap_direct_read(iocb, to); > > + if (ret != -ENOSYS) > > Hmm, where does fuse_iomap_direct_read() return -ENOSYS? afaict, > neither fuse_iomap_ilock_iocb() nor iomap_dio_rw() do? I /think/ the fuse server can reply with ENOSYS to fuse_iomap_begin(), in which case it'll percolate upwards to iomap_iter into dio->error where it can then be returned through iomap_dio_complete into iomap_dio_rw into fuse_iomap_direct_read. That will demote a directio... > > + return ret; > > + } > > I see that later on, in the patch that adds the implementation for > buffered IO with iomap, this logic later becomes something like > > if (fuse_want_iomap_directio(iocb)) { > ... > } > if (fuse_want_iomap_buffered_io(iocb)) > return fuse_iomap_buffered_read(iocb, to); ...into a buffered io. Maybe I should special-case ENOTBLK since that's what the other iomap users do. Originally directio on XFS would never fall back to buffered io. Then we introduced out of place writes for reflink, which forced us to implement the fallback for sub-allocation-unit directio writes that were aligned to LBA size (since XFS has always allowed that). Later on ext4 (which always allowed fallbacks) entered the chat and now we're stuck with it. :) > imo (if -ENOSYS is indeed not possible) something like this is maybe cleaner: > > if (fuse_inode_has_iomap(inode)) > fuse_iomap_read_iter(iocb, to); > > to move as much iomap-specific logic away from generic fuse files? That would be cleaner. It might even be cleaner to give iomap files their own file_operations structure completely. > fuse_want_iomap_directio()/fuse_want_iomap_buffered_io() helpers, eg: > > ssize_t fuse_iomap_read_iter(struct kiocb *iocb, struct iov_iter *to) { > if (iocb->ki_flags & IOCB_DIRECT) > return fuse_iomap_direct_read(iocb, to); > return fuse_iomap_buffered_read(iocb, to); > } > > > + > > if (FUSE_IS_DAX(inode)) > > return fuse_dax_read_iter(iocb, to); > > > > @@ -1776,6 +1786,12 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > > if (fuse_is_bad(inode)) > > return -EIO; > > > > + if (fuse_want_iomap_directio(iocb)) { > > + ssize_t ret = fuse_iomap_direct_write(iocb, from); > > + if (ret != -ENOSYS) > > + return ret; > > + } > > Same questions as above about -ENOSYS Same weird answer, too. :P > > + > > if (FUSE_IS_DAX(inode)) > > return fuse_dax_write_iter(iocb, from); > > > > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > > index c63527cec0448b..4db2acd8bc9925 100644 > > --- a/fs/fuse/file_iomap.c > > +++ b/fs/fuse/file_iomap.c > > @@ -495,10 +495,15 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > > } > > > > /* Decide if we send FUSE_IOMAP_END to the fuse server */ > > -static bool fuse_should_send_iomap_end(const struct iomap *iomap, > > +static bool fuse_should_send_iomap_end(const struct fuse_mount *fm, > > + const struct iomap *iomap, > > unsigned int opflags, loff_t count, > > ssize_t written) > > { > > + /* Not implemented on fuse server */ > > + if (fm->fc->iomap_conn.no_end) > > + return false; > > + > > /* fuse server demanded an iomap_end call. */ > > if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END) > > return true; > > @@ -523,7 +528,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > struct fuse_mount *fm = get_fuse_mount(inode); > > int err = 0; > > > > - if (fuse_should_send_iomap_end(iomap, opflags, count, written)) { > > + if (fuse_should_send_iomap_end(fm, iomap, opflags, count, written)) { > > struct fuse_iomap_end_in inarg = { > > .opflags = fuse_iomap_op_to_server(opflags), > > .attr_ino = fi->orig_ino, > > @@ -549,6 +554,7 @@ static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count, > > * libfuse returns ENOSYS for servers that don't > > * implement iomap_end > > */ > > + fm->fc->iomap_conn.no_end = 1; > > err = 0; > > break; > > case 0: > > @@ -567,6 +573,95 @@ static const struct iomap_ops fuse_iomap_ops = { > > .iomap_end = fuse_iomap_end, > > }; > > > > +static inline bool > > +fuse_should_send_iomap_ioend(const struct fuse_mount *fm, > > + const struct fuse_iomap_ioend_in *inarg) > > +{ > > + /* Not implemented on fuse server */ > > + if (fm->fc->iomap_conn.no_ioend) > > + return false; > > + > > + /* Always send an ioend for errors. */ > > + if (inarg->error) > > + return true; > > + > > + /* Send an ioend if we performed an IO involving metadata changes. */ > > + return inarg->written > 0 && > > + (inarg->ioendflags & (FUSE_IOMAP_IOEND_SHARED | > > + FUSE_IOMAP_IOEND_UNWRITTEN | > > + FUSE_IOMAP_IOEND_APPEND)); > > +} > > + > > +/* > > + * Fast and loose check if this write could update the on-disk inode size. > > + */ > > +static inline bool fuse_ioend_is_append(const struct fuse_inode *fi, > > + loff_t pos, size_t written) > > +{ > > + return pos + written > i_size_read(&fi->inode); > > +} > > + > > +static int fuse_iomap_ioend(struct inode *inode, loff_t pos, size_t written, > > + int error, unsigned ioendflags, sector_t new_addr) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct fuse_mount *fm = get_fuse_mount(inode); > > + struct fuse_iomap_ioend_in inarg = { > > + .ioendflags = ioendflags, > > + .error = error, > > + .attr_ino = fi->orig_ino, > > + .pos = pos, > > + .written = written, > > + .new_addr = new_addr, > > + }; > > + > > + if (fuse_ioend_is_append(fi, pos, written)) > > + inarg.ioendflags |= FUSE_IOMAP_IOEND_APPEND; > > + > > + if (fuse_should_send_iomap_ioend(fm, &inarg)) { > > + FUSE_ARGS(args); > > + int err; > > + > > + args.opcode = FUSE_IOMAP_IOEND; > > + args.nodeid = get_node_id(inode); > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + err = fuse_simple_request(fm, &args); > > + switch (err) { > > + case -ENOSYS: > > + /* > > + * fuse servers can return ENOSYS if ioend processing > > + * is never needed for this filesystem. > > + */ > > + fm->fc->iomap_conn.no_ioend = 1; > > + err = 0; > > It doesn't look like we need to set err here or maybe I'm missing something Maybe we both are? :D There shouldn't be variables named @error _and_ @err in the same function; and the err assignment here is indeed pointless as you state. But that's a very confusing thing fo rme to have done. fuse_iomap_ioend should return the error passed into it; or if no error was passed into it, then it can convey an error that occurred during processing of the ioend itself (e.g. remapping after a write failed). > > + break; > > + case 0: > > + break; > > + default: > > + /* > > + * If the write IO failed, return the failure code to > > + * the caller no matter what happens with the ioend. > > + * If the write IO succeeded but the ioend did not, > > + * pass the new error up to the caller. > > + */ > > + if (!error) > > + error = err; > > + break; > > + } > > + } > > + if (error) > > + return error; > > + > > + /* > > + * If there weren't any ioend errors, update the incore isize, which > > Not sure if incore is a standard term, but it had me confused for a > bit. I think incore just means kernel-internal? Yes. Later on in the pagecache path we'll introduce the notion of the "ondisk" isize. Both of these names are iomap/xfs anachronisms. The incore isize is of course the same kernel-internal file size. The "ondisk" isize is the file size according to the fuse server. Because iomap supports things like delalloc, this means that the in-memory file can become quite a bit larger than what the kernel has pushed to disk (and the fuse server) via writeback. If userspace asks the kernel to do something that requires immediate metadata changes such as fallocate, it's critical to preflush anything dirty between the "ondisk" EOF and the affected range. Example: Let's say you write a 100K file and do not fsync it. The incore isize is 100K, but nobody's told the fuse server anything so it thinks the file size is still empty. Next, you pwrite another 22K but this time at offset 20,000K. The incore isize is now 22,022K, but pagecache writeback hasn't triggered yet, so the fuse server thinks the file is still empty. Now you ask to fallocate 4K at offset 10,000K. This requires a trip to the fuse server to fill that 4k hole, but before you can do that, you have to flush any dirty data between the ondisk size (0) and the start of the fallocate range (10000K) because the fuse command will change the file size to 10,004K, and as the write came before the fallocate, the effects of that write must be persisted before the effects of the fallocate. But at this point we're only doing direct writes so this is a confusing long way to update isize on an extending write. > > + * confusingly takes the new i_size as "pos". > > + */ > > + fuse_write_update_attr(inode, pos + written, written); > > + return 0; > > +} > > + > > static int fuse_iomap_may_admin(struct fuse_conn *fc, unsigned int flags) > > { > > if (!fc->iomap) > > @@ -618,6 +713,8 @@ void fuse_iomap_mount(struct fuse_mount *fm) > > * freeze/thaw properly. > > */ > > fc->sync_fs = true; > > + fc->iomap_conn.no_end = 0; > > + fc->iomap_conn.no_ioend = 0; > > fc after it's first allocated has all its fields memset to 0 Ok, will fix. > > } > > > > void fuse_iomap_unmount(struct fuse_mount *fm) > > @@ -760,3 +857,151 @@ loff_t fuse_iomap_lseek(struct file *file, loff_t offset, int whence) > > return offset; > > return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); > > } > > + > > +void fuse_iomap_open(struct inode *inode, struct file *file) > > +{ > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; > > +} > > + > > +enum fuse_ilock_type { > > + SHARED, > > + EXCL, > > +}; > > + > > +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, > > + enum fuse_ilock_type type) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + > > + if (iocb->ki_flags & IOCB_NOWAIT) { > > + switch (type) { > > + case SHARED: > > + return inode_trylock_shared(inode) ? 0 : -EAGAIN; > > + case EXCL: > > + return inode_trylock(inode) ? 0 : -EAGAIN; > > + default: > > + ASSERT(0); > > + return -EIO; > > + } > > + } else { > > nit: the else {} scoping doesn't seem needed here <nod> > > + switch (type) { > > + case SHARED: > > + inode_lock_shared(inode); > > + break; > > + case EXCL: > > + inode_lock(inode); > > + break; > > + default: > > + ASSERT(0); > > + return -EIO; > > + } > > + } > > + > > + return 0; > > +} > > + > > +ssize_t fuse_iomap_direct_read(struct kiocb *iocb, struct iov_iter *to) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + ssize_t ret; > > + > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + if (!iov_iter_count(to)) > > + return 0; /* skip atime */ > > + > > + file_accessed(iocb->ki_filp); > > Does it make sense for this to be moved below so it's called only if > fuse_iomap_ilock_iocb() succeeded? Ideally we'd do it on successful return from iomap_dio_read. Curiously, XFS does it this way (bump atime, take i_rwsem, do read), whereas ext4 relies on filemap_read, which does it at the end. Weird. > > + > > + ret = fuse_iomap_ilock_iocb(iocb, SHARED); > > + if (ret) > > + return ret; > > + ret = iomap_dio_rw(iocb, to, &fuse_iomap_ops, NULL, 0, NULL, 0); > > + inode_unlock_shared(inode); > > + > > + return ret; > > +} > > + > > +static int fuse_iomap_dio_write_end_io(struct kiocb *iocb, ssize_t written, > > + int error, unsigned dioflags) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + unsigned int nofs_flag; > > + unsigned int ioendflags = FUSE_IOMAP_IOEND_DIRECT; > > + int ret; > > + > > + if (fuse_is_bad(inode)) > > + return -EIO; > > + > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + if (dioflags & IOMAP_DIO_COW) > > + ioendflags |= FUSE_IOMAP_IOEND_SHARED; > > + if (dioflags & IOMAP_DIO_UNWRITTEN) > > + ioendflags |= FUSE_IOMAP_IOEND_UNWRITTEN; > > + > > + /* > > + * We can allocate memory here while doing writeback on behalf of > > + * memory reclaim. To avoid memory allocation deadlocks set the > > + * task-wide nofs context for the following operations. > > + */ > > + nofs_flag = memalloc_nofs_save(); > > I'm a bit confused by this part. Could you explain how it's invoked > while doing writeback for memory reclaim? As I understand it, > writeback goes through buffered io and not direct io? I think this is a throwback to earlier versions of the iomap patchset where I tried using the actual directio write machinery to perform writebacks. They're separate now, so I think this can go away. > > + ret = fuse_iomap_ioend(inode, iocb->ki_pos, written, error, ioendflags, > > + FUSE_IOMAP_NULL_ADDR); > > + memalloc_nofs_restore(nofs_flag); > > + return ret; > > +} > > + > > +static const struct iomap_dio_ops fuse_iomap_dio_write_ops = { > > + .end_io = fuse_iomap_dio_write_end_io, > > +}; > > + > > +ssize_t fuse_iomap_direct_write(struct kiocb *iocb, struct iov_iter *from) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + loff_t blockmask = i_blocksize(inode) - 1; > > + size_t count = iov_iter_count(from); > > + unsigned int flags = 0; > > + ssize_t ret; > > + > > + ASSERT(fuse_inode_has_iomap(inode)); > > + > > + if (!count) > > + return 0; > > + > > + /* > > + * Unaligned direct writes require zeroing of unwritten head and tail > > + * blocks. Extending writes require zeroing of post-EOF tail blocks. > > + * The zeroing writes must complete before we return the direct write > > + * to userspace. Don't even bother trying the fast path. > > + */ > > + if ((iocb->ki_pos | count) & blockmask) > > + flags = IOMAP_DIO_FORCE_WAIT; > > + > > + ret = fuse_iomap_ilock_iocb(iocb, EXCL); > > + if (ret) > > + goto out_dsync; > > I wonder if we need the out_dsync goto at all. Maybe just return ret > here directly? Ok. > > + ret = generic_write_checks(iocb, from); > > + if (ret <= 0) > > + goto out_unlock; > > + > > + /* > > + * If we are doing exclusive unaligned I/O, this must be the only I/O > > + * in-flight. Otherwise we risk data corruption due to unwritten > > + * extent conversions from the AIO end_io handler. Wait for all other > > + * I/O to drain first. > > + */ > > + if (flags & IOMAP_DIO_FORCE_WAIT) > > + inode_dio_wait(inode); > > + > > Should we add a file_modified() call here? Urk. That will get fixed when I implement fuse_iomap_write_checks in the next patch that does buffered IO, but yes, it's needed here too. > > > + ret = iomap_dio_rw(iocb, from, &fuse_iomap_ops, > > + &fuse_iomap_dio_write_ops, flags, NULL, 0); > > + if (ret) > > + goto out_unlock; > > I think we could get rid of this if (ret) check Will do. > > + > > +out_unlock: > > + inode_unlock(inode); > > +out_dsync: > > + return ret; > > +} > > diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c > > index 68d2eecb8559a5..300985d62a2f9b 100644 > > --- a/fs/fuse/trace.c > > +++ b/fs/fuse/trace.c > > @@ -9,6 +9,7 @@ > > #include "iomap_i.h" > > > > #include <linux/pagemap.h> > > +#include <linux/iomap.h> > > Was this meant to be part of the subsequent trace.h patch? I haven't > tried compiling this though so maybe I' mmissing something but I'm not > seeing which part of the logic above needs this. Yes. Originally the tracepoints were not broken out, but Miklos asked for things to be this way. --D > Thanks, > Joanne > > > > #define CREATE_TRACE_POINTS > > #include "fuse_trace.h" > > > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap [not found] ` <176169810612.1424854.16053093294573829123.stgit@frogsfrogsfrogs> 2026-01-23 18:56 ` [PATCH 12/31] fuse: " Joanne Koong @ 2026-02-05 19:19 ` Chris Mason 2026-02-06 2:08 ` Darrick J. Wong 1 sibling, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-05 19:19 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel "Darrick J. Wong" <djwong@kernel.org> wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Start implementing the fuse-iomap file I/O paths by adding direct I/O > support and all the signalling flags that come with it. Buffered I/O > is much more complicated, so we leave that to a subsequent patch. > Hi everyone, I'm trying out my AI review prompts on a few more trees, and I ran it on the fuse-iomap-cache branch: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache This was the latest email lore could find, apologies if I grabbed the wrong one. Hopefully the review below is useful, please let me know if there are places it has gone wildly wrong. I kept the header to make it obvious exactly which commit I'd run through: commit fab88aeb077db04c5f4fdbaff31a58f4dd9450c8 Author: Darrick J. Wong <djwong@kernel.org> fuse: implement direct IO with iomap Start implementing the fuse-iomap file I/O paths by adding direct I/O support and all the signalling flags that come with it. Buffered I/O is much more complicated, so we leave that to a subsequent patch. > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index bec299865d8fe..07be70d314eff 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c [ ... ] > @@ -408,7 +416,7 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, > { > WARN_ON(refcount_read(&ff->count) > 1); > fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); > - fuse_file_put(ff, true); > + fuse_file_put(ff, &fi->inode, true); > } > EXPORT_SYMBOL_GPL(fuse_sync_release); Can this code dereference a NULL pointer when fi is NULL? Looking at fuse_create_open in dir.c, when fuse_iget() fails, it calls: fuse_sync_release(NULL, ff, flags); The fuse_prepare_release function explicitly handles fi == NULL with the comment "Inode is NULL on error path of fuse_create_open()". However, the new call to fuse_file_put passes &fi->inode, which evaluates to &NULL->inode. While the inode parameter is unused in the current commit's implementation of fuse_file_put, the API contract is broken. In commit 5f4e9030b26b (fuse: enable iomap), fuse_file_put uses the inode parameter to call fuse_inode_has_iomap(inode). Since struct inode is the first member of struct fuse_inode (offsetof == 0), &fi->inode evaluates to NULL, and fuse_inode_has_iomap(NULL) will dereference NULL. > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > index bc55e87751215..9796e3b994565 100644 > --- a/fs/fuse/fuse_iomap.c > +++ b/fs/fuse/fuse_iomap.c [ ... ] > @@ -545,6 +551,120 @@ static const struct iomap_ops fuse_iomap_ops = { > .iomap_end = fuse_iomap_end, > }; > > +static inline bool > +fuse_should_send_iomap_ioend(const struct fuse_mount *fm, > + const struct fuse_iomap_ioend_in *inarg) > +{ > + /* Not implemented on fuse server */ > + if (fm->fc->iomap_conn.no_ioend) > + return false; > + > + /* Always send an ioend for errors. */ > + if (inarg->error) > + return true; > + > + /* Send an ioend if we performed an IO involving metadata changes. */ > + return inarg->written > 0 && > + (inarg->flags & (FUSE_IOMAP_IOEND_SHARED | > + FUSE_IOMAP_IOEND_UNWRITTEN | > + FUSE_IOMAP_IOEND_APPEND)); > +} [ ... ] > +static int fuse_iomap_ioend(struct inode *inode, loff_t pos, size_t written, > + int error, unsigned ioendflags, > + struct block_device *bdev, sector_t new_addr) > +{ > + struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_mount *fm = get_fuse_mount(inode); > + struct fuse_iomap_ioend_in inarg = { > + .flags = ioendflags, > + .error = error, > + .attr_ino = fi->orig_ino, > + .pos = pos, > + .written = written, > + .dev = fuse_iomap_find_backing_id(fm->fc, bdev), > + .new_addr = new_addr, > + }; > + struct fuse_iomap_ioend_out outarg = { }; > + > + if (fuse_ioend_is_append(fi, pos, written)) > + inarg.flags |= FUSE_IOMAP_IOEND_APPEND; > + > + if (fuse_should_send_iomap_ioend(fm, &inarg)) { > + FUSE_ARGS(args); > + int iomap_error; > + > + args.opcode = FUSE_IOMAP_IOEND; > + args.nodeid = get_node_id(inode); > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + args.out_numargs = 1; > + args.out_args[0].size = sizeof(outarg); > + args.out_args[0].value = &outarg; > + iomap_error = fuse_simple_request(fm, &args); > + switch (iomap_error) { > + case -ENOSYS: > + /* > + * fuse servers can return ENOSYS if ioend processing > + * is never needed for this filesystem. Don't pass > + * that up to iomap. > + */ > + fm->fc->iomap_conn.no_ioend = 1; > + break; > + case 0: > + break; > + default: > + /* > + * If the write IO failed, return the failure code to > + * the caller no matter what happens with the ioend. > + * If the write IO succeeded but the ioend did not, > + * pass the new error up to the caller. > + */ > + if (!error) > + error = iomap_error; > + break; > + } > + } > + > + /* > + * Pass whatever error iomap gave us (or any new errors since then) > + * back to iomap. > + */ > + if (error) > + return error; > + > + /* > + * If there weren't any ioend errors, update the incore isize, which > + * confusingly takes the new i_size as "pos". > + */ > + fi->i_disk_size = outarg.newsize; > + fuse_write_update_attr(inode, pos + written, written); > + return 0; > +} When the fuse server returns -ENOSYS (indicating it doesn't implement FUSE_IOMAP_IOEND), the code sets fm->fc->iomap_conn.no_ioend = 1 but does NOT return early. It falls through to the update code where 'outarg' was zero-initialized and never filled by the server. This corrupts i_disk_size by setting it to 0: fi->i_disk_size = outarg.newsize; // outarg.newsize == 0 On subsequent writes, fuse_should_send_iomap_ioend() returns false due to no_ioend being set, so the code skips the server call and again uses the zero-initialized outarg.newsize. The i_disk_size field is used by fuse_ioend_is_append() to detect append writes, so corrupting it to 0 will cause incorrect append detection for all subsequent writes. Should the code return early when the request is not sent or returns -ENOSYS, or skip the i_disk_size update in this case? > +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, > + enum fuse_ilock_type type) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (iocb->ki_flags & IOCB_NOWAIT) { > + switch (type) { > + case SHARED: > + return inode_trylock_shared(inode) ? 0 : -EAGAIN; > + case EXCL: > + return inode_trylock(inode) ? 0 : -EAGAIN; > + default: > + ASSERT(0); > + return -EIO; > + } > + > + return 0; > + } This isn't a bug, but the 'return 0;' after the switch statement inside the IOCB_NOWAIT block is unreachable since every switch case returns. ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap 2026-02-05 19:19 ` Chris Mason @ 2026-02-06 2:08 ` Darrick J. Wong 2026-02-06 2:52 ` Chris Mason 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 2:08 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 11:19:11AM -0800, Chris Mason wrote: > "Darrick J. Wong" <djwong@kernel.org> wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Start implementing the fuse-iomap file I/O paths by adding direct I/O > > support and all the signalling flags that come with it. Buffered I/O > > is much more complicated, so we leave that to a subsequent patch. > > > > Hi everyone, > > I'm trying out my AI review prompts on a few more trees, and I ran it > on the fuse-iomap-cache branch: > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache I should mention: I appreciate you rolling out the AI reviews by slowly expanding the number of trees you scan for bugs. > This was the latest email lore could find, apologies if I grabbed the wrong one. > > Hopefully the review below is useful, please let me know if there are places it > has gone wildly wrong. I kept the header to make it obvious exactly which > commit I'd run through: > > commit fab88aeb077db04c5f4fdbaff31a58f4dd9450c8 > Author: Darrick J. Wong <djwong@kernel.org> > > fuse: implement direct IO with iomap > > Start implementing the fuse-iomap file I/O paths by adding direct I/O > support and all the signalling flags that come with it. Buffered I/O is > much more complicated, so we leave that to a subsequent patch. > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index bec299865d8fe..07be70d314eff 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > [ ... ] > > > @@ -408,7 +416,7 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, > > { > > WARN_ON(refcount_read(&ff->count) > 1); > > fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); > > - fuse_file_put(ff, true); > > + fuse_file_put(ff, &fi->inode, true); > > } > > EXPORT_SYMBOL_GPL(fuse_sync_release); > > Can this code dereference a NULL pointer when fi is NULL? Yikes, yes it can... > Looking at fuse_create_open in dir.c, when fuse_iget() fails, it calls: > > fuse_sync_release(NULL, ff, flags); > > The fuse_prepare_release function explicitly handles fi == NULL with > the comment "Inode is NULL on error path of fuse_create_open()". > > However, the new call to fuse_file_put passes &fi->inode, which > evaluates to &NULL->inode. While the inode parameter is unused in the > current commit's implementation of fuse_file_put, the API contract is > broken. ...so this is a very good catch! Though as it turns out we don't need this change to fuse_file_put until we actually add fuse_iomap_release() several patchsets from now, so I think I'll revert this part of the patch. > In commit 5f4e9030b26b (fuse: enable iomap), fuse_file_put uses the > inode parameter to call fuse_inode_has_iomap(inode). Since struct inode > is the first member of struct fuse_inode (offsetof == 0), &fi->inode > evaluates to NULL, and fuse_inode_has_iomap(NULL) will dereference NULL. > > > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > > index bc55e87751215..9796e3b994565 100644 > > --- a/fs/fuse/fuse_iomap.c > > +++ b/fs/fuse/fuse_iomap.c > > [ ... ] > > > @@ -545,6 +551,120 @@ static const struct iomap_ops fuse_iomap_ops = { > > .iomap_end = fuse_iomap_end, > > }; > > > > +static inline bool > > +fuse_should_send_iomap_ioend(const struct fuse_mount *fm, > > + const struct fuse_iomap_ioend_in *inarg) > > +{ > > + /* Not implemented on fuse server */ > > + if (fm->fc->iomap_conn.no_ioend) > > + return false; > > + > > + /* Always send an ioend for errors. */ > > + if (inarg->error) > > + return true; > > + > > + /* Send an ioend if we performed an IO involving metadata changes. */ > > + return inarg->written > 0 && > > + (inarg->flags & (FUSE_IOMAP_IOEND_SHARED | > > + FUSE_IOMAP_IOEND_UNWRITTEN | > > + FUSE_IOMAP_IOEND_APPEND)); > > +} > > [ ... ] > > > +static int fuse_iomap_ioend(struct inode *inode, loff_t pos, size_t written, > > + int error, unsigned ioendflags, > > + struct block_device *bdev, sector_t new_addr) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct fuse_mount *fm = get_fuse_mount(inode); > > + struct fuse_iomap_ioend_in inarg = { > > + .flags = ioendflags, > > + .error = error, > > + .attr_ino = fi->orig_ino, > > + .pos = pos, > > + .written = written, > > + .dev = fuse_iomap_find_backing_id(fm->fc, bdev), > > + .new_addr = new_addr, > > + }; > > + struct fuse_iomap_ioend_out outarg = { }; > > + > > + if (fuse_ioend_is_append(fi, pos, written)) > > + inarg.flags |= FUSE_IOMAP_IOEND_APPEND; > > + > > + if (fuse_should_send_iomap_ioend(fm, &inarg)) { > > + FUSE_ARGS(args); > > + int iomap_error; > > + > > + args.opcode = FUSE_IOMAP_IOEND; > > + args.nodeid = get_node_id(inode); > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + args.out_numargs = 1; > > + args.out_args[0].size = sizeof(outarg); > > + args.out_args[0].value = &outarg; > > + iomap_error = fuse_simple_request(fm, &args); > > + switch (iomap_error) { > > + case -ENOSYS: > > + /* > > + * fuse servers can return ENOSYS if ioend processing > > + * is never needed for this filesystem. Don't pass > > + * that up to iomap. > > + */ > > + fm->fc->iomap_conn.no_ioend = 1; > > + break; > > + case 0: > > + break; > > + default: > > + /* > > + * If the write IO failed, return the failure code to > > + * the caller no matter what happens with the ioend. > > + * If the write IO succeeded but the ioend did not, > > + * pass the new error up to the caller. > > + */ > > + if (!error) > > + error = iomap_error; > > + break; > > + } > > + } > > + > > + /* > > + * Pass whatever error iomap gave us (or any new errors since then) > > + * back to iomap. > > + */ > > + if (error) > > + return error; > > + > > + /* > > + * If there weren't any ioend errors, update the incore isize, which > > + * confusingly takes the new i_size as "pos". > > + */ > > + fi->i_disk_size = outarg.newsize; > > + fuse_write_update_attr(inode, pos + written, written); > > + return 0; > > +} > > When the fuse server returns -ENOSYS (indicating it doesn't implement > FUSE_IOMAP_IOEND), the code sets fm->fc->iomap_conn.no_ioend = 1 but > does NOT return early. It falls through to the update code where > 'outarg' was zero-initialized and never filled by the server. > > This corrupts i_disk_size by setting it to 0: > > fi->i_disk_size = outarg.newsize; // outarg.newsize == 0 > > On subsequent writes, fuse_should_send_iomap_ioend() returns false due > to no_ioend being set, so the code skips the server call and again uses > the zero-initialized outarg.newsize. Yeah. outarg should be initialized with fi->i_disk_size, though I think this bug is benign because no_ioend being set to 1 in that ENOSYS case means that this function doesn't do much of anything after that. > The i_disk_size field is used by fuse_ioend_is_append() to detect > append writes, so corrupting it to 0 will cause incorrect append > detection for all subsequent writes. > > Should the code return early when the request is not sent or returns > -ENOSYS, or skip the i_disk_size update in this case? > > > +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, > > + enum fuse_ilock_type type) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + > > + if (iocb->ki_flags & IOCB_NOWAIT) { > > + switch (type) { > > + case SHARED: > > + return inode_trylock_shared(inode) ? 0 : -EAGAIN; > > + case EXCL: > > + return inode_trylock(inode) ? 0 : -EAGAIN; > > + default: > > + ASSERT(0); > > + return -EIO; > > + } > > + > > + return 0; > > + } > > This isn't a bug, but the 'return 0;' after the switch statement inside > the IOCB_NOWAIT block is unreachable since every switch case returns. gcc is too stupid to detect that it's impossible to reach this case and whines about the lack of a return. --D ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap 2026-02-06 2:08 ` Darrick J. Wong @ 2026-02-06 2:52 ` Chris Mason 2026-02-06 5:08 ` Darrick J. Wong 0 siblings, 1 reply; 52+ messages in thread From: Chris Mason @ 2026-02-06 2:52 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On 2/5/26 9:08 PM, Darrick J. Wong wrote: > On Thu, Feb 05, 2026 at 11:19:11AM -0800, Chris Mason wrote: >> "Darrick J. Wong" <djwong@kernel.org> wrote: >>> From: Darrick J. Wong <djwong@kernel.org> >>> >>> Start implementing the fuse-iomap file I/O paths by adding direct I/O >>> support and all the signalling flags that come with it. Buffered I/O >>> is much more complicated, so we leave that to a subsequent patch. >>> >> >> Hi everyone, >> >> I'm trying out my AI review prompts on a few more trees, and I ran it >> on the fuse-iomap-cache branch: >> >> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > I should mention: I appreciate you rolling out the AI reviews by slowly > expanding the number of trees you scan for bugs. Thanks, I'm trying not to send unsolicited AI unless it feels like it's mostly right. I did discard one false positive, which looked accurate but also looked like intended behavior. Now that the false positive rate is pretty reasonable, I'll try to collect some fs/* Fixes: tagged patches and see if I can teach claude how to spot the bugs. In past kernel-wide scans, it gets ~35%, which is better than 0, but not as good as I was hoping for. [ ... ] >>> +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, >>> + enum fuse_ilock_type type) >>> +{ >>> + struct inode *inode = file_inode(iocb->ki_filp); >>> + >>> + if (iocb->ki_flags & IOCB_NOWAIT) { >>> + switch (type) { >>> + case SHARED: >>> + return inode_trylock_shared(inode) ? 0 : -EAGAIN; >>> + case EXCL: >>> + return inode_trylock(inode) ? 0 : -EAGAIN; >>> + default: >>> + ASSERT(0); >>> + return -EIO; >>> + } >>> + >>> + return 0; >>> + } >> >> This isn't a bug, but the 'return 0;' after the switch statement inside >> the IOCB_NOWAIT block is unreachable since every switch case returns. > > gcc is too stupid to detect that it's impossible to reach this case and > whines about the lack of a return. Oh that's great. -chris ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap 2026-02-06 2:52 ` Chris Mason @ 2026-02-06 5:08 ` Darrick J. Wong 2026-02-06 14:27 ` Chris Mason 0 siblings, 1 reply; 52+ messages in thread From: Darrick J. Wong @ 2026-02-06 5:08 UTC (permalink / raw) To: Chris Mason; +Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On Thu, Feb 05, 2026 at 09:52:29PM -0500, Chris Mason wrote: > On 2/5/26 9:08 PM, Darrick J. Wong wrote: > > On Thu, Feb 05, 2026 at 11:19:11AM -0800, Chris Mason wrote: > >> "Darrick J. Wong" <djwong@kernel.org> wrote: > >>> From: Darrick J. Wong <djwong@kernel.org> > >>> > >>> Start implementing the fuse-iomap file I/O paths by adding direct I/O > >>> support and all the signalling flags that come with it. Buffered I/O > >>> is much more complicated, so we leave that to a subsequent patch. > >>> > >> > >> Hi everyone, > >> > >> I'm trying out my AI review prompts on a few more trees, and I ran it > >> on the fuse-iomap-cache branch: > >> > >> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache > > > > I should mention: I appreciate you rolling out the AI reviews by slowly > > expanding the number of trees you scan for bugs. > > Thanks, I'm trying not to send unsolicited AI unless it feels like it's > mostly right. I did discard one false positive, which looked accurate > but also looked like intended behavior. > > Now that the false positive rate is pretty reasonable, I'll try to > collect some fs/* Fixes: tagged patches and see if I can teach claude > how to spot the bugs. In past kernel-wide scans, it gets ~35%, which is > better than 0, but not as good as I was hoping for. <nod> You've found some very good bugs, especially in the fuse-iomap branch! At some point I'm going to have to figure out how to run these tools myself, but until then you're quite welcome to keep scanning my dev trees. :) I wonder, have you tried it on non-kernel repos like e2fsprogs (ha!) or fstests? > [ ... ] > > >>> +static int fuse_iomap_ilock_iocb(const struct kiocb *iocb, > >>> + enum fuse_ilock_type type) > >>> +{ > >>> + struct inode *inode = file_inode(iocb->ki_filp); > >>> + > >>> + if (iocb->ki_flags & IOCB_NOWAIT) { > >>> + switch (type) { > >>> + case SHARED: > >>> + return inode_trylock_shared(inode) ? 0 : -EAGAIN; > >>> + case EXCL: > >>> + return inode_trylock(inode) ? 0 : -EAGAIN; > >>> + default: > >>> + ASSERT(0); > >>> + return -EIO; > >>> + } > >>> + > >>> + return 0; > >>> + } > >> > >> This isn't a bug, but the 'return 0;' after the switch statement inside > >> the IOCB_NOWAIT block is unreachable since every switch case returns. > > > > gcc is too stupid to detect that it's impossible to reach this case and > > whines about the lack of a return. > > Oh that's great. I know, right? :( --D > -chris > ^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [PATCH 12/31] fuse: implement direct IO with iomap 2026-02-06 5:08 ` Darrick J. Wong @ 2026-02-06 14:27 ` Chris Mason 0 siblings, 0 replies; 52+ messages in thread From: Chris Mason @ 2026-02-06 14:27 UTC (permalink / raw) To: Darrick J. Wong Cc: miklos, joannelkoong, bernd, neal, linux-ext4, linux-fsdevel On 2/6/26 12:08 AM, Darrick J. Wong wrote: > On Thu, Feb 05, 2026 at 09:52:29PM -0500, Chris Mason wrote: >> On 2/5/26 9:08 PM, Darrick J. Wong wrote: >>> On Thu, Feb 05, 2026 at 11:19:11AM -0800, Chris Mason wrote: >>>> "Darrick J. Wong" <djwong@kernel.org> wrote: >>>>> From: Darrick J. Wong <djwong@kernel.org> >>>>> >>>>> Start implementing the fuse-iomap file I/O paths by adding direct I/O >>>>> support and all the signalling flags that come with it. Buffered I/O >>>>> is much more complicated, so we leave that to a subsequent patch. >>>>> >>>> >>>> Hi everyone, >>>> >>>> I'm trying out my AI review prompts on a few more trees, and I ran it >>>> on the fuse-iomap-cache branch: >>>> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache >>> >>> I should mention: I appreciate you rolling out the AI reviews by slowly >>> expanding the number of trees you scan for bugs. >> >> Thanks, I'm trying not to send unsolicited AI unless it feels like it's >> mostly right. I did discard one false positive, which looked accurate >> but also looked like intended behavior. >> >> Now that the false positive rate is pretty reasonable, I'll try to >> collect some fs/* Fixes: tagged patches and see if I can teach claude >> how to spot the bugs. In past kernel-wide scans, it gets ~35%, which is >> better than 0, but not as good as I was hoping for. > > <nod> You've found some very good bugs, especially in the fuse-iomap > branch! At some point I'm going to have to figure out how to run these > tools myself, but until then you're quite welcome to keep scanning my > dev trees. :) > > I wonder, have you tried it on non-kernel repos like e2fsprogs (ha!) or > fstests? The prompts are here: https://github.com/masoncl/review-prompts And thanks to Christian they now have both kernel and systemd specific directories. The original versions of the prompts had a lot of details about exactly how to review code, but recent models don't seem to need (or follow) that level of detail. Instead it's really just forcing larger chunks of the call graph into the AI context window, and adding some kernel specific knowledge about locking, rcu, gfp masks, sleepable vs irq context etc. Basically the weird stuff that we've forgotten is weird. So non-kernel projects would mostly work but would need a few fixups depending on how far they stray from kernel semantics. It's easy enough to add branches into the prompts, with the asterisk that from time to time AI ignores all the instructions and does what it wants. -chris ^ permalink raw reply [flat|nested] 52+ messages in thread
end of thread, other threads:[~2026-02-06 14:28 UTC | newest]
Thread overview: 52+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20251029002755.GK6174@frogsfrogsfrogs>
[not found] ` <176169810144.1424854.11439355400009006946.stgit@frogsfrogsfrogs>
[not found] ` <176169810371.1424854.3010195280915622081.stgit@frogsfrogsfrogs>
2026-01-21 19:34 ` [PATCH 01/31] fuse: implement the basic iomap mechanisms Joanne Koong
2026-01-21 22:45 ` Darrick J. Wong
2026-01-22 0:06 ` Joanne Koong
2026-01-22 0:34 ` Darrick J. Wong
2026-02-05 19:22 ` Chris Mason
2026-02-05 23:31 ` Darrick J. Wong
[not found] ` <176169810415.1424854.10373764649459618752.stgit@frogsfrogsfrogs>
2026-01-21 23:42 ` [PATCH 03/31] fuse: make debugging configurable at runtime Joanne Koong
2026-01-22 0:02 ` Darrick J. Wong
2026-01-22 0:23 ` Joanne Koong
2026-01-22 0:40 ` Darrick J. Wong
[not found] ` <176169810502.1424854.13869957103489591272.stgit@frogsfrogsfrogs>
2026-01-22 1:13 ` [PATCH 07/31] fuse: create a per-inode flag for toggling iomap Joanne Koong
2026-01-22 22:22 ` Darrick J. Wong
2026-01-23 18:05 ` Joanne Koong
2026-01-24 16:54 ` Darrick J. Wong
2026-01-27 23:33 ` Darrick J. Wong
[not found] ` <176169810568.1424854.4073875923015322741.stgit@frogsfrogsfrogs>
2026-01-22 2:07 ` [PATCH 10/31] fuse: implement basic iomap reporting such as FIEMAP and SEEK_{DATA,HOLE} Joanne Koong
2026-01-22 22:31 ` Darrick J. Wong
[not found] ` <176169810700.1424854.5753715202341698632.stgit@frogsfrogsfrogs>
2026-01-23 21:50 ` [PATCH 16/31] fuse: implement large folios for iomap pagecache files Joanne Koong
[not found] ` <176169810721.1424854.6150447623894591900.stgit@frogsfrogsfrogs>
2026-01-26 22:03 ` [PATCH 17/31] fuse: use an unrestricted backing device with iomap pagecache io Joanne Koong
2026-01-26 23:55 ` Darrick J. Wong
2026-01-27 1:35 ` Joanne Koong
2026-01-27 2:09 ` Darrick J. Wong
2026-01-27 18:04 ` Joanne Koong
2026-01-27 23:37 ` Darrick J. Wong
2026-01-27 0:59 ` [PATCHSET v6 4/8] fuse: allow servers to use iomap for better file IO performance Joanne Koong
2026-01-27 2:22 ` Darrick J. Wong
2026-01-27 19:47 ` Joanne Koong
2026-01-27 23:21 ` Darrick J. Wong
2026-01-28 0:10 ` Joanne Koong
2026-01-28 0:34 ` Darrick J. Wong
2026-01-29 1:12 ` Joanne Koong
2026-01-29 20:02 ` Darrick J. Wong
2026-01-29 22:41 ` Darrick J. Wong
2026-01-29 22:50 ` Joanne Koong
2026-01-29 23:12 ` Darrick J. Wong
[not found] ` <176169810980.1424854.10557015500766654898.stgit@frogsfrogsfrogs>
2026-02-05 18:57 ` [PATCH 29/31] fuse: disable direct reclaim for any fuse server that uses iomap Chris Mason
2026-02-06 4:25 ` Darrick J. Wong
[not found] ` <176169810874.1424854.5037707950055785011.stgit@frogsfrogsfrogs>
2026-02-05 19:01 ` [PATCH 24/31] fuse: implement inline data file IO via iomap Chris Mason
2026-02-06 2:27 ` Darrick J. Wong
[not found] ` <176169810765.1424854.10969346031644824992.stgit@frogsfrogsfrogs>
2026-02-05 19:07 ` [PATCH 19/31] fuse: query filesystem geometry when using iomap Chris Mason
2026-02-06 2:17 ` Darrick J. Wong
[not found] ` <176169810656.1424854.15239592653019383193.stgit@frogsfrogsfrogs>
2026-02-05 19:12 ` [PATCH 14/31] fuse: implement buffered IO with iomap Chris Mason
2026-02-06 2:14 ` Darrick J. Wong
[not found] ` <176169810634.1424854.13084435884326863405.stgit@frogsfrogsfrogs>
2026-02-05 19:16 ` [PATCH 13/31] fuse_trace: implement direct " Chris Mason
2026-02-06 2:12 ` Darrick J. Wong
[not found] ` <176169810612.1424854.16053093294573829123.stgit@frogsfrogsfrogs>
2026-01-23 18:56 ` [PATCH 12/31] fuse: " Joanne Koong
2026-01-26 23:46 ` Darrick J. Wong
2026-02-05 19:19 ` Chris Mason
2026-02-06 2:08 ` Darrick J. Wong
2026-02-06 2:52 ` Chris Mason
2026-02-06 5:08 ` Darrick J. Wong
2026-02-06 14:27 ` Chris Mason
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox