From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0D3AD34D4C9; Thu, 14 May 2026 22:59:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778799599; cv=none; b=jD0NAJnGogIA39g8fTIRYI+iXbrZWS5qsYRUrgqSfqXM9u3aFYDIS5kd+Id3UbYZYWSkELJKo9ylBNBjfpO8xMk2seZxwtYNHStrHv2EVQOyRrZvNYDFKHJ8ZgrC8edpqpQu1xIXDv9bTt3/sguJKOqYOlMrPfHepiEWC7E41zQ= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778799599; c=relaxed/simple; bh=Zh141Lr4vuoHEfF62NuY2/Yua8vjCwevSvo4M6Cb7Ds=; h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version: Content-Type:Content-Disposition:In-Reply-To; b=QREBUanoqNSRrRUVQYqIWAO7C/ro/PkXRFiHl4zdH9GLqhcWFlVymjr8YcK7FQRwvR7++P6IHjQ4X0Aps9AV+jAs/Tkck2Rj9nWXDWC6bgP9zAa1bT40hGriYhKcdO/AI9cGODbkOUFOazb8Eb5lNhmLpQMvfQ9UgVpa+SMErBQ= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=LTsVjLzs; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="LTsVjLzs" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9BB55C2BCB3; Thu, 14 May 2026 22:59:58 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1778799598; bh=Zh141Lr4vuoHEfF62NuY2/Yua8vjCwevSvo4M6Cb7Ds=; h=Date:From:To:Cc:Subject:References:In-Reply-To:From; b=LTsVjLzsBEDGTXfZqom4Rh7kEInTnPPz5rtEItqjxPl6ukgpkEZCOIsqiI5ZlIHaW 0VXLLSR1q8PNT10QNllOoMeMSK3vABImhJAZ86kISS9yvtn3sSVFqTMdvEILD46NqU wXHeyNdWhykaviRjk7+bH+dmbQCcjy4hvzW8VFZKk7+k0eb0xKyI5BLtVmGQ1NzLFo uh6nRDwkIYDlwDCkXHC6J9rdIJz0+bdFP1XxUDldy//6geblb4nH6A8HUnyGtJgWQ3 GNsfyqL9tP7DN4sY4hyLgeWB/M7kDr1ke8ndkbc0PvkEFXBM4PlXDuyljgdPGw1GCK hE//5c0pXUcLg== Date: Thu, 14 May 2026 15:59:57 -0700 From: "Darrick J. Wong" To: miklos@szeredi.hu Cc: joannelkoong@gmail.com, neal@gompa.dev, linux-fsdevel@vger.kernel.org, bernd@bsbernd.com, fuse-devel@lists.linux.dev Subject: Re: [PATCH 07/12] fuse: enable iomap cache management Message-ID: <20260514225957.GQ9544@frogsfrogsfrogs> References: <177747206929.4103699.18256961856283014867.stgit@frogsfrogsfrogs> <177747207149.4103699.15223683178584774669.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <177747207149.4103699.15223683178584774669.stgit@frogsfrogsfrogs> On Wed, Apr 29, 2026 at 07:37:15AM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong > > Provide a means for the fuse server to upload iomappings to the kernel > and invalidate them. This is how we enable iomap caching for better > performance. This is also required for correct synchronization between > pagecache writes and writeback. > > Signed-off-by: "Darrick J. Wong" > --- > fs/fuse/fuse_iomap.h | 7 + > include/uapi/linux/fuse.h | 29 +++++ > fs/fuse/dev.c | 46 ++++++++ > fs/fuse/fuse_iomap.c | 264 ++++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 343 insertions(+), 3 deletions(-) > > > diff --git a/fs/fuse/fuse_iomap.h b/fs/fuse/fuse_iomap.h > index 5cdf7b311dba42..79625897dded50 100644 > --- a/fs/fuse/fuse_iomap.h > +++ b/fs/fuse/fuse_iomap.h > @@ -79,6 +79,11 @@ int fuse_iomap_dev_inval(struct fuse_conn *fc, > > int fuse_iomap_fadvise(struct file *file, loff_t start, loff_t end, int advice); > int fuse_dev_ioctl_iomap_set_nofs(struct file *file, uint32_t __user *argp); > + > +int fuse_iomap_upsert_mappings(struct fuse_conn *fc, > + const struct fuse_iomap_upsert_mappings_out *outarg); > +int fuse_iomap_inval_mappings(struct fuse_conn *fc, > + const struct fuse_iomap_inval_mappings_out *outarg); > #else > # define fuse_iomap_enabled(...) (false) > # define fuse_has_iomap(...) (false) > @@ -108,6 +113,8 @@ int fuse_dev_ioctl_iomap_set_nofs(struct file *file, uint32_t __user *argp); > # define fuse_iomap_dev_inval(...) (-ENOSYS) > # define fuse_iomap_fadvise NULL > # define fuse_dev_ioctl_iomap_set_nofs(...) (-EOPNOTSUPP) > +# define fuse_iomap_upsert_mappings(...) (-ENOSYS) > +# define fuse_iomap_inval_mappings(...) (-ENOSYS) > #endif /* CONFIG_FUSE_IOMAP */ > > #endif /* _FS_FUSE_IOMAP_H */ > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index a273838bc20f2f..8c5e67731b21b8 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -251,6 +251,8 @@ > * - add FUSE_ATTR_ATOMIC for single-fsblock atomic write support > * - add FUSE_ATTR_{SYNC,IMMUTABLE,APPEND} for VFS enforcement of file > * attributes > + * - add FUSE_NOTIFY_IOMAP_{UPSERT,INVAL}_MAPPINGS so fuse servers can cache > + * file range mappings in the kernel for iomap > */ > > #ifndef _LINUX_FUSE_H > @@ -731,6 +733,8 @@ enum fuse_notify_code { > FUSE_NOTIFY_INC_EPOCH = 8, > FUSE_NOTIFY_PRUNE = 9, > FUSE_NOTIFY_IOMAP_DEV_INVAL = 99, > + FUSE_NOTIFY_IOMAP_UPSERT_MAPPINGS = 100, > + FUSE_NOTIFY_IOMAP_INVAL_MAPPINGS = 101, > FUSE_NOTIFY_CODE_MAX, > }; > > @@ -1396,6 +1400,8 @@ struct fuse_uring_cmd_req { > #define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255) > /* fuse-specific mapping type saying the server has populated the cache */ > #define FUSE_IOMAP_TYPE_RETRY_CACHE (254) > +/* do not upsert this mapping */ > +#define FUSE_IOMAP_TYPE_NOCACHE (253) > > #define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */ > > @@ -1556,4 +1562,27 @@ struct fuse_iomap_dev_inval_out { > /* invalidate all cached iomap mappings up to EOF */ > #define FUSE_IOMAP_INVAL_TO_EOF (~0ULL) > > +struct fuse_iomap_inval_mappings_out { > + uint64_t nodeid; /* Inode ID */ > + uint64_t attr_ino; /* matches fuse_attr:ino */ > + > + /* > + * Range of read and mappings to invalidate. Zero length means ignore > + * the range; and FUSE_IOMAP_INVAL_TO_EOF can be used for length. > + */ > + struct fuse_range read; > + struct fuse_range write; > +}; > + > +struct fuse_iomap_upsert_mappings_out { > + uint64_t nodeid; /* Inode ID */ > + uint64_t attr_ino; /* matches fuse_attr:ino */ > + > + /* read file data from here */ > + struct fuse_iomap_io read; > + > + /* write file data to here, if applicable */ > + struct fuse_iomap_io write; > +}; > + > #endif /* _LINUX_FUSE_H */ > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index cf4bad6ffc287b..fcee1a23375cee 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -1872,6 +1872,48 @@ static int fuse_notify_iomap_dev_inval(struct fuse_conn *fc, unsigned int size, > return err; > } > > +static int fuse_notify_iomap_upsert_mappings(struct fuse_conn *fc, > + unsigned int size, > + struct fuse_copy_state *cs) > +{ > + struct fuse_iomap_upsert_mappings_out outarg; > + int err = -EINVAL; > + > + if (size != sizeof(outarg)) > + goto err; > + > + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); > + if (err) > + goto err; > + fuse_copy_finish(cs); > + > + return fuse_iomap_upsert_mappings(fc, &outarg); > +err: > + fuse_copy_finish(cs); > + return err; > +} > + > +static int fuse_notify_iomap_inval_mappings(struct fuse_conn *fc, > + unsigned int size, > + struct fuse_copy_state *cs) > +{ > + struct fuse_iomap_inval_mappings_out outarg; > + int err = -EINVAL; > + > + if (size != sizeof(outarg)) > + goto err; > + > + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); > + if (err) > + goto err; > + fuse_copy_finish(cs); > + > + return fuse_iomap_inval_mappings(fc, &outarg); > +err: > + fuse_copy_finish(cs); > + return err; > +} > + > struct fuse_retrieve_args { > struct fuse_args_pages ap; > struct fuse_notify_retrieve_in inarg; > @@ -2164,6 +2206,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, > > case FUSE_NOTIFY_IOMAP_DEV_INVAL: > return fuse_notify_iomap_dev_inval(fc, size, cs); > + case FUSE_NOTIFY_IOMAP_UPSERT_MAPPINGS: > + return fuse_notify_iomap_upsert_mappings(fc, size, cs); > + case FUSE_NOTIFY_IOMAP_INVAL_MAPPINGS: > + return fuse_notify_iomap_inval_mappings(fc, size, cs); > > default: > return -EINVAL; > diff --git a/fs/fuse/fuse_iomap.c b/fs/fuse/fuse_iomap.c > index 6d6c42fdaaac5b..8e747084d81b28 100644 > --- a/fs/fuse/fuse_iomap.c > +++ b/fs/fuse/fuse_iomap.c > @@ -132,6 +132,7 @@ static inline bool fuse_iomap_check_type(uint16_t fuse_type) > case FUSE_IOMAP_TYPE_INLINE: > case FUSE_IOMAP_TYPE_PURE_OVERWRITE: > case FUSE_IOMAP_TYPE_RETRY_CACHE: > + case FUSE_IOMAP_TYPE_NOCACHE: > return true; > } > > @@ -241,8 +242,8 @@ static inline bool fuse_iomap_check_mapping(const struct inode *inode, > uint64_t end; > > /* > - * Type and flags must be known. Mapping type "retry cache" doesn't > - * use any of the other fields. > + * Type and flags must be known. Mapping types "retry cache" and "do > + * not insert in cache" don't use any of the other fields. > */ > if (BAD_DATA(!fuse_iomap_check_type(map->type))) > return false; > @@ -255,6 +256,8 @@ static inline bool fuse_iomap_check_mapping(const struct inode *inode, > return false; > return true; > } > + if (map->type == FUSE_IOMAP_TYPE_NOCACHE) > + return true; > if (BAD_DATA(!fuse_iomap_check_flags(map->flags))) > return false; > > @@ -299,6 +302,7 @@ static inline bool fuse_iomap_check_mapping(const struct inode *inode, > if (BAD_DATA(iodir != WRITE_MAPPING)) > return false; > break; > + case FUSE_IOMAP_TYPE_NOCACHE: > case FUSE_IOMAP_TYPE_RETRY_CACHE: > default: > /* should have been caught already */ > @@ -373,6 +377,15 @@ fuse_iomap_begin_validate(const struct inode *inode, > if (!fuse_iomap_check_mapping(inode, &outarg->write, WRITE_MAPPING)) > return -EFSCORRUPTED; > > + /* > + * ->iomap_begin requires real mappings or "retry from cache"; "do not > + * add to cache" does not apply here. > + */ > + if (BAD_DATA(outarg->read.type == FUSE_IOMAP_TYPE_NOCACHE)) > + return -EFSCORRUPTED; > + if (BAD_DATA(outarg->write.type == FUSE_IOMAP_TYPE_NOCACHE)) > + return -EFSCORRUPTED; > + > /* > * Must have returned a mapping for at least the first byte in the > * range. The main mapping check already validated that the length > @@ -606,9 +619,11 @@ fuse_iomap_cached_validate(const struct inode *inode, > if (!fuse_iomap_check_mapping(inode, &lmap->map, dir)) > return -EFSCORRUPTED; > > - /* The cache should not be storing "retry cache" mappings */ > + /* The cache should not be storing cache management mappings */ > if (BAD_DATA(lmap->map.type == FUSE_IOMAP_TYPE_RETRY_CACHE)) > return -EFSCORRUPTED; > + if (BAD_DATA(lmap->map.type == FUSE_IOMAP_TYPE_NOCACHE)) > + return -EFSCORRUPTED; > > return 0; > } > @@ -2639,3 +2654,246 @@ void fuse_iomap_copied_file_range(struct inode *inode, loff_t offset, > > fuse_iomap_cache_invalidate_range(inode, offset, written); > } > + > +static inline int > +fuse_iomap_upsert_validate_dev( > + const struct fuse_backing *fb, > + const struct fuse_iomap_io *map) > +{ > + uint64_t map_end; > + sector_t device_bytes; > + > + if (!fb) { > + if (BAD_DATA(map->addr != FUSE_IOMAP_NULL_ADDR)) > + return -EFSCORRUPTED; > + > + return 0; > + } > + > + if (BAD_DATA(map->addr == FUSE_IOMAP_NULL_ADDR)) > + return -EFSCORRUPTED; > + > + if (BAD_DATA(check_add_overflow(map->addr, map->length, &map_end))) > + return -EFSCORRUPTED; > + > + /* > + * bdev_nr_sectors() == 0 usually means the device has gone away from > + * underneath us. We won't cache this mapping, but we'll return > + * -EINVAL to signal a softer error to the fuse server than "your fs > + * metadata are corrupt". If the fuse server persists anyway, then > + * the worst that happens is that the IO will fail. > + */ > + device_bytes = bdev_nr_sectors(fb->bdev) << SECTOR_SHIFT; > + if (!device_bytes) > + return -EINVAL; > + > + if (BAD_DATA(map_end > device_bytes)) > + return -EFSCORRUPTED; > + > + return 0; > +} > + > +/* Validate one of the incoming upsert mappings */ > +static inline int > +fuse_iomap_upsert_validate_mapping(struct inode *inode, > + enum fuse_iomap_iodir iodir, > + const struct fuse_iomap_io *map) > +{ > + struct fuse_conn *fc = get_fuse_conn(inode); > + struct fuse_backing *fb; > + int ret; > + > + if (!fuse_iomap_check_mapping(inode, map, iodir)) > + return -EFSCORRUPTED; > + > + /* > + * A "retry cache" instruction makes no sense when we're adding to > + * the mapping cache. > + */ > + if (BAD_DATA(map->type == FUSE_IOMAP_TYPE_RETRY_CACHE)) > + return -EFSCORRUPTED; > + > + /* nocache is allowed, because we ignore it later */ > + if (map->type == FUSE_IOMAP_TYPE_NOCACHE) > + return 0; > + > + /* Make sure we can find the device */ > + fb = fuse_iomap_find_dev(fc, map); > + if (BAD_DATA(IS_ERR(fb))) > + return -EFSCORRUPTED; > + > + ret = fuse_iomap_upsert_validate_dev(fb, map); > + fuse_backing_put(fb); > + return ret; > +} > + > +/* Check the incoming upsert mappings to make sure they're not nonsense */ > +static inline int > +fuse_iomap_upsert_validate_mappings(struct inode *inode, > + const struct fuse_iomap_upsert_mappings_out *outarg) > +{ > + int ret = fuse_iomap_upsert_validate_mapping(inode, READ_MAPPING, > + &outarg->read); > + if (ret) > + return ret; > + > + return fuse_iomap_upsert_validate_mapping(inode, WRITE_MAPPING, > + &outarg->write); > +} > + > +static int fuse_iomap_upsert_inode(struct inode *inode, > + const struct fuse_iomap_upsert_mappings_out *outarg) > +{ > + int ret = fuse_iomap_upsert_validate_mappings(inode, outarg); > + if (ret) > + return ret; > + > + if (!fuse_inode_caches_iomaps(inode)) { > + ret = fuse_iomap_cache_alloc(inode); > + if (ret) > + return ret; > + } > + > + fuse_iomap_cache_lock(inode); > + > + if (outarg->read.type != FUSE_IOMAP_TYPE_NOCACHE) { > + ret = fuse_iomap_cache_upsert(inode, READ_MAPPING, > + &outarg->read); > + if (ret) > + goto out_unlock; > + } > + > + if (outarg->write.type != FUSE_IOMAP_TYPE_NOCACHE) { > + ret = fuse_iomap_cache_upsert(inode, WRITE_MAPPING, > + &outarg->write); > + if (ret) > + goto out_unlock; > + } > + > +out_unlock: > + fuse_iomap_cache_unlock(inode); > + return ret; > +} > + > +int fuse_iomap_upsert_mappings(struct fuse_conn *fc, > + const struct fuse_iomap_upsert_mappings_out *outarg) > +{ > + struct inode *inode; > + struct fuse_inode *fi; > + int ret; > + > + if (!fc->iomap) > + return -EINVAL; > + > + down_read(&fc->killsb); > + inode = fuse_ilookup(fc, outarg->nodeid, NULL); > + if (!inode) { > + ret = -ESTALE; > + goto out_sb; > + } > + > + fi = get_fuse_inode(inode); > + if (BAD_DATA(fi->orig_ino != outarg->attr_ino)) { > + ret = -EINVAL; > + goto out_inode; > + } > + > + if (fuse_is_bad(inode)) { > + ret = -EIO; > + goto out_inode; > + } > + > + ret = fuse_iomap_upsert_inode(inode, outarg); > +out_inode: > + iput(inode); > +out_sb: > + up_read(&fc->killsb); > + return ret; > +} > + > +static inline bool > +fuse_iomap_inval_validate_range(const struct inode *inode, > + const struct fuse_range *range) > +{ > + const unsigned int blocksize = i_blocksize(inode); > + > + if (range->length == 0) > + return true; > + > + /* Range can't start beyond maxbytes */ > + if (BAD_DATA(range->offset >= inode->i_sb->s_maxbytes)) > + return false; > + > + /* File range must be aligned to blocksize */ > + if (BAD_DATA(!IS_ALIGNED(range->offset, blocksize))) > + return false; > + if (range->length != FUSE_IOMAP_INVAL_TO_EOF && > + BAD_DATA(!IS_ALIGNED(range->length, blocksize))) Codex points out that this needs to check_add_overflow() the range if the length isn't INVAL_TO_EOF. --D > + return false; > + > + return true; > +} > + > +static int fuse_iomap_inval_inode(struct inode *inode, > + const struct fuse_iomap_inval_mappings_out *outarg) > +{ > + int ret = 0, ret2 = 0; > + > + if (!fuse_iomap_inval_validate_range(inode, &outarg->write)) > + return -EFSCORRUPTED; > + > + if (!fuse_iomap_inval_validate_range(inode, &outarg->read)) > + return -EFSCORRUPTED; > + > + if (!fuse_inode_caches_iomaps(inode)) > + return 0; > + > + fuse_iomap_cache_lock(inode); > + if (outarg->read.length) > + ret2 = fuse_iomap_cache_remove(inode, READ_MAPPING, > + outarg->read.offset, > + outarg->read.length); > + if (outarg->write.length) > + ret = fuse_iomap_cache_remove(inode, WRITE_MAPPING, > + outarg->write.offset, > + outarg->write.length); > + fuse_iomap_cache_unlock(inode); > + > + return ret ? ret : ret2; > +} > + > +int fuse_iomap_inval_mappings(struct fuse_conn *fc, > + const struct fuse_iomap_inval_mappings_out *outarg) > +{ > + struct inode *inode; > + struct fuse_inode *fi; > + int ret; > + > + if (!fc->iomap) > + return -EINVAL; > + > + down_read(&fc->killsb); > + inode = fuse_ilookup(fc, outarg->nodeid, NULL); > + if (!inode) { > + ret = -ESTALE; > + goto out_sb; > + } > + > + fi = get_fuse_inode(inode); > + if (BAD_DATA(fi->orig_ino != outarg->attr_ino)) { > + ret = -EINVAL; > + goto out_inode; > + } > + > + if (fuse_is_bad(inode)) { > + ret = -EIO; > + goto out_inode; > + } > + > + ret = fuse_iomap_inval_inode(inode, outarg); > +out_inode: > + iput(inode); > +out_sb: > + up_read(&fc->killsb); > + return ret; > +} > >