From: "Darrick J. Wong" <djwong@kernel.org>
To: John Groves <John@groves.net>
Cc: Dan Williams <dan.j.williams@intel.com>,
Miklos Szeredi <miklos@szeredb.hu>,
Bernd Schubert <bschubert@ddn.com>,
John Groves <jgroves@micron.com>,
Jonathan Corbet <corbet@lwn.net>,
Vishal Verma <vishal.l.verma@intel.com>,
Dave Jiang <dave.jiang@intel.com>,
Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
Alexander Viro <viro@zeniv.linux.org.uk>,
Christian Brauner <brauner@kernel.org>,
Randy Dunlap <rdunlap@infradead.org>,
Jeff Layton <jlayton@kernel.org>,
Kent Overstreet <kent.overstreet@linux.dev>,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
nvdimm@lists.linux.dev, linux-cxl@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
Amir Goldstein <amir73il@gmail.com>,
Jonathan Cameron <Jonathan.Cameron@huawei.com>,
Stefan Hajnoczi <shajnocz@redhat.com>,
Joanne Koong <joannelkoong@gmail.com>,
Josef Bacik <josef@toxicpanda.com>,
Aravind Ramesh <arramesh@micron.com>,
Ajay Joshi <ajayjoshi@micron.com>
Subject: Re: [RFC V2 12/18] famfs_fuse: Plumb the GET_FMAP message/response
Date: Tue, 8 Jul 2025 21:27:13 -0700 [thread overview]
Message-ID: <20250709042713.GF2672029@frogsfrogsfrogs> (raw)
In-Reply-To: <20250703185032.46568-13-john@groves.net>
On Thu, Jul 03, 2025 at 01:50:26PM -0500, John Groves wrote:
> Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to
> retrieve and cache up the file-to-dax map in the kernel. If this
> succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
>
> GET_FMAP has a variable-size response payload, and the allocated size
> is sent in the in_args[0].size field. If the fmap would overflow the
> message, the fuse server sends a reply of size 'sizeof(uint32_t)' which
> specifies the size of the fmap message. Then the kernel can realloc a
> large enough buffer and try again.
>
> Signed-off-by: John Groves <john@groves.net>
> ---
> fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++
> fs/fuse/fuse_i.h | 36 ++++++++++++++++-
> fs/fuse/inode.c | 19 +++++++--
> fs/fuse/iomode.c | 2 +-
> include/uapi/linux/fuse.h | 18 +++++++++
> 5 files changed, 154 insertions(+), 5 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 93b82660f0c8..8616fb0a6d61 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
> fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
> }
>
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +
> +#define FMAP_BUFSIZE 4096
PAGE_SIZE ?
> +
> +static int
> +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
> +{
> + struct fuse_get_fmap_in inarg = { 0 };
> + size_t fmap_bufsize = FMAP_BUFSIZE;
> + ssize_t fmap_size;
> + int retries = 1;
> + void *fmap_buf;
> + int rc;
> +
> + FUSE_ARGS(args);
> +
> + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
> + if (!fmap_buf)
> + return -EIO;
> +
> + retry_once:
> + inarg.size = fmap_bufsize;
> +
> + args.opcode = FUSE_GET_FMAP;
> + args.nodeid = nodeid;
> +
> + args.in_numargs = 1;
> + args.in_args[0].size = sizeof(inarg);
> + args.in_args[0].value = &inarg;
> +
> + /* Variable-sized output buffer
> + * this causes fuse_simple_request() to return the size of the
> + * output payload
> + */
> + args.out_argvar = true;
> + args.out_numargs = 1;
> + args.out_args[0].size = fmap_bufsize;
> + args.out_args[0].value = fmap_buf;
> +
> + /* Send GET_FMAP command */
> + rc = fuse_simple_request(fm, &args);
> + if (rc < 0) {
> + pr_err("%s: err=%d from fuse_simple_request()\n",
> + __func__, rc);
> + return rc;
> + }
> + fmap_size = rc;
> +
> + if (retries && fmap_size == sizeof(uint32_t)) {
> + /* fmap size exceeded fmap_bufsize;
> + * actual fmap size returned in fmap_buf;
> + * realloc and retry once
> + */
> + fmap_bufsize = *((uint32_t *)fmap_buf);
> +
> + --retries;
> + kfree(fmap_buf);
> + fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL);
> + if (!fmap_buf)
> + return -EIO;
> +
> + goto retry_once;
> + }
> +
> + /* Will call famfs_file_init_dax() when that gets added */
Hard to say what this does without looking further down in the patchset.
:)
> + kfree(fmap_buf);
> + return 0;
> +}
> +#endif
> +
> static int fuse_open(struct inode *inode, struct file *file)
> {
> struct fuse_mount *fm = get_fuse_mount(inode);
> @@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file)
>
> err = fuse_do_open(fm, get_node_id(inode), file, false);
> if (!err) {
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + if (fm->fc->famfs_iomap) {
> + if (S_ISREG(inode->i_mode)) {
/me wonders if you want to turn this into a dumb helper to reduce the
indenting levels?
#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
static inline bool fuse_is_famfs_file(struct inode *inode)
{
return fm->fc->famfs_iomap && S_ISREG(inode->i_mode);
}
#else
# define fuse_is_famfs_file(...) (false)
#endif
if (!err) {
if (fuse_is_famfs_file(inode)) {
rc = fuse_get_fmap(fm, inode);
...
}
}
> + int rc;
> + /* Get the famfs fmap */
> + rc = fuse_get_fmap(fm, inode,
> + get_node_id(inode));
Just get_node_id inside fuse_get_fmap to reduce the parameter count.
> + if (rc)
> + pr_err("%s: fuse_get_fmap err=%d\n",
> + __func__, rc);
> + }
> + }
> +#endif
> ff = file->private_data;
> err = fuse_finish_open(inode, file);
> if (err)
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index f4ee61046578..e01d6e5c6e93 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -193,6 +193,10 @@ struct fuse_inode {
> /** Reference to backing file in passthrough mode */
> struct fuse_backing *fb;
> #endif
> +
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + void *famfs_meta;
> +#endif
What gets stored in here?
> };
>
> /** FUSE inode state bits */
> @@ -945,6 +949,8 @@ struct fuse_conn {
> #endif
>
> #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + struct rw_semaphore famfs_devlist_sem;
> + struct famfs_dax_devlist *dax_devlist;
> char *shadow;
> #endif
> };
> @@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc);
>
> /* dax.c */
>
> +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
> +
> /* This macro is used by virtio_fs, but now it also needs to filter for
> * "not famfs"
> */
> #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
> - && IS_DAX(&fuse_inode->inode))
> + && IS_DAX(&fuse_inode->inode) \
> + && !fuse_file_famfs(fuse_inode))
>
> ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
> ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
> @@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void);
> #define fuse_sysctl_unregister() do { } while (0)
> #endif /* CONFIG_SYSCTL */
>
> +/* famfs.c */
> +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
> + void *meta)
> +{
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + return xchg(&fi->famfs_meta, meta);
> +#else
> + return NULL;
> +#endif
> +}
> +
> +static inline void famfs_meta_free(struct fuse_inode *fi)
> +{
> + /* Stub wil be connected in a subsequent commit */
> +}
> +
> +static inline int fuse_file_famfs(struct fuse_inode *fi)
> +{
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + return (READ_ONCE(fi->famfs_meta) != NULL);
> +#else
> + return 0;
> +#endif
> +}
...or maybe this is the predicate you want to see if you really need to
fmapping related stuff?
> +
> #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index a7e1cf8257b0..b071d16f7d04 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_inode_backing_set(fi, NULL);
>
> + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> + famfs_meta_set(fi, NULL);
> +
> return &fi->inode;
>
> out_free_forget:
> @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_put(fuse_inode_backing(fi));
>
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
> + famfs_meta_free(fi);
> + famfs_meta_set(fi, NULL);
_free should null out the pointer, no?
--D
> + }
> +#endif
> +
> kmem_cache_free(fuse_inode_cachep, fi);
> }
>
> @@ -1002,6 +1012,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_files_init(fc);
>
> + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
> +
> INIT_LIST_HEAD(&fc->mounts);
> list_add(&fm->fc_entry, &fc->mounts);
> fm->fc = fc;
> @@ -1036,9 +1049,8 @@ void fuse_conn_put(struct fuse_conn *fc)
> }
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_files_free(fc);
> -#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> - kfree(fc->shadow);
> -#endif
> + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> + kfree(fc->shadow);
> call_rcu(&fc->rcu, delayed_release);
> }
> }
> @@ -1425,6 +1437,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> * those capabilities, they are held here).
> */
> fc->famfs_iomap = 1;
> + init_rwsem(&fc->famfs_devlist_sem);
> }
> } else {
> ra_pages = fc->max_read / PAGE_SIZE;
> diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
> index aec4aecb5d79..443b337b0c05 100644
> --- a/fs/fuse/iomode.c
> +++ b/fs/fuse/iomode.c
> @@ -204,7 +204,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
> * io modes are not relevant with DAX and with server that does not
> * implement open.
> */
> - if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
> + if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args)
> return 0;
>
> /*
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 6c384640c79b..dff5aa62543e 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -654,6 +654,10 @@ enum fuse_opcode {
> FUSE_TMPFILE = 51,
> FUSE_STATX = 52,
>
> + /* Famfs / devdax opcodes */
> + FUSE_GET_FMAP = 53,
> + FUSE_GET_DAXDEV = 54,
> +
> /* CUSE specific operations */
> CUSE_INIT = 4096,
>
> @@ -888,6 +892,16 @@ struct fuse_access_in {
> uint32_t padding;
> };
>
> +struct fuse_get_fmap_in {
> + uint32_t size;
> + uint32_t padding;
> +};
> +
> +struct fuse_get_fmap_out {
> + uint32_t size;
> + uint32_t padding;
> +};
> +
> struct fuse_init_in {
> uint32_t major;
> uint32_t minor;
> @@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req {
> uint8_t padding[6];
> };
>
> +/* Famfs fmap message components */
> +
> +#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
> +
> #endif /* _LINUX_FUSE_H */
> --
> 2.49.0
>
>
next prev parent reply other threads:[~2025-07-09 4:27 UTC|newest]
Thread overview: 91+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-03 18:50 [RFC V2 00/18] famfs: port into fuse John Groves
2025-07-03 18:50 ` [RFC V2 01/18] dev_dax_iomap: Move dax_pgoff_to_phys() from device.c to bus.c John Groves
2025-07-03 18:50 ` [RFC V2 02/18] dev_dax_iomap: Add fs_dax_get() func to prepare dax for fs-dax usage John Groves
2025-07-04 10:39 ` Jonathan Cameron
2025-07-04 12:54 ` John Groves
2025-07-03 18:50 ` [RFC V2 03/18] dev_dax_iomap: Save the kva from memremap John Groves
2025-07-04 11:11 ` Jonathan Cameron
2025-07-03 18:50 ` [RFC V2 04/18] dev_dax_iomap: Add dax_operations for use by fs-dax on devdax John Groves
2025-07-04 12:47 ` Jonathan Cameron
2025-07-05 22:56 ` John Groves
2025-07-03 18:50 ` [RFC V2 05/18] dev_dax_iomap: export dax_dev_get() John Groves
2025-07-03 18:50 ` [RFC V2 06/18] dev_dax_iomap: (ignore!) Drop poisoned page warning in fs/dax.c John Groves
2025-07-03 18:50 ` [RFC V2 07/18] famfs_fuse: magic.h: Add famfs magic numbers John Groves
2025-07-03 18:50 ` [RFC V2 08/18] famfs_fuse: Kconfig John Groves
2025-07-03 18:50 ` [RFC V2 09/18] famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/ John Groves
2025-07-04 8:44 ` Amir Goldstein
2025-07-03 18:50 ` [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs John Groves
2025-07-03 22:45 ` John Groves
2025-07-07 17:32 ` Darrick J. Wong
2025-07-04 7:54 ` Amir Goldstein
2025-07-04 13:39 ` John Groves
2025-07-07 17:39 ` Darrick J. Wong
2025-07-08 12:02 ` John Groves
2025-07-09 1:53 ` Darrick J. Wong
2025-07-11 1:32 ` John Groves
2025-07-12 4:49 ` Darrick J. Wong
2025-08-11 18:30 ` John Groves
2025-08-12 16:37 ` Darrick J. Wong
2025-08-13 13:07 ` John Groves
2025-08-14 17:16 ` Darrick J. Wong
2025-07-03 18:50 ` [RFC V2 11/18] famfs_fuse: Basic famfs mount opts John Groves
2025-07-09 3:59 ` Darrick J. Wong
2025-07-11 15:28 ` John Groves
2025-07-12 5:54 ` Darrick J. Wong
2025-08-14 10:37 ` Miklos Szeredi
2025-08-14 14:39 ` John Groves
2025-08-14 15:19 ` Miklos Szeredi
2025-08-14 23:52 ` John Groves
2025-07-03 18:50 ` [RFC V2 12/18] famfs_fuse: Plumb the GET_FMAP message/response John Groves
2025-07-04 8:54 ` Amir Goldstein
2025-07-04 20:30 ` John Groves
2025-07-05 0:06 ` John Groves
2025-07-05 7:58 ` Amir Goldstein
2025-07-05 19:17 ` John Groves
2025-07-09 4:27 ` Darrick J. Wong [this message]
2025-07-11 13:46 ` John Groves
2025-08-14 13:36 ` Miklos Szeredi
2025-08-14 14:36 ` Miklos Szeredi
2025-08-14 18:20 ` Darrick J. Wong
2025-08-15 15:06 ` John Groves
2025-08-19 21:55 ` Darrick J. Wong
2025-08-15 16:53 ` John Groves
2025-08-19 22:13 ` Darrick J. Wong
2025-08-14 18:05 ` Darrick J. Wong
2025-08-16 15:00 ` John Groves
2025-08-19 22:17 ` Darrick J. Wong
2025-08-15 0:38 ` John Groves
2025-07-03 18:50 ` [RFC V2 13/18] famfs_fuse: Create files with famfs fmaps John Groves
2025-07-04 9:01 ` Amir Goldstein
2025-07-05 19:27 ` John Groves
2025-07-03 18:50 ` [RFC V2 14/18] famfs_fuse: GET_DAXDEV message and daxdev_table John Groves
2025-07-04 13:20 ` Jonathan Cameron
2025-07-06 17:07 ` John Groves
2025-08-14 13:58 ` Miklos Szeredi
2025-08-14 17:19 ` Darrick J. Wong
2025-08-14 18:25 ` Miklos Szeredi
2025-08-14 18:55 ` Darrick J. Wong
2025-08-14 19:19 ` Miklos Szeredi
2025-08-16 16:22 ` John Groves
2025-08-19 22:32 ` Darrick J. Wong
2025-08-15 16:38 ` John Groves
2025-08-19 22:34 ` Darrick J. Wong
2025-07-03 18:50 ` [RFC V2 15/18] famfs_fuse: Plumb dax iomap and fuse read/write/mmap John Groves
2025-07-04 9:13 ` Amir Goldstein
2025-07-05 19:44 ` John Groves
2025-07-03 18:50 ` [RFC V2 16/18] famfs_fuse: Add holder_operations for dax notify_failure() John Groves
2025-07-03 18:50 ` [RFC V2 17/18] famfs_fuse: Add famfs metadata documentation John Groves
2025-07-03 18:50 ` [RFC V2 18/18] famfs_fuse: Add documentation John Groves
2025-07-04 0:27 ` Bagas Sanjaya
2025-07-04 2:22 ` Jonathan Corbet
2025-07-04 3:53 ` Bagas Sanjaya
2025-07-04 18:58 ` Matthew Wilcox
2025-07-04 23:29 ` Bagas Sanjaya
2025-07-04 23:43 ` Matthew Wilcox
2025-07-05 1:11 ` Bagas Sanjaya
2025-07-04 6:09 ` Randy Dunlap
2025-07-04 8:27 ` Amir Goldstein
2025-07-04 23:36 ` Bagas Sanjaya
2025-07-03 18:56 ` [RFC V2 00/18] famfs: port into fuse John Groves
2025-07-09 3:26 ` Miklos Szeredi
2025-07-11 1:18 ` John Groves
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250709042713.GF2672029@frogsfrogsfrogs \
--to=djwong@kernel.org \
--cc=John@groves.net \
--cc=Jonathan.Cameron@huawei.com \
--cc=ajayjoshi@micron.com \
--cc=amir73il@gmail.com \
--cc=arramesh@micron.com \
--cc=brauner@kernel.org \
--cc=bschubert@ddn.com \
--cc=corbet@lwn.net \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=jack@suse.cz \
--cc=jgroves@micron.com \
--cc=jlayton@kernel.org \
--cc=joannelkoong@gmail.com \
--cc=josef@toxicpanda.com \
--cc=kent.overstreet@linux.dev \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=miklos@szeredb.hu \
--cc=nvdimm@lists.linux.dev \
--cc=rdunlap@infradead.org \
--cc=shajnocz@redhat.com \
--cc=viro@zeniv.linux.org.uk \
--cc=vishal.l.verma@intel.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.