* [PATCH V10 01/10] famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
@ 2026-03-31 12:38 ` John Groves
2026-03-31 12:38 ` [PATCH V10 02/10] famfs_fuse: Basic fuse kernel ABI enablement for famfs John Groves
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
Virtio_fs now needs to determine if an inode is DAX && not famfs.
This relaces the FUSE_IS_DAX() macro with FUSE_IS_VIRTIO_DAX(),
in preparation for famfs in later commits. The dummy
fuse_file_famfs() macro will be replaced with a working
function.
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/dir.c | 2 +-
fs/fuse/file.c | 13 ++++++++-----
fs/fuse/fuse_i.h | 9 ++++++++-
fs/fuse/inode.c | 4 ++--
fs/fuse/iomode.c | 2 +-
5 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7ac6b232ef12..c63f097bc697 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -2161,7 +2161,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
is_truncate = true;
}
- if (FUSE_IS_DAX(inode) && is_truncate) {
+ if (FUSE_IS_VIRTIO_DAX(fi) && is_truncate) {
filemap_invalidate_lock(mapping);
fault_blocked = true;
err = fuse_dax_break_layouts(inode, 0, -1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 676fd9856bfb..150f2e1d6c2f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -252,7 +252,7 @@ static int fuse_open(struct inode *inode, struct file *file)
int err;
bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
bool is_wb_truncate = is_truncate && fc->writeback_cache;
- bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
+ bool dax_truncate = is_truncate && FUSE_IS_VIRTIO_DAX(fi);
if (fuse_is_bad(inode))
return -EIO;
@@ -1812,11 +1812,12 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
if (fuse_is_bad(inode))
return -EIO;
- if (FUSE_IS_DAX(inode))
+ if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_read_iter(iocb, to);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
@@ -1833,11 +1834,12 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
if (fuse_is_bad(inode))
return -EIO;
- if (FUSE_IS_DAX(inode))
+ if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_write_iter(iocb, from);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
@@ -2370,10 +2372,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
int rc;
/* DAX mmap is superior to direct_io mmap */
- if (FUSE_IS_DAX(inode))
+ if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_mmap(file, vma);
/*
@@ -2934,7 +2937,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
- bool block_faults = FUSE_IS_DAX(inode) &&
+ bool block_faults = FUSE_IS_VIRTIO_DAX(fi) &&
(!(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7f16049387d1..80bf4438c436 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1508,7 +1508,14 @@ void fuse_free_conn(struct fuse_conn *fc);
/* dax.c */
-#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+static inline bool fuse_file_famfs(struct fuse_inode *fuse_inode) /* Will be superseded */
+{
+ (void)fuse_inode;
+ return false;
+}
+#define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
+ && IS_DAX(&(fuse_inode)->inode) \
+ && !fuse_file_famfs(fuse_inode))
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c795abe47a4f..f688c31f7eef 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -162,7 +162,7 @@ static void fuse_evict_inode(struct inode *inode)
/* Will write inode on close/munmap and in all other dirtiers */
WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
- if (FUSE_IS_DAX(inode))
+ if (FUSE_IS_VIRTIO_DAX(fi))
dax_break_layout_final(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -170,7 +170,7 @@ static void fuse_evict_inode(struct inode *inode)
if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
- if (FUSE_IS_DAX(inode))
+ if (FUSE_IS_VIRTIO_DAX(fi))
fuse_dax_inode_cleanup(inode);
if (fi->nlookup) {
fuse_queue_forget(fc, fi->forget, fi->nodeid,
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index 3728933188f3..31ee7f3304c6 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -203,7 +203,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
* io modes are not relevant with DAX and with server that does not
* implement open.
*/
- if (FUSE_IS_DAX(inode) || !ff->args)
+ if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
return 0;
/*
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 02/10] famfs_fuse: Basic fuse kernel ABI enablement for famfs
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
2026-03-31 12:38 ` [PATCH V10 01/10] famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/ John Groves
@ 2026-03-31 12:38 ` John Groves
2026-03-31 12:38 ` [PATCH V10 03/10] famfs_fuse: Plumb the GET_FMAP message/response John Groves
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
This patch starts the kernel ABI enablement of famfs in fuse.
- Kconfig: Add FUSE_FAMFS_DAX config parameter, to control
compilation of famfs within fuse.
- FUSE_DAX_FMAP flag in INIT request/reply
- fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
famfs-enabled connection
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/Kconfig | 13 +++++++++++++
fs/fuse/fuse_i.h | 3 +++
fs/fuse/inode.c | 6 ++++++
include/uapi/linux/fuse.h | 5 +++++
4 files changed, 27 insertions(+)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 3a4ae632c94a..17fe1f490cbd 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -76,3 +76,16 @@ config FUSE_IO_URING
If you want to allow fuse server/client communication through io-uring,
answer Y
+
+config FUSE_FAMFS_DAX
+ bool "FUSE support for fs-dax filesystems backed by devdax"
+ depends on FUSE_FS
+ depends on DEV_DAX_FSDEV
+ default FUSE_FS
+ help
+ This enables the fabric-attached memory file system (famfs),
+ which enables formatting devdax memory as a file system. Famfs
+ is primarily intended for scale-out shared access to
+ disaggregated memory.
+
+ To enable famfs or other fuse/fs-dax file systems, answer Y
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 80bf4438c436..712038a554d9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -921,6 +921,9 @@ struct fuse_conn {
/* Is synchronous FUSE_INIT allowed? */
unsigned int sync_init:1;
+ /* dev_dax_iomap support for famfs */
+ unsigned int famfs_iomap:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f688c31f7eef..f4a265734270 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1456,6 +1456,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
if (flags & FUSE_REQUEST_TIMEOUT)
timeout = arg->request_timeout;
+
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
+ flags & FUSE_DAX_FMAP)
+ fc->famfs_iomap = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1517,6 +1521,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
flags |= FUSE_SUBMOUNTS;
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
flags |= FUSE_PASSTHROUGH;
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ flags |= FUSE_DAX_FMAP;
/*
* This is just an information flag for fuse server. No need to check
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c13e1f9a2f12..25686f088e6a 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -240,6 +240,9 @@
* - add FUSE_COPY_FILE_RANGE_64
* - add struct fuse_copy_file_range_out
* - add FUSE_NOTIFY_PRUNE
+ *
+ * 7.46
+ * - Add FUSE_DAX_FMAP capability - ability to handle in-kernel fsdax maps
*/
#ifndef _LINUX_FUSE_H
@@ -448,6 +451,7 @@ struct fuse_file_lock {
* FUSE_OVER_IO_URING: Indicate that client supports io-uring
* FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
* init_out.request_timeout contains the timeout (in secs)
+ * FUSE_DAX_FMAP: kernel supports dev_dax_iomap (aka famfs) fmaps
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
@@ -495,6 +499,7 @@ struct fuse_file_lock {
#define FUSE_ALLOW_IDMAP (1ULL << 40)
#define FUSE_OVER_IO_URING (1ULL << 41)
#define FUSE_REQUEST_TIMEOUT (1ULL << 42)
+#define FUSE_DAX_FMAP (1ULL << 43)
/**
* CUSE INIT request/reply flags
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 03/10] famfs_fuse: Plumb the GET_FMAP message/response
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
2026-03-31 12:38 ` [PATCH V10 01/10] famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/ John Groves
2026-03-31 12:38 ` [PATCH V10 02/10] famfs_fuse: Basic fuse kernel ABI enablement for famfs John Groves
@ 2026-03-31 12:38 ` John Groves
2026-03-31 12:38 ` [PATCH V10 04/10] famfs_fuse: Create files with famfs fmaps John Groves
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to
retrieve and cache up the file-to-dax map in the kernel. If this
succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
Signed-off-by: John Groves <john@groves.net>
---
MAINTAINERS | 8 +++++
fs/fuse/Makefile | 1 +
fs/fuse/famfs.c | 73 +++++++++++++++++++++++++++++++++++++++
fs/fuse/file.c | 14 +++++++-
fs/fuse/fuse_i.h | 70 ++++++++++++++++++++++++++++++++++---
fs/fuse/inode.c | 8 ++++-
fs/fuse/iomode.c | 2 +-
include/uapi/linux/fuse.h | 7 ++++
8 files changed, 175 insertions(+), 8 deletions(-)
create mode 100644 fs/fuse/famfs.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ac49067c64ee..a789394552a2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10523,6 +10523,14 @@ F: fs/fuse/
F: include/uapi/linux/fuse.h
F: tools/testing/selftests/filesystems/fuse/
+FUSE [FAMFS Fabric-Attached Memory File System]
+M: John Groves <jgroves@micron.com>
+M: John Groves <John@Groves.net>
+L: linux-cxl@vger.kernel.org
+L: linux-fsdevel@vger.kernel.org
+S: Supported
+F: fs/fuse/famfs.c
+
FUTEX SUBSYSTEM
M: Thomas Gleixner <tglx@kernel.org>
M: Ingo Molnar <mingo@redhat.com>
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 22ad9538dfc4..3f8dcc8cbbd0 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -17,5 +17,6 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
+fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
new file mode 100644
index 000000000000..d238d853afa8
--- /dev/null
+++ b/fs/fuse/famfs.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2026 Micron Technology, Inc.
+ *
+ * This file system, originally based on ramfs the dax support from xfs,
+ * is intended to allow multiple host systems to mount a common file system
+ * view of dax files that map to shared memory.
+ */
+
+#include <linux/cleanup.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "fuse_i.h"
+
+
+#define FMAP_BUFSIZE PAGE_SIZE
+
+int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ size_t fmap_bufsize = FMAP_BUFSIZE;
+ u64 nodeid = get_node_id(inode);
+ ssize_t fmap_size;
+ int rc;
+
+ FUSE_ARGS(args);
+
+ /* Don't retrieve if we already have the famfs metadata */
+ if (fi->famfs_meta)
+ return 0;
+
+ void *fmap_buf __free(kfree) = kzalloc(FMAP_BUFSIZE, GFP_KERNEL);
+
+ if (!fmap_buf)
+ return -ENOMEM;
+
+ args.opcode = FUSE_GET_FMAP;
+ args.nodeid = nodeid;
+
+ /* Variable-sized output buffer
+ * this causes fuse_simple_request() to return the size of the
+ * output payload
+ */
+ args.out_argvar = true;
+ args.out_numargs = 1;
+ args.out_args[0].size = fmap_bufsize;
+ args.out_args[0].value = fmap_buf;
+
+ /* Send GET_FMAP command */
+ rc = fuse_simple_request(fm, &args);
+ if (rc < 0) {
+ pr_err("%s: err=%d from fuse_simple_request()\n",
+ __func__, rc);
+ return rc;
+ }
+ fmap_size = rc;
+
+ /* We retrieved the "fmap" (the file's map to memory), but
+ * we haven't used it yet. A call to famfs_file_init_dax() will be added
+ * here in a subsequent patch, when we add the ability to attach
+ * fmaps to files.
+ */
+
+ return 0;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 150f2e1d6c2f..605f1c6cc10e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -277,6 +277,16 @@ static int fuse_open(struct inode *inode, struct file *file)
err = fuse_do_open(fm, get_node_id(inode), file, false);
if (!err) {
ff = file->private_data;
+
+ if ((fm->fc->famfs_iomap) && (S_ISREG(inode->i_mode))) {
+ /* Get the famfs fmap - failure is fatal */
+ err = fuse_get_fmap(fm, inode);
+ if (err) {
+ fuse_sync_release(fi, ff, file->f_flags);
+ goto out_nowrite;
+ }
+ }
+
err = fuse_finish_open(inode, file);
if (err)
fuse_sync_release(fi, ff, file->f_flags);
@@ -284,12 +294,14 @@ static int fuse_open(struct inode *inode, struct file *file)
fuse_truncate_update_attr(inode, file);
}
+out_nowrite:
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
if (is_truncate)
truncate_pagecache(inode, 0);
- else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE) &&
+ !fuse_file_famfs(fi))
invalidate_inode_pages2(inode->i_mapping);
}
if (dax_truncate)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 712038a554d9..b5466743c13f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -223,6 +223,14 @@ struct fuse_inode {
* so preserve the blocksize specified by the server.
*/
u8 cached_i_blkbits;
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ /* Pointer to the file's famfs metadata. Primary content is the
+ * in-memory version of the fmap - the map from file's offset range
+ * to DAX memory
+ */
+ void *famfs_meta;
+#endif
};
/** FUSE inode state bits */
@@ -1511,11 +1519,8 @@ void fuse_free_conn(struct fuse_conn *fc);
/* dax.c */
-static inline bool fuse_file_famfs(struct fuse_inode *fuse_inode) /* Will be superseded */
-{
- (void)fuse_inode;
- return false;
-}
+static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
+
#define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
&& IS_DAX(&(fuse_inode)->inode) \
&& !fuse_file_famfs(fuse_inode))
@@ -1634,4 +1639,59 @@ extern void fuse_sysctl_unregister(void);
#define fuse_sysctl_unregister() do { } while (0)
#endif /* CONFIG_SYSCTL */
+/* famfs.c */
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+void __famfs_meta_free(void *map);
+
+/* Set fi->famfs_meta = NULL regardless of prior value */
+static inline void famfs_meta_init(struct fuse_inode *fi)
+{
+ fi->famfs_meta = NULL;
+}
+
+/* Set fi->famfs_meta iff the current value is NULL */
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+ void *meta)
+{
+ return cmpxchg(&fi->famfs_meta, NULL, meta);
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+ famfs_meta_set(fi, NULL);
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+ return (READ_ONCE(fi->famfs_meta) != NULL);
+}
+
+int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode);
+
+#else /* !CONFIG_FUSE_FAMFS_DAX */
+
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+ void *meta)
+{
+ return NULL;
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+ return 0;
+}
+
+static inline int
+fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FUSE_FAMFS_DAX */
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f4a265734270..862f4e61a5fb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -120,6 +120,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_inode_backing_set(fi, NULL);
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ famfs_meta_set(fi, NULL);
+
return &fi->inode;
out_free_forget:
@@ -141,6 +144,9 @@ static void fuse_free_inode(struct inode *inode)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_put(fuse_inode_backing(fi));
+ if (S_ISREG(inode->i_mode) && fuse_file_famfs(fi))
+ famfs_meta_free(fi);
+
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -162,7 +168,7 @@ static void fuse_evict_inode(struct inode *inode)
/* Will write inode on close/munmap and in all other dirtiers */
WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
- if (FUSE_IS_VIRTIO_DAX(fi))
+ if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi))
dax_break_layout_final(inode);
truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index 31ee7f3304c6..948148316ef0 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -203,7 +203,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
* io modes are not relevant with DAX and with server that does not
* implement open.
*/
- if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
+ if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args)
return 0;
/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 25686f088e6a..9eff9083d3b5 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -669,6 +669,9 @@ enum fuse_opcode {
FUSE_STATX = 52,
FUSE_COPY_FILE_RANGE_64 = 53,
+ /* Famfs / devdax opcodes */
+ FUSE_GET_FMAP = 54,
+
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -1313,4 +1316,8 @@ struct fuse_uring_cmd_req {
uint8_t padding[6];
};
+/* Famfs fmap message components */
+
+#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
+
#endif /* _LINUX_FUSE_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 04/10] famfs_fuse: Create files with famfs fmaps
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (2 preceding siblings ...)
2026-03-31 12:38 ` [PATCH V10 03/10] famfs_fuse: Plumb the GET_FMAP message/response John Groves
@ 2026-03-31 12:38 ` John Groves
2026-03-31 12:38 ` [PATCH V10 05/10] famfs_fuse: GET_DAXDEV message and daxdev_table John Groves
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
On completion of GET_FMAP message/response, setup the full famfs
metadata such that it's possible to handle read/write/mmap directly to
dax. Note that the devdax_iomap plumbing is not in yet...
* Add famfs_kfmap.h: in-memory structures for resolving famfs file maps
(fmaps) to dax.
* famfs.c: allocate, initialize and free fmaps
* inode.c: only allow famfs mode if the fuse server has CAP_SYS_RAWIO
* Update MAINTAINERS for the new file.
Signed-off-by: John Groves <john@groves.net>
---
MAINTAINERS | 1 +
fs/fuse/famfs.c | 339 +++++++++++++++++++++++++++++++++++++-
fs/fuse/famfs_kfmap.h | 67 ++++++++
fs/fuse/fuse_i.h | 8 +-
fs/fuse/inode.c | 20 ++-
include/uapi/linux/fuse.h | 56 +++++++
6 files changed, 481 insertions(+), 10 deletions(-)
create mode 100644 fs/fuse/famfs_kfmap.h
diff --git a/MAINTAINERS b/MAINTAINERS
index a789394552a2..4edb56afb947 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10530,6 +10530,7 @@ L: linux-cxl@vger.kernel.org
L: linux-fsdevel@vger.kernel.org
S: Supported
F: fs/fuse/famfs.c
+F: fs/fuse/famfs_kfmap.h
FUTEX SUBSYSTEM
M: Thomas Gleixner <tglx@kernel.org>
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index d238d853afa8..ac52e54e2cb5 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -18,9 +18,339 @@
#include <linux/namei.h>
#include <linux/string.h>
+#include "famfs_kfmap.h"
#include "fuse_i.h"
+/***************************************************************************/
+
+void __famfs_meta_free(void *famfs_meta)
+{
+ struct famfs_file_meta *fmap = famfs_meta;
+
+ if (!fmap)
+ return;
+
+ switch (fmap->fm_extent_type) {
+ case SIMPLE_DAX_EXTENT:
+ kfree(fmap->se);
+ break;
+ case INTERLEAVED_EXTENT:
+ if (fmap->ie) {
+ for (int i = 0; i < fmap->fm_niext; i++)
+ kfree(fmap->ie[i].ie_strips);
+ }
+ kfree(fmap->ie);
+ break;
+ default:
+ pr_err("%s: invalid fmap type\n", __func__);
+ break;
+ }
+
+ kfree(fmap);
+}
+DEFINE_FREE(__famfs_meta_free, void *, if (_T) __famfs_meta_free(_T))
+
+static int
+famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
+{
+ int errs = 0;
+
+ if (se->dev_index != 0)
+ errs++;
+
+ /* TODO: pass in alignment so we can support the other page sizes */
+ if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
+ errs++;
+
+ if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
+ errs++;
+
+ return errs;
+}
+
+/**
+ * famfs_fuse_meta_alloc() - Allocate famfs file metadata
+ * @fmap_buf: fmap buffer from fuse server
+ * @fmap_buf_size: size of fmap buffer
+ * @metap: pointer where 'struct famfs_file_meta' is returned
+ *
+ * Returns: 0=success
+ * -errno=failure
+ */
+static int
+famfs_fuse_meta_alloc(
+ void *fmap_buf,
+ size_t fmap_buf_size,
+ struct famfs_file_meta **metap)
+{
+ struct fuse_famfs_fmap_header *fmh;
+ size_t extent_total = 0;
+ size_t next_offset = 0;
+ int errs = 0;
+ int i, j;
+
+ fmh = fmap_buf;
+
+ /* Move past fmh in fmap_buf */
+ next_offset += sizeof(*fmh);
+ if (next_offset > fmap_buf_size) {
+ pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+ __func__, __LINE__, next_offset, fmap_buf_size);
+ return -EINVAL;
+ }
+
+ if (fmh->nextents < 1) {
+ pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
+ return -ERANGE;
+ }
+
+ if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
+ pr_err("%s: nextents %d > max (%d) 1\n",
+ __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
+ return -ERANGE;
+ }
+
+ struct famfs_file_meta *meta __free(__famfs_meta_free) = kzalloc(sizeof(*meta), GFP_KERNEL);
+
+ if (!meta)
+ return -ENOMEM;
+
+ meta->error = false;
+ meta->file_type = fmh->file_type;
+ meta->file_size = fmh->file_size;
+ meta->fm_extent_type = fmh->ext_type;
+
+ switch (fmh->ext_type) {
+ case FUSE_FAMFS_EXT_SIMPLE: {
+ struct fuse_famfs_simple_ext *se_in;
+
+ se_in = fmap_buf + next_offset;
+
+ /* Move past simple extents */
+ next_offset += fmh->nextents * sizeof(*se_in);
+ if (next_offset > fmap_buf_size) {
+ pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+ __func__, __LINE__, next_offset, fmap_buf_size);
+ return -EINVAL;
+ }
+
+ meta->fm_nextents = fmh->nextents;
+
+ meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
+ GFP_KERNEL);
+ if (!meta->se)
+ return -ENOMEM;
+
+ if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||
+ (meta->fm_nextents < 1))
+ return -EINVAL;
+
+ for (i = 0; i < fmh->nextents; i++) {
+ meta->se[i].dev_index = se_in[i].se_devindex;
+ meta->se[i].ext_offset = se_in[i].se_offset;
+ meta->se[i].ext_len = se_in[i].se_len;
+
+ /* Record bitmap of referenced daxdev indices */
+ meta->dev_bitmap |= (1 << meta->se[i].dev_index);
+
+ errs += famfs_check_ext_alignment(&meta->se[i]);
+
+ extent_total += meta->se[i].ext_len;
+ }
+ break;
+ }
+
+ case FUSE_FAMFS_EXT_INTERLEAVE: {
+ s64 size_remainder = meta->file_size;
+ struct fuse_famfs_iext *ie_in;
+ int niext = fmh->nextents;
+
+ meta->fm_niext = niext;
+
+ /* Allocate interleaved extent */
+ meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
+ if (!meta->ie)
+ return -ENOMEM;
+
+ /*
+ * Each interleaved extent has a simple extent list of strips.
+ * Outer loop is over separate interleaved extents
+ */
+ for (i = 0; i < niext; i++) {
+ u64 nstrips;
+ struct fuse_famfs_simple_ext *sie_in;
+
+ /* ie_in = one interleaved extent in fmap_buf */
+ ie_in = fmap_buf + next_offset;
+
+ /* Move past one interleaved extent header in fmap_buf */
+ next_offset += sizeof(*ie_in);
+ if (next_offset > fmap_buf_size) {
+ pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+ __func__, __LINE__, next_offset,
+ fmap_buf_size);
+ return -EINVAL;
+ }
+
+ if (!IS_ALIGNED(ie_in->ie_chunk_size, PMD_SIZE)) {
+ pr_err("%s: chunk_size %lld not PMD-aligned\n",
+ __func__, meta->ie[i].fie_chunk_size);
+ return -EINVAL;
+ }
+
+ if (ie_in->ie_nbytes == 0) {
+ pr_err("%s: zero-length interleave!\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ nstrips = ie_in->ie_nstrips;
+ meta->ie[i].fie_chunk_size = ie_in->ie_chunk_size;
+ meta->ie[i].fie_nstrips = ie_in->ie_nstrips;
+ meta->ie[i].fie_nbytes = ie_in->ie_nbytes;
+
+ /* sie_in = the strip extents in fmap_buf */
+ sie_in = fmap_buf + next_offset;
+
+ /* Move past strip extents in fmap_buf */
+ next_offset += nstrips * sizeof(*sie_in);
+ if (next_offset > fmap_buf_size) {
+ pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+ __func__, __LINE__, next_offset,
+ fmap_buf_size);
+ return -EINVAL;
+ }
+
+ if ((nstrips > FUSE_FAMFS_MAX_STRIPS) || (nstrips < 1)) {
+ pr_err("%s: invalid nstrips=%lld (max=%d)\n",
+ __func__, nstrips,
+ FUSE_FAMFS_MAX_STRIPS);
+ errs++;
+ }
+
+ /* Allocate strip extent array */
+ meta->ie[i].ie_strips =
+ kcalloc(ie_in->ie_nstrips,
+ sizeof(meta->ie[i].ie_strips[0]),
+ GFP_KERNEL);
+ if (!meta->ie[i].ie_strips)
+ return -ENOMEM;
+
+ /* Inner loop is over strips */
+ for (j = 0; j < nstrips; j++) {
+ struct famfs_meta_simple_ext *strips_out;
+ u64 devindex = sie_in[j].se_devindex;
+ u64 offset = sie_in[j].se_offset;
+ u64 len = sie_in[j].se_len;
+
+ strips_out = meta->ie[i].ie_strips;
+ strips_out[j].dev_index = devindex;
+ strips_out[j].ext_offset = offset;
+ strips_out[j].ext_len = len;
+
+ /* Record bitmap of referenced daxdev indices */
+ meta->dev_bitmap |= (1 << devindex);
+
+ extent_total += len;
+ errs += famfs_check_ext_alignment(&strips_out[j]);
+ size_remainder -= len;
+ }
+ }
+
+ if (size_remainder > 0) {
+ /* Sum of interleaved extent sizes is less than file size! */
+ pr_err("%s: size_remainder %lld (0x%llx)\n",
+ __func__, size_remainder, size_remainder);
+ return -EINVAL;
+ }
+ break;
+ }
+
+ default:
+ pr_err("%s: invalid ext_type %d\n", __func__, fmh->ext_type);
+ return -EINVAL;
+ }
+
+ if (errs > 0) {
+ pr_err("%s: %d alignment errors found\n", __func__, errs);
+ return -EINVAL;
+ }
+
+ /* More sanity checks */
+ if (extent_total < meta->file_size) {
+ pr_err("%s: file size %ld larger than map size %ld\n",
+ __func__, meta->file_size, extent_total);
+ return -EINVAL;
+ }
+
+ if (cmpxchg(metap, NULL, meta) != NULL) {
+ pr_debug("%s: fmap race detected\n", __func__);
+ return 0; /* fmap already installed */
+ }
+ retain_and_null_ptr(meta);
+
+ return 0;
+}
+
+/**
+ * famfs_file_init_dax() - init famfs dax file metadata
+ *
+ * @fm: fuse_mount
+ * @inode: the inode
+ * @fmap_buf: fmap response message
+ * @fmap_size: Size of the fmap message
+ *
+ * Initialize famfs metadata for a file, based on the contents of the GET_FMAP
+ * response
+ *
+ * Return: 0=success
+ * -errno=failure
+ */
+int
+famfs_file_init_dax(
+ struct fuse_mount *fm,
+ struct inode *inode,
+ void *fmap_buf,
+ size_t fmap_size)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct famfs_file_meta *meta = NULL;
+ int rc;
+
+ if (fi->famfs_meta) {
+ pr_notice("%s: i_no=%ld fmap_size=%ld ALREADY INITIALIZED\n",
+ __func__,
+ inode->i_ino, fmap_size);
+ return 0;
+ }
+
+ rc = famfs_fuse_meta_alloc(fmap_buf, fmap_size, &meta);
+ if (rc)
+ goto errout;
+
+ /* Publish the famfs metadata on fi->famfs_meta */
+ inode_lock(inode);
+
+ if (famfs_meta_set(fi, meta) == NULL) {
+ i_size_write(inode, meta->file_size);
+ inode->i_flags |= S_DAX;
+ } else {
+ pr_debug("%s: file already had metadata\n", __func__);
+ __famfs_meta_free(meta);
+ /* rc is 0 - the file is valid */
+ }
+
+ inode_unlock(inode);
+ return 0;
+
+errout:
+ if (rc)
+ __famfs_meta_free(meta);
+
+ return rc;
+}
+
#define FMAP_BUFSIZE PAGE_SIZE
int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
@@ -63,11 +393,8 @@ int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
}
fmap_size = rc;
- /* We retrieved the "fmap" (the file's map to memory), but
- * we haven't used it yet. A call to famfs_file_init_dax() will be added
- * here in a subsequent patch, when we add the ability to attach
- * fmaps to files.
- */
+ /* Convert fmap into in-memory format and hang from inode */
+ rc = famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
- return 0;
+ return rc;
}
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
new file mode 100644
index 000000000000..18ab22bcc5a1
--- /dev/null
+++ b/fs/fuse/famfs_kfmap.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2026 Micron Technology, Inc.
+ */
+#ifndef FAMFS_KFMAP_H
+#define FAMFS_KFMAP_H
+
+/*
+ * The structures below are the in-memory metadata format for famfs files.
+ * Metadata retrieved via the GET_FMAP response is converted to this format
+ * for use in resolving file mapping faults.
+ *
+ * The GET_FMAP response contains the same information, but in a more
+ * message-and-versioning-friendly format. Those structs can be found in the
+ * famfs section of include/uapi/linux/fuse.h (aka fuse_kernel.h in libfuse)
+ */
+
+enum famfs_file_type {
+ FAMFS_REG,
+ FAMFS_SUPERBLOCK,
+ FAMFS_LOG,
+};
+
+/* We anticipate the possibility of supporting additional types of extents */
+enum famfs_extent_type {
+ SIMPLE_DAX_EXTENT,
+ INTERLEAVED_EXTENT,
+ INVALID_EXTENT_TYPE,
+};
+
+struct famfs_meta_simple_ext {
+ u64 dev_index;
+ u64 ext_offset;
+ u64 ext_len;
+};
+
+struct famfs_meta_interleaved_ext {
+ u64 fie_nstrips;
+ u64 fie_chunk_size;
+ u64 fie_nbytes;
+ struct famfs_meta_simple_ext *ie_strips;
+};
+
+/*
+ * Each famfs dax file has this hanging from its fuse_inode->famfs_meta
+ */
+struct famfs_file_meta {
+ bool error;
+ enum famfs_file_type file_type;
+ size_t file_size;
+ enum famfs_extent_type fm_extent_type;
+ u64 dev_bitmap; /* bitmap of referenced daxdevs by index */
+ union {
+ struct {
+ size_t fm_nextents;
+ struct famfs_meta_simple_ext *se;
+ };
+ struct {
+ size_t fm_niext;
+ struct famfs_meta_interleaved_ext *ie;
+ };
+ };
+};
+
+#endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b5466743c13f..df4e9c9f80bf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1642,6 +1642,9 @@ extern void fuse_sysctl_unregister(void);
/* famfs.c */
#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+int famfs_file_init_dax(struct fuse_mount *fm,
+ struct inode *inode, void *fmap_buf,
+ size_t fmap_size);
void __famfs_meta_free(void *map);
/* Set fi->famfs_meta = NULL regardless of prior value */
@@ -1659,7 +1662,10 @@ static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
static inline void famfs_meta_free(struct fuse_inode *fi)
{
- famfs_meta_set(fi, NULL);
+ if (fi->famfs_meta != NULL) {
+ __famfs_meta_free(fi->famfs_meta);
+ famfs_meta_set(fi, NULL);
+ }
}
static inline int fuse_file_famfs(struct fuse_inode *fi)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 862f4e61a5fb..5e692fc84297 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -10,6 +10,7 @@
#include "fuse_dev_i.h"
#include "dev_uring_i.h"
+#include <linux/bitfield.h>
#include <linux/dax.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -1464,8 +1465,21 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
timeout = arg->request_timeout;
if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
- flags & FUSE_DAX_FMAP)
- fc->famfs_iomap = 1;
+ flags & FUSE_DAX_FMAP) {
+ /* famfs_iomap is only allowed if the fuse
+ * server has CAP_SYS_RAWIO. This was checked
+ * in fuse_send_init, and FUSE_DAX_IOMAP was
+ * set in in_flags if so. Only allow enablement
+ * if we find it there. This function is
+ * normally not running in fuse server context,
+ * so we can't do the capability check here...
+ */
+ u64 in_flags = FIELD_PREP(GENMASK_ULL(63, 32), ia->in.flags2)
+ | ia->in.flags;
+
+ if (in_flags & FUSE_DAX_FMAP)
+ fc->famfs_iomap = 1;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1527,7 +1541,7 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
flags |= FUSE_SUBMOUNTS;
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
flags |= FUSE_PASSTHROUGH;
- if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) && capable(CAP_SYS_RAWIO))
flags |= FUSE_DAX_FMAP;
/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 9eff9083d3b5..cf678bebbfe0 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -243,6 +243,13 @@
*
* 7.46
* - Add FUSE_DAX_FMAP capability - ability to handle in-kernel fsdax maps
+ * - Add the following structures for the GET_FMAP message reply components:
+ * - struct fuse_famfs_simple_ext
+ * - struct fuse_famfs_iext
+ * - struct fuse_famfs_fmap_header
+ * - Add the following enumerated types
+ * - enum fuse_famfs_file_type
+ * - enum famfs_ext_type
*/
#ifndef _LINUX_FUSE_H
@@ -1318,6 +1325,55 @@ struct fuse_uring_cmd_req {
/* Famfs fmap message components */
+#define FAMFS_FMAP_VERSION 1
+
#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
+#define FUSE_FAMFS_MAX_EXTENTS 32
+#define FUSE_FAMFS_MAX_STRIPS 32
+
+enum fuse_famfs_file_type {
+ FUSE_FAMFS_FILE_REG,
+ FUSE_FAMFS_FILE_SUPERBLOCK,
+ FUSE_FAMFS_FILE_LOG,
+};
+
+enum famfs_ext_type {
+ FUSE_FAMFS_EXT_SIMPLE = 0,
+ FUSE_FAMFS_EXT_INTERLEAVE = 1,
+};
+
+struct fuse_famfs_simple_ext {
+ uint32_t se_devindex;
+ uint32_t reserved;
+ uint64_t se_offset;
+ uint64_t se_len;
+};
+
+struct fuse_famfs_iext { /* Interleaved extent */
+ uint32_t ie_nstrips;
+ uint32_t ie_chunk_size;
+ uint64_t ie_nbytes; /* Total bytes for this interleaved_ext;
+ * sum of strips may be more
+ */
+ uint64_t reserved;
+};
+
+struct fuse_famfs_fmap_header {
+ uint8_t file_type; /* enum famfs_file_type */
+ uint8_t reserved;
+ uint16_t fmap_version;
+ uint32_t ext_type; /* enum famfs_log_ext_type */
+ uint32_t nextents;
+ uint32_t reserved0;
+ uint64_t file_size;
+ uint64_t reserved1;
+};
+
+static inline int32_t fmap_msg_min_size(void)
+{
+ /* Smallest fmap message is a header plus one simple extent */
+ return (sizeof(struct fuse_famfs_fmap_header)
+ + sizeof(struct fuse_famfs_simple_ext));
+}
#endif /* _LINUX_FUSE_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 05/10] famfs_fuse: GET_DAXDEV message and daxdev_table
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (3 preceding siblings ...)
2026-03-31 12:38 ` [PATCH V10 04/10] famfs_fuse: Create files with famfs fmaps John Groves
@ 2026-03-31 12:38 ` John Groves
2026-03-31 12:39 ` [PATCH V10 06/10] famfs_fuse: Plumb dax iomap and fuse read/write/mmap John Groves
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
- The new GET_DAXDEV message/response is added
- The famfs.c:famfs_teardown() function is added as a primary teardown
function for famfs.
- The command it triggered by the update_daxdev_table() call, if there
are any daxdevs in the subject fmap that are not represented in the
daxdev_table yet.
- fs/namei.c: export may_open_dev()
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs.c | 227 +++++++++++++++++++++++++++++++++++++-
fs/fuse/famfs_kfmap.h | 26 +++++
fs/fuse/fuse_i.h | 19 ++++
fs/fuse/inode.c | 7 +-
fs/namei.c | 1 +
include/uapi/linux/fuse.h | 20 ++++
6 files changed, 298 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index ac52e54e2cb5..0e9415aa6339 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -21,6 +21,228 @@
#include "famfs_kfmap.h"
#include "fuse_i.h"
+/*
+ * famfs_teardown()
+ *
+ * Deallocate famfs metadata for a fuse_conn
+ */
+void
+famfs_teardown(struct fuse_conn *fc)
+{
+ struct famfs_dax_devlist *devlist __free(kfree) = fc->dax_devlist;
+ int i;
+
+ fc->dax_devlist = NULL;
+
+ if (!devlist)
+ return;
+
+ if (!devlist->devlist)
+ return;
+
+ /* Close & release all the daxdevs in our table */
+ for (i = 0; i < devlist->nslots; i++) {
+ struct famfs_daxdev *dd = &devlist->devlist[i];
+
+ if (!dd->valid)
+ continue;
+
+ /* Release reference from dax_dev_get() */
+ if (dd->devp)
+ put_dax(dd->devp);
+
+ kfree(dd->name);
+ }
+ kfree(devlist->devlist);
+}
+
+static int
+famfs_verify_daxdev(const char *pathname, dev_t *devno)
+{
+ struct inode *inode;
+ struct path path;
+ int err;
+
+ if (!pathname || !*pathname)
+ return -EINVAL;
+
+ err = kern_path(pathname, LOOKUP_FOLLOW, &path);
+ if (err)
+ return err;
+
+ inode = d_backing_inode(path.dentry);
+ if (!S_ISCHR(inode->i_mode)) {
+ err = -EINVAL;
+ goto out_path_put;
+ }
+
+ if (!may_open_dev(&path)) { /* had to export this */
+ err = -EACCES;
+ goto out_path_put;
+ }
+
+ *devno = inode->i_rdev;
+
+out_path_put:
+ path_put(&path);
+ return err;
+}
+
+/**
+ * famfs_fuse_get_daxdev() - Retrieve info for a DAX device from fuse server
+ *
+ * Send a GET_DAXDEV message to the fuse server to retrieve info on a
+ * dax device.
+ *
+ * @fm: fuse_mount
+ * @index: the index of the dax device; daxdevs are referred to by index
+ * in fmaps, and the server resolves the index to a particular daxdev
+ *
+ * Returns: 0=success
+ * -errno=failure
+ */
+static int
+famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index)
+{
+ struct fuse_daxdev_out daxdev_out = { 0 };
+ struct fuse_conn *fc = fm->fc;
+ struct famfs_daxdev *daxdev;
+ int rc;
+
+ FUSE_ARGS(args);
+
+ /* Store the daxdev in our table */
+ if (index >= fc->dax_devlist->nslots) {
+ pr_err("%s: index(%lld) > nslots(%d)\n",
+ __func__, index, fc->dax_devlist->nslots);
+ return -EINVAL;
+ }
+
+ args.opcode = FUSE_GET_DAXDEV;
+ args.nodeid = index;
+
+ args.in_numargs = 0;
+
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(daxdev_out);
+ args.out_args[0].value = &daxdev_out;
+
+ /* Send GET_DAXDEV command */
+ rc = fuse_simple_request(fm, &args);
+ if (rc) {
+ pr_err("%s: rc=%d from fuse_simple_request()\n",
+ __func__, rc);
+ /* Error will be that the payload is smaller than FMAP_BUFSIZE,
+ * which is the max we can handle. Empty payload handled below.
+ */
+ return rc;
+ }
+
+ scoped_guard(rwsem_write, &fc->famfs_devlist_sem) {
+ daxdev = &fc->dax_devlist->devlist[index];
+
+ /* Abort if daxdev is now valid (races are possible here) */
+ if (daxdev->valid) {
+ pr_debug("%s: daxdev already known\n", __func__);
+ return 0;
+ }
+
+ /* Verify dev is valid and can be opened and gets the devno */
+ rc = famfs_verify_daxdev(daxdev_out.name, &daxdev->devno);
+ if (rc) {
+ pr_err("%s: rc=%d from famfs_verify_daxdev()\n",
+ __func__, rc);
+ return rc;
+ }
+
+ daxdev->name = kstrdup(daxdev_out.name, GFP_KERNEL);
+ if (!daxdev->name)
+ return -ENOMEM;
+
+ /* This will fail if it's not a dax device */
+ daxdev->devp = dax_dev_get(daxdev->devno);
+ if (!daxdev->devp) {
+ pr_warn("%s: device %s not found or not dax\n",
+ __func__, daxdev_out.name);
+ kfree(daxdev->name);
+ daxdev->name = NULL;
+ return -ENODEV;
+ }
+
+ wmb(); /* All other fields must be visible before valid */
+ daxdev->valid = 1;
+ }
+
+ return 0;
+}
+
+/**
+ * famfs_update_daxdev_table() - Update the daxdev table
+ * @fm: fuse_mount
+ * @meta: famfs_file_meta, in-memory format, built from a GET_FMAP response
+ *
+ * This function is called for each new file fmap, to verify whether all
+ * referenced daxdevs are already known (i.e. in the table). Any daxdev
+ * indices referenced in @meta but not in the table will be retrieved via
+ * famfs_fuse_get_daxdev() and added to the table
+ *
+ * Return: 0=success
+ * -errno=failure
+ */
+static int
+famfs_update_daxdev_table(
+ struct fuse_mount *fm,
+ const struct famfs_file_meta *meta)
+{
+ struct famfs_dax_devlist *local_devlist;
+ struct fuse_conn *fc = fm->fc;
+ int indices_to_fetch[MAX_DAXDEVS];
+ int n_to_fetch = 0;
+ int err;
+
+ /* First time through we will need to allocate the dax_devlist */
+ if (!fc->dax_devlist) {
+ local_devlist = kcalloc(1, sizeof(*fc->dax_devlist), GFP_KERNEL);
+ if (!local_devlist)
+ return -ENOMEM;
+
+ local_devlist->nslots = MAX_DAXDEVS;
+
+ local_devlist->devlist = kcalloc(MAX_DAXDEVS,
+ sizeof(struct famfs_daxdev),
+ GFP_KERNEL);
+ if (!local_devlist->devlist) {
+ kfree(local_devlist);
+ return -ENOMEM;
+ }
+
+ /* We don't need famfs_devlist_sem here because we use cmpxchg */
+ if (cmpxchg(&fc->dax_devlist, NULL, local_devlist) != NULL) {
+ kfree(local_devlist->devlist);
+ kfree(local_devlist); /* another thread beat us to it */
+ }
+ }
+
+ /* Collect indices that need fetching while holding read lock */
+ scoped_guard(rwsem_read, &fc->famfs_devlist_sem) {
+ unsigned long i;
+
+ for_each_set_bit(i, (unsigned long *)&meta->dev_bitmap, MAX_DAXDEVS) {
+ if (!(fc->dax_devlist->devlist[i].valid))
+ indices_to_fetch[n_to_fetch++] = i;
+ }
+ }
+
+ /* Fetch needed daxdevs outside the read lock */
+ for (int j = 0; j < n_to_fetch; j++) {
+ err = famfs_fuse_get_daxdev(fm, indices_to_fetch[j]);
+ if (err)
+ pr_err("%s: failed to get daxdev=%d\n",
+ __func__, indices_to_fetch[j]);
+ }
+
+ return 0;
+}
/***************************************************************************/
@@ -184,7 +406,7 @@ famfs_fuse_meta_alloc(
/* ie_in = one interleaved extent in fmap_buf */
ie_in = fmap_buf + next_offset;
- /* Move past one interleaved extent header in fmap_buf */
+ /* Move past 1 interleaved extent header in fmap_buf */
next_offset += sizeof(*ie_in);
if (next_offset > fmap_buf_size) {
pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
@@ -329,6 +551,9 @@ famfs_file_init_dax(
if (rc)
goto errout;
+ /* Make sure this fmap doesn't reference any unknown daxdevs */
+ famfs_update_daxdev_table(fm, meta);
+
/* Publish the famfs metadata on fi->famfs_meta */
inode_lock(inode);
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
index 18ab22bcc5a1..eb9f70b5cb81 100644
--- a/fs/fuse/famfs_kfmap.h
+++ b/fs/fuse/famfs_kfmap.h
@@ -64,4 +64,30 @@ struct famfs_file_meta {
};
};
+/*
+ * famfs_daxdev - tracking struct for a daxdev within a famfs file system
+ *
+ * This is the in-memory daxdev metadata that is populated by parsing
+ * the responses to GET_FMAP messages
+ */
+struct famfs_daxdev {
+ /* Include dev uuid? */
+ bool valid;
+ bool error;
+ dev_t devno;
+ struct dax_device *devp;
+ char *name;
+};
+
+#define MAX_DAXDEVS 24
+
+/*
+ * famfs_dax_devlist - list of famfs_daxdev's
+ */
+struct famfs_dax_devlist {
+ int nslots;
+ int ndevs;
+ struct famfs_daxdev *devlist;
+};
+
#endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index df4e9c9f80bf..8170266cbb02 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1006,6 +1006,11 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ struct rw_semaphore famfs_devlist_sem;
+ struct famfs_dax_devlist *dax_devlist;
+#endif
};
/*
@@ -1647,6 +1652,8 @@ int famfs_file_init_dax(struct fuse_mount *fm,
size_t fmap_size);
void __famfs_meta_free(void *map);
+void famfs_teardown(struct fuse_conn *fc);
+
/* Set fi->famfs_meta = NULL regardless of prior value */
static inline void famfs_meta_init(struct fuse_inode *fi)
{
@@ -1668,6 +1675,11 @@ static inline void famfs_meta_free(struct fuse_inode *fi)
}
}
+static inline void famfs_init_devlist_sem(struct fuse_conn *fc)
+{
+ init_rwsem(&fc->famfs_devlist_sem);
+}
+
static inline int fuse_file_famfs(struct fuse_inode *fi)
{
return (READ_ONCE(fi->famfs_meta) != NULL);
@@ -1677,6 +1689,9 @@ int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode);
#else /* !CONFIG_FUSE_FAMFS_DAX */
+static inline void famfs_teardown(struct fuse_conn *fc)
+{
+}
static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
void *meta)
{
@@ -1687,6 +1702,10 @@ static inline void famfs_meta_free(struct fuse_inode *fi)
{
}
+static inline void famfs_init_devlist_sem(struct fuse_conn *fc)
+{
+}
+
static inline int fuse_file_famfs(struct fuse_inode *fi)
{
return 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5e692fc84297..40e7ea5b6437 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1048,6 +1048,9 @@ void fuse_conn_put(struct fuse_conn *fc)
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ famfs_teardown(fc);
+
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_free(fc);
call_rcu(&fc->rcu, delayed_release);
@@ -1477,8 +1480,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
u64 in_flags = FIELD_PREP(GENMASK_ULL(63, 32), ia->in.flags2)
| ia->in.flags;
- if (in_flags & FUSE_DAX_FMAP)
+ if (in_flags & FUSE_DAX_FMAP) {
+ famfs_init_devlist_sem(fc);
fc->famfs_iomap = 1;
+ }
}
} else {
ra_pages = fc->max_read / PAGE_SIZE;
diff --git a/fs/namei.c b/fs/namei.c
index 9e5500dad14f..38e6e4be089d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4212,6 +4212,7 @@ bool may_open_dev(const struct path *path)
return !(path->mnt->mnt_flags & MNT_NODEV) &&
!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
+EXPORT_SYMBOL(may_open_dev);
static int may_open(struct mnt_idmap *idmap, const struct path *path,
int acc_mode, int flag)
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index cf678bebbfe0..1b82895108be 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -247,6 +247,9 @@
* - struct fuse_famfs_simple_ext
* - struct fuse_famfs_iext
* - struct fuse_famfs_fmap_header
+ * - Add the following structs for the GET_DAXDEV message and reply
+ * - struct fuse_get_daxdev_in
+ * - struct fuse_get_daxdev_out
* - Add the following enumerated types
* - enum fuse_famfs_file_type
* - enum famfs_ext_type
@@ -678,6 +681,7 @@ enum fuse_opcode {
/* Famfs / devdax opcodes */
FUSE_GET_FMAP = 54,
+ FUSE_GET_DAXDEV = 55,
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -1369,6 +1373,22 @@ struct fuse_famfs_fmap_header {
uint64_t reserved1;
};
+struct fuse_get_daxdev_in {
+ uint32_t daxdev_num;
+};
+
+#define DAXDEV_NAME_MAX 256
+
+/* fuse_daxdev_out has enough space for a uuid if we need it */
+struct fuse_daxdev_out {
+ uint16_t index;
+ uint16_t reserved;
+ uint32_t reserved2;
+ uint64_t reserved3;
+ uint64_t reserved4;
+ char name[DAXDEV_NAME_MAX];
+};
+
static inline int32_t fmap_msg_min_size(void)
{
/* Smallest fmap message is a header plus one simple extent */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 06/10] famfs_fuse: Plumb dax iomap and fuse read/write/mmap
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (4 preceding siblings ...)
2026-03-31 12:38 ` [PATCH V10 05/10] famfs_fuse: GET_DAXDEV message and daxdev_table John Groves
@ 2026-03-31 12:39 ` John Groves
2026-03-31 12:39 ` [PATCH V10 07/10] famfs_fuse: Add holder_operations for dax notify_failure() John Groves
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:39 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
This commit fills in read/write/mmap handling for famfs files. The
dev_dax_iomap interface is used - just like xfs in fs-dax mode.
- Read/write are handled by famfs_fuse_[read|write]_iter() via
dax_iomap_rw() to fsdev_dax.
- Mmap is handled by famfs_fuse_mmap()
- Faults are handled by famfs_filemap_fault(), using dax_iomap_fault()
to fsdev_dax.
- File offset to dax offset resolution is handled via
famfs_fuse_iomap_begin(), which uses famfs "fmaps" to resolve the
the requested (file, offset) to an offset on a dax device (by way of
famfs_fileofs_to_daxofs() and famfs_interleave_fileofs_to_daxofs())
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs.c | 448 +++++++++++++++++++++++++++++++++++++++++++++++
fs/fuse/file.c | 18 +-
fs/fuse/fuse_i.h | 19 ++
3 files changed, 483 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index 0e9415aa6339..6f935032eb17 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -576,6 +576,454 @@ famfs_file_init_dax(
return rc;
}
+/*********************************************************************
+ * iomap_operations
+ *
+ * This stuff uses the iomap (dax-related) helpers to resolve file offsets to
+ * offsets within a dax device.
+ */
+
+static int famfs_file_bad(struct inode *inode);
+
+static int
+famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
+ loff_t file_offset, off_t len, unsigned int flags)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct famfs_file_meta *meta = fi->famfs_meta;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ loff_t local_offset = file_offset;
+
+ /* This function is only for extent_type INTERLEAVED_EXTENT */
+ if (meta->fm_extent_type != INTERLEAVED_EXTENT) {
+ pr_err("%s: bad extent type\n", __func__);
+ goto err_out;
+ }
+
+ if (famfs_file_bad(inode))
+ goto err_out;
+
+ iomap->offset = file_offset;
+
+ for (int i = 0; i < meta->fm_niext; i++) {
+ struct famfs_meta_interleaved_ext *fei = &meta->ie[i];
+ u64 chunk_size = fei->fie_chunk_size;
+ u64 nstrips = fei->fie_nstrips;
+ u64 ext_size = min(fei->fie_nbytes, meta->file_size);
+
+ if (!IS_ALIGNED(chunk_size, PMD_SIZE)) {
+ pr_err("%s: chunk_size %lld not PMD-aligned\n",
+ __func__, meta->ie[i].fie_chunk_size);
+ return -EINVAL;
+ }
+ if (ext_size == 0) {
+ pr_err("%s: ext_size=%lld file_size=%ld\n",
+ __func__, fei->fie_nbytes, meta->file_size);
+ goto err_out;
+ }
+
+ /* Is the data is in this striped extent? */
+ if (local_offset < ext_size) {
+ u64 chunk_num = local_offset / chunk_size;
+ u64 chunk_offset = local_offset % chunk_size;
+ u64 chunk_remainder = chunk_size - chunk_offset;
+ u64 stripe_num = chunk_num / nstrips;
+ u64 strip_num = chunk_num % nstrips;
+ u64 strip_offset = chunk_offset + (stripe_num * chunk_size);
+ u64 strip_dax_ofs = fei->ie_strips[strip_num].ext_offset;
+ u64 strip_devidx = fei->ie_strips[strip_num].dev_index;
+
+ if (strip_devidx >= fc->dax_devlist->nslots) {
+ pr_err("%s: strip_devidx %llu >= nslots %d\n",
+ __func__, strip_devidx,
+ fc->dax_devlist->nslots);
+ goto err_out;
+ }
+
+ if (!fc->dax_devlist->devlist[strip_devidx].valid) {
+ pr_err("%s: daxdev=%lld invalid\n", __func__,
+ strip_devidx);
+ goto err_out;
+ }
+
+ iomap->addr = strip_dax_ofs + strip_offset;
+ iomap->offset = file_offset;
+ iomap->length = min_t(loff_t, len, chunk_remainder);
+
+ iomap->dax_dev = fc->dax_devlist->devlist[strip_devidx].devp;
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = flags;
+
+ return 0;
+ }
+ local_offset -= ext_size; /* offset is beyond this striped extent */
+ }
+
+ err_out:
+ pr_err("%s: err_out\n", __func__);
+
+ /* We fell out the end of the extent list.
+ * Set iomap to zero length in this case, and return 0
+ * This just means that the r/w is past EOF
+ */
+ iomap->addr = 0; /* there is no valid dax device offset */
+ iomap->offset = file_offset; /* file offset */
+ iomap->length = 0; /* this had better result in no access to dax mem */
+ iomap->dax_dev = NULL;
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = flags;
+
+ return -EIO;
+}
+
+/**
+ * famfs_fileofs_to_daxofs() - Resolve (file, offset, len) to (daxdev, offset, len)
+ *
+ * This function is called by famfs_fuse_iomap_begin() to resolve an offset in a
+ * file to an offset in a dax device. This is upcalled from dax from calls to
+ * both * dax_iomap_fault() and dax_iomap_rw(). Dax finishes the job resolving
+ * a fault to a specific physical page (the fault case) or doing a memcpy
+ * variant (the rw case)
+ *
+ * Pages can be PTE (4k), PMD (2MiB) or (theoretically) PuD (1GiB)
+ * (these sizes are for X86; may vary on other cpu architectures
+ *
+ * @inode: The file where the fault occurred
+ * @iomap: To be filled in to indicate where to find the right memory,
+ * relative to a dax device.
+ * @file_offset: Within the file where the fault occurred (will be page boundary)
+ * @len: The length of the faulted mapping (will be a page multiple)
+ * (will be trimmed in *iomap if it's disjoint in the extent list)
+ * @flags: flags passed to famfs_fuse_iomap_begin(), and sent back via
+ * struct iomap
+ *
+ * Return values: 0. (info is returned in a modified @iomap struct)
+ */
+static int
+famfs_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
+ loff_t file_offset, off_t len, unsigned int flags)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct famfs_file_meta *meta = fi->famfs_meta;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ loff_t local_offset = file_offset;
+
+ if (!fc->dax_devlist) {
+ pr_err("%s: null dax_devlist\n", __func__);
+ goto err_out;
+ }
+
+ if (famfs_file_bad(inode))
+ goto err_out;
+
+ if (meta->fm_extent_type == INTERLEAVED_EXTENT)
+ return famfs_interleave_fileofs_to_daxofs(inode, iomap,
+ file_offset,
+ len, flags);
+
+ iomap->offset = file_offset;
+
+ for (int i = 0; i < meta->fm_nextents; i++) {
+ /* TODO: check devindex too */
+ loff_t dax_ext_offset = meta->se[i].ext_offset;
+ loff_t dax_ext_len = meta->se[i].ext_len;
+ u64 daxdev_idx = meta->se[i].dev_index;
+
+
+ /* TODO: test that superblock and log offsets only happen
+ * with superblock and log files. Requires instrumentaiton
+ * from user space...
+ */
+
+ /* local_offset is the offset minus the size of extents skipped
+ * so far; If local_offset < dax_ext_len, the data of interest
+ * starts in this extent
+ */
+ if (local_offset < dax_ext_len) {
+ loff_t ext_len_remainder = dax_ext_len - local_offset;
+ struct famfs_daxdev *dd;
+
+ if (daxdev_idx >= fc->dax_devlist->nslots) {
+ pr_err("%s: daxdev_idx %llu >= nslots %d\n",
+ __func__, daxdev_idx,
+ fc->dax_devlist->nslots);
+ goto err_out;
+ }
+
+ dd = &fc->dax_devlist->devlist[daxdev_idx];
+
+ if (!dd->valid || dd->error) {
+ pr_err("%s: daxdev=%lld %s\n", __func__,
+ daxdev_idx,
+ dd->valid ? "error" : "invalid");
+ goto err_out;
+ }
+
+ /*
+ * OK, we found the file metadata extent where this
+ * data begins
+ * @local_offset - The offset within the current
+ * extent
+ * @ext_len_remainder - Remaining length of ext after
+ * skipping local_offset
+ * Outputs:
+ * iomap->addr: the offset within the dax device where
+ * the data starts
+ * iomap->offset: the file offset
+ * iomap->length: the valid length resolved here
+ */
+ iomap->addr = dax_ext_offset + local_offset;
+ iomap->offset = file_offset;
+ iomap->length = min_t(loff_t, len, ext_len_remainder);
+
+ iomap->dax_dev = fc->dax_devlist->devlist[daxdev_idx].devp;
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = flags;
+ return 0;
+ }
+ local_offset -= dax_ext_len; /* Get ready for the next extent */
+ }
+
+ err_out:
+ pr_err("%s: err_out\n", __func__);
+
+ /* We fell out the end of the extent list.
+ * Set iomap to zero length in this case, and return 0
+ * This just means that the r/w is past EOF
+ */
+ iomap->addr = 0; /* there is no valid dax device offset */
+ iomap->offset = file_offset; /* file offset */
+ iomap->length = 0; /* this had better result in no access to dax mem */
+ iomap->dax_dev = NULL;
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = flags;
+
+ return -EIO;
+}
+
+/**
+ * famfs_fuse_iomap_begin() - Handler for iomap_begin upcall from dax
+ *
+ * This function is pretty simple because files are
+ * * never partially allocated
+ * * never have holes (never sparse)
+ * * never "allocate on write"
+ *
+ * @inode: inode for the file being accessed
+ * @offset: offset within the file
+ * @length: Length being accessed at offset
+ * @flags: flags to be retured via struct iomap
+ * @iomap: iomap struct to be filled in, resolving (offset, length) to
+ * (daxdev, offset, len)
+ * @srcmap: source mapping if it is a COW operation (which it is not here)
+ */
+static int
+famfs_fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct famfs_file_meta *meta = fi->famfs_meta;
+ size_t size;
+
+ size = i_size_read(inode);
+
+ WARN_ON(size != meta->file_size);
+
+ return famfs_fileofs_to_daxofs(inode, iomap, offset, length, flags);
+}
+
+/* Note: We never need a special set of write_iomap_ops because famfs never
+ * performs allocation on write.
+ */
+const struct iomap_ops famfs_iomap_ops = {
+ .iomap_begin = famfs_fuse_iomap_begin,
+};
+
+/*********************************************************************
+ * vm_operations
+ */
+static vm_fault_t
+__famfs_fuse_filemap_fault(struct vm_fault *vmf, unsigned int pe_size,
+ bool write_fault)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
+ unsigned long pfn;
+
+ if (!IS_DAX(file_inode(vmf->vma->vm_file))) {
+ pr_err("%s: file not marked IS_DAX!!\n", __func__);
+ return VM_FAULT_SIGBUS;
+ }
+
+ if (write_fault) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+ }
+
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &famfs_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+
+ if (write_fault)
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+static inline bool
+famfs_is_write_fault(struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
+
+static vm_fault_t
+famfs_filemap_fault(struct vm_fault *vmf)
+{
+ return __famfs_fuse_filemap_fault(vmf, 0, famfs_is_write_fault(vmf));
+}
+
+static vm_fault_t
+famfs_filemap_huge_fault(struct vm_fault *vmf, unsigned int pe_size)
+{
+ return __famfs_fuse_filemap_fault(vmf, pe_size,
+ famfs_is_write_fault(vmf));
+}
+
+static vm_fault_t
+famfs_filemap_mkwrite(struct vm_fault *vmf)
+{
+ return __famfs_fuse_filemap_fault(vmf, 0, true);
+}
+
+const struct vm_operations_struct famfs_file_vm_ops = {
+ .fault = famfs_filemap_fault,
+ .huge_fault = famfs_filemap_huge_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = famfs_filemap_mkwrite,
+ .pfn_mkwrite = famfs_filemap_mkwrite,
+};
+
+/*********************************************************************
+ * file_operations
+ */
+
+/**
+ * famfs_file_bad() - Check for files that aren't in a valid state
+ *
+ * @inode: inode
+ *
+ * Returns: 0=success
+ * -errno=failure
+ */
+static int
+famfs_file_bad(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct famfs_file_meta *meta = fi->famfs_meta;
+ size_t i_size = i_size_read(inode);
+
+ if (!meta) {
+ pr_err("%s: un-initialized famfs file\n", __func__);
+ return -EIO;
+ }
+ if (meta->error) {
+ pr_debug("%s: previously detected metadata errors\n", __func__);
+ return -EIO;
+ }
+ if (i_size != meta->file_size) {
+ pr_warn("%s: i_size overwritten from %ld to %ld\n",
+ __func__, meta->file_size, i_size);
+ meta->error = true;
+ return -ENXIO;
+ }
+ if (!IS_DAX(inode)) {
+ pr_debug("%s: inode %llx IS_DAX is false\n",
+ __func__, (u64)inode);
+ return -ENXIO;
+ }
+ return 0;
+}
+
+static ssize_t
+famfs_fuse_rw_prep(struct kiocb *iocb, struct iov_iter *ubuf)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ size_t i_size = i_size_read(inode);
+ size_t count = iov_iter_count(ubuf);
+ size_t max_count;
+ ssize_t rc;
+
+ rc = famfs_file_bad(inode);
+ if (rc)
+ return (ssize_t)rc;
+
+ /* Avoid unsigned underflow if position is past EOF */
+ if (iocb->ki_pos >= i_size)
+ max_count = 0;
+ else
+ max_count = i_size - iocb->ki_pos;
+
+ if (count > max_count)
+ iov_iter_truncate(ubuf, max_count);
+
+ if (!iov_iter_count(ubuf))
+ return 0;
+
+ return rc;
+}
+
+ssize_t
+famfs_fuse_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t rc;
+
+ rc = famfs_fuse_rw_prep(iocb, to);
+ if (rc)
+ return rc;
+
+ if (!iov_iter_count(to))
+ return 0;
+
+ rc = dax_iomap_rw(iocb, to, &famfs_iomap_ops);
+
+ file_accessed(iocb->ki_filp);
+ return rc;
+}
+
+ssize_t
+famfs_fuse_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t rc;
+
+ rc = famfs_fuse_rw_prep(iocb, from);
+ if (rc)
+ return rc;
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ return dax_iomap_rw(iocb, from, &famfs_iomap_ops);
+}
+
+int
+famfs_fuse_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(file);
+ ssize_t rc;
+
+ rc = famfs_file_bad(inode);
+ if (rc)
+ return rc;
+
+ file_accessed(file);
+ vma->vm_ops = &famfs_file_vm_ops;
+ vm_flags_set(vma, VM_HUGEPAGE);
+ return 0;
+}
+
#define FMAP_BUFSIZE PAGE_SIZE
int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 605f1c6cc10e..5d8dcb7639be 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1831,6 +1831,8 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_read_iter(iocb, to);
+ if (fuse_file_famfs(fi))
+ return famfs_fuse_read_iter(iocb, to);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
if (ff->open_flags & FOPEN_DIRECT_IO)
@@ -1853,6 +1855,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_write_iter(iocb, from);
+ if (fuse_file_famfs(fi))
+ return famfs_fuse_write_iter(iocb, from);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
if (ff->open_flags & FOPEN_DIRECT_IO)
@@ -1868,9 +1872,13 @@ static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
unsigned int flags)
{
struct fuse_file *ff = in->private_data;
+ struct inode *inode = file_inode(in);
+ struct fuse_inode *fi = get_fuse_inode(inode);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
- if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ if (fuse_file_famfs(fi))
+ return -EIO; /* famfs does not use the page cache... */
+ else if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
else
return filemap_splice_read(in, ppos, pipe, len, flags);
@@ -1880,9 +1888,13 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
struct fuse_file *ff = out->private_data;
+ struct inode *inode = file_inode(out);
+ struct fuse_inode *fi = get_fuse_inode(inode);
/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
- if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ if (fuse_file_famfs(fi))
+ return -EIO; /* famfs does not use the page cache... */
+ else if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
else
return iter_file_splice_write(pipe, out, ppos, len, flags);
@@ -2390,6 +2402,8 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
/* DAX mmap is superior to direct_io mmap */
if (FUSE_IS_VIRTIO_DAX(fi))
return fuse_dax_mmap(file, vma);
+ if (fuse_file_famfs(fi))
+ return famfs_fuse_mmap(file, vma);
/*
* If inode is in passthrough io mode, because it has some file open
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8170266cbb02..dcbeaceda918 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1650,6 +1650,9 @@ extern void fuse_sysctl_unregister(void);
int famfs_file_init_dax(struct fuse_mount *fm,
struct inode *inode, void *fmap_buf,
size_t fmap_size);
+ssize_t famfs_fuse_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t famfs_fuse_read_iter(struct kiocb *iocb, struct iov_iter *to);
+int famfs_fuse_mmap(struct file *file, struct vm_area_struct *vma);
void __famfs_meta_free(void *map);
void famfs_teardown(struct fuse_conn *fc);
@@ -1692,6 +1695,22 @@ int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode);
static inline void famfs_teardown(struct fuse_conn *fc)
{
}
+static inline ssize_t famfs_fuse_write_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ return -ENODEV;
+}
+static inline ssize_t famfs_fuse_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ return -ENODEV;
+}
+static inline int famfs_fuse_mmap(struct file *file,
+ struct vm_area_struct *vma)
+{
+ return -ENODEV;
+}
+
static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
void *meta)
{
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 07/10] famfs_fuse: Add holder_operations for dax notify_failure()
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (5 preceding siblings ...)
2026-03-31 12:39 ` [PATCH V10 06/10] famfs_fuse: Plumb dax iomap and fuse read/write/mmap John Groves
@ 2026-03-31 12:39 ` John Groves
2026-03-31 12:39 ` [PATCH V10 08/10] famfs_fuse: Add DAX address_space_operations with noop_dirty_folio John Groves
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:39 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <john@groves.net>
Memory errors are at least somewhat more likely on disaggregated memory
than on-board memory. This commit registers to be notified by fsdev_dax
in the event that a memory failure is detected.
When a file access resolves to a daxdev with memory errors, it will fail
with an appropriate error.
If a daxdev failed fs_dax_get(), we set dd->dax_err. If a daxdev called
our notify_failure(), set dd->error. When any of the above happens, set
(file)->error and stop allowing access.
In general, the recovery from memory errors is to unmount the file
system and re-initialize the memory, but there may be usable degraded
modes of operation - particularly in the future when famfs supports
file systems backed by more than one daxdev. In those cases,
accessing data that is on a working daxdev can still work.
For now, return errors for any file that has encountered a memory or dax
error.
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs.c | 110 +++++++++++++++++++++++++++++++++++++++---
fs/fuse/famfs_kfmap.h | 3 +-
2 files changed, 105 insertions(+), 8 deletions(-)
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index 6f935032eb17..87012df537eb 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -21,6 +21,26 @@
#include "famfs_kfmap.h"
#include "fuse_i.h"
+static void famfs_set_daxdev_err(
+ struct fuse_conn *fc, struct dax_device *dax_devp);
+
+static int
+famfs_dax_notify_failure(struct dax_device *dax_devp, u64 offset,
+ u64 len, int mf_flags)
+{
+ struct fuse_conn *fc = dax_holder(dax_devp);
+
+ famfs_set_daxdev_err(fc, dax_devp);
+
+ return 0;
+}
+
+static const struct dax_holder_operations famfs_fuse_dax_holder_ops = {
+ .notify_failure = famfs_dax_notify_failure,
+};
+
+/*****************************************************************************/
+
/*
* famfs_teardown()
*
@@ -47,9 +67,12 @@ famfs_teardown(struct fuse_conn *fc)
if (!dd->valid)
continue;
- /* Release reference from dax_dev_get() */
- if (dd->devp)
+ /* Only call fs_put_dax if fs_dax_get succeeded */
+ if (dd->devp) {
+ if (!dd->dax_err)
+ fs_put_dax(dd->devp, fc);
put_dax(dd->devp);
+ }
kfree(dd->name);
}
@@ -169,6 +192,17 @@ famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index)
return -ENODEV;
}
+ rc = fs_dax_get(daxdev->devp, fc, &famfs_fuse_dax_holder_ops);
+ if (rc) {
+ /* Mark as valid with dax_err to prevent retry loop.
+ * famfs_dax_err() will return -EIO on access attempts.
+ * Teardown handles this case: skips fs_put_dax, calls put_dax.
+ */
+ daxdev->dax_err = 1;
+ pr_err("%s: fs_dax_get(%lld) failed\n",
+ __func__, (u64)daxdev->devno);
+ }
+
wmb(); /* All other fields must be visible before valid */
daxdev->valid = 1;
}
@@ -244,6 +278,36 @@ famfs_update_daxdev_table(
return 0;
}
+static void
+famfs_set_daxdev_err(
+ struct fuse_conn *fc,
+ struct dax_device *dax_devp)
+{
+ int i;
+
+ /* Gotta search the list by dax_devp;
+ * read lock because we're not adding or removing daxdev entries
+ */
+ scoped_guard(rwsem_write, &fc->famfs_devlist_sem) {
+ for (i = 0; i < fc->dax_devlist->nslots; i++) {
+ if (fc->dax_devlist->devlist[i].valid) {
+ struct famfs_daxdev *dd;
+
+ dd = &fc->dax_devlist->devlist[i];
+ if (dd->devp != dax_devp)
+ continue;
+
+ dd->error = true;
+
+ pr_err("%s: memory error on daxdev %s (%d)\n",
+ __func__, dd->name, i);
+ return;
+ }
+ }
+ }
+ pr_err("%s: memory err on unrecognized daxdev\n", __func__);
+}
+
/***************************************************************************/
void __famfs_meta_free(void *famfs_meta)
@@ -585,6 +649,26 @@ famfs_file_init_dax(
static int famfs_file_bad(struct inode *inode);
+static int famfs_dax_err(struct famfs_daxdev *dd)
+{
+ if (!dd->valid) {
+ pr_err("%s: daxdev=%s invalid\n",
+ __func__, dd->name);
+ return -EIO;
+ }
+ if (dd->dax_err) {
+ pr_err("%s: daxdev=%s dax_err\n",
+ __func__, dd->name);
+ return -EIO;
+ }
+ if (dd->error) {
+ pr_err("%s: daxdev=%s memory error\n",
+ __func__, dd->name);
+ return -EHWPOISON;
+ }
+ return 0;
+}
+
static int
famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
loff_t file_offset, off_t len, unsigned int flags)
@@ -624,6 +708,7 @@ famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
/* Is the data is in this striped extent? */
if (local_offset < ext_size) {
+ struct famfs_daxdev *dd;
u64 chunk_num = local_offset / chunk_size;
u64 chunk_offset = local_offset % chunk_size;
u64 chunk_remainder = chunk_size - chunk_offset;
@@ -632,6 +717,7 @@ famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
u64 strip_offset = chunk_offset + (stripe_num * chunk_size);
u64 strip_dax_ofs = fei->ie_strips[strip_num].ext_offset;
u64 strip_devidx = fei->ie_strips[strip_num].dev_index;
+ int rc;
if (strip_devidx >= fc->dax_devlist->nslots) {
pr_err("%s: strip_devidx %llu >= nslots %d\n",
@@ -646,6 +732,15 @@ famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
goto err_out;
}
+ dd = &fc->dax_devlist->devlist[strip_devidx];
+
+ rc = famfs_dax_err(dd);
+ if (rc) {
+ /* Shut down access to this file */
+ meta->error = true;
+ return rc;
+ }
+
iomap->addr = strip_dax_ofs + strip_offset;
iomap->offset = file_offset;
iomap->length = min_t(loff_t, len, chunk_remainder);
@@ -743,6 +838,7 @@ famfs_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
if (local_offset < dax_ext_len) {
loff_t ext_len_remainder = dax_ext_len - local_offset;
struct famfs_daxdev *dd;
+ int rc;
if (daxdev_idx >= fc->dax_devlist->nslots) {
pr_err("%s: daxdev_idx %llu >= nslots %d\n",
@@ -753,11 +849,11 @@ famfs_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap,
dd = &fc->dax_devlist->devlist[daxdev_idx];
- if (!dd->valid || dd->error) {
- pr_err("%s: daxdev=%lld %s\n", __func__,
- daxdev_idx,
- dd->valid ? "error" : "invalid");
- goto err_out;
+ rc = famfs_dax_err(dd);
+ if (rc) {
+ /* Shut down access to this file */
+ meta->error = true;
+ return rc;
}
/*
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
index eb9f70b5cb81..0fff841f5a9e 100644
--- a/fs/fuse/famfs_kfmap.h
+++ b/fs/fuse/famfs_kfmap.h
@@ -73,7 +73,8 @@ struct famfs_file_meta {
struct famfs_daxdev {
/* Include dev uuid? */
bool valid;
- bool error;
+ bool error; /* Dax has reported a memory error (probably poison) */
+ bool dax_err; /* fs_dax_get() failed */
dev_t devno;
struct dax_device *devp;
char *name;
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 08/10] famfs_fuse: Add DAX address_space_operations with noop_dirty_folio
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (6 preceding siblings ...)
2026-03-31 12:39 ` [PATCH V10 07/10] famfs_fuse: Add holder_operations for dax notify_failure() John Groves
@ 2026-03-31 12:39 ` John Groves
2026-03-31 12:39 ` [PATCH V10 09/10] famfs_fuse: Add famfs fmap metadata documentation John Groves
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:39 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <John@Groves.net>
Famfs is memory-backed; there is no place to write back to, and no
reason to mark pages dirty at all.
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index 87012df537eb..121ed74e9727 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -14,6 +14,7 @@
#include <linux/mm.h>
#include <linux/dax.h>
#include <linux/iomap.h>
+#include <linux/pagemap.h>
#include <linux/path.h>
#include <linux/namei.h>
#include <linux/string.h>
@@ -39,6 +40,15 @@ static const struct dax_holder_operations famfs_fuse_dax_holder_ops = {
.notify_failure = famfs_dax_notify_failure,
};
+/*
+ * DAX address_space_operations for famfs.
+ * famfs doesn't need dirty tracking - writes go directly to
+ * memory with no writeback required.
+ */
+static const struct address_space_operations famfs_dax_aops = {
+ .dirty_folio = noop_dirty_folio,
+};
+
/*****************************************************************************/
/*
@@ -624,6 +634,7 @@ famfs_file_init_dax(
if (famfs_meta_set(fi, meta) == NULL) {
i_size_write(inode, meta->file_size);
inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &famfs_dax_aops;
} else {
pr_debug("%s: file already had metadata\n", __func__);
__famfs_meta_free(meta);
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 09/10] famfs_fuse: Add famfs fmap metadata documentation
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (7 preceding siblings ...)
2026-03-31 12:39 ` [PATCH V10 08/10] famfs_fuse: Add DAX address_space_operations with noop_dirty_folio John Groves
@ 2026-03-31 12:39 ` John Groves
2026-03-31 12:39 ` [PATCH V10 10/10] famfs_fuse: Add documentation John Groves
2026-04-01 15:15 ` [PATCH V10 00/10] famfs: port into fuse John Groves
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:39 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves
From: John Groves <John@Groves.net>
This describes the fmap metadata - both simple and interleaved
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs_kfmap.h | 73 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 73 insertions(+)
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
index 0fff841f5a9e..970ad802b492 100644
--- a/fs/fuse/famfs_kfmap.h
+++ b/fs/fuse/famfs_kfmap.h
@@ -7,6 +7,79 @@
#ifndef FAMFS_KFMAP_H
#define FAMFS_KFMAP_H
+/* KABI version 43 (aka v2) fmap structures
+ *
+ * The location of the memory backing for a famfs file is described by
+ * the response to the GET_FMAP fuse message (defined in
+ * include/uapi/linux/fuse.h
+ *
+ * There are currently two extent formats: Simple and Interleaved.
+ *
+ * Simple extents are just (devindex, offset, length) tuples, where devindex
+ * references a devdax device that must be retrievable via the GET_DAXDEV
+ * message/response.
+ *
+ * The extent list size must be >= file_size.
+ *
+ * Interleaved extents merit some additional explanation. Interleaved
+ * extents stripe data across a collection of strips. Each strip is a
+ * contiguous allocation from a single devdax device - and is described by
+ * a simple_extent structure.
+ *
+ * Interleaved_extent example:
+ * ie_nstrips = 4
+ * ie_chunk_size = 2MiB
+ * ie_nbytes = 24MiB
+ *
+ * ┌────────────┐────────────┐────────────┐────────────┐
+ * │Chunk = 0 │Chunk = 1 │Chunk = 2 │Chunk = 3 │
+ * │Strip = 0 │Strip = 1 │Strip = 2 │Strip = 3 │
+ * │Stripe = 0 │Stripe = 0 │Stripe = 0 │Stripe = 0 │
+ * │ │ │ │ │
+ * └────────────┘────────────┘────────────┘────────────┘
+ * │Chunk = 4 │Chunk = 5 │Chunk = 6 │Chunk = 7 │
+ * │Strip = 0 │Strip = 1 │Strip = 2 │Strip = 3 │
+ * │Stripe = 1 │Stripe = 1 │Stripe = 1 │Stripe = 1 │
+ * │ │ │ │ │
+ * └────────────┘────────────┘────────────┘────────────┘
+ * │Chunk = 8 │Chunk = 9 │Chunk = 10 │Chunk = 11 │
+ * │Strip = 0 │Strip = 1 │Strip = 2 │Strip = 3 │
+ * │Stripe = 2 │Stripe = 2 │Stripe = 2 │Stripe = 2 │
+ * │ │ │ │ │
+ * └────────────┘────────────┘────────────┘────────────┘
+ *
+ * * Data is laid out across chunks in chunk # order
+ * * Columns are strips
+ * * Strips are contiguous devdax extents, normally each coming from a
+ * different memory device
+ * * Rows are stripes
+ * * The number of chunks is (int)((file_size + chunk_size - 1) / chunk_size)
+ * (and obviously the last chunk could be partial)
+ * * The stripe_size = (nstrips * chunk_size)
+ * * chunk_num(offset) = offset / chunk_size //integer division
+ * * strip_num(offset) = chunk_num(offset) % nchunks
+ * * stripe_num(offset) = offset / stripe_size //integer division
+ * * ...You get the idea - see the code for more details...
+ *
+ * Some concrete examples from the layout above:
+ * * Offset 0 in the file is offset 0 in chunk 0, which is offset 0 in
+ * strip 0
+ * * Offset 4MiB in the file is offset 0 in chunk 2, which is offset 0 in
+ * strip 2
+ * * Offset 15MiB in the file is offset 1MiB in chunk 7, which is offset
+ * 3MiB in strip 3
+ *
+ * Notes about this metadata format:
+ *
+ * * For various reasons, chunk_size must be a multiple of the applicable
+ * PAGE_SIZE
+ * * Since chunk_size and nstrips are constant within an interleaved_extent,
+ * resolving a file offset to a strip offset within a single
+ * interleaved_ext is order 1.
+ * * If nstrips==1, a list of interleaved_ext structures degenerates to a
+ * regular extent list (albeit with some wasted struct space).
+ */
+
/*
* The structures below are the in-memory metadata format for famfs files.
* Metadata retrieved via the GET_FMAP response is converted to this format
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH V10 10/10] famfs_fuse: Add documentation
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (8 preceding siblings ...)
2026-03-31 12:39 ` [PATCH V10 09/10] famfs_fuse: Add famfs fmap metadata documentation John Groves
@ 2026-03-31 12:39 ` John Groves
2026-04-01 15:15 ` [PATCH V10 00/10] famfs: port into fuse John Groves
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-03-31 12:39 UTC (permalink / raw)
To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
Alison Schofield
Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
John Groves, Jonathan Cameron
From: John Groves <john@groves.net>
Add Documentation/filesystems/famfs.rst and update MAINTAINERS
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: John Groves <john@groves.net>
---
Documentation/filesystems/famfs.rst | 142 ++++++++++++++++++++++++++++
Documentation/filesystems/index.rst | 1 +
MAINTAINERS | 1 +
3 files changed, 144 insertions(+)
create mode 100644 Documentation/filesystems/famfs.rst
diff --git a/Documentation/filesystems/famfs.rst b/Documentation/filesystems/famfs.rst
new file mode 100644
index 000000000000..d90ce96d6fda
--- /dev/null
+++ b/Documentation/filesystems/famfs.rst
@@ -0,0 +1,142 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _famfs_index:
+
+==================================================================
+famfs: The fabric-attached memory file system
+==================================================================
+
+- Copyright (C) 2024-2026 Micron Technology, Inc.
+
+Introduction
+============
+Compute Express Link (CXL) provides a mechanism for disaggregated or
+fabric-attached memory (FAM). This creates opportunities for data sharing;
+clustered apps that would otherwise have to shard or replicate data can
+share one copy in disaggregated memory.
+
+Famfs, which is not CXL-specific in any way, provides a mechanism for
+multiple hosts to concurrently access data in shared memory, by giving it
+a file system interface. With famfs, any app that understands files can
+access data sets in shared memory. Although famfs supports read and write,
+the real point is to support mmap, which provides direct (dax) access to
+the memory - either writable or read-only.
+
+Shared memory can pose complex coherency and synchronization issues, but
+there are also simple cases. Two simple and eminently useful patterns that
+occur frequently in data analytics and AI are:
+
+* Serial Sharing - Only one host or process at a time has access to a file
+* Read-only Sharing - Multiple hosts or processes share read-only access
+ to a file
+
+The famfs fuse file system is part of the famfs framework; user space
+components [1] handle metadata allocation and distribution, and provide a
+low-level fuse server to expose files that map directly to [presumably
+shared] memory.
+
+The famfs framework manages coherency of its own metadata and structures,
+but does not attempt to manage coherency for applications.
+
+Famfs also provides data isolation between files. That is, even though
+the host has access to an entire memory "device" (as a devdax device), apps
+cannot write to memory for which the file is read-only, and mapping one
+file provides isolation from the memory of all other files. This is pretty
+basic, but some experimental shared memory usage patterns provide no such
+isolation.
+
+Principles of Operation
+=======================
+
+Famfs is a file system with one or more devdax devices as a first-class
+backing device(s). Metadata maintenance and query operations happen
+entirely in user space.
+
+The famfs low-level fuse server daemon provides file maps (fmaps) and
+devdax device info to the fuse/famfs kernel component so that
+read/write/mapping faults can be handled without up-calls for all active
+files.
+
+The famfs user space is responsible for maintaining and distributing
+consistent metadata. This is currently handled via an append-only
+metadata log within the memory, but this is orthogonal to the fuse/famfs
+kernel code.
+
+Once instantiated, "the same file" on each host points to the same shared
+memory, but in-memory metadata (inodes, etc.) is ephemeral on each host
+that has a famfs instance mounted. Use cases are free to allow or not
+allow mutations to data on a file-by-file basis.
+
+When an app accesses a data object in a famfs file, there is no page cache
+involvement. The CPU cache is loaded directly from the shared memory. In
+some use cases, this is an enormous reduction in read amplification
+compared to loading an entire page into the page cache.
+
+
+Famfs is Not a Conventional File System
+---------------------------------------
+
+Famfs files can be accessed by conventional means, but there are
+limitations. The kernel component of fuse/famfs is not involved in the
+allocation of backing memory for files at all; the famfs user space
+creates files and responds as a low-level fuse server with fmaps and
+devdax device info upon request.
+
+Famfs differs in some important ways from conventional file systems:
+
+* Files must be pre-allocated by the famfs framework; allocation is never
+ performed on (or after) write.
+* Any operation that changes a file's size is considered to put the file
+ in an invalid state, disabling access to the data. It may be possible to
+ revisit this in the future. (Typically the famfs user space can restore
+ files to a valid state by replaying the famfs metadata log.)
+
+Famfs exists to apply the existing file system abstractions to shared
+memory so applications and workflows can more easily adapt to an
+environment with disaggregated shared memory.
+
+Memory Error Handling
+=====================
+
+Possible memory errors include timeouts, poison, and unexpected
+reconfiguration of an underlying dax device. In all of these cases, famfs
+receives a call from the devdax layer via its iomap_ops->notify_failure()
+function. If any memory errors have been detected, access to the affected
+daxdev is disabled to avoid further errors or corruption.
+
+In all known cases, famfs can be unmounted cleanly. In most cases errors
+can be cleared by re-initializing the memory - at which point a new famfs
+file system can be created.
+
+Key Requirements
+================
+
+The primary requirements for famfs are:
+
+1. Must support a file system abstraction backed by sharable devdax memory
+2. Files must efficiently handle VMA faults
+3. Must support metadata distribution in a sharable way
+4. Must handle clients with a stale copy of metadata
+
+The famfs kernel component takes care of 1-2 above by caching each file's
+mapping metadata in the kernel.
+
+Requirements 3 and 4 are handled by the user space components, and are
+largely orthogonal to the functionality of the famfs kernel module.
+
+Requirements 3 and 4 cannot be met by conventional fs-dax file systems
+(e.g. xfs) because they use write-back metadata; it is not valid to mount
+such a file system on two hosts from the same in-memory image.
+
+
+Famfs Usage
+===========
+
+Famfs usage is documented at [1].
+
+
+References
+==========
+
+- [1] Famfs user space repository and documentation
+ https://github.com/cxl-micron-reskit/famfs
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index f4873197587d..e6fb467c1680 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -89,6 +89,7 @@ Documentation for filesystem implementations.
ext3
ext4/index
f2fs
+ famfs
gfs2/index
hfs
hfsplus
diff --git a/MAINTAINERS b/MAINTAINERS
index 4edb56afb947..739388d290c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10529,6 +10529,7 @@ M: John Groves <John@Groves.net>
L: linux-cxl@vger.kernel.org
L: linux-fsdevel@vger.kernel.org
S: Supported
+F: Documentation/filesystems/famfs.rst
F: fs/fuse/famfs.c
F: fs/fuse/famfs_kfmap.h
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH V10 00/10] famfs: port into fuse
2026-03-31 12:37 ` [PATCH V10 00/10] famfs: port into fuse John Groves
` (9 preceding siblings ...)
2026-03-31 12:39 ` [PATCH V10 10/10] famfs_fuse: Add documentation John Groves
@ 2026-04-01 15:15 ` John Groves
10 siblings, 0 replies; 12+ messages in thread
From: John Groves @ 2026-04-01 15:15 UTC (permalink / raw)
To: John Groves
Cc: Miklos Szeredi, Dan Williams, Bernd Schubert, Alison Schofield,
John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
David Hildenbrand, Christian Brauner, Darrick J . Wong,
Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org
On 26/03/31 12:37PM, John Groves wrote:
> From: John Groves <john@groves.net>
>
> NOTE: this series depends on the famfs dax series in Ira's for-7.1/dax-famfs
> branch [0]
>
> Changes v9 -> v10
> - Rebased to Ira's for-7.1/dax-famfs branch [0], which contains the required
> dax patches
> - Add parentheses to FUSE_IS_VIRTIO_DAX() macro, in case something bad is
> passed in as fuse_inode (thanks Jonathan's AI)
>
> Description:
>
> This patch series introduces famfs into the fuse file system framework.
> Famfs depends on the bundled dax patch set.
>
> The famfs user space code can be found at [1].
>
> Fuse Overview:
>
> Famfs started as a standalone file system, but this series is intended to
> permanently supersede that implementation. At a high level, famfs adds
> two new fuse server messages:
>
> GET_FMAP - Retrieves a famfs fmap (the file-to-dax map for a famfs
> file)
> GET_DAXDEV - Retrieves the details of a particular daxdev that was
> referenced by an fmap
>
> Famfs Overview
>
> Famfs exposes shared memory as a file system. Famfs consumes shared
> memory from dax devices, and provides memory-mappable files that map
> directly to the memory - no page cache involvement. Famfs differs from
> conventional file systems in fs-dax mode, in that it handles in-memory
> metadata in a sharable way (which begins with never caching dirty shared
> metadata).
>
> Famfs started as a standalone file system [2,3], but the consensus at
> LSFMM was that it should be ported into fuse [4,5].
>
> The key performance requirement is that famfs must resolve mapping faults
> without upcalls. This is achieved by fully caching the file-to-devdax
> metadata for all active files. This is done via two fuse client/server
> message/response pairs: GET_FMAP and GET_DAXDEV.
>
> Famfs remains the first fs-dax file system that is backed by devdax
> rather than pmem in fs-dax mode (hence the need for the new dax mode).
>
> Notes
>
> - When a file is opened in a famfs mount, the OPEN is followed by a
> GET_FMAP message and response. The "fmap" is the full file-to-dax
> mapping, allowing the fuse/famfs kernel code to handle
> read/write/fault without any upcalls.
>
> - After each GET_FMAP, the fmap is checked for extents that reference
> previously-unknown daxdevs. Each such occurrence is handled with a
> GET_DAXDEV message and response.
>
> - Daxdevs are stored in a table (which might become an xarray at some
> point). When entries are added to the table, we acquire exclusive
> access to the daxdev via the fs_dax_get() call (modeled after how
> fs-dax handles this with pmem devices). Famfs provides
> holder_operations to devdax, providing a notification path in the
> event of memory errors or forced reconfiguration.
>
> - If devdax notifies famfs of memory errors on a dax device, famfs
> currently blocks all subsequent accesses to data on that device. The
> recovery is to re-initialize the memory and file system. Famfs is
> memory, not storage...
>
> - Because famfs uses backing (devdax) devices, only privileged mounts are
> supported (i.e. the fuse server requires CAP_SYS_RAWIO).
>
> - The famfs kernel code never accesses the memory directly - it only
> facilitates read, write and mmap on behalf of user processes, using
> fmap metadata provided by its privileged fuse server. As such, the
> RAS of the shared memory affects applications, but not the kernel.
>
> - Famfs has backing device(s), but they are devdax (char) rather than
> block. Right now there is no way to tell the vfs layer that famfs has a
> char backing device (unless we say it's block, but it's not). Currently
> we use the standard anonymous fuse fs_type - but I'm not sure that's
> ultimately optimal (thoughts?)
>
> Changes v8 -> v9
> - Kconfig: fs/fuse/Kconfig:CONFIG_FUSE_FAMFS_DAX now depends on the
> new CONFIG_DEV_DAX_FSDEV (from drivers/dax/Kconfig) rather than
> just CONFIG_DEV_DAX and CONFIG_FS_DAX. (CONFIG_FUSE_FAMFS_DAX
> depends on those...)
>
> Changes v7 -> v8
> - Moved to inline __free declaration in fuse_get_fmap() and
> famfs_fuse_meta_alloc(), famfs_teardown()
> - Adopted FIELD_PREP() macro rather than manual bitfield manipulation
> - Minor doc edits
> - I dropped adding magic numbers to include/uapi/linux/magic.h. That
> can be done later if appropriate
>
> Changes v6 -> v7
> - Fixed a regression in famfs_interleave_fileofs_to_daxofs() that
> was reported by Intel's kernel test robot
> - Added a check in __fsdev_dax_direct_access() for negative return
> from pgoff_to_phys(), which would indicate an out-of-range offset
> - Fixed a bug in __famfs_meta_free(), where not all interleaved
> extents were freed
> - Added chunksize alignment checks in famfs_fuse_meta_alloc() and
> famfs_interleave_fileofs_to_daxofs() as interleaved chunks must
> be PTE or PMD aligned
> - Simplified famfs_file_init_dax() a bit
> - Re-ran CM's kernel code review prompts on the entire series and
> fixed several minor issues
>
> Changes v4 -> v5 -> v6
> - None. Re-sending due to technical difficulties
>
> Changes v3 [9] -> v4
> - The patch "dax: prevent driver unbind while filesystem holds device"
> has been dropped. Dan Williams indicated that the favored behavior is
> for a file system to stop working if an underlying driver is unbound,
> rather than preventing the unbind.
> - The patch "famfs_fuse: Famfs mount opt: -o shadow=<shadowpath>" has
> been dropped. Found a way for the famfs user space to do without the
> -o opt (via getxattr).
> - Squashed the fs/fuse/Kconfig patch into the first subsequent patch
> that needed the change
> ("famfs_fuse: Basic fuse kernel ABI enablement for famfs")
> - Many review comments addressed.
> - Addressed minor kerneldoc infractions reported by test robot.
>
> Changes v2 [7] -> v3
> - Dax: Completely new fsdev driver (drivers/dax/fsdev.c) replaces the
> dev_dax_iomap modifications to bus.c/device.c. Devdax devices can now
> be switched among 'devdax', 'famfs' and 'system-ram' modes via daxctl
> or sysfs.
> - Dax: fsdev uses MEMORY_DEVICE_FS_DAX type and leaves folios at order-0
> (no vmemmap_shift), allowing fs-dax to manage folio lifecycles
> dynamically like pmem does.
> - Dax: The "poisoned page" problem is properly fixed via
> fsdev_clear_folio_state(), which clears stale mapping/compound state
> when fsdev binds. The temporary WARN_ON_ONCE workaround in fs/dax.c
> has been removed.
> - Dax: Added dax_set_ops() so fsdev can set dax_operations at bind time
> (and clear them on unbind), since the dax_device is created before we
> know which driver will bind.
> - Dax: Added custom bind/unbind sysfs handlers; unbind return -EBUSY if a
> filesystem holds the device, preventing unbind while famfs is mounted.
> - Fuse: Famfs mounts now require that the fuse server/daemon has
> CAP_SYS_RAWIO because they expose raw memory devices.
> - Fuse: Added DAX address_space_operations with noop_dirty_folio since
> famfs is memory-backed with no writeback required.
> - Rebased to latest kernels, fully compatible with Alistair Popple
> et. al's recent dax refactoring.
> - Ran this series through Chris Mason's code review AI prompts to check
> for issues - several subtle problems found and fixed.
> - Dropped RFC status - this version is intended to be mergeable.
>
> Changes v1 [8] -> v2:
>
> - The GET_FMAP message/response has been moved from LOOKUP to OPEN, as
> was the pretty much unanimous consensus.
> - Made the response payload to GET_FMAP variable sized (patch 12)
> - Dodgy kerneldoc comments cleaned up or removed.
> - Fixed memory leak of fc->shadow in patch 11 (thanks Joanne)
> - Dropped many pr_debug and pr_notice calls
>
>
> References
>
> [0] - https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/
> [1] - https://famfs.org (famfs user space)
> [2] - https://lore.kernel.org/linux-cxl/cover.1708709155.git.john@groves.net/
> [3] - https://lore.kernel.org/linux-cxl/cover.1714409084.git.john@groves.net/
> [4] - https://lwn.net/Articles/983105/ (lsfmm 2024)
> [5] - https://lwn.net/Articles/1020170/ (lsfmm 2025)
> [6] - https://lore.kernel.org/linux-cxl/cover.8068ad144a7eea4a813670301f4d2a86a8e68ec4.1740713401.git-series.apopple@nvidia.com/
> [7] - https://lore.kernel.org/linux-fsdevel/20250703185032.46568-1-john@groves.net/ (famfs fuse v2)
> [8] - https://lore.kernel.org/linux-fsdevel/20250421013346.32530-1-john@groves.net/ (famfs fuse v1)
> [9] - https://lore.kernel.org/linux-fsdevel/20260107153244.64703-1-john@groves.net/T/#mb2c868801be16eca82dab239a1d201628534aea7 (famfs fuse v3)
>
>
> John Groves (10):
> famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/
> famfs_fuse: Basic fuse kernel ABI enablement for famfs
> famfs_fuse: Plumb the GET_FMAP message/response
> famfs_fuse: Create files with famfs fmaps
> famfs_fuse: GET_DAXDEV message and daxdev_table
> famfs_fuse: Plumb dax iomap and fuse read/write/mmap
> famfs_fuse: Add holder_operations for dax notify_failure()
> famfs_fuse: Add DAX address_space_operations with noop_dirty_folio
> famfs_fuse: Add famfs fmap metadata documentation
> famfs_fuse: Add documentation
>
> Documentation/filesystems/famfs.rst | 142 ++++
> Documentation/filesystems/index.rst | 1 +
> MAINTAINERS | 10 +
> fs/fuse/Kconfig | 13 +
> fs/fuse/Makefile | 1 +
> fs/fuse/dir.c | 2 +-
> fs/fuse/famfs.c | 1180 +++++++++++++++++++++++++++
> fs/fuse/famfs_kfmap.h | 167 ++++
> fs/fuse/file.c | 45 +-
> fs/fuse/fuse_i.h | 116 ++-
> fs/fuse/inode.c | 35 +-
> fs/fuse/iomode.c | 2 +-
> fs/namei.c | 1 +
> include/uapi/linux/fuse.h | 88 ++
> 14 files changed, 1790 insertions(+), 13 deletions(-)
> create mode 100644 Documentation/filesystems/famfs.rst
> create mode 100644 fs/fuse/famfs.c
> create mode 100644 fs/fuse/famfs_kfmap.h
>
>
> base-commit: 2ae624d5a555d47a735fb3f4d850402859a4db77
> --
> 2.53.0
>
>
Miklos,
I would appreciate a read on what you're thinking WRT merging famfs. The
dax patches are ready; this series should be applied on top of Ira's
for-7.1/dax-famfs branch, which is at [1].
I saw that you had the famfs series in your for-next branch briefly a
couple of weeks ago, but it didn't build because it depends on the dax
series. It will build and run cleanly if you put it on Ira's branch above.
Famfs has been in use for a long time, though availability of sharable cxl
memory is still limited; that is changing with early availability (now) of
sharable JBOMs up to 100TB.
The presence of famfs won't affect anybody who doesn't use it though...
What are your thoughts?
Thanks,
John
[1] - https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/
^ permalink raw reply [flat|nested] 12+ messages in thread