Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH v9 03/17] fat: Implement fileattr_get for case sensitivity
From: Chuck Lever @ 2026-04-22 23:29 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever
In-Reply-To: <20260422-case-sensitivity-v9-0-be023cc070e2@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Report FAT's case sensitivity behavior via the FS_XFLAG_CASEFOLD
and FS_XFLAG_CASENONPRESERVING flags. FAT filesystems are
case-insensitive by default.

MSDOS supports a 'nocase' mount option that enables case-sensitive
behavior; check this option when reporting case sensitivity.

VFAT long filename entries preserve case; without VFAT, only
uppercased 8.3 short names are stored. MSDOS with 'nocase' also
preserves case since the name-formatting code skips upcasing when
'nocase' is set. Check both options when reporting case preservation.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/fat/fat.h         |  3 +++
 fs/fat/file.c        | 23 +++++++++++++++++++++++
 fs/fat/namei_msdos.c |  1 +
 fs/fat/namei_vfat.c  |  1 +
 4 files changed, 28 insertions(+)

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 5a58f0bf8ce8..99ed9228a677 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -10,6 +10,8 @@
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 
+struct file_kattr;
+
 /*
  * vfat shortname flags
  */
@@ -408,6 +410,7 @@ extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct mnt_idmap *idmap,
 		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int flags);
+int fat_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync);
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index becccdd2e501..34d8588fcd3f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,6 +17,7 @@
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
+#include <linux/fileattr.h>
 #include "fat.h"
 
 static long fat_fallocate(struct file *file, int mode,
@@ -398,6 +399,27 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
+int fat_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+
+	/*
+	 * FAT filesystems are case-insensitive by default. MSDOS
+	 * supports a 'nocase' mount option for case-sensitive behavior.
+	 *
+	 * VFAT long filename entries preserve case. Without VFAT, only
+	 * uppercased 8.3 short names are stored. MSDOS with 'nocase'
+	 * also preserves case.
+	 */
+	if (!sbi->options.nocase) {
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+		if (!sbi->options.isvfat)
+			fa->fsx_xflags |= FS_XFLAG_CASENONPRESERVING;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_fileattr_get);
+
 int fat_getattr(struct mnt_idmap *idmap, const struct path *path,
 		struct kstat *stat, u32 request_mask, unsigned int flags)
 {
@@ -575,5 +597,6 @@ EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.fileattr_get	= fat_fileattr_get,
 	.update_time	= fat_update_time,
 };
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 4cc65f330fb7..0fd2971ad4b1 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -644,6 +644,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
 	.rename		= msdos_rename,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.fileattr_get	= fat_fileattr_get,
 	.update_time	= fat_update_time,
 };
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 918b3756674c..e909447873e3 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1185,6 +1185,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
 	.rename		= vfat_rename2,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.fileattr_get	= fat_fileattr_get,
 	.update_time	= fat_update_time,
 };
 

-- 
2.53.0


^ permalink raw reply related

* [PATCH v9 02/17] fs: Add case sensitivity flags to file_kattr
From: Chuck Lever @ 2026-04-22 23:29 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Darrick J. Wong
In-Reply-To: <20260422-case-sensitivity-v9-0-be023cc070e2@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Enable upper layers such as NFSD to retrieve case sensitivity
information from file systems by adding FS_XFLAG_CASEFOLD and
FS_XFLAG_CASENONPRESERVING flags.

Filesystems report case-insensitive or case-nonpreserving behavior
by setting these flags directly in fa->fsx_xflags. The default
(flags unset) indicates POSIX semantics: case-sensitive and
case-preserving. These flags are read-only; userspace cannot set
them via ioctl.

Case sensitivity information is exported to userspace via the
fa_xflags field in the FS_IOC_FSGETXATTR ioctl and file_getattr()
system call.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/file_attr.c           | 4 ++++
 include/linux/fileattr.h | 3 ++-
 include/uapi/linux/fs.h  | 7 +++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/file_attr.c b/fs/file_attr.c
index f429da66a317..bfb00d256dd5 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -37,6 +37,8 @@ void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
 		fa->flags |= FS_PROJINHERIT_FL;
 	if (fa->fsx_xflags & FS_XFLAG_VERITY)
 		fa->flags |= FS_VERITY_FL;
+	if (fa->fsx_xflags & FS_XFLAG_CASEFOLD)
+		fa->flags |= FS_CASEFOLD_FL;
 }
 EXPORT_SYMBOL(fileattr_fill_xflags);
 
@@ -67,6 +69,8 @@ void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
 		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
 	if (fa->flags & FS_VERITY_FL)
 		fa->fsx_xflags |= FS_XFLAG_VERITY;
+	if (fa->flags & FS_CASEFOLD_FL)
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
 }
 EXPORT_SYMBOL(fileattr_fill_flags);
 
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index 3780904a63a6..58044b598016 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -16,7 +16,8 @@
 
 /* Read-only inode flags */
 #define FS_XFLAG_RDONLY_MASK \
-	(FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY)
+	(FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY | \
+	 FS_XFLAG_CASEFOLD | FS_XFLAG_CASENONPRESERVING)
 
 /* Flags to indicate valid value of fsx_ fields */
 #define FS_XFLAG_VALUES_MASK \
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 13f71202845e..2ea4c81df08f 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -254,6 +254,13 @@ struct file_attr {
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
 #define FS_XFLAG_VERITY		0x00020000	/* fs-verity enabled */
+/*
+ * Case handling flags (read-only, cannot be set via ioctl).
+ * Default (neither set) indicates POSIX semantics: case-sensitive
+ * lookups and case-preserving storage.
+ */
+#define FS_XFLAG_CASEFOLD	0x00040000	/* case-insensitive lookups */
+#define FS_XFLAG_CASENONPRESERVING 0x00080000	/* case not preserved */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is

-- 
2.53.0


^ permalink raw reply related

* [PATCH v9 01/17] fs: Move file_kattr initialization to callers
From: Chuck Lever @ 2026-04-22 23:29 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Darrick J. Wong
In-Reply-To: <20260422-case-sensitivity-v9-0-be023cc070e2@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

fileattr_fill_xflags() and fileattr_fill_flags() memset the
entire file_kattr struct before populating select fields, so
callers cannot pre-set fields in fa->fsx_xflags without having
their values clobbered. Darrick Wong noted that a function
named "fill_xflags" touching more than xflags forces callers
to know implementation details beyond its apparent scope.

Drop the memset from both fill functions and initialize at the
entry points instead: ioctl_setflags(), ioctl_fssetxattr(),
the file_setattr() syscall, and xfs_ioc_fsgetxattra() now
declare fa with an aggregate initializer. ioctl_getflags(),
ioctl_fsgetxattr(), and the file_getattr() syscall already
aggregate-initialize fa to pass flags_valid/fsx_valid hints
into vfs_fileattr_get().

Subsequent patches rely on this so that ->fileattr_get()
handlers can set case-sensitivity flags (FS_XFLAG_CASEFOLD,
FS_XFLAG_CASENONPRESERVING) in fa->fsx_xflags before the fill
functions run.

Suggested-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/file_attr.c     | 12 ++++--------
 fs/xfs/xfs_ioctl.c |  2 +-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/file_attr.c b/fs/file_attr.c
index da983e105d70..f429da66a317 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -15,12 +15,10 @@
  * @fa:		fileattr pointer
  * @xflags:	FS_XFLAG_* flags
  *
- * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
- * other fields are zeroed.
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).
  */
 void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
 {
-	memset(fa, 0, sizeof(*fa));
 	fa->fsx_valid = true;
 	fa->fsx_xflags = xflags;
 	if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
@@ -48,11 +46,9 @@ EXPORT_SYMBOL(fileattr_fill_xflags);
  * @flags:	FS_*_FL flags
  *
  * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
- * All other fields are zeroed.
  */
 void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
 {
-	memset(fa, 0, sizeof(*fa));
 	fa->flags_valid = true;
 	fa->flags = flags;
 	if (fa->flags & FS_SYNC_FL)
@@ -325,7 +321,7 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
 {
 	struct mnt_idmap *idmap = file_mnt_idmap(file);
 	struct dentry *dentry = file->f_path.dentry;
-	struct file_kattr fa;
+	struct file_kattr fa = {};
 	unsigned int flags;
 	int err;
 
@@ -357,7 +353,7 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
 {
 	struct mnt_idmap *idmap = file_mnt_idmap(file);
 	struct dentry *dentry = file->f_path.dentry;
-	struct file_kattr fa;
+	struct file_kattr fa = {};
 	int err;
 
 	err = copy_fsxattr_from_user(&fa, argp);
@@ -431,7 +427,7 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
 	struct path filepath __free(path_put) = {};
 	unsigned int lookup_flags = 0;
 	struct file_attr fattr;
-	struct file_kattr fa;
+	struct file_kattr fa = {};
 	int error;
 
 	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 46e234863644..ed9b4846c05f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -517,7 +517,7 @@ xfs_ioc_fsgetxattra(
 	xfs_inode_t		*ip,
 	void			__user *arg)
 {
-	struct file_kattr	fa;
+	struct file_kattr	fa = {};
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v9 00/17] Exposing case folding behavior
From: Chuck Lever @ 2026-04-22 23:29 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Darrick J. Wong

Following on from

https://lore.kernel.org/linux-nfs/20251021-zypressen-bazillus-545a44af57fd@brauner/T/#m0ba197d75b7921d994cf284f3cef3a62abb11aaa

I'm attempting to implement enough support in the Linux VFS to
enable file services like NFSD and ksmbd (and user space
equivalents) to provide the actual status of case folding support
in local file systems. The default behavior for local file systems
not explicitly supported in this series is to reflect the usual
POSIX behaviors:

  case-insensitive = false
  case-nonpreserving = false

The case-insensitivity and case-nonpreserving booleans can be
consumed immediately by NFSD. These two attributes have been part of
the NFSv3 and NFSv4 protocols for decades, in order to support NFS
client implementations on non-POSIX systems.

Support for user space file servers is why this series exposes case
folding information via a user-space API. I don't know of any other
category of user-space application that requires access to case
folding info.

The Linux NFS community has a growing interest in supporting NFS
clients on Windows and MacOS platforms, where file name behavior does
not align with traditional POSIX semantics.

One example of a Windows-based NFS client is [1]. This client
implementation explicitly requires servers to report
FATTR4_WORD0_CASE_INSENSITIVE = TRUE for proper operation, a hard
requirement for Windows client interoperability because Windows
applications expect case-insensitive behavior. When an NFS client
knows the server is case-insensitive, it can avoid issuing multiple
LOOKUP/READDIR requests to search for case variants, and applications
like Win32 programs work correctly without manual workarounds or
code changes.

Even the Linux client can take advantage of this information. Trond
merged patches 4 years ago [2] that introduce support for case
insensitivity, in support of the Hammerspace NFS server. In
particular, when a client detects a case-insensitive NFS share,
negative dentry caching must be disabled (a lookup for "FILE.TXT"
failing shouldn't cache a negative entry when "file.txt" exists)
and directory change invalidation must clear all cached case-folded
file name variants.

Hammerspace servers and several other NFS server implementations
operate in multi-protocol environments, where a single file service
instance caters to both NFS and SMB clients. In those cases, things
work more smoothly for everyone when the NFS client can see and adapt
to the case folding behavior that SMB users rely on and expect. NFSD
needs to support the case-insensitivity and case-nonpreserving
booleans properly in order to participate as a first-class citizen
in such environments.

[1] https://github.com/kofemann/ms-nfs41-client

[2] https://patchwork.kernel.org/project/linux-nfs/cover/20211217203658.439352-1-trondmy@kernel.org/

---
Changes since v8:
- Rebase on v7.0-rc1

Changes since v7:
- Split file_attr initialization changes into a separate patch

Changes since v6:
- Remove the memset from vfs_fileattr_get

Changes since v5:
- Finish the conversion to FS_XFLAGs
- NFSv4 GETATTR now clears the attr mask bit if nfsd_get_case_info()
  fails

Changes since v4:
- Observe the MSDOS "nocase" mount option
- Define new FS_XFLAGs for the user API

Changes since v3:
- Change fa->case_preserving to fa_case_nonpreserving
- VFAT is case preserving
- Make new fields available to user space

Changes since v2:
- Remove unicode labels
- Replace vfs_get_case_info
- Add support for several more local file system implementations
- Add support for in-kernel SMB server

Changes since RFC:
- Use file_getattr instead of statx
- Postpone exposing Unicode version until later
- Support NTFS and ext4 in addition to FAT
- Support NFSv4 fattr4 in addition to NFSv3 PATHCONF

---
Chuck Lever (17):
      fs: Move file_kattr initialization to callers
      fs: Add case sensitivity flags to file_kattr
      fat: Implement fileattr_get for case sensitivity
      exfat: Implement fileattr_get for case sensitivity
      ntfs3: Implement fileattr_get for case sensitivity
      hfs: Implement fileattr_get for case sensitivity
      hfsplus: Report case sensitivity in fileattr_get
      ext4: Report case sensitivity in fileattr_get
      xfs: Report case sensitivity in fileattr_get
      cifs: Implement fileattr_get for case sensitivity
      nfs: Implement fileattr_get for case sensitivity
      f2fs: Add case sensitivity reporting to fileattr_get
      vboxsf: Implement fileattr_get for case sensitivity
      isofs: Implement fileattr_get for case sensitivity
      nfsd: Report export case-folding via NFSv3 PATHCONF
      nfsd: Implement NFSv4 FATTR4_CASE_INSENSITIVE and FATTR4_CASE_PRESERVING
      ksmbd: Report filesystem case sensitivity via FS_ATTRIBUTE_INFORMATION

 fs/exfat/exfat_fs.h      |  2 ++
 fs/exfat/file.c          | 17 +++++++++++++++--
 fs/exfat/namei.c         |  1 +
 fs/ext4/ioctl.c          |  7 +++++++
 fs/f2fs/file.c           |  8 ++++++++
 fs/fat/fat.h             |  3 +++
 fs/fat/file.c            | 23 +++++++++++++++++++++++
 fs/fat/namei_msdos.c     |  1 +
 fs/fat/namei_vfat.c      |  1 +
 fs/file_attr.c           | 16 ++++++++--------
 fs/hfs/dir.c             |  1 +
 fs/hfs/hfs_fs.h          |  2 ++
 fs/hfs/inode.c           | 13 +++++++++++++
 fs/hfsplus/inode.c       | 10 ++++++++++
 fs/isofs/dir.c           | 11 +++++++++++
 fs/nfs/client.c          |  9 +++++++--
 fs/nfs/inode.c           | 21 +++++++++++++++++++++
 fs/nfs/internal.h        |  3 +++
 fs/nfs/nfs3proc.c        |  2 ++
 fs/nfs/nfs3xdr.c         |  7 +++++--
 fs/nfs/nfs4proc.c        |  2 ++
 fs/nfs/proc.c            |  3 +++
 fs/nfs/symlink.c         |  3 +++
 fs/nfsd/nfs3proc.c       | 18 ++++++++++--------
 fs/nfsd/nfs4xdr.c        | 25 +++++++++++++++++++++++--
 fs/nfsd/vfs.c            | 29 +++++++++++++++++++++++++++++
 fs/nfsd/vfs.h            |  3 +++
 fs/ntfs3/file.c          | 23 +++++++++++++++++++++++
 fs/ntfs3/inode.c         |  1 +
 fs/ntfs3/namei.c         |  2 ++
 fs/ntfs3/ntfs_fs.h       |  1 +
 fs/smb/client/cifsfs.c   | 20 ++++++++++++++++++++
 fs/smb/server/smb2pdu.c  | 25 +++++++++++++++++++------
 fs/vboxsf/dir.c          |  1 +
 fs/vboxsf/file.c         |  6 ++++--
 fs/vboxsf/super.c        |  7 +++++++
 fs/vboxsf/utils.c        | 26 ++++++++++++++++++++++++++
 fs/vboxsf/vfsmod.h       |  6 ++++++
 fs/xfs/xfs_ioctl.c       |  9 ++++++++-
 include/linux/fileattr.h |  3 ++-
 include/linux/nfs_xdr.h  |  2 ++
 include/uapi/linux/fs.h  |  7 +++++++
 42 files changed, 346 insertions(+), 34 deletions(-)
---
base-commit: 6596a02b207886e9e00bb0161c7fd59fea53c081
change-id: 20260422-case-sensitivity-5cbffc8f1558

Best regards,
--  
Chuck Lever


^ permalink raw reply

* [RFC PATCH 1/4] fusefatfs: enable fuse systemd service mode
From: Darrick J. Wong @ 2026-04-22 23:29 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, fuse-devel
  Cc: Miklos Szeredi, Bernd Schubert, Joanne Koong, Theodore Ts'o,
	Neal Gompa, Amir Goldstein, Christian Brauner, demiobenour
In-Reply-To: <20260422231518.GA7717@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Enable use of fusefatfs as a contained systemd service.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 CMakeLists.txt        |   18 +++++++
 config.h.in           |    2 +
 diskio.c              |    3 +
 fusefatfs.c           |  124 ++++++++++++++++++++++++++++++++++++++++++++++---
 fusefatfs.socket.in   |   17 +++++++
 fusefatfs@.service.in |  102 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 fusefatfs.socket.in
 create mode 100644 fusefatfs@.service.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e7d70ec85b748..473d1c451d0810 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,24 @@ pkg_check_modules(FUSE fuse3)
 if(NOT FUSE_FOUND)
 	pkg_check_modules(FUSE REQUIRED fuse)
 endif()
+pkg_get_variable(FUSE3_SERVICE_SOCKET_DIR fuse3 service_socket_dir)
+pkg_get_variable(FUSE3_SERVICE_SOCKET_PERMS fuse3 service_socket_perms)
+pkg_check_modules(SYSTEMD systemd)
+if (SYSTEMD_FOUND)
+	pkg_get_variable(SYSTEMD_SYSTEM_UNIT_DIR systemd systemd_system_unit_dir)
+endif()
+IF ( (NOT "${FUSE3_SERVICE_SOCKET_DIR}" STREQUAL "") AND (NOT "${SYSTEMD_SYSTEM_UNIT_DIR}" STREQUAL "") )
+	message(STATUS "Found libfuse3 service socket dir: ${FUSE3_SERVICE_SOCKET_DIR}")
+	message(STATUS "Found libfuse3 service socket perms: ${FUSE3_SERVICE_SOCKET_PERMS}")
+	set(DEFINE_HAVE_FUSE_SERVICE "#define HAVE_FUSE_SERVICE")
+	configure_file(fusefatfs.socket.in ${CMAKE_BINARY_DIR}/fusefatfs.socket @ONLY)
+	configure_file(fusefatfs@.service.in ${CMAKE_BINARY_DIR}/fusefatfs@.service @ONLY)
+	message(STATUS "Found systemd system unit dir: ${SYSTEMD_SYSTEM_UNIT_DIR}")
+	install(FILES ${CMAKE_BINARY_DIR}/fusefatfs.socket
+		DESTINATION ${CMAKE_INSTALL_PREFIX}${SYSTEMD_SYSTEM_UNIT_DIR})
+	install(FILES ${CMAKE_BINARY_DIR}/fusefatfs@.service
+		DESTINATION ${CMAKE_INSTALL_PREFIX}${SYSTEMD_SYSTEM_UNIT_DIR})
+endif()
 string(REGEX REPLACE "\\..*" "" FUSE_VERSION ${FUSE_VERSION})
 
 set(CMAKE_REQUIRED_DEFINITIONS -D_FILE_OFFSET_BITS=64)
diff --git a/config.h.in b/config.h.in
index 7f916d685c1e42..e6d2e8c82b7d0c 100644
--- a/config.h.in
+++ b/config.h.in
@@ -4,5 +4,7 @@
 #define PROGNAME "@CMAKE_PROJECT_NAME@"
 #define VERSION "@CMAKE_PROJECT_VERSION@"
 
+@DEFINE_HAVE_FUSE_SERVICE@
+
 #endif
 
diff --git a/diskio.c b/diskio.c
index 122f93f3316e66..ca83a102e49b78 100644
--- a/diskio.c
+++ b/diskio.c
@@ -41,6 +41,9 @@ DSTATUS disk_initialize (
 	struct fftab *drv = fftab_get(pdrv);
 	if (!drv) return STA_NOINIT;
 
+	if (drv->fd >= 0)
+		return RES_OK;
+
 	if (drv->flags & FFFF_RDONLY)
 		drv->fd = open(drv->path, O_RDONLY);
 	else
diff --git a/fusefatfs.c b/fusefatfs.c
index 248f5c3a8a37c8..376f6cd1c338dd 100644
--- a/fusefatfs.c
+++ b/fusefatfs.c
@@ -20,7 +20,7 @@
 #define FUSE_USE_VERSION 29
 #define FUSE3_ONLY(...)
 #else
-#define FUSE_USE_VERSION FUSE_MAKE_VERSION(3, 14)
+#define FUSE_USE_VERSION FUSE_MAKE_VERSION(3, 19)
 #define FUSE3_ONLY(...) __VA_ARGS__
 #endif
 
@@ -37,6 +37,11 @@
 #include <fftable.h>
 #include <config.h>
 
+#ifdef HAVE_FUSE_SERVICE
+# include <sys/mount.h>
+# include <fuse_service.h>
+#endif
+
 int fuse_reentrant_tag = 0;
 
 #if FF_DEFINED == 80286
@@ -52,6 +57,96 @@ static pthread_mutex_t fff_mutex = PTHREAD_MUTEX_INITIALIZER;
 #define mutex_out() pthread_mutex_unlock(&fff_mutex)
 #define mutex_out_return(RETVAL) do {mutex_out(); return(RETVAL); } while (0)
 
+#ifdef HAVE_FUSE_SERVICE
+static struct fuse_service *service;
+static int bdev_fd = -1;
+
+static inline bool fff_is_service(void)
+{
+	return fuse_service_accepted(service);
+}
+
+static int fff_service_connect(struct fuse_args *args)
+{
+	int ret;
+
+	ret = fuse_service_accept(&service);
+	if (ret)
+		return ret;
+
+	if (fuse_service_accepted(service))
+		return fuse_service_append_args(service, args);
+
+	return 0;
+}
+
+static int fff_service_get_config(const char *device, bool ro,
+				  struct stat *sbuf)
+{
+	int open_flags = O_EXCL;
+	int fd;
+	int ret;
+
+	if (ro)
+		open_flags |= O_RDONLY;
+	else
+		open_flags |= O_SYNC | O_RDWR;
+
+	ret = fuse_service_request_file(service, device, open_flags, 0, 0);
+	if (ret)
+		return ret;
+
+	ret = fuse_service_receive_file(service, device, &fd);
+	if (ret)
+		return ret;
+
+	if (fd < 0) {
+		fprintf(stderr, "%s opening device: %s.\n", device,
+			   strerror(-fd));
+		return -1;
+	}
+	bdev_fd = fd;
+
+	ret = fuse_service_finish_file_requests(service);
+	if (ret)
+		return ret;
+
+	return fstat(bdev_fd, sbuf);
+}
+
+static void fff_service_assign_bdev(struct fftab *ffentry)
+{
+	if (fff_is_service())
+		ffentry->fd = bdev_fd;
+}
+
+static int fff_service_main(struct fuse_args *args,
+			    const struct fuse_operations *ops,
+			    struct fftab *data)
+{
+	fuse_service_expect_mount_format(service, S_IFDIR);
+	return fuse_service_main(service, args, ops, data);
+}
+
+static int fff_service_finish(int exitcode)
+{
+	if (!fff_is_service())
+		return exitcode;
+
+	fuse_service_send_goodbye(service, exitcode);
+	fuse_service_destroy(&service);
+
+	return fuse_service_exit(exitcode);
+}
+#else
+# define fff_is_service(...)			(false)
+# define fff_service_connect(...)		(0)
+# define fff_service_get_config(...)		(EOPNOTSUPP)
+# define fff_service_assign_bdev(...)		((void)0)
+# define fff_service_main(...)			(1)
+# define fff_service_finish(ret)		(ret)
+#endif /* HAVE_FUSE_SERVICE */
+
 #define fffpath(index, path) \
   *fffpath; \
   ssize_t __fffpathlen = (index == 0) ? 0 : strlen(path) + 3; \
@@ -423,6 +518,9 @@ static struct fftab *fff_init(const char *source, int codepage, int flags) {
 	if (index >= 0) {
 		struct fftab *ffentry = fftab_get(index);
 		char sdrv[12];
+
+		fff_service_assign_bdev(ffentry);
+
 		snprintf(sdrv, 12, "%d:", index);
 		FRESULT fres = f_mount(&ffentry->fs, sdrv, 1);
 		if (fres != FR_OK) {
@@ -531,10 +629,10 @@ fff_opt_proc(void *data, const char *arg, int key, struct fuse_args *outargs)
 			return 1;
 		case FUSE_OPT_KEY_NONOPT:
 			if (!options->source) {
-				options->source = arg;
+				options->source = strdup(arg);
 				return 0;
 			} else if(!options->mountpoint) {
-				options->mountpoint = arg;
+				options->mountpoint = strdup(arg);
 				return 1;
 			} else
 				return -1;
@@ -565,6 +663,11 @@ int main(int argc, char *argv[])
 	int flags = 0;
 	struct stat sbuf;
 	putenv("TZ=UTC0");
+
+	err = fff_service_connect(&args);
+	if (err)
+		exit(1);
+
 	if (fuse_opt_parse(&args, &options, fff_opts, fff_opt_proc) == -1) {
 		fuse_opt_free_args(&args);
 		return -1;
@@ -585,7 +688,11 @@ int main(int argc, char *argv[])
 		goto returnerr;
 	}
 
-	if (stat(options.source, &sbuf) < 0) {
+	if (fff_is_service())
+		err = fff_service_get_config(options.source, options.ro, &sbuf);
+	else
+		err = stat(options.source, &sbuf);
+	if (err < 0) {
 		fprintf(stderr, "%s: %s\n", options.source, strerror(errno));
 		goto returnerr;
 	}
@@ -600,12 +707,15 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "Fuse init error\n");
 		goto returnerr;
 	}
-	err = fuse_main(args.argc, args.argv, &fusefat_ops, ffentry);
+	if (fff_is_service())
+		err = fff_service_main(&args, &fusefat_ops, ffentry);
+	else
+		err = fuse_main(args.argc, args.argv, &fusefat_ops, ffentry);
 	fff_destroy(ffentry);
 	fuse_opt_free_args(&args);
 	if (err) fprintf(stderr, "Fuse error %d\n", err);
-	return err;
+	return fff_service_finish(err);
 returnerr:
 	fuse_opt_free_args(&args);
-	return -1;
+	return fff_service_finish(-1);
 }
diff --git a/fusefatfs.socket.in b/fusefatfs.socket.in
new file mode 100644
index 00000000000000..3512b3570d6178
--- /dev/null
+++ b/fusefatfs.socket.in
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=Socket for fusefatfs Service
+
+[Socket]
+ListenSequentialPacket=@FUSE3_SERVICE_SOCKET_DIR@/vfat
+ListenSequentialPacket=@FUSE3_SERVICE_SOCKET_DIR@/msdos
+ListenSequentialPacket=@FUSE3_SERVICE_SOCKET_DIR@/fat
+Accept=yes
+SocketMode=@FUSE3_SERVICE_SOCKET_PERMS@
+RemoveOnStop=yes
+
+[Install]
+WantedBy=sockets.target
diff --git a/fusefatfs@.service.in b/fusefatfs@.service.in
new file mode 100644
index 00000000000000..ac7c4d6cdad93a
--- /dev/null
+++ b/fusefatfs@.service.in
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=fusefatfs Service
+
+# Don't leave failed units behind, systemd does not clean them up!
+CollectMode=inactive-or-failed
+
+[Service]
+Type=exec
+ExecStart=/@CMAKE_INSTALL_BINDIR@/fusefatfs
+
+# Try to capture core dumps
+LimitCORE=infinity
+
+SyslogIdentifier=%N
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Don't let us see anything in the regular system, and don't run as root
+DynamicUser=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+PrivateDevices=true
+PrivateUsers=true
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+RestrictFileSystems=
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+
+SystemCallFilter=~@clock
+SystemCallFilter=~@cpu-emulation
+SystemCallFilter=~@debug
+SystemCallFilter=~@module
+SystemCallFilter=~@reboot
+SystemCallFilter=~@swap
+
+SystemCallFilter=~@mount
+
+# libfuse io_uring wants to pin cores and memory
+SystemCallFilter=mbind
+SystemCallFilter=sched_setaffinity
+
+# Leave a breadcrumb if we get whacked by the system call filter
+SystemCallErrorNumber=EL3RST
+
+# Log to the kernel dmesg, just like an in-kernel ext4 driver
+StandardOutput=append:/dev/ttyprintk
+StandardError=append:/dev/ttyprintk
+
+# Run with no capabilities at all
+CapabilityBoundingSet=
+AmbientCapabilities=
+NoNewPrivileges=true
+
+# fuse4fs doesn't create files
+UMask=7777
+
+# No access to hardware /dev files at all
+ProtectClock=true
+DevicePolicy=closed
+
+# Don't mess with set[ug]id anything.
+RestrictSUIDSGID=true
+
+# Don't let OOM kills of processes in this containment group kill the whole
+# service, because we don't want filesystem drivers to go down.
+OOMPolicy=continue
+OOMScoreAdjust=-1000

^ permalink raw reply related

* Re: [PATCH] fscrypt: add software key support for filesystem-managed data
From: Eric Biggers @ 2026-04-22 23:27 UTC (permalink / raw)
  To: LiaoYuanhong-vivo
  Cc: tytso, jaegeuk, linux-fscrypt, linux-kernel, linux-ext4,
	linux-f2fs-devel
In-Reply-To: <20260421075717.170840-1-liaoyuanhong@vivo.com>

On Tue, Apr 21, 2026 at 03:57:17PM +0800, LiaoYuanhong-vivo wrote:
> Some filesystems store small file contents in filesystem-managed regions
> rather than in regular data blocks submitted through bios. One example is
> F2FS inline_data, where the payload is stored inside the inode node block.
> Such regions still need to follow the inode's fscrypt contents encryption
> semantics, but they cannot rely on blk-crypto because they are not
> submitted as standalone file data bios.
> 
> As a result, when blk-crypto is enabled, mechanisms such as inline_data are
> typically disabled outright. However, it is desirable to re-enable such
> space-saving features while still preserving the required encryption
> semantics.
> 
> To support this, add fscrypt_crypt_fs_layer_page_inplace(), a helper that
> encrypts or decrypts a caller-provided page region in place using
> filesystem-layer software crypto and the inode's contents encryption
> policy.
> 
> This support is limited to v2 encryption policies. v1 policies do not
> provide the key setup model used here, so this path returns -EOPNOTSUPP for
> v1. Hardware-wrapped keys are not supported either, since deriving a
> software skcipher key requires software-accessible key material, which
> conflicts with the hardware-wrapped key model.
> 
> When the inode's normal contents path uses blk-crypto, fscrypt may not have
> a software skcipher key prepared for the inode contents key. Add an
> optional filesystem-layer prepared key to fscrypt_inode_info. This key is
> derived using the same v2 contents-encryption KDF as the normal contents
> key, but is prepared as a software skcipher key and is used only by the new
> filesystem-layer helper.
> 
> Signed-off-by: LiaoYuanhong-vivo <liaoyuanhong@vivo.com>

I don't have time for a super detailed review at the moment, but here
are my initial thoughts:

- This needs to be sent along with the code that actually uses it in
  ext4 and f2fs.  Please also Cc the mailing lists for those
  filesystems.

- This is going to require an "incompat" filesystem feature flag.  After
  all, once a filesystem contains files that use this scheme, older
  kernels won't understand it.

- UBIFS and CephFS already use fs/crypto/ but don't support blk-crypto
  (inline encryption).  This new code feels duplicative of that.  It
  should be possible to reuse the existing code instead.  That would
  include, for example, reusing the existing en/decryption functions and
  the existing struct ci_enc_key field.  This would keep the changes
  limited mainly to how the key is being set up.

- Supporting all the different IV generation methods doesn't make sense
  when a per-file key is always used.

- The fact that this is incompatible with hardware-wrapped keys greatly
  limits the usefulness of this.  (Note that technically, it could be
  supported in combination with them anyway.  But the security models
  would be inconsistent, which I assume is what you have in mind.)

Hope this is helpful,

- Eric

^ permalink raw reply

* [PATCH 10/10] debian: update packaging for fuse4fs service
From: Darrick J. Wong @ 2026-04-22 23:26 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Update the Debian packaging code so that we can create fuse4fs service
containers.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 debian/e2fsprogs.install |    7 ++++++-
 debian/fuse4fs.install   |    3 +++
 debian/rules             |    3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 debian/fuse4fs.install


diff --git a/debian/e2fsprogs.install b/debian/e2fsprogs.install
index 17a80e3922dcee..808474bcab1717 100755
--- a/debian/e2fsprogs.install
+++ b/debian/e2fsprogs.install
@@ -50,4 +50,9 @@ usr/share/man/man8/resize2fs.8
 usr/share/man/man8/tune2fs.8
 etc
 [linux-any] ${deb_udevudevdir}/rules.d
-[linux-any] ${deb_systemdsystemunitdir}
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_all.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_all.timer
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_fail@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_reap.service
diff --git a/debian/fuse4fs.install b/debian/fuse4fs.install
old mode 100644
new mode 100755
index 17bdc90e33cb67..56048136c2b28b
--- a/debian/fuse4fs.install
+++ b/debian/fuse4fs.install
@@ -1,2 +1,5 @@
+#!/usr/bin/dh-exec
 usr/bin/fuse4fs
 usr/share/man/man1/fuse4fs.1
+[linux-any] ${deb_systemdsystemunitdir}/fuse4fs.socket
+[linux-any] ${deb_systemdsystemunitdir}/fuse4fs@.service
diff --git a/debian/rules b/debian/rules
index b680eb33ceac9e..d629e9d6915cfe 100755
--- a/debian/rules
+++ b/debian/rules
@@ -173,6 +173,9 @@ override_dh_installinfo:
 ifneq ($(DEB_HOST_ARCH_OS), hurd)
 override_dh_installsystemd:
 	dh_installsystemd -p e2fsprogs --no-restart-after-upgrade --no-stop-on-upgrade e2scrub_all.timer e2scrub_reap.service
+ifeq ($(SKIP_FUSE4FS),)
+	dh_installsystemd -p fuse4fs fuse4fs.socket
+endif
 endif
 
 override_dh_makeshlibs:


^ permalink raw reply related

* [PATCH 09/10] fuse4fs: make MMP work correctly in safe service mode
From: Darrick J. Wong @ 2026-04-22 23:25 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Normally, the libext2fs MMP code open()s a complete separate file
descriptor to read and write the MMP block so that it can have its own
private open file with its own access mode and file position.  However,
if the unixfd IO manager is in use, it will reuse the io channel, which
means that MMP and the unixfd share the same open file and hence the
access mode and file position.

MMP requires directio access to block devices so that changes are
immediately visible on other nodes.  Therefore, we need the IO channel
(and thus the filesystem) to be running in directio mode if MMP is in
use.

To make this work correctly with the sole unixfd IO manager user
(fuse4fs in unprivileged service mode), we must set O_DIRECT on the
bdev fd and mount the filesystem in directio mode.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fuse4fs/fuse4fs.c |   51 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)


diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index bf7c8ef9102a7a..dc5a0ede9f5072 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -1371,12 +1371,57 @@ static int fuse4fs_service_get_config(struct fuse4fs *ff)
 }
 
 static errcode_t fuse4fs_service_openfs(struct fuse4fs *ff, char *options,
-					int flags)
+					int *flags)
 {
+	struct stat statbuf;
 	char path[64];
+	errcode_t retval;
+	int ret;
 
+	ret = fstat(ff->bdev_fd, &statbuf);
+	if (ret)
+		return errno;
+
+	/*
+	 * Open the filesystem with SKIP_MMP so that we can find out if the
+	 * filesystem actually has MMP.
+	 */
 	snprintf(path, sizeof(path), "/dev/fd/%d", ff->bdev_fd);
-	return ext2fs_open2(path, options, flags, 0, 0, unixfd_io_manager,
+	retval = ext2fs_open2(path, options, *flags | EXT2_FLAG_SKIP_MMP, 0, 0,
+			      unixfd_io_manager, &ff->fs);
+	if (retval)
+		return retval;
+
+	/*
+	 * If the fs doesn't have MMP then we're good to go.  Otherwise close
+	 * the filesystem so that we can reopen it with MMP enabled.
+	 */
+	if (!ext2fs_has_feature_mmp(ff->fs->super))
+		return 0;
+
+	retval = ext2fs_close_free(&ff->fs);
+	if (retval)
+		return retval;
+
+	/*
+	 * If the filesystem is not on a regular file, MMP will share the same
+	 * fd as the unixfd IO channel.  We need to set O_DIRECT on the bdev_fd
+	 * and open the filesystem in directio mode.
+	 */
+	if (!S_ISREG(statbuf.st_mode)) {
+		int fflags = fcntl(ff->bdev_fd, F_GETFL);
+
+		if (!(fflags & O_DIRECT)) {
+			ret = fcntl(ff->bdev_fd, F_SETFL, fflags | O_DIRECT);
+			if (ret)
+				return EXT2_ET_MMP_OPEN_DIRECT;
+		}
+
+		ff->directio = 1;
+		*flags |= EXT2_FLAG_DIRECT_IO;
+	}
+
+	return ext2fs_open2(path, options, *flags, 0, 0, unixfd_io_manager,
 			    &ff->fs);
 }
 #else
@@ -1516,7 +1561,7 @@ static errcode_t fuse4fs_open(struct fuse4fs *ff)
 	deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
 	do {
 		if (fuse4fs_is_service(ff))
-			err = fuse4fs_service_openfs(ff, options, flags);
+			err = fuse4fs_service_openfs(ff, options, &flags);
 		else
 			err = ext2fs_open2(ff->device, options, flags, 0, 0,
 					   unix_io_manager, &ff->fs);


^ permalink raw reply related

* [PATCH 08/10] fuse4fs: set proc title when in fuse service mode
From: Darrick J. Wong @ 2026-04-22 23:25 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

When in fuse service mode, set the process title so that we can identify
fuse servers by mount arguments.  When the service ends, amend the title
again to say that we're cleaning up.  This is done to make ps aux a bit
more communicative as to what is going on.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 configure           |  109 +++++++++++++++++++++++++++++++++++++++++++++++++++
 configure.ac        |   13 ++++++
 fuse4fs/Makefile.in |    2 -
 fuse4fs/fuse4fs.c   |   47 ++++++++++++++++++++++
 lib/config.h.in     |    6 +++
 5 files changed, 176 insertions(+), 1 deletion(-)


diff --git a/configure b/configure
index 0d49ec854a92cf..80aad505da550c 100755
--- a/configure
+++ b/configure
@@ -696,6 +696,7 @@ gcc_ranlib
 gcc_ar
 UNI_DIFF_OPTS
 SEM_INIT_LIB
+LIBBSD_LIB
 FUSE4FS_CMT
 FUSE2FS_CMT
 fuse_service_socket_perms
@@ -15014,6 +15015,114 @@ printf "%s\n" "#define HAVE_FUSE_CACHE_READDIR 1" >>confdefs.h
 
 fi
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for setproctitle in -lbsd" >&5
+printf %s "checking for setproctitle in -lbsd... " >&6; }
+if test ${ac_cv_lib_bsd_setproctitle+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbsd  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char setproctitle (void);
+int
+main (void)
+{
+return setproctitle ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_bsd_setproctitle=yes
+else case e in #(
+  e) ac_cv_lib_bsd_setproctitle=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bsd_setproctitle" >&5
+printf "%s\n" "$ac_cv_lib_bsd_setproctitle" >&6; }
+if test "x$ac_cv_lib_bsd_setproctitle" = xyes
+then :
+  LIBBSD_LIB=-lbsd
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for setproctitle_init in -lbsd" >&5
+printf %s "checking for setproctitle_init in -lbsd... " >&6; }
+if test ${ac_cv_lib_bsd_setproctitle_init+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbsd  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char setproctitle_init (void);
+int
+main (void)
+{
+return setproctitle_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_bsd_setproctitle_init=yes
+else case e in #(
+  e) ac_cv_lib_bsd_setproctitle_init=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bsd_setproctitle_init" >&5
+printf "%s\n" "$ac_cv_lib_bsd_setproctitle_init" >&6; }
+if test "x$ac_cv_lib_bsd_setproctitle_init" = xyes
+then :
+  LIBBSD_LIB=-lbsd
+fi
+
+
+if test "$ac_cv_lib_bsd_setproctitle" = yes ; then
+
+printf "%s\n" "#define HAVE_SETPROCTITLE 1" >>confdefs.h
+
+fi
+if test "$ac_cv_lib_bsd_setproctitle_init" = yes ; then
+
+printf "%s\n" "#define HAVE_SETPROCTITLE_INIT 1" >>confdefs.h
+
+fi
+
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for PR_SET_IO_FLUSHER" >&5
 printf %s "checking for PR_SET_IO_FLUSHER... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
diff --git a/configure.ac b/configure.ac
index 4b66296764ec86..63a5cd697a6dde 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1619,6 +1619,19 @@ then
 		  [Define to 1 if fuse supports cache_readdir])
 fi
 
+dnl
+dnl see if setproctitle exists
+dnl
+AC_CHECK_LIB(bsd, setproctitle, [LIBBSD_LIB=-lbsd])
+AC_CHECK_LIB(bsd, setproctitle_init, [LIBBSD_LIB=-lbsd])
+AC_SUBST(LIBBSD_LIB)
+if test "$ac_cv_lib_bsd_setproctitle" = yes ; then
+	AC_DEFINE(HAVE_SETPROCTITLE, 1, [Define to 1 if setproctitle present in libbsd])
+fi
+if test "$ac_cv_lib_bsd_setproctitle_init" = yes ; then
+	AC_DEFINE(HAVE_SETPROCTITLE_INIT, 1, [Define to 1 if setproctitle_init present in libbsd])
+fi
+
 dnl
 dnl see if PR_SET_IO_FLUSHER exists
 dnl
diff --git a/fuse4fs/Makefile.in b/fuse4fs/Makefile.in
index 8de3fff338584c..97b2da3af9bc0b 100644
--- a/fuse4fs/Makefile.in
+++ b/fuse4fs/Makefile.in
@@ -76,7 +76,7 @@ fuse4fs: $(FUSE4FS_OBJS) $(DEPLIBS) $(DEPLIBBLKID) $(DEPLIBUUID) \
 	$(E) "	LD $@"
 	$(Q) $(CC) $(ALL_LDFLAGS) -o fuse4fs $(FUSE4FS_OBJS) $(LIBS) \
 		$(LIBFUSE) $(LIBBLKID) $(LIBUUID) $(LIBEXT2FS) $(LIBINTL) \
-		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P)
+		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P) @LIBBSD_LIB@
 
 %.socket: %.socket.in $(DEP_SUBSTITUTE)
 	$(E) "	SUBST $@"
diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 47cac88c46cea9..bf7c8ef9102a7a 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -45,6 +45,9 @@
 #ifdef HAVE_FUSE4FS_SERVICE
 # include <sys/mount.h>
 # include <fuse_service.h>
+# ifdef HAVE_SETPROCTITLE
+#  include <bsd/unistd.h>
+# endif
 #endif
 #ifdef __SET_FOB_FOR_FUSE
 # undef _FILE_OFFSET_BITS
@@ -277,6 +280,9 @@ struct fuse4fs {
 	struct cache inodes;
 #ifdef HAVE_FUSE4FS_SERVICE
 	struct fuse_service *service;
+# ifdef HAVE_SETPROCTITLE
+	char *svc_cmdline;
+# endif
 	int bdev_fd;
 #endif
 };
@@ -1252,6 +1258,35 @@ static errcode_t fuse4fs_check_support(struct fuse4fs *ff)
 	return 0;
 }
 
+#if defined(HAVE_FUSE4FS_SERVICE) && defined(HAVE_SETPROCTITLE)
+static void fuse4fs_service_set_proc_cmdline(struct fuse4fs *ff, int argc,
+					     char *argv[],
+					     struct fuse_args *args)
+{
+#ifdef HAVE_SETPROCTITLE_INIT
+	setproctitle_init(argc, argv, environ);
+#endif
+
+	ff->svc_cmdline = fuse_service_cmdline(argc, argv, args);
+	if (!ff->svc_cmdline)
+		return;
+
+	setproctitle("-%s", ff->svc_cmdline);
+}
+
+static void fuse4fs_service_finish_proc_cmdline(struct fuse4fs *ff)
+{
+	if (!ff->svc_cmdline)
+		return;
+
+	setproctitle("-%s [cleaning up]", ff->svc_cmdline);
+	free(ff->svc_cmdline);
+}
+#else
+# define fuse4fs_service_set_proc_cmdline(...)		((void)0)
+# define fuse4fs_service_finish_proc_cmdline(...)	((void)0)
+#endif
+
 #ifdef HAVE_FUSE4FS_SERVICE
 static int fuse4fs_service_connect(struct fuse4fs *ff, struct fuse_args *args)
 {
@@ -1285,6 +1320,8 @@ static int fuse4fs_service_exit(struct fuse4fs *ff, int exitcode)
 	if (!fuse4fs_is_service(ff))
 		return exitcode;
 
+	fuse4fs_service_finish_proc_cmdline(ff);
+
 	fuse_service_send_goodbye(ff->service, exitcode);
 	fuse_service_release(ff->service);
 	close(ff->bdev_fd);
@@ -6353,6 +6390,16 @@ int main(int argc, char *argv[])
 		goto out_exit;
 	}
 
+	/*
+	 * For fuse services, make the /proc title include the arguments that
+	 * we got from the mount helper.  Do this after parsing argc/argv
+	 * because that may overwrite the argv area.  Note that the procfs
+	 * listing might not reflect the options that actually get enabled,
+	 * just like regular fuse4fs.
+	 */
+	if (fuse4fs_is_service(&fctx))
+		fuse4fs_service_set_proc_cmdline(&fctx, argc, argv, &args);
+
 	/* /dev/sda -> sda for reporting */
 	fctx.shortdev = strrchr(fctx.device, '/');
 	if (fctx.shortdev)
diff --git a/lib/config.h.in b/lib/config.h.in
index 3aa0511a329b17..2c25632188e4f3 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -376,6 +376,12 @@
 /* Define to 1 if you have the 'setmntent' function. */
 #undef HAVE_SETMNTENT
 
+/* Define to 1 if setproctitle present in libbsd */
+#undef HAVE_SETPROCTITLE
+
+/* Define to 1 if setproctitle_init present in libbsd */
+#undef HAVE_SETPROCTITLE_INIT
+
 /* Define to 1 if you have the 'setresgid' function. */
 #undef HAVE_SETRESGID
 


^ permalink raw reply related

* [PATCH 07/10] fuse4fs: enable safe service mode
From: Darrick J. Wong @ 2026-04-22 23:25 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Make it possible to run fuse4fs as a safe systemd service, wherein the
fuse server only has access to the fds that we pass in.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 MCONFIG.in                  |    2 
 configure                   |  186 +++++++++++++++++++++++++++++++++
 configure.ac                |  108 +++++++++++++++++++
 fuse4fs/Makefile.in         |   40 ++++++-
 fuse4fs/fuse4fs.c           |  241 +++++++++++++++++++++++++++++++++++++++++--
 fuse4fs/fuse4fs.socket.in   |   17 +++
 fuse4fs/fuse4fs@.service.in |  102 ++++++++++++++++++
 lib/config.h.in             |    6 +
 util/subst.conf.in          |    3 +
 9 files changed, 690 insertions(+), 15 deletions(-)
 create mode 100644 fuse4fs/fuse4fs.socket.in
 create mode 100644 fuse4fs/fuse4fs@.service.in


diff --git a/MCONFIG.in b/MCONFIG.in
index d66e2f3bc1d552..7a17778b6da67f 100644
--- a/MCONFIG.in
+++ b/MCONFIG.in
@@ -42,6 +42,8 @@ HAVE_CROND = @have_crond@
 CROND_DIR = @crond_dir@
 HAVE_SYSTEMD = @have_systemd@
 SYSTEMD_SYSTEM_UNIT_DIR = @systemd_system_unit_dir@
+HAVE_FUSE_SERVICE = @have_fuse_service@
+HAVE_FUSE4FS_SERVICE = @have_fuse4fs_service@
 
 @SET_MAKE@
 
diff --git a/configure b/configure
index 59413d5fc32e83..0d49ec854a92cf 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ enable_year2038=no
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 OS_IO_FILE
+have_fuse4fs_service
 systemd_system_unit_dir
 have_systemd
 systemd_LIBS
@@ -697,6 +698,9 @@ UNI_DIFF_OPTS
 SEM_INIT_LIB
 FUSE4FS_CMT
 FUSE2FS_CMT
+fuse_service_socket_perms
+fuse_service_socket_dir
+have_fuse_service
 FUSE_LIB
 fuse3_LIBS
 fuse3_CFLAGS
@@ -929,6 +933,8 @@ with_libiconv_prefix
 with_libintl_prefix
 enable_largefile
 with_libarchive
+with_fuse_service_socket_dir
+with_fuse_service_socket_perms
 enable_fuse2fs
 enable_fuse4fs
 enable_lto
@@ -1652,6 +1658,11 @@ Optional Packages:
   --with-libintl-prefix[=DIR]  search for libintl in DIR/include and DIR/lib
   --without-libintl-prefix     don't search for libintl in includedir and libdir
   --without-libarchive    disable use of libarchive
+  --with-fuse-service-socket-dir[=DIR]
+                          Create fuse3 filesystem service sockets in DIR.
+  --with-fuse-service-socket-perms[=MODE]
+                          Create fuse3 filesystem service socket with these
+                          permissions.
   --with-multiarch=ARCH   specify the multiarch triplet
   --with-udev-rules-dir[=DIR]
                           Install udev rules into DIR.
@@ -14592,7 +14603,7 @@ else
         fuse3_LIBS=$pkg_cv_fuse3_LIBS
         { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 printf "%s\n" "yes" >&6; }
-        FUSE_LIB=-lfuse3
+        FUSE_LIB=-lfuse3 ; have_fuse3_pkg=yes
 fi
 
 
@@ -14674,6 +14685,155 @@ printf "%s\n" "#define HAVE_FUSE_LOWLEVEL 1" >>confdefs.h
 
 fi
 
+have_fuse_service=
+fuse_service_socket_dir=
+if test -n "$have_fuse_lowlevel"
+then
+
+# Check whether --with-fuse_service_socket_dir was given.
+if test ${with_fuse_service_socket_dir+y}
+then :
+  withval=$with_fuse_service_socket_dir;
+else case e in #(
+  e) with_fuse_service_socket_dir=yes ;;
+esac
+fi
+
+	if test "x${with_fuse_service_socket_dir}" != "xno"
+then :
+
+		if test "x${with_fuse_service_socket_dir}" = "xyes"
+then :
+
+			if test "x$have_fuse3_pkg" = "xyes"
+then :
+
+				with_fuse_service_socket_dir="$($PKG_CONFIG --variable=service_socket_dir fuse3)"
+
+else case e in #(
+  e)
+				with_fuse_service_socket_dir=""
+			   ;;
+esac
+fi
+
+fi
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse3 service socket dir" >&5
+printf %s "checking for fuse3 service socket dir... " >&6; }
+		fuse_service_socket_dir="${with_fuse_service_socket_dir}"
+		if test -n "${fuse_service_socket_dir}"
+then :
+
+			{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${fuse_service_socket_dir}" >&5
+printf "%s\n" "${fuse_service_socket_dir}" >&6; }
+
+else case e in #(
+  e)
+			{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+		   ;;
+esac
+fi
+
+fi
+
+# Check whether --with-fuse_service_socket_perms was given.
+if test ${with_fuse_service_socket_perms+y}
+then :
+  withval=$with_fuse_service_socket_perms;
+else case e in #(
+  e) with_fuse_service_socket_perms=yes ;;
+esac
+fi
+
+	if test "x${with_fuse_service_socket_perms}" != "xno"
+then :
+
+		if test "x${with_fuse_service_socket_perms}" = "xyes"
+then :
+
+			if test "x$have_fuse3_pkg" = "xyes"
+then :
+
+				with_fuse_service_socket_perms="$($PKG_CONFIG --variable=service_socket_perms fuse3)"
+
+else case e in #(
+  e)
+				with_fuse_service_socket_perms=""
+			   ;;
+esac
+fi
+
+fi
+		fuse_service_socket_perms="${with_fuse_service_socket_perms}"
+
+fi
+fi
+if test -n "$FUSE_USE_VERSION"
+then
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse_service_accept in libfuse" >&5
+printf %s "checking for fuse_service_accept in libfuse... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	#define _GNU_SOURCE
+	#define _FILE_OFFSET_BITS	64
+	#define FUSE_USE_VERSION	319
+	#include <fuse_lowlevel.h>
+	#include <fuse_service.h>
+
+int
+main (void)
+{
+
+	struct fuse_service *moo;
+	fuse_service_accepted(moo);
+
+  ;
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  have_fuse_service_accept=yes
+	   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; } ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse3 service support" >&5
+printf %s "checking for fuse3 service support... " >&6; }
+	if test -n "${fuse_service_socket_dir}" && test "${have_fuse_service_accept}" = "yes"
+then :
+
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+		have_fuse_service="yes"
+
+else case e in #(
+  e)
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+	   ;;
+esac
+fi
+fi
+
+
+
+if test "$have_fuse_service" = yes
+then
+
+printf "%s\n" "#define HAVE_FUSE_SERVICE 1" >>confdefs.h
+
+fi
+
 FUSE2FS_CMT=
 # Check whether --enable-fuse2fs was given.
 if test ${enable_fuse2fs+y}
@@ -16587,6 +16747,30 @@ esac
 fi
 
 
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse4fs service support and systemd" >&5
+printf %s "checking for fuse4fs service support and systemd... " >&6; }
+if test "${FUSE4FS_CMT}${have_fuse_service}${have_systemd}" = "yesyes"
+then :
+
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+printf "%s\n" "#define HAVE_FUSE4FS_SERVICE 1" >>confdefs.h
+
+           have_fuse4fs_service=yes
+
+else case e in #(
+  e)
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           have_fuse4fs_service=no
+
+ ;;
+esac
+fi
+
+
 OS_IO_FILE=""
 case "$host_os" in
   mingw*)
diff --git a/configure.ac b/configure.ac
index c7d18dd9988db4..4b66296764ec86 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1375,7 +1375,7 @@ dnl Check to see if the FUSE library is -lfuse3 or -losxfuse
 dnl
 FUSE_LIB=
 dnl osxfuse.dylib supersedes fuselib.dylib
-PKG_CHECK_MODULES([fuse3], [fuse3], [FUSE_LIB=-lfuse3],
+PKG_CHECK_MODULES([fuse3], [fuse3], [FUSE_LIB=-lfuse3 ; have_fuse3_pkg=yes],
 [
 	AC_CHECK_LIB(osxfuse, fuse_main, [FUSE_LIB=-losxfuse])
 ])
@@ -1427,6 +1427,96 @@ then
 		  [Define to 1 if fuse supports lowlevel API])
 fi
 
+dnl
+dnl Check if the FUSE library tells us where to put fs service sockets
+dnl
+have_fuse_service=
+fuse_service_socket_dir=
+if test -n "$have_fuse_lowlevel"
+then
+	AC_ARG_WITH([fuse_service_socket_dir],
+	  [AS_HELP_STRING([--with-fuse-service-socket-dir@<:@=DIR@:>@],
+		  [Create fuse3 filesystem service sockets in DIR.])],
+	  [],
+	  [with_fuse_service_socket_dir=yes])
+	AS_IF([test "x${with_fuse_service_socket_dir}" != "xno"],
+	  [
+		AS_IF([test "x${with_fuse_service_socket_dir}" = "xyes"],
+		  [
+			AS_IF([test "x$have_fuse3_pkg" = "xyes" ],
+			  [
+				with_fuse_service_socket_dir="$($PKG_CONFIG --variable=service_socket_dir fuse3)"
+			  ], [
+				with_fuse_service_socket_dir=""
+			  ])
+		  ])
+		AC_MSG_CHECKING([for fuse3 service socket dir])
+		fuse_service_socket_dir="${with_fuse_service_socket_dir}"
+		AS_IF([test -n "${fuse_service_socket_dir}"],
+		  [
+			AC_MSG_RESULT(${fuse_service_socket_dir})
+		  ],
+		  [
+			AC_MSG_RESULT(no)
+		  ])
+	  ],
+	  [])
+	AC_ARG_WITH([fuse_service_socket_perms],
+	  [AS_HELP_STRING([--with-fuse-service-socket-perms@<:@=MODE@:>@],
+		  [Create fuse3 filesystem service socket with these permissions.])],
+	  [],
+	  [with_fuse_service_socket_perms=yes])
+	AS_IF([test "x${with_fuse_service_socket_perms}" != "xno"],
+	  [
+		AS_IF([test "x${with_fuse_service_socket_perms}" = "xyes"],
+		  [
+			AS_IF([test "x$have_fuse3_pkg" = "xyes" ],
+			  [
+				with_fuse_service_socket_perms="$($PKG_CONFIG --variable=service_socket_perms fuse3)"
+			  ], [
+				with_fuse_service_socket_perms=""
+			  ])
+		  ])
+		fuse_service_socket_perms="${with_fuse_service_socket_perms}"
+	  ],
+	  [])
+fi
+if test -n "$FUSE_USE_VERSION"
+then
+	AC_MSG_CHECKING(for fuse_service_accept in libfuse)
+	AC_LINK_IFELSE(
+	[	AC_LANG_PROGRAM([[
+	#define _GNU_SOURCE
+	#define _FILE_OFFSET_BITS	64
+	#define FUSE_USE_VERSION	319
+	#include <fuse_lowlevel.h>
+	#include <fuse_service.h>
+		]], [[
+	struct fuse_service *moo;
+	fuse_service_accepted(moo);
+		]])
+	], have_fuse_service_accept=yes
+	   AC_MSG_RESULT(yes),
+	   AC_MSG_RESULT(no))
+
+	AC_MSG_CHECKING([for fuse3 service support])
+	AS_IF([test -n "${fuse_service_socket_dir}" && test "${have_fuse_service_accept}" = "yes"],
+	  [
+		AC_MSG_RESULT(yes)
+		have_fuse_service="yes"
+	  ],
+	  [
+		AC_MSG_RESULT(no)
+	  ])
+fi
+AC_SUBST(have_fuse_service)
+AC_SUBST(fuse_service_socket_dir)
+AC_SUBST(fuse_service_socket_perms)
+if test "$have_fuse_service" = yes
+then
+	AC_DEFINE(HAVE_FUSE_SERVICE, 1, [Define to 1 if fuse supports service])
+fi
+
 dnl
 dnl Check if fuse2fs is actually built.
 dnl
@@ -2098,6 +2188,22 @@ AS_IF([test "x${with_systemd_unit_dir}" != "xno"],
   ])
 AC_SUBST(have_systemd)
 AC_SUBST(systemd_system_unit_dir)
+
+AC_MSG_CHECKING([for fuse4fs service support and systemd])
+AS_IF([test "${FUSE4FS_CMT}${have_fuse_service}${have_systemd}" = "yesyes"],
+      [
+           AC_MSG_RESULT(yes)
+           AC_DEFINE(HAVE_FUSE4FS_SERVICE, 1,
+                     [Define to 1 if fuse4fs should be built with fuse service support])
+           have_fuse4fs_service=yes
+      ],
+      [
+           AC_MSG_RESULT(no)
+           have_fuse4fs_service=no
+      ]
+)
+AC_SUBST(have_fuse4fs_service)
+
 dnl Adjust the compiled files if we are on windows vs everywhere else
 dnl
 OS_IO_FILE=""
diff --git a/fuse4fs/Makefile.in b/fuse4fs/Makefile.in
index 9f3547c271638f..8de3fff338584c 100644
--- a/fuse4fs/Makefile.in
+++ b/fuse4fs/Makefile.in
@@ -17,6 +17,13 @@ UMANPAGES=
 @FUSE4FS_CMT@UPROGS+=fuse4fs
 @FUSE4FS_CMT@UMANPAGES+=fuse4fs.1
 
+ifeq ($(HAVE_FUSE4FS_SERVICE),yes)
+SERVICE_FILES	+= fuse4fs.socket fuse4fs@.service
+INSTALLDIRS_TGT	+= installdirs-systemd
+INSTALL_TGT	+= install-systemd
+UNINSTALL_TGT	+= uninstall-systemd
+endif
+
 FUSE4FS_OBJS=	fuse4fs.o journal.o recovery.o revoke.o
 
 PROFILED_FUSE4FS_OJBS=	profiled/fuse4fs.o profiled/journal.o \
@@ -54,7 +61,7 @@ DEPEND_CFLAGS = -I$(top_srcdir)/e2fsck
 @PROFILE_CMT@	$(Q) $(CC) $(ALL_CFLAGS) -g -pg -o profiled/$*.o -c $<
 
 all:: profiled $(SPROGS) $(UPROGS) $(USPROGS) $(SMANPAGES) $(UMANPAGES) \
-	$(FMANPAGES) $(LPROGS)
+	$(FMANPAGES) $(LPROGS) $(SERVICE_FILES)
 
 all-static::
 
@@ -71,6 +78,14 @@ fuse4fs: $(FUSE4FS_OBJS) $(DEPLIBS) $(DEPLIBBLKID) $(DEPLIBUUID) \
 		$(LIBFUSE) $(LIBBLKID) $(LIBUUID) $(LIBEXT2FS) $(LIBINTL) \
 		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P)
 
+%.socket: %.socket.in $(DEP_SUBSTITUTE)
+	$(E) "	SUBST $@"
+	$(Q) $(SUBSTITUTE_UPTIME) $< $@
+
+%.service: %.service.in $(DEP_SUBSTITUTE)
+	$(E) "	SUBST $@"
+	$(Q) $(SUBSTITUTE_UPTIME) $< $@
+
 journal.o: $(srcdir)/../debugfs/journal.c
 	$(E) "	CC $<"
 	$(Q) $(CC) -c $(JOURNAL_CFLAGS) -I$(srcdir) \
@@ -93,11 +108,15 @@ fuse4fs.1: $(DEP_SUBSTITUTE) $(srcdir)/fuse4fs.1.in
 	$(E) "	SUBST $@"
 	$(Q) $(SUBSTITUTE_UPTIME) $(srcdir)/fuse4fs.1.in fuse4fs.1
 
-installdirs:
+installdirs: $(INSTALLDIRS_TGT)
 	$(E) "	MKDIR_P $(bindir) $(man1dir)"
 	$(Q) $(MKDIR_P) $(DESTDIR)$(bindir) $(DESTDIR)$(man1dir)
 
-install: all $(UMANPAGES) installdirs
+installdirs-systemd:
+	$(E) "	MKDIR_P $(SYSTEMD_SYSTEM_UNIT_DIR)"
+	$(Q) $(MKDIR_P) $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)
+
+install: all $(UMANPAGES) installdirs $(INSTALL_TGT)
 	$(Q) for i in $(UPROGS); do \
 		$(ES) "	INSTALL $(bindir)/$$i"; \
 		$(INSTALL_PROGRAM) $$i $(DESTDIR)$(bindir)/$$i; \
@@ -110,13 +129,19 @@ install: all $(UMANPAGES) installdirs
 		$(INSTALL_DATA) $$i $(DESTDIR)$(man1dir)/$$i; \
 	done
 
+install-systemd: $(SERVICE_FILES) installdirs-systemd
+	$(Q) for i in $(SERVICE_FILES); do \
+		$(ES) "	INSTALL_DATA $(SYSTEMD_SYSTEM_UNIT_DIR)/$$i"; \
+		$(INSTALL_DATA) $$i $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/$$i; \
+	done
+
 install-strip: install
 	$(Q) for i in $(UPROGS); do \
 		$(E) "	STRIP $(bindir)/$$i"; \
 		$(STRIP) $(DESTDIR)$(bindir)/$$i; \
 	done
 
-uninstall:
+uninstall: $(UNINSTALL_TGT)
 	for i in $(UPROGS); do \
 		$(RM) -f $(DESTDIR)$(bindir)/$$i; \
 	done
@@ -124,9 +149,16 @@ uninstall:
 		$(RM) -f $(DESTDIR)$(man1dir)/$$i; \
 	done
 
+uninstall-systemd:
+	for i in $(SERVICE_FILES); do \
+		$(RM) -f $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/$$i; \
+	done
+
 clean::
 	$(RM) -f $(UPROGS) $(UMANPAGES) profile.h \
 		fuse4fs.profiled \
+		$(SERVICE_FILES) \
+		fuse4fs.socket \
 		profiled/*.o \#* *.s *.o *.a *~ core gmon.out
 
 mostlyclean: clean
diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 1203f074ac29b6..47cac88c46cea9 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -42,6 +42,10 @@
 # define _FILE_OFFSET_BITS 64
 #endif /* _FILE_OFFSET_BITS */
 #include <fuse_lowlevel.h>
+#ifdef HAVE_FUSE4FS_SERVICE
+# include <sys/mount.h>
+# include <fuse_service.h>
+#endif
 #ifdef __SET_FOB_FOR_FUSE
 # undef _FILE_OFFSET_BITS
 #endif /* __SET_FOB_FOR_FUSE */
@@ -122,6 +126,10 @@
 
 #define FUSE4FS_ATTR_TIMEOUT	(0.0)
 
+#ifndef O_DIRECT
+# define O_DIRECT	(0)
+#endif
+
 static inline uint64_t round_up(uint64_t b, unsigned int align)
 {
 	unsigned int m;
@@ -267,8 +275,21 @@ struct fuse4fs {
 #endif
 	struct fuse_session *fuse;
 	struct cache inodes;
+#ifdef HAVE_FUSE4FS_SERVICE
+	struct fuse_service *service;
+	int bdev_fd;
+#endif
 };
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static inline bool fuse4fs_is_service(const struct fuse4fs *ff)
+{
+	return fuse_service_accepted(ff->service);
+}
+#else
+# define fuse4fs_is_service(...)		(false)
+#endif
+
 #define FUSE4FS_CHECK_HANDLE(req, fh) \
 	do { \
 		if ((fh) == NULL || (fh)->magic != FUSE4FS_FILE_MAGIC) { \
@@ -1231,6 +1252,105 @@ static errcode_t fuse4fs_check_support(struct fuse4fs *ff)
 	return 0;
 }
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static int fuse4fs_service_connect(struct fuse4fs *ff, struct fuse_args *args)
+{
+	int ret;
+
+	ret = fuse_service_accept(&ff->service);
+	if (ret)
+		return ret;
+
+	if (!fuse4fs_is_service(ff))
+		return 0;
+
+	return fuse_service_append_args(ff->service, args);
+}
+
+static bool fuse4fs_service_should_drop_kernel_mode(const struct fuse4fs *ff)
+{
+	return ff->kernel && fuse4fs_is_service(ff) &&
+	       !fuse_service_can_allow_other(ff->service);
+}
+
+static void fuse4fs_service_close_bdev(struct fuse4fs *ff)
+{
+	if (ff->bdev_fd >= 0)
+		close(ff->bdev_fd);
+	ff->bdev_fd = -1;
+}
+
+static int fuse4fs_service_exit(struct fuse4fs *ff, int exitcode)
+{
+	if (!fuse4fs_is_service(ff))
+		return exitcode;
+
+	fuse_service_send_goodbye(ff->service, exitcode);
+	fuse_service_release(ff->service);
+	close(ff->bdev_fd);
+	ff->bdev_fd = -1;
+
+	return fuse_service_exit(exitcode);
+}
+
+static int fuse4fs_service_get_config(struct fuse4fs *ff)
+{
+	double deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
+	const int open_flags = O_EXCL | (ff->directio ? O_DIRECT : 0);
+	int open_mode = O_RDWR;
+	int fd;
+	int ret;
+
+	do {
+		ret = fuse_service_request_file(ff->service, ff->device,
+						open_mode | open_flags, 0, 0);
+		if (ret)
+			return ret;
+
+		ret = fuse_service_receive_file(ff->service, ff->device, &fd);
+		if (ret)
+			return ret;
+
+		if ((fd == -EPERM || fd == -EACCES || fd == -EROFS) &&
+		    open_mode == O_RDWR) {
+			/* Try readonly, but force the loop to run once more */
+			open_mode = O_RDONLY;
+			ret = 1;
+		}
+	} while (ret == 1 || (fd == -EBUSY && retry_before_deadline(deadline)));
+
+	if (fd < 0) {
+		err_printf(ff, "%s %s: %s.\n", _("opening device"), ff->device,
+			   strerror(-fd));
+		return -1;
+	}
+
+	if (!ff->ro && open_mode == O_RDONLY)
+		ff->ro = 1;
+
+	ff->bdev_fd = fd;
+
+	return fuse_service_finish_file_requests(ff->service);
+}
+
+static errcode_t fuse4fs_service_openfs(struct fuse4fs *ff, char *options,
+					int flags)
+{
+	char path[64];
+
+	snprintf(path, sizeof(path), "/dev/fd/%d", ff->bdev_fd);
+	return ext2fs_open2(path, options, flags, 0, 0, unixfd_io_manager,
+			    &ff->fs);
+}
+#else
+# define fuse4fs_service_connect(...)		(0)
+# define fuse4fs_service_should_drop_kernel_mode(...)	(false)
+# define fuse4fs_service_close_bdev(...)	((void)0)
+# define fuse4fs_service_exit(fctx, ret)	(ret)
+# define fuse4fs_service_get_config(...)	(EOPNOTSUPP)
+# define fuse4fs_service_openfs(...)		(EOPNOTSUPP)
+#endif
+
 static errcode_t fuse4fs_acquire_lockfile(struct fuse4fs *ff)
 {
 	char *resolved;
@@ -1301,6 +1421,8 @@ static void fuse4fs_unmount(struct fuse4fs *ff)
 				   uuid);
 	}
 
+	fuse4fs_service_close_bdev(ff);
+
 	if (ff->lockfile)
 		fuse4fs_release_lockfile(ff);
 }
@@ -1356,8 +1478,11 @@ static errcode_t fuse4fs_open(struct fuse4fs *ff)
 	 */
 	deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
 	do {
-		err = ext2fs_open2(ff->device, options, flags, 0, 0,
-				   unix_io_manager, &ff->fs);
+		if (fuse4fs_is_service(ff))
+			err = fuse4fs_service_openfs(ff, options, flags);
+		else
+			err = ext2fs_open2(ff->device, options, flags, 0, 0,
+					   unix_io_manager, &ff->fs);
 		if ((err == EPERM || err == EACCES) &&
 		    (!ff->ro || (flags & EXT2_FLAG_RW))) {
 			/*
@@ -1702,6 +1827,10 @@ static int fuse4fs_setup_logging(struct fuse4fs *ff)
 	if (logfile)
 		return fuse4fs_capture_output(ff, logfile);
 
+	/* systemd already hooked us up to /dev/ttyprintk */
+	if (fuse4fs_is_service(ff))
+		return 0;
+
 	/* in kernel mode, try to log errors to the kernel log */
 	if (ff->kernel)
 		fuse4fs_capture_output(ff, "/dev/ttyprintk");
@@ -5923,14 +6052,13 @@ static const char *get_subtype(const char *argv0)
 }
 
 static void fuse4fs_compute_libfuse_args(struct fuse4fs *ff,
-					 struct fuse_args *args,
-					 const char *argv0)
+					 struct fuse_args *args)
 {
 	char extra_args[BUFSIZ];
 
 	/* Set up default fuse parameters */
 	snprintf(extra_args, BUFSIZ, "-osubtype=%s,fsname=%s",
-		 get_subtype(argv0),
+		 get_subtype(args->argv[0]),
 		 ff->device);
 	if (ff->no_default_opts == 0)
 		fuse_opt_add_arg(args, extra_args);
@@ -5947,6 +6075,15 @@ static void fuse4fs_compute_libfuse_args(struct fuse4fs *ff,
 #endif
 	}
 
+	/*
+	 * If we're mounting as a systemd service but the mount helper told us
+	 * that allow_other isn't allowed, then disable -okernel.  This mount
+	 * option gets special consideration because it's hardcoded in the
+	 * service unit file.
+	 */
+	if (fuse4fs_service_should_drop_kernel_mode(ff))
+		ff->kernel = 0;
+
 	if (ff->kernel) {
 		/*
 		 * ACLs are always enforced when kernel mode is enabled, to
@@ -6058,6 +6195,69 @@ static int fuse4fs_event_loop(struct fuse4fs *ff,
 	return fuse_session_loop_mt(ff->fuse, loop_config) == 0 ? 0 : 8;
 }
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static int fuse4fs_service_main(struct fuse_args *args, struct fuse4fs *ff)
+{
+	struct fuse_cmdline_opts opts;
+	struct fuse_loop_config *loop_config = NULL;
+	int ret;
+
+	/*
+	 * Service initialization doesn't fork or change stdout/stderr so we
+	 * can drop the extra logfd right now.
+	 */
+	if (ff->logfd >= 0)
+		close(ff->logfd);
+	ff->logfd = -1;
+
+	ret = fuse_service_parse_cmdline_opts(args, &opts);
+	if (ret != 0) {
+		ret = 1;
+		goto out;
+	}
+
+	ret = fuse4fs_create_session(ff, args, &opts);
+	if (ret || !ff->fuse)
+		goto out_free_opts;
+
+	loop_config = fuse_loop_cfg_create();
+	if (loop_config == NULL) {
+		ret = 7;
+		goto out_destroy_session;
+	}
+
+	if (fuse_set_signal_handlers(ff->fuse) != 0) {
+		ret = 6;
+		goto out_loopcfg;
+	}
+
+	ret = fuse_service_session_mount(ff->service, ff->fuse, S_IFDIR, &opts);
+	if (ret) {
+		ret = 4;
+		goto out_signals;
+	}
+
+	fuse_service_send_goodbye(ff->service, 0);
+	fuse_service_release(ff->service);
+
+	ret = fuse4fs_event_loop(ff, loop_config, &opts);
+
+out_signals:
+	fuse_remove_signal_handlers(ff->fuse);
+out_loopcfg:
+	fuse_loop_cfg_destroy(loop_config);
+out_destroy_session:
+	fuse_session_destroy(ff->fuse);
+	ff->fuse = NULL;
+out_free_opts:
+	free(opts.mountpoint);
+out:
+	return ret;
+}
+#else
+# define fuse4fs_service_main(...)		(8)
+#endif
+
 static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 {
 	struct fuse_cmdline_opts opts;
@@ -6129,18 +6329,28 @@ int main(int argc, char *argv[])
 		.bfl = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
 		.oom_score_adj = -500,
 		.opstate = F4OP_WRITABLE,
+#ifdef HAVE_FUSE4FS_SERVICE
+		.bdev_fd = -1,
+#endif
 	};
 	errcode_t err;
 	FILE *orig_stderr = stderr;
 	int ret;
 
+	ret = fuse4fs_service_connect(&fctx, &args);
+	if (ret) {
+		ret = 1;
+		goto out_exit;
+	}
+
 	ret = fuse_opt_parse(&args, &fctx, fuse4fs_opts, fuse4fs_opt_proc);
 	if (ret)
-		exit(1);
+		goto out_exit;
 	if (fctx.device == NULL) {
 		fprintf(stderr, "Missing ext4 device/image\n");
 		fprintf(stderr, "See '%s -h' for usage\n", argv[0]);
-		exit(1);
+		ret = 1;
+		goto out_exit;
 	}
 
 	/* /dev/sda -> sda for reporting */
@@ -6170,6 +6380,14 @@ int main(int argc, char *argv[])
 		goto out;
 	}
 
+	if (fuse4fs_is_service(&fctx)) {
+		ret = fuse4fs_service_get_config(&fctx);
+		if (ret) {
+			ret = 2;
+			goto out;
+		}
+	}
+
 	try_set_io_flusher(&fctx);
 	try_adjust_oom_score(&fctx);
 
@@ -6225,9 +6443,12 @@ int main(int argc, char *argv[])
 	/* Initialize generation counter */
 	get_random_bytes(&fctx.next_generation, sizeof(unsigned int));
 
-	fuse4fs_compute_libfuse_args(&fctx, &args, argv[0]);
+	fuse4fs_compute_libfuse_args(&fctx, &args);
 
-	ret = fuse4fs_main(&args, &fctx);
+	if (fuse4fs_is_service(&fctx))
+		ret = fuse4fs_service_main(&args, &fctx);
+	else
+		ret = fuse4fs_main(&args, &fctx);
 	switch(ret) {
 	case 0:
 		/* success */
@@ -6269,6 +6490,8 @@ int main(int argc, char *argv[])
 	if (fctx.device)
 		free(fctx.device);
 	pthread_mutex_destroy(&fctx.bfl);
+out_exit:
+	ret = fuse4fs_service_exit(&fctx, ret);
 	fuse_opt_free_args(&args);
 	return ret;
 }
diff --git a/fuse4fs/fuse4fs.socket.in b/fuse4fs/fuse4fs.socket.in
new file mode 100644
index 00000000000000..99e391bcc6787e
--- /dev/null
+++ b/fuse4fs/fuse4fs.socket.in
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=Socket for ext4 Service
+
+[Socket]
+ListenSequentialPacket=@fuse_service_socket_dir@/ext2
+ListenSequentialPacket=@fuse_service_socket_dir@/ext3
+ListenSequentialPacket=@fuse_service_socket_dir@/ext4
+Accept=yes
+SocketMode=@fuse_service_socket_perms@
+RemoveOnStop=yes
+
+[Install]
+WantedBy=sockets.target
diff --git a/fuse4fs/fuse4fs@.service.in b/fuse4fs/fuse4fs@.service.in
new file mode 100644
index 00000000000000..38434c383c7be3
--- /dev/null
+++ b/fuse4fs/fuse4fs@.service.in
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=ext4 Service
+
+# Don't leave failed units behind, systemd does not clean them up!
+CollectMode=inactive-or-failed
+
+[Service]
+Type=exec
+ExecStart=@bindir@/fuse4fs -o kernel
+
+# Try to capture core dumps
+LimitCORE=infinity
+
+SyslogIdentifier=%N
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Don't let us see anything in the regular system, and don't run as root
+DynamicUser=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+PrivateDevices=true
+PrivateUsers=true
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+RestrictFileSystems=
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+
+SystemCallFilter=~@clock
+SystemCallFilter=~@cpu-emulation
+SystemCallFilter=~@debug
+SystemCallFilter=~@module
+SystemCallFilter=~@reboot
+SystemCallFilter=~@swap
+
+SystemCallFilter=~@mount
+
+# libfuse io_uring wants to pin cores and memory
+SystemCallFilter=mbind
+SystemCallFilter=sched_setaffinity
+
+# Leave a breadcrumb if we get whacked by the system call filter
+SystemCallErrorNumber=EL3RST
+
+# Log to the kernel dmesg, just like an in-kernel ext4 driver
+StandardOutput=append:/dev/ttyprintk
+StandardError=append:/dev/ttyprintk
+
+# Run with no capabilities at all
+CapabilityBoundingSet=
+AmbientCapabilities=
+NoNewPrivileges=true
+
+# fuse4fs doesn't create files
+UMask=7777
+
+# No access to hardware /dev files at all
+ProtectClock=true
+DevicePolicy=closed
+
+# Don't mess with set[ug]id anything.
+RestrictSUIDSGID=true
+
+# Don't let OOM kills of processes in this containment group kill the whole
+# service, because we don't want filesystem drivers to go down.
+OOMPolicy=continue
+OOMScoreAdjust=-1000
diff --git a/lib/config.h.in b/lib/config.h.in
index fd2520396712e8..3aa0511a329b17 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -139,6 +139,9 @@
 /* Define to 1 if you have the 'ftruncate64' function. */
 #undef HAVE_FTRUNCATE64
 
+/* Define to 1 if fuse4fs should be built with fuse service support */
+#undef HAVE_FUSE4FS_SERVICE
+
 /* Define to 1 if fuse supports cache_readdir */
 #undef HAVE_FUSE_CACHE_READDIR
 
@@ -148,6 +151,9 @@
 /* Define to 1 if fuse supports lowlevel API */
 #undef HAVE_FUSE_LOWLEVEL
 
+/* Define to 1 if fuse supports service */
+#undef HAVE_FUSE_SERVICE
+
 /* Define to 1 if you have the 'futimes' function. */
 #undef HAVE_FUTIMES
 
diff --git a/util/subst.conf.in b/util/subst.conf.in
index 5af5e356d46ac7..3d0ec5cc39eabd 100644
--- a/util/subst.conf.in
+++ b/util/subst.conf.in
@@ -24,3 +24,6 @@ root_bindir		@root_bindir@
 libdir			@libdir@
 $exec_prefix		@exec_prefix@
 pkglibexecdir		@libexecdir@/e2fsprogs
+bindir			@bindir@
+fuse_service_socket_dir	@fuse_service_socket_dir@
+fuse_service_socket_perms	@fuse_service_socket_perms@


^ permalink raw reply related

* [PATCH 06/10] fuse4fs: hoist some code out of fuse4fs_main
From: Darrick J. Wong @ 2026-04-22 23:25 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

In the next patch, we're going to create a separate fuse4fs_main
function when we're running in service mode.  Hoist into separate
helpers the code that will be shared between the two functions.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fuse4fs/fuse4fs.c |   95 +++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 46 deletions(-)


diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 13e1aae4b5d2e5..1203f074ac29b6 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -6016,47 +6016,64 @@ static void fuse4fs_com_err_proc(const char *whoami, errcode_t code,
 	fflush(stderr);
 }
 
-static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
+static int fuse4fs_create_session(struct fuse4fs *ff, struct fuse_args *args,
+				  struct fuse_cmdline_opts *opts)
 {
-	struct fuse_cmdline_opts opts;
-	struct fuse_session *se;
-	struct fuse_loop_config *loop_config = NULL;
-	int ret;
-
-	if (fuse_parse_cmdline(args, &opts) != 0) {
-		ret = 1;
-		goto out;
-	}
-
 	if (ff->debug)
-		opts.debug = true;
+		opts->debug = true;
 
-	if (opts.show_help) {
+	if (opts->show_help) {
 		fuse_cmdline_help();
-		ret = 0;
-		goto out_free_opts;
+		return 0;
 	}
 
-	if (opts.show_version) {
+	if (opts->show_version) {
 		printf("FUSE library version %s\n", fuse_pkgversion());
-		ret = 0;
-		goto out_free_opts;
+		return 0;
 	}
 
-	if (!opts.mountpoint) {
+	if (!opts->mountpoint) {
 		fprintf(stderr, "error: no mountpoint specified\n");
-		ret = 2;
-		goto out_free_opts;
+		return 2;
 	}
 
-	se = fuse_session_new(args, &fs_ops, sizeof(fs_ops), ff);
-	if (se == NULL) {
-		ret = 3;
-		goto out_free_opts;
+	ff->fuse = fuse_session_new(args, &fs_ops, sizeof(fs_ops), ff);
+	return ff->fuse ? 0 : 3;
+}
+
+static int fuse4fs_event_loop(struct fuse4fs *ff,
+			      struct fuse_loop_config *loop_config,
+			      const struct fuse_cmdline_opts *opts)
+{
+	/*
+	 * Since there's a Big Kernel Lock around all the libext2fs code, we
+	 * only need to start four threads -- one to decode a request, another
+	 * to do the filesystem work, a third to transmit the reply, and a
+	 * fourth to handle fuse notifications.
+	 */
+	fuse_loop_cfg_set_clone_fd(loop_config, opts->clone_fd);
+	fuse_loop_cfg_set_idle_threads(loop_config, opts->max_idle_threads);
+	fuse_loop_cfg_set_max_threads(loop_config, 4);
+
+	return fuse_session_loop_mt(ff->fuse, loop_config) == 0 ? 0 : 8;
+}
+
+static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
+{
+	struct fuse_cmdline_opts opts;
+	struct fuse_loop_config *loop_config = NULL;
+	int ret;
+
+	if (fuse_parse_cmdline(args, &opts) != 0) {
+		ret = 1;
+		goto out;
 	}
-	ff->fuse = se;
 
-	if (fuse_session_mount(se, opts.mountpoint) != 0) {
+	ret = fuse4fs_create_session(ff, args, &opts);
+	if (ret || !ff->fuse)
+		goto out_free_opts;
+
+	if (fuse_session_mount(ff->fuse, opts.mountpoint) != 0) {
 		ret = 4;
 		goto out_destroy_session;
 	}
@@ -6076,7 +6093,7 @@ static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 		close(ff->logfd);
 	ff->logfd = -1;
 
-	if (fuse_set_signal_handlers(se) != 0) {
+	if (fuse_set_signal_handlers(ff->fuse) != 0) {
 		ret = 6;
 		goto out_unmount;
 	}
@@ -6087,30 +6104,16 @@ static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 		goto out_remove_signal_handlers;
 	}
 
-	/*
-	 * Since there's a Big Kernel Lock around all the libext2fs code, we
-	 * only need to start four threads -- one to decode a request, another
-	 * to do the filesystem work, a third to transmit the reply, and a
-	 * fourth to handle fuse notifications.
-	 */
-	fuse_loop_cfg_set_clone_fd(loop_config, opts.clone_fd);
-	fuse_loop_cfg_set_idle_threads(loop_config, opts.max_idle_threads);
-	fuse_loop_cfg_set_max_threads(loop_config, 4);
+	ret = fuse4fs_event_loop(ff, loop_config, &opts);
 
-	if (fuse_session_loop_mt(se, loop_config) != 0) {
-		ret = 8;
-		goto out_loopcfg;
-	}
-
-out_loopcfg:
 	fuse_loop_cfg_destroy(loop_config);
 out_remove_signal_handlers:
-	fuse_remove_signal_handlers(se);
+	fuse_remove_signal_handlers(ff->fuse);
 out_unmount:
-	fuse_session_unmount(se);
+	fuse_session_unmount(ff->fuse);
 out_destroy_session:
+	fuse_session_destroy(ff->fuse);
 	ff->fuse = NULL;
-	fuse_session_destroy(se);
 out_free_opts:
 	free(opts.mountpoint);
 out:


^ permalink raw reply related

* [PATCH 05/10] libext2fs: bump libfuse API version to 3.19
From: Darrick J. Wong @ 2026-04-22 23:24 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

The fuse service container API is only available in 3.19, so we need to
bump FUSE_USE_VERSION up from 3.14 to 3.19.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 configure    |    8 ++++----
 configure.ac |   10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)


diff --git a/configure b/configure
index a9779412174b34..59413d5fc32e83 100755
--- a/configure
+++ b/configure
@@ -14598,14 +14598,14 @@ fi
 
 if test -n "$FUSE_LIB"
 then
-	FUSE_USE_VERSION=314
+	FUSE_USE_VERSION=319
 	CFLAGS="$fuse3_CFLAGS $CFLAGS"
 	FUSE_LIB="$fuse3_LIBS"
 	       for ac_header in pthread.h fuse.h
 do :
   as_ac_Header=`printf "%s\n" "ac_cv_header_$ac_header" | sed "$as_sed_sh"`
 ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "#define _FILE_OFFSET_BITS	64
-#define FUSE_USE_VERSION	314
+#define FUSE_USE_VERSION	319
 "
 if eval test \"x\$"$as_ac_Header"\" = x"yes"
 then :
@@ -14640,7 +14640,7 @@ printf %s "checking for lowlevel interface in libfuse... " >&6; }
 
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse_lowlevel.h>
 
 int
@@ -14818,7 +14818,7 @@ printf %s "checking for cache_readdir support in libfuse... " >&6; }
 
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse.h>
 
 int
diff --git a/configure.ac b/configure.ac
index b62553e3d6b946..c7d18dd9988db4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1383,17 +1383,17 @@ AC_SUBST(FUSE_LIB)
 
 dnl
 dnl Set FUSE_USE_VERSION, which is how fuse servers build against a particular
-dnl libfuse ABI.  Currently we link against the libfuse 3.14 ABI (hence 314)
+dnl libfuse ABI.  Currently we link against the libfuse 3.19 ABI (hence 319)
 dnl
 if test -n "$FUSE_LIB"
 then
-	FUSE_USE_VERSION=314
+	FUSE_USE_VERSION=319
 	CFLAGS="$fuse3_CFLAGS $CFLAGS"
 	FUSE_LIB="$fuse3_LIBS"
 	AC_CHECK_HEADERS([pthread.h fuse.h], [],
 		[AC_MSG_FAILURE([Cannot build against fuse3 headers])],
 [#define _FILE_OFFSET_BITS	64
-#define FUSE_USE_VERSION	314])
+#define FUSE_USE_VERSION	319])
 fi
 if test -n "$FUSE_USE_VERSION"
 then
@@ -1412,7 +1412,7 @@ then
 	[	AC_LANG_PROGRAM([[
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse_lowlevel.h>
 		]], [[
 	struct fuse_lowlevel_ops fs_ops = { };
@@ -1512,7 +1512,7 @@ then
 	[	AC_LANG_PROGRAM([[
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse.h>
 		]], [[
 	struct fuse_file_info fs_ops = {


^ permalink raw reply related

* [PATCH 04/10] libext2fs: fix MMP code to work with unixfd IO manager
From: Darrick J. Wong @ 2026-04-22 23:24 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

The MMP code wants to be able to read and write the MMP block directly
to storage so that the pagecache does not get in the way.  This is
critical for correct operation of MMP, because it is guarding against
two cluster nodes trying to change the filesystem at the same time.

Unfortunately there's no convenient way to tell an IO manager to perform
a particular IO in directio mode, so the MMP code open()s the filesystem
source device a second time so that it can set O_DIRECT and maintain its
own file position independently of the IO channel.  This is a gross
layering violation.

For unprivileged containerized fuse4fs, we're going to have a privileged
mount helper pass us the fd to the block device, so we'll be using the
unixfd IO manager.  The enhanced security posture provided by the
service definition file (minimal /dev) means that we cannot reopen the
source device.  In this case, MMP can only duplicate the fd and use the
IO channel carefully.

Fix this (sort of) by detecting the unixfd IO manager and duplicating
the open fd if it's in use.  This adds a requirement that the unixfd
originally be opened in O_DIRECT mode if the filesystem is on a block
device, but that's the best we can do here.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/ext2fs.h  |    1 +
 lib/ext2fs/ext2fsP.h |    4 ++
 lib/ext2fs/mmp.c     |   95 +++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/ext2fs/unix_io.c |    2 +
 4 files changed, 100 insertions(+), 2 deletions(-)


diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index c4fcb10bea0fb9..02c3cbcea92482 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -225,6 +225,7 @@ typedef struct ext2_file *ext2_file_t;
  * Internal flags for use by the ext2fs library only
  */
 #define EXT2_FLAG2_USE_FAKE_TIME	0x000000001
+#define EXT2_FLAG2_MMP_USE_IOCHANNEL	0x000000002
 
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
diff --git a/lib/ext2fs/ext2fsP.h b/lib/ext2fs/ext2fsP.h
index 428081c9e2ff38..bdc92991e7dda0 100644
--- a/lib/ext2fs/ext2fsP.h
+++ b/lib/ext2fs/ext2fsP.h
@@ -218,3 +218,7 @@ errcode_t ext2fs_remove_exit_fn(ext2_exit_fn fn, void *data);
         (sizeof(array) / sizeof(array[0]))
 
 #define EXT2FS_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)]))
+
+#ifndef _WIN32
+int possible_unixfd_pathname(const char *path);
+#endif
diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
index cb15a18fce5547..188cdb68900e97 100644
--- a/lib/ext2fs/mmp.c
+++ b/lib/ext2fs/mmp.c
@@ -26,9 +26,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <limits.h>
 
 #include "ext2fs/ext2_fs.h"
 #include "ext2fs/ext2fs.h"
+#include "ext2fs/ext2fsP.h"
 
 #ifndef O_DIRECT
 #define O_DIRECT 0
@@ -48,6 +50,86 @@ errcode_t ext2fs_mmp_get_mem(ext2_filsys fs, void **ptr)
 	return ext2fs_get_memalign(fs->blocksize, align, ptr);
 }
 
+#ifdef _WIN32
+static int ext2fs_mmp_open_device(ext2_filsys fs, int flags)
+{
+	return open(fs->device_name, flags);
+}
+#else
+static int ext2fs_mmp_open_device(ext2_filsys fs, int flags)
+{
+	struct stat stbuf;
+	char path[64];
+	int maybe_fd = -1;
+	int new_fd;
+	int ret;
+	errcode_t retval = 0;
+
+	/*
+	 * If we can't possibly be using the unixfd IO manager, open the device
+	 * a second time, which is the historical behavior.  This is a huge
+	 * and historic layering violation!
+	 *
+	 * It's also broken if the unixfd IO manager was passed a string with a
+	 * file descriptor number instead of a /dev/fd/XX path, but the
+	 * internet thinks there are no users of the manager outside of Google.
+	 */
+	if (!possible_unixfd_pathname(fs->device_name))
+		return open(fs->device_name, flags);
+
+	/*
+	 * Try to get the fd of the open block device.  If this fails for any
+	 * reason, fall back to the classic open path.
+	 */
+	retval = io_channel_get_fd(fs->io, &maybe_fd);
+	if (retval || maybe_fd < 0)
+		return open(fs->device_name, flags);
+
+	/*
+	 * We extracted the fd from the IO manager.
+	 *
+	 * Skip directio if this is a regular file, just ext2fs_mmp_read does.
+	 * Note that the O_DIRECT-clearing logic in the caller might not have
+	 * cleared the bit because it is path based.
+	 */
+	if (fstat(maybe_fd, &stbuf) == 0 && S_ISREG(stbuf.st_mode))
+		flags &= ~O_DIRECT;
+
+	/*
+	 * Try to reopen the same file descriptor, but with the new mode flags.
+	 * If that works then we're done.  Note that these magic symlinks do
+	 * not have to resolve anywhere.
+	 */
+	snprintf(path, sizeof(path), "/dev/fd/%d", maybe_fd);
+	new_fd = open(path, flags);
+	if (new_fd >= 0)
+		return new_fd;
+
+	/*
+	 * Reopening didn't work.  Instead, duplicate the file descriptor and
+	 * check that we actually got directio if that's required.  Note that
+	 * we can't change the mode on the IO channel's fd because we already
+	 * set it up for buffered IO.
+	 */
+	new_fd = dup(maybe_fd);
+	if (flags & O_DIRECT) {
+		ret = fcntl(new_fd, F_GETFL);
+		if (ret < 0 || !(ret & O_DIRECT)) {
+			close(new_fd);
+			return -1;
+		}
+	}
+
+	/*
+	 * The MMP fd shadows the io channel fd, so we must use that for all
+	 * MMP block accesses because the two fds share the same file position
+	 * and O_DIRECT state, and the iochannel must know about that.
+	 */
+	fs->flags2 |= EXT2_FLAG2_MMP_USE_IOCHANNEL;
+	return new_fd;
+}
+#endif
+
 errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 {
 #ifdef CONFIG_MMP
@@ -77,7 +159,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		    S_ISREG(st.st_mode))
 			flags &= ~O_DIRECT;
 
-		fs->mmp_fd = open(fs->device_name, flags);
+		fs->mmp_fd = ext2fs_mmp_open_device(fs, flags);
 		if (fs->mmp_fd < 0) {
 			retval = EXT2_ET_MMP_OPEN_DIRECT;
 			goto out;
@@ -90,6 +172,15 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 			return retval;
 	}
 
+	if (fs->flags2 & EXT2_FLAG2_MMP_USE_IOCHANNEL) {
+		retval = io_channel_read_blk64(fs->io, mmp_blk, -fs->blocksize,
+					       fs->mmp_cmp);
+		if (retval)
+			return retval;
+
+		goto read_compare;
+	}
+
 	if ((blk64_t) ext2fs_llseek(fs->mmp_fd, mmp_blk * fs->blocksize,
 				    SEEK_SET) !=
 	    mmp_blk * fs->blocksize) {
@@ -102,6 +193,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		goto out;
 	}
 
+read_compare:
 	mmp_cmp = fs->mmp_cmp;
 
 	if (!(fs->flags & EXT2_FLAG_IGNORE_CSUM_ERRORS) &&
@@ -428,6 +520,7 @@ errcode_t ext2fs_mmp_stop(ext2_filsys fs)
 
 mmp_error:
 	if (fs->mmp_fd >= 0) {
+		fs->flags2 &= ~EXT2_FLAG2_MMP_USE_IOCHANNEL;
 		close(fs->mmp_fd);
 		fs->mmp_fd = -1;
 	}
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index a9b1fac62a0250..567bbd9493f7f1 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1152,7 +1152,7 @@ static errcode_t unix_open_channel(const char *name, int fd,
 #define DEV_FD_PATH	"/dev/fd/"
 #define DEV_FD_PATHLEN	(sizeof(DEV_FD_PATH) - 1)
 
-static int possible_unixfd_pathname(const char *path)
+int possible_unixfd_pathname(const char *path)
 {
 	return strncmp(DEV_FD_PATH, path, DEV_FD_PATHLEN) == 0;
 }


^ permalink raw reply related

* [PATCH 03/10] unix_io: allow passing /dev/fd/XXX paths to the unixfd IO manager
From: Darrick J. Wong @ 2026-04-22 23:24 UTC (permalink / raw)
  To: tytso
  Cc: linux-ext4, linux-fsdevel, fuse-devel, linux-ext4, neal,
	joannelkoong, miklos, bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Commit 4ccf9e4fe165cf created a "unixfd" IO manager that allows someone
to choose the unixfd IO manager and then mount a filesystem from an
existing file descriptor by passing a string with the fd number as the
"device" name to ext2fs_open().

That was an unfortunate choice of naming, however, because that could
be mistaken for a relative path to a file whose name is an integer
number.  Let's improve this by allowing callers to pass /dev/fd/XX
as the filesystem device name.  The upcoming fuse4fs service patches
will employ this method to open a filesystem on a block device fd passed
into the secure container from a mount helper.

Cc: <linux-ext4@vger.kernel.org> # v1.43.2
Fixes: 4ccf9e4fe165cf ("libext2fs: add unixfd_io_manager")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 79bc9219f9515b..a9b1fac62a0250 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -67,6 +67,7 @@
 #ifdef HAVE_SYS_FILE_H
 #include <sys/file.h>
 #endif
+#include <limits.h>
 
 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
@@ -1148,13 +1149,40 @@ static errcode_t unix_open_channel(const char *name, int fd,
 	return retval;
 }
 
+#define DEV_FD_PATH	"/dev/fd/"
+#define DEV_FD_PATHLEN	(sizeof(DEV_FD_PATH) - 1)
+
+static int possible_unixfd_pathname(const char *path)
+{
+	return strncmp(DEV_FD_PATH, path, DEV_FD_PATHLEN) == 0;
+}
+
 static errcode_t unixfd_open(const char *str_fd, int flags,
 			     io_channel *channel)
 {
 	int fd;
 	int fd_flags;
 
-	fd = atoi(str_fd);
+	/*
+	 * The caller should provide a path in the form "/dev/fd/XX",
+	 * but the shorthand form "XX" is allowed for legacy reasons.
+	 */
+	if (possible_unixfd_pathname(str_fd)) {
+		char *endptr;
+		long maybe_fd;
+
+		errno = 0;
+		maybe_fd = strtol(str_fd + DEV_FD_PATHLEN, &endptr, 10);
+		if (errno)
+			return errno;
+		if (*endptr != 0)
+			return EINVAL;
+		if (maybe_fd < 0 || maybe_fd > INT_MAX)
+			return EINVAL;
+		fd = maybe_fd;
+	} else {
+		fd = atoi(str_fd);
+	}
 #if defined(HAVE_FCNTL)
 	fd_flags = fcntl(fd, F_GETFL);
 	if (fd_flags == -1)


^ permalink raw reply related

* [PATCH 02/10] libext2fs: fix checking for valid fds in mmp.c
From: Darrick J. Wong @ 2026-04-22 23:24 UTC (permalink / raw)
  To: tytso
  Cc: linux-ext4, linux-fsdevel, fuse-devel, linux-ext4, neal,
	joannelkoong, miklos, bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

File descriptors are non-negative numbers, which means that 0 is a valid
fd.  Fix the code to be consistent with Unix behaviors.

Cc: <linux-ext4@vger.kernel.org> # v1.42
Fixes: 0f5eba7501f467 ("ext2fs: add multi-mount protection (INCOMPAT_MMP)")
Fixes: 76a6c8788c79e4 ("mmp: do not use O_DIRECT when working with regular file")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/mmp.c    |    6 +++---
 lib/ext2fs/openfs.c |    1 +
 2 files changed, 4 insertions(+), 3 deletions(-)


diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
index e2823732e2b6a2..cb15a18fce5547 100644
--- a/lib/ext2fs/mmp.c
+++ b/lib/ext2fs/mmp.c
@@ -59,11 +59,11 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		return EXT2_ET_MMP_BAD_BLOCK;
 
 	/* ext2fs_open() reserves fd0,1,2 to avoid stdio collision, so checking
-	 * mmp_fd <= 0 is OK to validate that the fd is valid.  This opens its
+	 * mmp_fd < 0 is OK to validate that the fd is valid.  This opens its
 	 * own fd to read the MMP block to ensure that it is using O_DIRECT,
 	 * regardless of how the io_manager is doing reads, to avoid caching of
 	 * the MMP block by the io_manager or the VM.  It needs to be fresh. */
-	if (fs->mmp_fd <= 0) {
+	if (fs->mmp_fd < 0) {
 		struct stat st;
 		int flags = O_RDONLY | O_DIRECT;
 
@@ -427,7 +427,7 @@ errcode_t ext2fs_mmp_stop(ext2_filsys fs)
 	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_cmp);
 
 mmp_error:
-	if (fs->mmp_fd > 0) {
+	if (fs->mmp_fd >= 0) {
 		close(fs->mmp_fd);
 		fs->mmp_fd = -1;
 	}
diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
index 2b8e0e753c46e8..41359d15740881 100644
--- a/lib/ext2fs/openfs.c
+++ b/lib/ext2fs/openfs.c
@@ -148,6 +148,7 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
 	/* don't overwrite sb backups unless flag is explicitly cleared */
 	fs->flags |= EXT2_FLAG_MASTER_SB_ONLY;
 	fs->umask = 022;
+	fs->mmp_fd = -1;
 
 	time_env = ext2fs_safe_getenv("SOURCE_DATE_EPOCH");
 	if (time_env) {


^ permalink raw reply related

* [PATCH 01/10] libext2fs: make it possible to extract the fd from an IO manager
From: Darrick J. Wong @ 2026-04-22 23:23 UTC (permalink / raw)
  To: tytso
  Cc: linux-fsdevel, fuse-devel, linux-ext4, neal, joannelkoong, miklos,
	bernd
In-Reply-To: <177689989498.3821326.15497525132012299039.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Make it so that we can extract the fd from an open IO manager.  This
will be used in subsequent patches to register the open block device
with the fuse iomap kernel driver.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/ext2_io.h         |    4 +++-
 debian/libext2fs2t64.symbols |    1 +
 lib/ext2fs/io_manager.c      |    8 ++++++++
 lib/ext2fs/unix_io.c         |   20 ++++++++++++++++++++
 4 files changed, 32 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index 61865d54d82490..c880ea2524f248 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -103,7 +103,8 @@ struct struct_io_manager {
 	errcode_t (*zeroout)(io_channel channel, unsigned long long block,
 			     unsigned long long count);
 	errcode_t (*flock)(io_channel channel, unsigned int flock_flags);
-	long	reserved[13];
+	errcode_t (*get_fd)(io_channel channel, int *fd);
+	long	reserved[12];
 };
 
 #define IO_FLAG_RW		0x0001
@@ -155,6 +156,7 @@ extern errcode_t io_channel_cache_readahead(io_channel io,
 					    unsigned long long count);
 extern errcode_t io_channel_flock(io_channel io, unsigned int flock_flags);
 extern errcode_t io_channel_funlock(io_channel io);
+extern errcode_t io_channel_get_fd(io_channel io, int *fd);
 
 #ifdef _WIN32
 /* windows_io.c */
diff --git a/debian/libext2fs2t64.symbols b/debian/libext2fs2t64.symbols
index affe4c27d4e791..555fbbb0c98878 100644
--- a/debian/libext2fs2t64.symbols
+++ b/debian/libext2fs2t64.symbols
@@ -701,6 +701,7 @@ libext2fs.so.2 libext2fs2t64 #MINVER#
  io_channel_discard@Base 1.42
  io_channel_flock@Base 1.47.99
  io_channel_funlock@Base 1.47.99
+ io_channel_get_fd@Base 1.47.99
  io_channel_read_blk64@Base 1.41.1
  io_channel_set_options@Base 1.37
  io_channel_write_blk64@Base 1.41.1
diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index 791ec7d14adbba..dff3d73552827f 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -166,3 +166,11 @@ errcode_t io_channel_funlock(io_channel io)
 
 	return io->manager->flock(io, 0);
 }
+
+errcode_t io_channel_get_fd(io_channel io, int *fd)
+{
+	if (!io->manager->get_fd)
+		return EXT2_ET_OP_NOT_SUPPORTED;
+
+	return io->manager->get_fd(io, fd);
+}
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index f4307db0fb2b05..79bc9219f9515b 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1786,6 +1786,24 @@ static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;
 }
+
+static errcode_t unix_get_fd(io_channel channel, int *fd)
+{
+	struct unix_private_data *data;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	data = (struct unix_private_data *) channel->private_data;
+	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+	if (data->offset) {
+		*fd = -1;
+		return EINVAL;
+	}
+
+	*fd = data->dev;
+	return 0;
+}
+
 #if __GNUC_PREREQ (4, 6)
 #pragma GCC diagnostic pop
 #endif
@@ -1808,6 +1826,7 @@ static struct struct_io_manager struct_unix_manager = {
 	.cache_readahead	= unix_cache_readahead,
 	.zeroout	= unix_zeroout,
 	.flock		= unix_flock,
+	.get_fd		= unix_get_fd,
 };
 
 io_manager unix_io_manager = &struct_unix_manager;
@@ -1830,6 +1849,7 @@ static struct struct_io_manager struct_unixfd_manager = {
 	.cache_readahead	= unix_cache_readahead,
 	.zeroout	= unix_zeroout,
 	.flock		= unix_flock,
+	.get_fd		= unix_get_fd,
 };
 
 io_manager unixfd_io_manager = &struct_unixfd_manager;


^ permalink raw reply related

* [PATCH 3/3] libext2fs: only fsync the unix fd if we wrote to the device
From: Darrick J. Wong @ 2026-04-22 23:23 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <177689989303.3821152.12873703999139555043.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

As an optimization, only fsync the block device fd if we tried to write
to the io channel.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |   86 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 19 deletions(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 15d6d55ff7fdd4..f4307db0fb2b05 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -132,10 +132,13 @@ struct unix_cache {
 #define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
 #define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
 
+#define UNIX_STATE_DIRTY	(1U << 0) /* device needs fsyncing */
+
 struct unix_private_data {
 	int	magic;
 	int	dev;
 	int	flags;
+	unsigned int	state; /* UNIX_STATE_* */
 	int	align;
 	int	access_time;
 	int	unix_flock_flags;
@@ -1198,10 +1201,65 @@ static errcode_t unix_open(const char *name, int flags,
 	return unix_open_channel(name, fd, flags, channel, unix_io_manager);
 }
 
+#ifdef HAVE_FSYNC
+static void mark_dirty(io_channel channel)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+
+	mutex_lock(data, CACHE_MTX);
+	data->state |= UNIX_STATE_DIRTY;
+	mutex_unlock(data, CACHE_MTX);
+}
+
+static errcode_t maybe_fsync(io_channel channel, int force_fsync)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+	int need_fsync;
+	errcode_t retval = 0;
+
+#ifndef NO_IO_CACHE
+	retval = flush_cached_blocks(channel, data, 0);
+#endif
+
+	mutex_lock(data, CACHE_MTX);
+	need_fsync = force_fsync || (data->state & UNIX_STATE_DIRTY);
+	data->state &= ~UNIX_STATE_DIRTY;
+	mutex_unlock(data, CACHE_MTX);
+
+	if (need_fsync && fsync(data->dev) != 0) {
+		if (!retval)
+			retval = errno;
+	}
+	if (retval) {
+		/* redirty because writeback failed */
+		mark_dirty(channel);
+		return retval;
+	}
+
+	return 0;
+}
+#else
+# define mark_dirty(...)		((void)0)
+
+static errcode_t maybe_fsync(io_channel channel, int force_fsync)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+	errcode_t retval = 0;
+
+#ifndef NO_IO_CACHE
+	retval = flush_cached_blocks(channel, data, 0);
+#endif
+	return retval;
+}
+#endif
+
 static errcode_t unix_close(io_channel channel)
 {
 	struct unix_private_data *data;
-	errcode_t	retval = 0;
+	errcode_t	retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	data = (struct unix_private_data *) channel->private_data;
@@ -1210,14 +1268,7 @@ static errcode_t unix_close(io_channel channel)
 	if (--channel->refcount > 0)
 		return 0;
 
-#ifndef NO_IO_CACHE
-	retval = flush_cached_blocks(channel, data, 0);
-#endif
-#ifdef HAVE_FSYNC
-	/* always fsync the device, even if flushing our own cache failed */
-	if (fsync(data->dev) != 0 && !retval)
-		retval = errno;
-#endif
+	retval = maybe_fsync(channel, 1);
 
 	unix_funlock(channel);
 
@@ -1388,6 +1439,8 @@ static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
 	data = (struct unix_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
+	mark_dirty(channel);
+
 #ifdef NO_IO_CACHE
 	return raw_write_blk(channel, data, block, count, buf, 0);
 #else
@@ -1512,6 +1565,8 @@ static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
 	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
 		return errno;
 
+	mark_dirty(channel);
+
 	actual = write(data->dev, buf, size);
 	if (actual < 0)
 		return errno;
@@ -1527,21 +1582,12 @@ static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
 static errcode_t unix_flush(io_channel channel)
 {
 	struct unix_private_data *data;
-	errcode_t retval = 0;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	data = (struct unix_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
-#ifndef NO_IO_CACHE
-	retval = flush_cached_blocks(channel, data, 0);
-#endif
-#ifdef HAVE_FSYNC
-	/* always fsync the device, even if flushing our own cache failed */
-	if (fsync(data->dev) != 0 && !retval)
-		return errno;
-#endif
-	return retval;
+	return maybe_fsync(channel, 0);
 }
 
 static errcode_t unix_set_option(io_channel channel, const char *option,
@@ -1653,6 +1699,7 @@ static errcode_t unix_discard(io_channel channel, unsigned long long block,
 		}
 		return errno;
 	}
+	mark_dirty(channel);
 	return 0;
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;
@@ -1734,6 +1781,7 @@ static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
 		}
 		return errno;
 	}
+	mark_dirty(channel);
 	return 0;
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;


^ permalink raw reply related

* [PATCH 2/3] libext2fs: always fsync the device when closing the unix IO manager
From: Darrick J. Wong @ 2026-04-22 23:23 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <177689989303.3821152.12873703999139555043.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

unix_close is the last chance that libext2fs has to report write
failures to users.  Although it's likely that ext2fs_close already
called ext2fs_flush and told the IO manager to flush, we could do one
more sync before we close the file descriptor.  Also don't override the
fsync's errno with the close's errno.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index b6feebef93fa5b..15d6d55ff7fdd4 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1213,10 +1213,16 @@ static errcode_t unix_close(io_channel channel)
 #ifndef NO_IO_CACHE
 	retval = flush_cached_blocks(channel, data, 0);
 #endif
+#ifdef HAVE_FSYNC
+	/* always fsync the device, even if flushing our own cache failed */
+	if (fsync(data->dev) != 0 && !retval)
+		retval = errno;
+#endif
 
 	unix_funlock(channel);
 
-	if (channel->manager != unixfd_io_manager && close(data->dev) < 0)
+	if (channel->manager != unixfd_io_manager && close(data->dev) < 0 &&
+	    !retval)
 		retval = errno;
 	free_cache(data);
 	free(data->cache);


^ permalink raw reply related

* [PATCH 1/3] libext2fs: always fsync the device when flushing the cache
From: Darrick J. Wong @ 2026-04-22 23:23 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <177689989303.3821152.12873703999139555043.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

When we're flushing the unix IO manager's buffer cache, we should always
fsync the block device, because something could have written to the
block device -- either the buffer cache itself, or a direct write.
Regardless, the callers all want all dirtied regions to be persisted to
stable media.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index abd33ba839f7e9..b6feebef93fa5b 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1531,7 +1531,8 @@ static errcode_t unix_flush(io_channel channel)
 	retval = flush_cached_blocks(channel, data, 0);
 #endif
 #ifdef HAVE_FSYNC
-	if (!retval && fsync(data->dev) != 0)
+	/* always fsync the device, even if flushing our own cache failed */
+	if (fsync(data->dev) != 0 && !retval)
 		return errno;
 #endif
 	return retval;


^ permalink raw reply related

* [PATCHSET v5 2/2] fuse4fs: run servers as a contained service
From: Darrick J. Wong @ 2026-04-22 23:19 UTC (permalink / raw)
  To: tytso
  Cc: linux-ext4, linux-fsdevel, fuse-devel, linux-ext4, neal,
	joannelkoong, miklos, bernd
In-Reply-To: <20260422231518.GA7717@frogsfrogsfrogs>

Hi all,

This series packages the newly created fuse4fs server into a systemd
socket service.  This service can be used by the "mount.service" helper
in libfuse to implement untrusted unprivileged mounts.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-service-container
---
Commits in this patchset:
 * libext2fs: make it possible to extract the fd from an IO manager
 * libext2fs: fix checking for valid fds in mmp.c
 * unix_io: allow passing /dev/fd/XXX paths to the unixfd IO manager
 * libext2fs: fix MMP code to work with unixfd IO manager
 * libext2fs: bump libfuse API version to 3.19
 * fuse4fs: hoist some code out of fuse4fs_main
 * fuse4fs: enable safe service mode
 * fuse4fs: set proc title when in fuse service mode
 * fuse4fs: make MMP work correctly in safe service mode
 * debian: update packaging for fuse4fs service
---
 lib/ext2fs/ext2_io.h         |    4 
 lib/ext2fs/ext2fs.h          |    1 
 lib/ext2fs/ext2fsP.h         |    4 
 MCONFIG.in                   |    2 
 configure                    |  303 +++++++++++++++++++++++++++
 configure.ac                 |  131 +++++++++++-
 debian/e2fsprogs.install     |    7 +
 debian/fuse4fs.install       |    3 
 debian/libext2fs2t64.symbols |    1 
 debian/rules                 |    3 
 fuse4fs/Makefile.in          |   42 +++-
 fuse4fs/fuse4fs.c            |  466 +++++++++++++++++++++++++++++++++++-------
 fuse4fs/fuse4fs.socket.in    |   17 ++
 fuse4fs/fuse4fs@.service.in  |  102 +++++++++
 lib/config.h.in              |   12 +
 lib/ext2fs/io_manager.c      |    8 +
 lib/ext2fs/mmp.c             |  101 +++++++++
 lib/ext2fs/openfs.c          |    1 
 lib/ext2fs/unix_io.c         |   50 ++++-
 util/subst.conf.in           |    3 
 20 files changed, 1164 insertions(+), 97 deletions(-)
 mode change 100644 => 100755 debian/fuse4fs.install
 create mode 100644 fuse4fs/fuse4fs.socket.in
 create mode 100644 fuse4fs/fuse4fs@.service.in


^ permalink raw reply

* [PATCHSET 1/2] libext2fs: fix some missed fsync calls
From: Darrick J. Wong @ 2026-04-22 23:19 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <20260422231518.GA7717@frogsfrogsfrogs>

Hi all,

Fix a few places (like device closing) where we really ought to tell the
block device to flush whatever's dirty to disk, even if we've failed to
flush all our cached buffers out to disk.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=libext2fs-flushing-fixes
---
Commits in this patchset:
 * libext2fs: always fsync the device when flushing the cache
 * libext2fs: always fsync the device when closing the unix IO manager
 * libext2fs: only fsync the unix fd if we wrote to the device
---
 lib/ext2fs/unix_io.c |   83 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 14 deletions(-)


^ permalink raw reply

* [PATCHBOMB v5] fuse/libfuse/e2fsprogs/etc: containerize ext4 for safer operation
From: Darrick J. Wong @ 2026-04-22 23:15 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, fuse-devel
  Cc: Miklos Szeredi, Bernd Schubert, Joanne Koong, Theodore Ts'o,
	Neal Gompa, Amir Goldstein, Christian Brauner, demiobenour

Hi everyone,

This *would have been* the eight public draft of the gigantic patchset
to connect the Linux fuse driver to fs-iomap for regular file IO
operations to and from files whose contents persist to locally attached
storage devices.

However, the previous submission was too large, and I didn't even send
half the patches!  I have therefore split the work into two sections.
This first section covers setting up fuse servers to run as contained
systemd services; I previously sent only the libfuse changes, without
any of the surrounding pieces.  Now I'm ready to send them all.

To summarize this patchbomb: fuse servers can now run as non-root users,
with no privilege, no access to the network or hardware, etc.  The only
connection to the outside is an ephemeral AF_UNIX socket.  The process
on the other end is a helper program that acquires resources and calls
fsmount().

Why would you want to do that?  Most filesystem drivers are seriously
vulnerable to metadata parsing attacks, as syzbot has shown repeatedly
over almost a decade of its existence.  Faulty code can lead to total
kernel compromise, and I think there's a very strong incentive to move
all that parsing out to userspace where we can containerize the fuse
server process.  Runtime filesystem metadata parsing is no longer a
privileged (== risky) operation.

The consequences of a crashed driver is a dead mount, instead of a
crashed or corrupt OS kernel.

Note that contained fuse filesystem servers are no faster than regular
fuse.  The redesign of the fuse IO path via iomap will be the subject of
the second patchbomb.  The containerization code only requires changes
to libfuse and is ready to go today.

Since the seventh submission, I have made the following changes:

1) Added a couple of simple fuse service drivers to the example code

2) Adapted fuservicemount to be runnable as a setuid program so that
unprivileged users can start up a containerized filesystem driver

3) Fixed some endianness handling errors in the socket protocol between
the new mount helper and the fuse server

4) Added a high level fuse_main function so that fuse servers that use
the high level api can containerize without a total rewrite

5) Adapted mount.fuse to call the new mount helper code so that mount -t
fuse.XXX can try to start up a contained server

6) Cleaned up a lot of cppcheck complaints and refactored a bunch of
repetitious code

7) Started using codex to try to find bugs and security problems with
the new mount helper

There are a few unanswered questions:

a. How to integrate with the SYNC_INIT patches that Bernd is working on
merging into libfuse

b. If /any/ of the new fsopen/fsconfig/fsmount/move_mount calls fail,
do we fall back to the old mount syscall?  Even after printing errors?

c. Are there any Linux systems where some inetd implementation can
actually handle AF_UNIX sockets?  Does it make sense to try to do the
service isolation without the convenience of systemd directives?

d. meson/autoconf/cmake are a pain to deal with, hopefully the changes I
made are correct

I have also converted a handful more fuse servers (fat, exfat, iso,
http) to the new service architecture so that I can run a (virtual)
Debian system with EFI completely off of containerized fuse servers.
These will be sent at the end.

libfuse:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/libfuse.git/log/?h=fuse-service-container_2026-04-22

e2fsprogs:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-service-container_2026-04-22

fstests:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=fuse2fs_2026-04-22

--Darrick

Unreviewed patches in this patchbomb:

[PATCHSET v5] libfuse: run fuse servers as a contained service
  [PATCH 02/13] mount_service: add systemd socket service mounting
  [PATCH 03/13] mount_service: create high level fuse helpers
  [PATCH 04/13] mount_service: use the new mount api for the mount
  [PATCH 05/13] mount_service: update mtab after a successful mount
  [PATCH 06/13] util: hoist the fuse.conf parsing and setuid mode
  [PATCH 07/13] util: fix checkpatch complaints in fuser_conf.[ch]
  [PATCH 08/13] mount_service: enable unprivileged users in a similar
  [PATCH 09/13] mount.fuse3: integrate systemd service startup
  [PATCH 10/13] mount_service: allow installation as a setuid program
  [PATCH 11/13] example/service_ll: create a sample systemd service
  [PATCH 12/13] example/service: create a sample systemd service for a
  [PATCH 13/13] nullfs: support fuse systemd service mode
[PATCHSET 1/2] libext2fs: fix some missed fsync calls
  [PATCH 1/3] libext2fs: always fsync the device when flushing the
  [PATCH 2/3] libext2fs: always fsync the device when closing the unix
  [PATCH 3/3] libext2fs: only fsync the unix fd if we wrote to the
[PATCHSET v5 2/2] fuse4fs: run servers as a contained service
  [PATCH 01/10] libext2fs: make it possible to extract the fd from an
  [PATCH 02/10] libext2fs: fix checking for valid fds in mmp.c
  [PATCH 03/10] unix_io: allow passing /dev/fd/XXX paths to the unixfd
  [PATCH 04/10] libext2fs: fix MMP code to work with unixfd IO manager
  [PATCH 05/10] libext2fs: bump libfuse API version to 3.19
  [PATCH 06/10] fuse4fs: hoist some code out of fuse4fs_main
  [PATCH 07/10] fuse4fs: enable safe service mode
  [PATCH 08/10] fuse4fs: set proc title when in fuse service mode
  [PATCH 09/10] fuse4fs: make MMP work correctly in safe service mode
  [PATCH 10/10] debian: update packaging for fuse4fs service

^ permalink raw reply

* Re: [PATCH v8 03/22] ovl: use core fsverity ensure info interface
From: Eric Biggers @ 2026-04-22 22:46 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, hch,
	linux-ext4, linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
	Amir Goldstein
In-Reply-To: <gpmgtg2wkoo4vozzaaouhdp2df6zlifwi6gy4jvq7xc22zo7om@t3f2bl374nlr>

On Wed, Apr 22, 2026 at 11:59:11AM +0200, Andrey Albershteyn wrote:
> > The 'if (!fsverity_active(inode) && IS_VERITY(inode)) {' condition
> > should stay
> 
> Why? With recent changes, the fsverity_active() now checks for
> IS_VERITY() instead of verity_descriptor.
> 

Okay, I forgot that that had changed.  Looks like the kerneldoc never
got updated, though.  It still says "This checks whether the inode's
verity info has been set."

- Eric

^ permalink raw reply

* [syzbot] [ext4?] WARNING in jbd2_journal_dirty_metadata (3)
From: syzbot @ 2026-04-22 22:44 UTC (permalink / raw)
  To: jack, linux-ext4, linux-kernel, syzkaller-bugs, tytso

Hello,

syzbot found the following issue on:

HEAD commit:    eb5249b12507 Merge tag 'parisc-for-7.1-rc1' of git://git.k..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16ed0836580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=d120b114be21f79
dashboard link: https://syzkaller.appspot.com/bug?extid=a5f824f1c49dd97fcff0
compiler:       Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/d900f083ada3/non_bootable_disk-eb5249b1.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/cb4e868ca1f9/vmlinux-eb5249b1.xz
kernel image: https://storage.googleapis.com/syzbot-assets/fb064cd651ff/bzImage-eb5249b1.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+a5f824f1c49dd97fcff0@syzkaller.appspotmail.com

loop0: detected capacity change from 0 to 32768
JBD2: Ignoring recovery information on journal
ocfs2: Mounting device (7,0) on (node local, slot 0) with ordered data mode.
------------[ cut here ]------------
jbd2_handle_buffer_credits(handle) <= 0
WARNING: fs/jbd2/transaction.c:1593 at jbd2_journal_dirty_metadata+0x9c8/0xd30 fs/jbd2/transaction.c:1593, CPU#0: syz.0.0/5322
Modules linked in:
CPU: 0 UID: 0 PID: 5322 Comm: syz.0.0 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:jbd2_journal_dirty_metadata+0x9c8/0xd30 fs/jbd2/transaction.c:1593
Code: 26 03 00 00 45 8b 06 48 c7 c7 40 07 e6 8b 89 de 89 ea 4c 89 f9 e8 18 25 8a fe b8 ea ff ff ff e9 11 fa ff ff e8 f9 42 28 ff 90 <0f> 0b 90 b8 e4 ff ff ff e9 fe f9 ff ff e8 e6 42 28 ff 90 0f 0b 90
RSP: 0018:ffffc9000f85e640 EFLAGS: 00010287
RAX: ffffffff829d9bb7 RBX: 0000000000000000 RCX: 0000000000100000
RDX: ffffc9000ef42000 RSI: 000000000009fb83 RDI: 000000000009fb84
RBP: 1ffff1100ac32d99 R08: 0000000000000003 R09: 0000000000000004
R10: dffffc0000000000 R11: fffff52001f0bcb8 R12: ffff8880483456f8
R13: ffff888048345690 R14: 1ffff11009068adc R15: 0000000000000000
FS:  00007f04675f56c0(0000) GS:ffff88808c81a000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055a18f887168 CR3: 000000001f4fd000 CR4: 0000000000352ef0
Call Trace:
 <TASK>
 ocfs2_journal_dirty+0x130/0x700 fs/ocfs2/journal.c:831
 ocfs2_alloc_dinode_update_counts+0x16e/0x2d0 fs/ocfs2/suballoc.c:1746
 ocfs2_search_chain+0xe39/0x1e10 fs/ocfs2/suballoc.c:2002
 ocfs2_claim_suballoc_bits+0x901/0x1f40 fs/ocfs2/suballoc.c:2113
 __ocfs2_claim_clusters+0x31d/0x970 fs/ocfs2/suballoc.c:2540
 ocfs2_make_clusters_writable fs/ocfs2/refcounttree.c:3243 [inline]
 ocfs2_replace_cow+0x984/0x1c90 fs/ocfs2/refcounttree.c:3346
 ocfs2_refcount_cow_hunk fs/ocfs2/refcounttree.c:3424 [inline]
 ocfs2_refcount_cow+0x790/0xd40 fs/ocfs2/refcounttree.c:3467
 ocfs2_prepare_inode_for_write fs/ocfs2/file.c:2347 [inline]
 ocfs2_file_write_iter+0xee2/0x1e70 fs/ocfs2/file.c:2458
 iter_file_splice_write+0x9a1/0x10f0 fs/splice.c:736
 do_splice_from fs/splice.c:936 [inline]
 direct_splice_actor+0x101/0x160 fs/splice.c:1159
 splice_direct_to_actor+0x53a/0xc70 fs/splice.c:1103
 do_splice_direct_actor fs/splice.c:1202 [inline]
 do_splice_direct+0x195/0x290 fs/splice.c:1228
 do_sendfile+0x535/0x7d0 fs/read_write.c:1372
 __do_sys_sendfile64 fs/read_write.c:1433 [inline]
 __se_sys_sendfile64+0x144/0x1a0 fs/read_write.c:1419
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x15f/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f046b19c819
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f04675f4fe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
RAX: ffffffffffffffda RBX: 00007f046b415fa0 RCX: 00007f046b19c819
RDX: 0000000000000000 RSI: 000000000000000b RDI: 000000000000000a
RBP: 00007f046b232c91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000020fffe82 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f046b416038 R14: 00007f046b415fa0 R15: 00007ffd208882e8
 </TASK>


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH] generic/790: test post-EOF gap zeroing persistence
From: Brian Foster @ 2026-04-22 13:22 UTC (permalink / raw)
  To: Zhang Yi
  Cc: fstests, zlang, linux-ext4, linux-fsdevel, jack, yi.zhang,
	yizhang089, yangerkun
In-Reply-To: <20260422015246.4132376-1-yi.zhang@huaweicloud.com>

On Wed, Apr 22, 2026 at 09:52:46AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Test that extending a file past a non-block-aligned EOF correctly
> zero-fills the gap [old_EOF, block_boundary), and that this zeroing
> persists through a filesystem shutdown+remount cycle.
> 
> Stale data beyond EOF can persist on disk when append write data blocks
> are flushed before the i_size metadata update, or when concurrent append
> writeback and mmap writes persist non-zero data past EOF. Subsequent
> post-EOF operations (append write, fallocate, truncate up) must
> zero-fill and persist the gap to prevent exposing stale data.
> 
> The test pollutes the file's last physical block (via FIEMAP + raw
> device write) with a sentinel pattern beyond i_size, then performs each
> extend operation and verifies the gap is zeroed both in memory and on
> disk.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> This is the case Jan Kara pointed out during my work on the ext4
> buffered I/O to iomap conversion. This case is similar to generic/363,
> but generic/363 doesn't provide persistent testing. For details:
> 
>  https://lore.kernel.org/linux-ext4/jgotl7vzzuzm6dvz5zfgk6haodxvunb4hq556pzh4hqqwvnhxq@lr3jiedhqh7c/
> 
>  tests/generic/790     | 155 ++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/790.out |   4 ++
>  2 files changed, 159 insertions(+)
>  create mode 100755 tests/generic/790
>  create mode 100644 tests/generic/790.out
> 
> diff --git a/tests/generic/790 b/tests/generic/790
> new file mode 100755
> index 00000000..5d8f61f9
> --- /dev/null
> +++ b/tests/generic/790
> @@ -0,0 +1,155 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Huawei.  All Rights Reserved.
> +#
> +# FS QA Test No. 790
> +#
> +# Test that extending a file past a non-block-aligned EOF correctly zero-fills
> +# the gap [old_EOF, block_boundary), and that this zeroing persists through a
> +# filesystem shutdown+remount cycle.
> +#

Nice test! This is a great idea.

> +# Stale data beyond EOF can persist on disk when:
> +# 1) append write data blocks are flushed before the i_size metadata update,
> +#    and the system crashes in this window.

Maybe it's wording or I'm missing something, but how would "append write
data blocks" be flushed before i_size updates? Wouldn't writeback toss
them or zero the post-eof range of a folio? Do you mean to refer to
"on-disk size update" specifically (where I'm reading it as
inode->i_isize)?

> +# 2) concurrent append writeback and mmap writes persist non-zero data past EOF.
> +#
> +# Subsequent post-EOF operations (append write, fallocate, truncate up) must
> +# zero-fill and persist the gap to prevent exposing stale data.
> +#
> +# The test pollutes the file's last physical block (via FIEMAP + raw device
> +# write) with a sentinel pattern beyond i_size, then performs each extend
> +# operation and verifies the gap is zeroed both in memory and on disk.
> +#
...
> +_test_eof_zeroing()
> +{
> +	local test_name="$1"
> +	local extend_cmd="$2"
> +	local file=$SCRATCH_MNT/testfile_${test_name}
> +
> +	echo "$test_name" | tee -a $seqres.full
> +
> +	# Compute non-block-aligned EOF offset
> +	local gap_bytes=16
> +	local eof_offset=$((blksz - gap_bytes))
> +
> +	# Step 1: Write one full block to ensure the filesystem allocates a
> +	#         physical block for the file instead of using inline data.
> +	$XFS_IO_PROG -f -c "pwrite -S 0x5a 0 $blksz" -c fsync \
> +		"$file" >> $seqres.full 2>&1
> +
> +	# Step 2: Get physical block offset on device via FIEMAP
> +	local phys_offset
> +	phys_offset=$(_get_phys_offset "$file")
> +	if [ -z "$phys_offset" ]; then
> +		_fail "$test_name: failed to get physical block offset via fiemap"
> +	fi
> +
> +	# Step 3: Truncate file to non-block-aligned size and fsync.
> +	#         The on-disk region [eof_offset, blksz) may or may not be
> +	#         zeroed by the filesystem at this point.
> +	$XFS_IO_PROG -c "truncate $eof_offset" -c fsync \
> +		"$file" >> $seqres.full 2>&1
> +
> +	# Step 4: Unmount and restore the physical block to all-0x5a on disk.
> +	#         This bypasses the kernel's pagecache EOF-zeroing to ensure
> +	#         the stale pattern is present on disk. Then remount.
> +	_scratch_unmount
> +	$XFS_IO_PROG -d -c "pwrite -S 0x5a $phys_offset $blksz" \
> +		$SCRATCH_DEV >> $seqres.full 2>&1
> +	_scratch_mount >> $seqres.full 2>&1
> +
> +	# Verify file size is still eof_offset after remount
> +	local sz
> +	sz=$(stat -c %s "$file")
> +	if [ "$sz" -ne "$eof_offset" ]; then
> +		_fail "$test_name: file size wrong after remount: $sz != $eof_offset"
> +	fi

I was initially curious why we'd want to do this, but after further
thought I wonder if it might make more sense to check file size against
the extended size after the shutdown/mount cycle below (but before
checking the gap range). That way we know the size update was
logged/recovered correctly and we're about to read from a file range
within eof. Hm?

Those couple nits aside this all looks pretty good to me.

Brian

> +
> +	# Step 5: Execute the extend operation.
> +	$XFS_IO_PROG -c "$extend_cmd" "$file" >> $seqres.full 2>&1
> +
> +	# Step 6: Verify gap [eof_offset, blksz) is zeroed BEFORE shutdown
> +	_check_gap_zero "$file" $eof_offset $gap_bytes "before shutdown" || return 1
> +
> +	# Step 7: Sync the extended range and shutdown the filesystem with
> +	#         journal flush. This persists the file size extending, and
> +	#         the filesystem should persist the zeroed data in the gap
> +	#         range as well.
> +	if [ "$extend_cmd" != "${extend_cmd#pwrite}" ]; then
> +		$XFS_IO_PROG -c "sync_range -w $blksz $blksz" \
> +			"$file" >> $seqres.full 2>&1
> +	fi
> +	_scratch_shutdown -f
> +
> +	# Step 8: Remount and verify gap is still zeroed
> +	_scratch_cycle_mount
> +	_check_gap_zero "$file" $eof_offset $gap_bytes "after shutdown+remount" || return 1
> +}
> +
> +_scratch_mkfs >> $seqres.full 2>&1
> +_scratch_mount
> +
> +blksz=$(_get_block_size $SCRATCH_MNT)
> +
> +# Test three variants of EOF-extending operations
> +_test_eof_zeroing "append_write" "pwrite -S 0x42 $blksz $blksz"
> +_test_eof_zeroing "truncate_up" "truncate $((blksz * 2))"
> +_test_eof_zeroing "fallocate" "falloc $blksz $blksz"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/790.out b/tests/generic/790.out
> new file mode 100644
> index 00000000..e5e2cc09
> --- /dev/null
> +++ b/tests/generic/790.out
> @@ -0,0 +1,4 @@
> +QA output created by 790
> +append_write
> +truncate_up
> +fallocate
> -- 
> 2.52.0
> 
> 


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox