Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH 08/13] mount_service: enable unprivileged users in a similar manner as fusermount
From: Darrick J. Wong @ 2026-04-30 21:17 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Some Linux distributions allow unprivileged users to mount fuse
filesystems through the use of the setuid fusermount helper program.  It
would be useful to provide similar functionality when mounting a
filesystem that runs as a systemd service.

Therefore, read the fuse config file and implement the same checks as
fusermount.  The only new requirement is that the unprivileged user must
be able to open the mountpoint for write access if it's a regular file;
or have write access if it's a directory.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 util/mount_service.c |  232 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 227 insertions(+), 5 deletions(-)


diff --git a/util/mount_service.c b/util/mount_service.c
index 915a0c4b610792..95de56f2b625fe 100644
--- a/util/mount_service.c
+++ b/util/mount_service.c
@@ -38,6 +38,7 @@
 #include "fuse_i.h"
 #include "fuse_service_priv.h"
 #include "mount_service.h"
+#include "fuser_conf.h"
 
 struct mount_service {
 	/* prefix for printing error messages */
@@ -313,8 +314,10 @@ static int mount_service_connect(struct mount_service *mo)
 	if (ret)
 		return ret;
 
+	drop_privs();
 	ret = connect(sockfd, (const struct sockaddr *)&name, sizeof(name));
 	if (ret && (errno == ENOENT || errno == ECONNREFUSED)) {
+		restore_privs();
 		fprintf(stderr, "%s: no safe filesystem driver for %s available.\n",
 			mo->msgtag, mo->subtype);
 		close(sockfd);
@@ -323,10 +326,12 @@ static int mount_service_connect(struct mount_service *mo)
 	if (ret) {
 		int error = errno;
 
+		restore_privs();
 		fprintf(stderr, "%s: %s: %s\n",
 			mo->msgtag, name.sun_path, strerror(error));
 		goto out;
 	}
+	restore_privs();
 
 	ret = try_drop_passrights(mo, sockfd);
 	if (ret)
@@ -349,7 +354,7 @@ static int mount_service_send_hello(struct mount_service *mo)
 	struct fuse_service_hello_reply reply = { };
 	ssize_t size;
 
-	if (getuid() == 0)
+	if (getuid() == 0 || user_allow_other)
 		hello.flags |= htonl(FUSE_SERVICE_FLAG_ALLOW_OTHER);
 
 	size = __send_packet(mo, &hello, sizeof(hello));
@@ -586,14 +591,17 @@ static int mount_service_send_required_files(struct mount_service *mo,
 {
 	int ret;
 
+	drop_privs();
 	mo->fusedevfd = open(fusedev, O_RDWR | O_CLOEXEC);
 	if (mo->fusedevfd < 0) {
 		int error = errno;
 
+		restore_privs();
 		fprintf(stderr, "%s: %s: %s\n",
 			mo->msgtag, fusedev, strerror(error));
 		return -1;
 	}
+	restore_privs();
 
 	ret = mount_service_send_file(mo, FUSE_SERVICE_ARGV, mo->argvfd);
 	if (ret)
@@ -710,14 +718,17 @@ static int prepare_bdev(struct mount_service *mo,
 	if (oc->block_size) {
 		int block_size = ntohl(oc->block_size);
 
+		drop_privs();
 		ret = ioctl(fd, BLKBSZSET, &block_size);
 		if (ret) {
 			int error = errno;
 
+			restore_privs();
 			fprintf(stderr, "%s: %s: %s\n",
 				mo->msgtag, oc->path, strerror(error));
 			return -error;
 		}
+		restore_privs();
 	}
 
 	return 0;
@@ -754,6 +765,7 @@ static int mount_service_open_path(struct mount_service *mo,
 	}
 
 	open_flags = ntohl(oc->open_flags) | O_CLOEXEC;
+	drop_privs();
 	fd = open(oc->path, open_flags, ntohl(oc->create_mode));
 	if (fd < 0) {
 		int error = errno;
@@ -762,11 +774,13 @@ static int mount_service_open_path(struct mount_service *mo,
 		 * Don't print a busy device error report because the
 		 * filesystem might decide to retry.
 		 */
+		restore_privs();
 		if (error != EBUSY && !(request_flags & FUSE_SERVICE_OPEN_QUIET))
 			fprintf(stderr, "%s: %s: %s\n",
 				mo->msgtag, oc->path, strerror(error));
 		return mount_service_send_file_error(mo, error, oc->path);
 	}
+	restore_privs();
 
 	if (S_ISBLK(expected_fmt)) {
 		ret = prepare_bdev(mo, oc, fd);
@@ -994,6 +1008,15 @@ static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
 			*equals = 0;
 		}
 
+		if (getuid() != 0 && !user_allow_other &&
+		    (!strcmp(tok, "allow_other") ||
+		     !strcmp(tok, "allow_root"))) {
+			fprintf(stderr,
+"%s: option %s only allowed if 'user_allow_other' is set in %s\n",
+				mo->msgtag, tok, FUSE_CONF);
+			return mount_service_send_reply(mo, EPERM);
+		}
+
 #ifdef HAVE_NEW_MOUNT_API
 		if (mo->fsopenfd >= 0) {
 			int ret;
@@ -1077,19 +1100,64 @@ static int mount_service_handle_mtabopts_cmd(struct mount_service *mo,
 	return mount_service_send_reply(mo, 0);
 }
 
+static int open_mountpoint(const char *mntpt, bool *require_dir)
+{
+	int ret;
+
+	*require_dir = false;
+
+	if (getuid() == 0) {
+		/*
+		 * Open the alleged mountpoint.  We're root, so we only bother
+		 * checking for readability.
+		 */
+		return open(mntpt, O_RDONLY | O_CLOEXEC);
+	}
+
+	/*
+	 * Open the alleged mountpoint.  For unprivileged callers, we only
+	 * allow mounting on paths that the user can write to.
+	 */
+	ret = open(mntpt, O_WRONLY | O_CLOEXEC);
+	if (ret >= 0 || errno != EISDIR)
+		return ret;
+
+	/*
+	 * However, we can't open directories with write access.  Try again in
+	 * readonly mode, but require the caller to verify that we actually got
+	 * a directory.
+	 */
+	*require_dir = true;
+	ret = open(mntpt, O_RDONLY | O_CLOEXEC);
+	if (ret >= 0 || (errno != EACCES && errno != EPERM))
+		return ret;
+
+#ifdef O_PATH
+	/*
+	 * If we can't open at all, let's try opening this directory with
+	 * O_PATH.
+	 */
+	return open(mntpt, O_PATH | O_CLOEXEC);
+#else
+	/* No idea what to do now */
+	errno = EACCES;
+	return -1;
+#endif
+}
+
 static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
 				char *mntpt)
 {
 	struct stat stbuf;
 	char *res_mntpt;
+	bool require_dir;
 	int mountfd = -1;
 	int error;
 	int ret;
 
-	/*
-	 * Open the alleged mountpoint, make sure it's a dir or a file.
-	 */
-	mountfd = open(mntpt, O_RDONLY | O_CLOEXEC);
+	drop_privs();
+
+	mountfd = open_mountpoint(mntpt, &require_dir);
 	if (mountfd < 0) {
 		error = errno;
 		fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mntpt,
@@ -1117,6 +1185,13 @@ static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
 		goto out_mountfd;
 	}
 
+	if (require_dir && !S_ISDIR(stbuf.st_mode)) {
+		error = EACCES;
+		fprintf(stderr, "%s: %s: Mount point must be directory.\n",
+			mo->msgtag, mntpt);
+		goto out_mountfd;
+	}
+
 	/*
 	 * Resolve the (possibly relative) mountpoint path before chdir'ing
 	 * onto it.
@@ -1193,6 +1268,7 @@ static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
 	mo->mountfd = mountfd;
 	mo->resv_mountpoint = res_mntpt;
 
+	restore_privs();
 	return mount_service_send_reply(mo, 0);
 
 out_res_mntpt:
@@ -1201,6 +1277,7 @@ static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
 	close(mountfd);
 out_error:
 	free(mntpt);
+	restore_privs();
 	return mount_service_send_reply(mo, error);
 }
 
@@ -1580,6 +1657,141 @@ static int mount_service_fsopen_mount(struct mount_service *mo,
 # define mount_service_fsopen_mount(...)	(FUSE_MOUNT_FALLBACK_NEEDED)
 #endif
 
+static int check_nonroot_file_access(struct mount_service *mo)
+{
+	struct stat sb1, sb2;
+	int fd;
+	int ret;
+
+	/*
+	 * If we already succeeded in opening the file with write access, then
+	 * we're good.
+	 */
+	ret = fcntl(mo->mountfd, F_GETFL);
+	if (ret < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mo->mountpoint,
+			strerror(error));
+		return -1;
+	}
+
+	if ((ret & O_ACCMODE) != O_RDONLY)
+		return 0;
+
+	ret = fstat(mo->mountfd, &sb1);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, mo->mountpoint, strerror(error));
+		return -1;
+	}
+
+	/* Try to reopen the file with write access this time. */
+	fd = open(mo->real_mountpoint, O_WRONLY | O_CLOEXEC);
+	if (fd < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, mo->mountpoint, strerror(error));
+		return -1;
+	}
+
+	/* Is this the same file? */
+	ret = fstat(fd, &sb2);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, mo->mountpoint, strerror(error));
+		goto out_fd;
+	}
+
+	if (sb1.st_dev != sb2.st_dev || sb1.st_ino != sb2.st_ino) {
+		fprintf(stderr, "%s: %s: Mount point moved during fuse startup.\n",
+			mo->msgtag, mo->mountpoint);
+		ret = -1;
+		goto out_fd;
+	}
+
+	/*
+	 * We reopened the same file with write access, everything is ok.  Swap
+	 * the two file descriptors so that we retain our write access.
+	 */
+	ret = mo->mountfd;
+	mo->mountfd = fd;
+	fd = ret;
+	ret = 0;
+out_fd:
+	close(fd);
+	return ret;
+}
+
+static void adjust_nonroot_mount_flags(struct mount_service *mo,
+				       struct fuse_service_mount_command *oc)
+{
+	const struct mount_flags *mf;
+	uint32_t ms_flags = ntohl(oc->ms_flags);
+
+	/* only care that the unsafe flags are set to the value of @on */
+	for (mf = mount_flags; mf->opt != NULL; mf++) {
+		if (mf->safe)
+			continue;
+		if (!!(ms_flags & mf->flag) == !!mf->on) {
+			ms_flags = (ms_flags & ~mf->flag) |
+				   (mf->on ? 0 : mf->flag);
+
+			fprintf(stderr, "%s: unsafe option %s ignored\n",
+				mo->msgtag, mf->opt);
+		}
+	}
+
+	oc->ms_flags = htonl(ms_flags);
+}
+
+/*
+ * fuse.conf can limit the number of unprivileged fuse mounts.  For
+ * unprivileged mounts (via setuid) we also require write access to the
+ * mountpoint, and we'll only accept certain underlying filesystems.
+ */
+static int check_nonroot_access(struct mount_service *mo,
+				struct fuse_service_mount_command *oc,
+				const struct stat *stbuf)
+{
+	struct statfs fs_buf;
+	int ret;
+
+	ret = check_nonroot_mount_count(mo->msgtag);
+	if (ret)
+		return -EUSERS;
+
+	ret = fstatfs(mo->mountfd, &fs_buf);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, mo->mountpoint, strerror(error));
+		return -error;
+	}
+
+	adjust_nonroot_mount_flags(mo, oc);
+
+	drop_privs();
+	if (S_ISDIR(stbuf->st_mode))
+		ret = check_nonroot_dir_access(mo->msgtag,
+					       mo->mountpoint,
+					       mo->real_mountpoint,
+					       stbuf);
+	else
+		ret = check_nonroot_file_access(mo);
+	if (!ret)
+		ret = check_nonroot_fstype(mo->msgtag, &fs_buf);
+	restore_privs();
+
+	return ret ? -EPERM : 0;
+}
+
 static int mount_service_handle_mount_cmd(struct mount_service *mo,
 					  struct fuse_service_packet *p,
 					  size_t psz)
@@ -1621,6 +1833,12 @@ static int mount_service_handle_mount_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, error);
 	}
 
+	if (getuid() != 0) {
+		ret = check_nonroot_access(mo, oc, &stbuf);
+		if (ret)
+			return mount_service_send_reply(mo, -ret);
+	}
+
 	if (mo->fsopenfd >= 0) {
 		ret = mount_service_fsopen_mount(mo, oc, &stbuf);
 		if (ret != FUSE_MOUNT_FALLBACK_NEEDED)
@@ -1752,6 +1970,10 @@ int mount_service_main(int argc, char *argv[])
 	else
 		mo.msgtag = "mount.service";
 
+	drop_privs();
+	read_conf(mo.msgtag);
+	restore_privs();
+
 	ret = mount_service_init(&mo, argc, argv);
 	if (ret)
 		return EXIT_FAILURE;


^ permalink raw reply related

* [PATCH 07/13] util: fix checkpatch complaints in fuser_conf.[ch]
From: Darrick J. Wong @ 2026-04-30 21:16 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Fix the checkpatch complaints because we touched some code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 util/fuser_conf.h |    3 ++-
 util/fuser_conf.c |   51 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 19 deletions(-)


diff --git a/util/fuser_conf.h b/util/fuser_conf.h
index 5afe70709c5152..55ed745c964ed8 100644
--- a/util/fuser_conf.h
+++ b/util/fuser_conf.h
@@ -23,7 +23,8 @@ void unescape(char *buf);
 static inline struct mntent *GETMNTENT(FILE *stream)
 {
 	struct mntent *entp = getmntent(stream);
-	if(entp != NULL) {
+
+	if (entp != NULL) {
 		unescape(entp->mnt_fsname);
 		unescape(entp->mnt_dir);
 		unescape(entp->mnt_type);
diff --git a/util/fuser_conf.c b/util/fuser_conf.c
index ad933de9cf7cbf..6553c7f6f929a5 100644
--- a/util/fuser_conf.c
+++ b/util/fuser_conf.c
@@ -30,7 +30,7 @@
 #include <stdint.h>
 #endif
 
-int user_allow_other = 0;
+int user_allow_other;
 int mount_max = 1000;
 static uid_t oldfsuid;
 static gid_t oldfsgid;
@@ -43,25 +43,27 @@ void unescape(char *buf)
 {
 	char *src = buf;
 	char *dest = buf;
+
 	while (1) {
 		char *next_src = strchrnul(src, '\\');
 		int offset = next_src - src;
+
 		memmove(dest, src, offset);
 		src = next_src;
 		dest += offset;
 
-		if(*src == '\0') {
+		if (*src == '\0') {
 			*dest = *src;
 			return;
 		}
 		src++;
 
-		if('0' <= src[0] && src[0] < '2' &&
-		   '0' <= src[1] && src[1] < '8' &&
-		   '0' <= src[2] && src[2] < '8') {
+		if ('0' <= src[0] && src[0] < '2' &&
+		    '0' <= src[1] && src[1] < '8' &&
+		    '0' <= src[2] && src[2] < '8') {
 			*dest++ = (src[0] - '0') << 6
-			        | (src[1] - '0') << 3
-			        | (src[2] - '0') << 0;
+				| (src[1] - '0') << 3
+				| (src[2] - '0') << 0;
 			src += 3;
 		} else if (src[0] == '\\') {
 			*dest++ = '\\';
@@ -79,6 +81,7 @@ static int count_fuse_fs_mtab(const char *progname)
 	int count = 0;
 	const char *mtab = _PATH_MOUNTED;
 	FILE *fp = setmntent(mtab, "r");
+
 	if (fp == NULL) {
 		fprintf(stderr, "%s: failed to open %s: %s\n", progname, mtab,
 			strerror(errno));
@@ -87,7 +90,7 @@ static int count_fuse_fs_mtab(const char *progname)
 	while ((entp = GETMNTENT(fp)) != NULL) {
 		if (strcmp(entp->mnt_type, "fuse") == 0 ||
 		    strncmp(entp->mnt_type, "fuse.", 5) == 0)
-			count ++;
+			count++;
 	}
 	endmntent(fp);
 	return count;
@@ -169,12 +172,15 @@ int count_fuse_fs(const char *progname)
 static void strip_line(char *line)
 {
 	char *s = strchr(line, '#');
+
 	if (s != NULL)
 		s[0] = '\0';
 	for (s = line + strlen(line) - 1;
-	     s >= line && isspace((unsigned char) *s); s--);
+	     s >= line && isspace((unsigned char) *s); s--) {
+	}
 	s[1] = '\0';
-	for (s = line; isspace((unsigned char) *s); s++);
+	for (s = line; isspace((unsigned char) *s); s++)
+		; /* empty */
 	if (s != line)
 		memmove(line, s, strlen(s)+1);
 }
@@ -182,11 +188,12 @@ static void strip_line(char *line)
 static void parse_line(const char *line, int linenum, const char *progname)
 {
 	int tmp;
+
 	if (strcmp(line, "user_allow_other") == 0)
 		user_allow_other = 1;
 	else if (sscanf(line, "mount_max = %i", &tmp) == 1)
 		mount_max = tmp;
-	else if(line[0])
+	else if (line[0])
 		fprintf(stderr,
 			"%s: unknown parameter in %s at line %i: '%s'\n",
 			progname, FUSE_CONF, linenum, line);
@@ -195,10 +202,12 @@ static void parse_line(const char *line, int linenum, const char *progname)
 void read_conf(const char *progname)
 {
 	FILE *fp = fopen(FUSE_CONF, "r");
+
 	if (fp != NULL) {
 		int linenum = 1;
 		char line[256];
 		int isnewline = 1;
+
 		while (fgets(line, sizeof(line), fp) != NULL) {
 			if (isnewline) {
 				if (line[strlen(line)-1] == '\n') {
@@ -207,16 +216,18 @@ void read_conf(const char *progname)
 				} else {
 					isnewline = 0;
 				}
-			} else if(line[strlen(line)-1] == '\n') {
-				fprintf(stderr, "%s: reading %s: line %i too long\n", progname, FUSE_CONF, linenum);
+			} else if (line[strlen(line)-1] == '\n') {
+				fprintf(stderr, "%s: reading %s: line %i too long\n",
+					progname, FUSE_CONF, linenum);
 
 				isnewline = 1;
 			}
 			if (isnewline)
-				linenum ++;
+				linenum++;
 		}
 		if (!isnewline) {
-			fprintf(stderr, "%s: reading %s: missing newline at end of file\n", progname, FUSE_CONF);
+			fprintf(stderr, "%s: reading %s: missing newline at end of file\n",
+				progname, FUSE_CONF);
 
 		}
 		if (ferror(fp)) {
@@ -289,6 +300,8 @@ int check_nonroot_dir_access(const char *progname, const char *origmnt,
 	return 0;
 }
 
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
 int check_nonroot_fstype(const char *progname, const struct statfs *fs_buf)
 {
 	size_t i;
@@ -298,13 +311,15 @@ int check_nonroot_fstype(const char *progname, const struct statfs *fs_buf)
 	 * able to just put anything we want there.
 	 * Luckily, without allow_other, we can't get other users to actually
 	 * use any fake information we try to put there anyway.
-	 * Use a whitelist to be safe. */
+	 * Use a whitelist to be safe.
+	 */
 
 	/* Define permitted filesystems for the mount target. This was
 	 * originally the same list as used by the ecryptfs mount helper
 	 * (https://bazaar.launchpad.net/~ecryptfs/ecryptfs/trunk/view/head:/src/utils/mount.ecryptfs_private.c#L225)
 	 * but got expanded as we found more filesystems that needed to be
-	 * overlaid. */
+	 * overlaid.
+	 */
 	typeof(fs_buf->f_type) f_type_whitelist[] = {
 		0x61756673 /* AUFS_SUPER_MAGIC */,
 		0x00000187 /* AUTOFS_SUPER_MAGIC */,
@@ -345,7 +360,7 @@ int check_nonroot_fstype(const char *progname, const struct statfs *fs_buf)
 		0x2FC12FC1 /* ZFS_SUPER_MAGIC */,
 		0x858458f6 /* RAMFS_MAGIC */,
 	};
-	for (i = 0; i < sizeof(f_type_whitelist)/sizeof(f_type_whitelist[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(f_type_whitelist); i++) {
 		if (f_type_whitelist[i] == fs_buf->f_type)
 			return 0;
 	}


^ permalink raw reply related

* [PATCH 06/13] util: hoist the fuse.conf parsing and setuid mode enforcement code
From: Darrick J. Wong @ 2026-04-30 21:16 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Move all the code that parses fuse.conf into a separate file in util/ so
that fuservicemount can read the same file, then add the security checks
that occur when fusermount is trying to start up a filesystem but is not
running as root.  We'll want that for fusermount in a moment.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 util/fuser_conf.h |   61 ++++++++
 util/fuser_conf.c |  383 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 util/fusermount.c |  358 +-------------------------------------------------
 util/meson.build  |    6 -
 4 files changed, 455 insertions(+), 353 deletions(-)
 create mode 100644 util/fuser_conf.h
 create mode 100644 util/fuser_conf.c


diff --git a/util/fuser_conf.h b/util/fuser_conf.h
new file mode 100644
index 00000000000000..5afe70709c5152
--- /dev/null
+++ b/util/fuser_conf.h
@@ -0,0 +1,61 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file LGPL2.txt.
+ */
+#ifndef FUSER_CONF_H_
+#define FUSER_CONF_H_
+
+#include <sys/vfs.h>
+#include <sys/stat.h>
+
+extern int user_allow_other;
+extern int mount_max;
+
+void unescape(char *buf);
+
+#ifdef GETMNTENT_NEEDS_UNESCAPING
+#include <stdio.h>
+#include <mntent.h>
+
+static inline struct mntent *GETMNTENT(FILE *stream)
+{
+	struct mntent *entp = getmntent(stream);
+	if(entp != NULL) {
+		unescape(entp->mnt_fsname);
+		unescape(entp->mnt_dir);
+		unescape(entp->mnt_type);
+		unescape(entp->mnt_opts);
+	}
+	return entp;
+}
+#else
+#define GETMNTENT getmntent
+#endif // GETMNTENT_NEEDS_UNESCAPING
+
+int count_fuse_fs(const char *progname);
+
+void read_conf(const char *progname);
+
+void drop_privs(void);
+void restore_privs(void);
+
+int check_nonroot_mount_count(const char *progname);
+
+int check_nonroot_dir_access(const char *progname, const char *origmnt,
+			     const char *mnt, const struct stat *stbuf);
+
+int check_nonroot_fstype(const char *progname, const struct statfs *fs_buf);
+
+struct mount_flags {
+	const char *opt;
+	unsigned long flag;
+	int on;
+	int safe;
+};
+
+extern const struct mount_flags mount_flags[];
+
+#endif /* FUSER_CONF_H_ */
diff --git a/util/fuser_conf.c b/util/fuser_conf.c
new file mode 100644
index 00000000000000..ad933de9cf7cbf
--- /dev/null
+++ b/util/fuser_conf.c
@@ -0,0 +1,383 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file GPL2.txt.
+ */
+/* This program parses fuse.conf */
+#define _GNU_SOURCE
+#include "fuse_config.h"
+#include "mount_util.h"
+#include "util.h"
+#include "fuser_conf.h"
+
+#include <string.h>
+#include <stddef.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <mntent.h>
+#include <unistd.h>
+#include <sys/fsuid.h>
+
+#include "fuse_mount_compat.h"
+
+#if defined HAVE_LISTMOUNT
+#include <linux/mount.h>
+#include <syscall.h>
+#include <stdint.h>
+#endif
+
+int user_allow_other = 0;
+int mount_max = 1000;
+static uid_t oldfsuid;
+static gid_t oldfsgid;
+
+// Older versions of musl libc don't unescape entries in /etc/mtab
+
+// unescapes octal sequences like \040 in-place
+// That's ok, because unescaping can not extend the length of the string.
+void unescape(char *buf)
+{
+	char *src = buf;
+	char *dest = buf;
+	while (1) {
+		char *next_src = strchrnul(src, '\\');
+		int offset = next_src - src;
+		memmove(dest, src, offset);
+		src = next_src;
+		dest += offset;
+
+		if(*src == '\0') {
+			*dest = *src;
+			return;
+		}
+		src++;
+
+		if('0' <= src[0] && src[0] < '2' &&
+		   '0' <= src[1] && src[1] < '8' &&
+		   '0' <= src[2] && src[2] < '8') {
+			*dest++ = (src[0] - '0') << 6
+			        | (src[1] - '0') << 3
+			        | (src[2] - '0') << 0;
+			src += 3;
+		} else if (src[0] == '\\') {
+			*dest++ = '\\';
+			src += 1;
+		} else {
+			*dest++ = '\\';
+		}
+	}
+}
+
+#ifndef IGNORE_MTAB
+static int count_fuse_fs_mtab(const char *progname)
+{
+	const struct mntent *entp;
+	int count = 0;
+	const char *mtab = _PATH_MOUNTED;
+	FILE *fp = setmntent(mtab, "r");
+	if (fp == NULL) {
+		fprintf(stderr, "%s: failed to open %s: %s\n", progname, mtab,
+			strerror(errno));
+		return -1;
+	}
+	while ((entp = GETMNTENT(fp)) != NULL) {
+		if (strcmp(entp->mnt_type, "fuse") == 0 ||
+		    strncmp(entp->mnt_type, "fuse.", 5) == 0)
+			count ++;
+	}
+	endmntent(fp);
+	return count;
+}
+
+#ifdef HAVE_LISTMOUNT
+static int count_fuse_fs_ls_mnt(const char *progname)
+{
+	#define SMBUF_SIZE 1024
+	#define MNT_ID_LEN 128
+
+	int fuse_count = 0;
+	int n_mounts = 0;
+	int ret = 0;
+	uint64_t mnt_ids[MNT_ID_LEN];
+	unsigned char smbuf[SMBUF_SIZE];
+	struct mnt_id_req req = {
+		.size = sizeof(struct mnt_id_req),
+	};
+	struct statmount *sm;
+
+	for (;;) {
+		req.mnt_id = LSMT_ROOT;
+
+		n_mounts = syscall(SYS_listmount, &req, &mnt_ids, MNT_ID_LEN, 0);
+		if (n_mounts == -1) {
+			if (errno != ENOSYS) {
+				fprintf(stderr, "%s: failed to list mounts: %s\n", progname,
+					strerror(errno));
+			}
+			return -1;
+		}
+
+		for (int i = 0; i < n_mounts; i++) {
+			req.mnt_id = mnt_ids[i];
+			req.param = STATMOUNT_FS_TYPE;
+			ret = syscall(SYS_statmount, &req, &smbuf, SMBUF_SIZE, 0);
+			if (ret) {
+				if (errno == ENOENT)
+					continue;
+
+				fprintf(stderr, "%s: failed to stat mount %lld: %s\n", progname,
+					req.mnt_id, strerror(errno));
+				return -1;
+			}
+
+			sm = (struct statmount *)smbuf;
+			if (sm->mask & STATMOUNT_FS_TYPE &&
+			    strcmp(&sm->str[sm->fs_type], "fuse") == 0)
+				fuse_count++;
+		}
+
+		if (n_mounts < MNT_ID_LEN)
+			break;
+		req.param = mnt_ids[MNT_ID_LEN - 1];
+	}
+	return fuse_count;
+}
+
+int count_fuse_fs(const char *progname)
+{
+	int count = count_fuse_fs_ls_mnt(progname);
+
+	return count >= 0 ? count : count_fuse_fs_mtab(progname);
+}
+#else
+int count_fuse_fs(const char *progname)
+{
+	return count_fuse_fs_mtab(progname);
+}
+#endif /* HAVE_LISTMOUNT */
+#else
+int count_fuse_fs(const char *progname)
+{
+	return 0;
+}
+#endif /* !IGNORE_MTAB */
+
+static void strip_line(char *line)
+{
+	char *s = strchr(line, '#');
+	if (s != NULL)
+		s[0] = '\0';
+	for (s = line + strlen(line) - 1;
+	     s >= line && isspace((unsigned char) *s); s--);
+	s[1] = '\0';
+	for (s = line; isspace((unsigned char) *s); s++);
+	if (s != line)
+		memmove(line, s, strlen(s)+1);
+}
+
+static void parse_line(const char *line, int linenum, const char *progname)
+{
+	int tmp;
+	if (strcmp(line, "user_allow_other") == 0)
+		user_allow_other = 1;
+	else if (sscanf(line, "mount_max = %i", &tmp) == 1)
+		mount_max = tmp;
+	else if(line[0])
+		fprintf(stderr,
+			"%s: unknown parameter in %s at line %i: '%s'\n",
+			progname, FUSE_CONF, linenum, line);
+}
+
+void read_conf(const char *progname)
+{
+	FILE *fp = fopen(FUSE_CONF, "r");
+	if (fp != NULL) {
+		int linenum = 1;
+		char line[256];
+		int isnewline = 1;
+		while (fgets(line, sizeof(line), fp) != NULL) {
+			if (isnewline) {
+				if (line[strlen(line)-1] == '\n') {
+					strip_line(line);
+					parse_line(line, linenum, progname);
+				} else {
+					isnewline = 0;
+				}
+			} else if(line[strlen(line)-1] == '\n') {
+				fprintf(stderr, "%s: reading %s: line %i too long\n", progname, FUSE_CONF, linenum);
+
+				isnewline = 1;
+			}
+			if (isnewline)
+				linenum ++;
+		}
+		if (!isnewline) {
+			fprintf(stderr, "%s: reading %s: missing newline at end of file\n", progname, FUSE_CONF);
+
+		}
+		if (ferror(fp)) {
+			fprintf(stderr, "%s: reading %s: read failed\n", progname, FUSE_CONF);
+			exit(1);
+		}
+		fclose(fp);
+	} else if (errno != ENOENT) {
+		bool fatal = (errno != EACCES && errno != ELOOP &&
+			      errno != ENAMETOOLONG && errno != ENOTDIR &&
+			      errno != EOVERFLOW);
+		fprintf(stderr, "%s: failed to open %s: %s\n",
+			progname, FUSE_CONF, strerror(errno));
+		if (fatal)
+			exit(1);
+	}
+}
+
+void drop_privs(void)
+{
+	if (getuid() != 0) {
+		oldfsuid = setfsuid(getuid());
+		oldfsgid = setfsgid(getgid());
+	}
+}
+
+void restore_privs(void)
+{
+	if (getuid() != 0) {
+		setfsuid(oldfsuid);
+		setfsgid(oldfsgid);
+	}
+}
+
+int check_nonroot_mount_count(const char *progname)
+{
+	if (mount_max == -1)
+		return 0;
+
+	int mount_count = count_fuse_fs(progname);
+
+	if (mount_count >= mount_max) {
+		fprintf(stderr,
+"%s: too many FUSE filesystems mounted; mount_max=N can be set in %s\n",
+			progname, FUSE_CONF);
+		return -1;
+	}
+
+	return 0;
+}
+
+int check_nonroot_dir_access(const char *progname, const char *origmnt,
+			     const char *mnt, const struct stat *stbuf)
+{
+	int res;
+
+	if ((stbuf->st_mode & S_ISVTX) && stbuf->st_uid != getuid()) {
+		fprintf(stderr, "%s: mountpoint %s not owned by user\n",
+			progname, origmnt);
+		return -1;
+	}
+
+	res = access(mnt, W_OK);
+	if (res == -1) {
+		fprintf(stderr, "%s: user has no write access to mountpoint %s\n",
+			progname, origmnt);
+		return -1;
+	}
+
+	return 0;
+}
+
+int check_nonroot_fstype(const char *progname, const struct statfs *fs_buf)
+{
+	size_t i;
+
+	/* Do not permit mounting over anything in procfs - it has a couple
+	 * places to which we have "write access" without being supposed to be
+	 * able to just put anything we want there.
+	 * Luckily, without allow_other, we can't get other users to actually
+	 * use any fake information we try to put there anyway.
+	 * Use a whitelist to be safe. */
+
+	/* Define permitted filesystems for the mount target. This was
+	 * originally the same list as used by the ecryptfs mount helper
+	 * (https://bazaar.launchpad.net/~ecryptfs/ecryptfs/trunk/view/head:/src/utils/mount.ecryptfs_private.c#L225)
+	 * but got expanded as we found more filesystems that needed to be
+	 * overlaid. */
+	typeof(fs_buf->f_type) f_type_whitelist[] = {
+		0x61756673 /* AUFS_SUPER_MAGIC */,
+		0x00000187 /* AUTOFS_SUPER_MAGIC */,
+		0xCA451A4E /* BCACHEFS_STATFS_MAGIC */,
+		0x9123683E /* BTRFS_SUPER_MAGIC */,
+		0x00C36400 /* CEPH_SUPER_MAGIC */,
+		0xFF534D42 /* CIFS_MAGIC_NUMBER */,
+		0x0000F15F /* ECRYPTFS_SUPER_MAGIC */,
+		0X2011BAB0 /* EXFAT_SUPER_MAGIC */,
+		0x0000EF53 /* EXT[234]_SUPER_MAGIC */,
+		0xF2F52010 /* F2FS_SUPER_MAGIC */,
+		0x65735546 /* FUSE_SUPER_MAGIC */,
+		0x01161970 /* GFS2_MAGIC */,
+		0x47504653 /* GPFS_SUPER_MAGIC */,
+		0x0000482b /* HFSPLUS_SUPER_MAGIC */,
+		0x000072B6 /* JFFS2_SUPER_MAGIC */,
+		0x3153464A /* JFS_SUPER_MAGIC */,
+		0x0BD00BD0 /* LL_SUPER_MAGIC */,
+		0X00004D44 /* MSDOS_SUPER_MAGIC */,
+		0x0000564C /* NCP_SUPER_MAGIC */,
+		0x00006969 /* NFS_SUPER_MAGIC */,
+		0x00003434 /* NILFS_SUPER_MAGIC */,
+		0x5346544E /* NTFS_SB_MAGIC */,
+		0x7366746E /* NTFS3_SUPER_MAGIC */,
+		0x5346414f /* OPENAFS_SUPER_MAGIC */,
+		0x794C7630 /* OVERLAYFS_SUPER_MAGIC */,
+		0xAAD7AAEA /* PANFS_SUPER_MAGIC */,
+		0x52654973 /* REISERFS_SUPER_MAGIC */,
+		0xFE534D42 /* SMB2_SUPER_MAGIC */,
+		0x73717368 /* SQUASHFS_MAGIC */,
+		0x01021994 /* TMPFS_MAGIC */,
+		0x24051905 /* UBIFS_SUPER_MAGIC */,
+		0x18031977 /* WEKAFS_SUPER_MAGIC */,
+#if __SIZEOF_LONG__ > 4
+		0x736675005346544e /* UFSD */,
+#endif
+		0x58465342 /* XFS_SB_MAGIC */,
+		0x2FC12FC1 /* ZFS_SUPER_MAGIC */,
+		0x858458f6 /* RAMFS_MAGIC */,
+	};
+	for (i = 0; i < sizeof(f_type_whitelist)/sizeof(f_type_whitelist[0]); i++) {
+		if (f_type_whitelist[i] == fs_buf->f_type)
+			return 0;
+	}
+
+	fprintf(stderr, "%s: mounting over filesystem type %#010lx is forbidden\n",
+		progname, (unsigned long)fs_buf->f_type);
+	return -1;
+}
+
+const struct mount_flags mount_flags[] = {
+	{"rw",	    MS_RDONLY,	    0, 1},
+	{"ro",	    MS_RDONLY,	    1, 1},
+	{"suid",    MS_NOSUID,	    0, 0},
+	{"nosuid",  MS_NOSUID,	    1, 1},
+	{"dev",	    MS_NODEV,	    0, 0},
+	{"nodev",   MS_NODEV,	    1, 1},
+	{"exec",    MS_NOEXEC,	    0, 1},
+	{"noexec",  MS_NOEXEC,	    1, 1},
+	{"async",   MS_SYNCHRONOUS, 0, 1},
+	{"sync",    MS_SYNCHRONOUS, 1, 1},
+	{"atime",   MS_NOATIME,	    0, 1},
+	{"noatime", MS_NOATIME,	    1, 1},
+	{"diratime",        MS_NODIRATIME,  0, 1},
+	{"nodiratime",      MS_NODIRATIME,  1, 1},
+	{"lazytime",        MS_LAZYTIME,    1, 1},
+	{"nolazytime",      MS_LAZYTIME,    0, 1},
+	{"relatime",        MS_RELATIME,    1, 1},
+	{"norelatime",      MS_RELATIME,    0, 1},
+	{"strictatime",     MS_STRICTATIME, 1, 1},
+	{"nostrictatime",   MS_STRICTATIME, 0, 1},
+	{"dirsync", MS_DIRSYNC,	    1, 1},
+	{"symfollow",       MS_NOSYMFOLLOW, 0, 1},
+	{"nosymfollow",     MS_NOSYMFOLLOW, 1, 1},
+	{NULL,	    0,		    0, 0}
+};
diff --git a/util/fusermount.c b/util/fusermount.c
index 68370468140a59..c7905d58a85e32 100644
--- a/util/fusermount.c
+++ b/util/fusermount.c
@@ -11,6 +11,7 @@
 #include "fuse_config.h"
 #include "mount_util.h"
 #include "util.h"
+#include "fuser_conf.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -50,63 +51,8 @@
 
 static const char *progname;
 
-static int user_allow_other = 0;
-static int mount_max = 1000;
-
 static int auto_unmount = 0;
 
-#ifdef GETMNTENT_NEEDS_UNESCAPING
-// Older versions of musl libc don't unescape entries in /etc/mtab
-
-// unescapes octal sequences like \040 in-place
-// That's ok, because unescaping can not extend the length of the string.
-static void unescape(char *buf) {
-	char *src = buf;
-	char *dest = buf;
-	while (1) {
-		char *next_src = strchrnul(src, '\\');
-		int offset = next_src - src;
-		memmove(dest, src, offset);
-		src = next_src;
-		dest += offset;
-
-		if(*src == '\0') {
-			*dest = *src;
-			return;
-		}
-		src++;
-
-		if('0' <= src[0] && src[0] < '2' &&
-		   '0' <= src[1] && src[1] < '8' &&
-		   '0' <= src[2] && src[2] < '8') {
-			*dest++ = (src[0] - '0') << 6
-			        | (src[1] - '0') << 3
-			        | (src[2] - '0') << 0;
-			src += 3;
-		} else if (src[0] == '\\') {
-			*dest++ = '\\';
-			src += 1;
-		} else {
-			*dest++ = '\\';
-		}
-	}
-}
-
-static struct mntent *GETMNTENT(FILE *stream)
-{
-	struct mntent *entp = getmntent(stream);
-	if(entp != NULL) {
-		unescape(entp->mnt_fsname);
-		unescape(entp->mnt_dir);
-		unescape(entp->mnt_type);
-		unescape(entp->mnt_opts);
-	}
-	return entp;
-}
-#else
-#define GETMNTENT getmntent
-#endif // GETMNTENT_NEEDS_UNESCAPING
-
 /*
  * Take a ',' separated option string and extract "x-" options
  */
@@ -188,25 +134,6 @@ static const char *get_user_name(void)
 	}
 }
 
-static uid_t oldfsuid;
-static gid_t oldfsgid;
-
-static void drop_privs(void)
-{
-	if (getuid() != 0) {
-		oldfsuid = setfsuid(getuid());
-		oldfsgid = setfsgid(getgid());
-	}
-}
-
-static void restore_privs(void)
-{
-	if (getuid() != 0) {
-		setfsuid(oldfsuid);
-		setfsgid(oldfsgid);
-	}
-}
-
 #ifndef IGNORE_MTAB
 /*
  * Make sure that /etc/mtab is checked and updated atomically
@@ -568,100 +495,7 @@ static int unmount_fuse(const char *mnt, int quiet, int lazy)
 
 	return res;
 }
-
-static int count_fuse_fs_mtab(void)
-{
-	const struct mntent *entp;
-	int count = 0;
-	const char *mtab = _PATH_MOUNTED;
-	FILE *fp = setmntent(mtab, "r");
-	if (fp == NULL) {
-		fprintf(stderr, "%s: failed to open %s: %s\n", progname, mtab,
-			strerror(errno));
-		return -1;
-	}
-	while ((entp = GETMNTENT(fp)) != NULL) {
-		if (strcmp(entp->mnt_type, "fuse") == 0 ||
-		    strncmp(entp->mnt_type, "fuse.", 5) == 0)
-			count ++;
-	}
-	endmntent(fp);
-	return count;
-}
-
-#ifdef HAVE_LISTMOUNT
-static int count_fuse_fs_ls_mnt(void)
-{
-	#define SMBUF_SIZE 1024
-	#define MNT_ID_LEN 128
-
-	int fuse_count = 0;
-	int n_mounts = 0;
-	int ret = 0;
-	uint64_t mnt_ids[MNT_ID_LEN];
-	unsigned char smbuf[SMBUF_SIZE];
-	struct mnt_id_req req = {
-		.size = sizeof(struct mnt_id_req),
-	};
-	struct statmount *sm;
-
-	for (;;) {
-		req.mnt_id = LSMT_ROOT;
-
-		n_mounts = syscall(SYS_listmount, &req, &mnt_ids, MNT_ID_LEN, 0);
-		if (n_mounts == -1) {
-			if (errno != ENOSYS) {
-				fprintf(stderr, "%s: failed to list mounts: %s\n", progname,
-					strerror(errno));
-			}
-			return -1;
-		}
-
-		for (int i = 0; i < n_mounts; i++) {
-			req.mnt_id = mnt_ids[i];
-			req.param = STATMOUNT_FS_TYPE;
-			ret = syscall(SYS_statmount, &req, &smbuf, SMBUF_SIZE, 0);
-			if (ret) {
-				if (errno == ENOENT)
-					continue;
-
-				fprintf(stderr, "%s: failed to stat mount %lld: %s\n", progname,
-					req.mnt_id, strerror(errno));
-				return -1;
-			}
-
-			sm = (struct statmount *)smbuf;
-			if (sm->mask & STATMOUNT_FS_TYPE &&
-			    strcmp(&sm->str[sm->fs_type], "fuse") == 0)
-				fuse_count++;
-		}
-
-		if (n_mounts < MNT_ID_LEN)
-			break;
-		req.param = mnt_ids[MNT_ID_LEN - 1];
-	}
-	return fuse_count;
-}
-
-static int count_fuse_fs(void)
-{
-	int count = count_fuse_fs_ls_mnt();
-
-	return count >= 0 ? count : count_fuse_fs_mtab();
-}
-#else
-static int count_fuse_fs(void)
-{
-	return count_fuse_fs_mtab();
-}
-#endif
-
 #else /* IGNORE_MTAB */
-static int count_fuse_fs(void)
-{
-	return 0;
-}
-
 static int add_mount(const char *source, const char *mnt, const char *type,
 		     const char *opts)
 {
@@ -679,75 +513,6 @@ static int unmount_fuse(const char *mnt, int quiet, int lazy)
 }
 #endif /* IGNORE_MTAB */
 
-static void strip_line(char *line)
-{
-	char *s = strchr(line, '#');
-	if (s != NULL)
-		s[0] = '\0';
-	for (s = line + strlen(line) - 1;
-	     s >= line && isspace((unsigned char) *s); s--);
-	s[1] = '\0';
-	for (s = line; isspace((unsigned char) *s); s++);
-	if (s != line)
-		memmove(line, s, strlen(s)+1);
-}
-
-static void parse_line(const char *line, int linenum)
-{
-	int tmp;
-	if (strcmp(line, "user_allow_other") == 0)
-		user_allow_other = 1;
-	else if (sscanf(line, "mount_max = %i", &tmp) == 1)
-		mount_max = tmp;
-	else if(line[0])
-		fprintf(stderr,
-			"%s: unknown parameter in %s at line %i: '%s'\n",
-			progname, FUSE_CONF, linenum, line);
-}
-
-static void read_conf(void)
-{
-	FILE *fp = fopen(FUSE_CONF, "r");
-	if (fp != NULL) {
-		int linenum = 1;
-		char line[256];
-		int isnewline = 1;
-		while (fgets(line, sizeof(line), fp) != NULL) {
-			if (isnewline) {
-				if (line[strlen(line)-1] == '\n') {
-					strip_line(line);
-					parse_line(line, linenum);
-				} else {
-					isnewline = 0;
-				}
-			} else if(line[strlen(line)-1] == '\n') {
-				fprintf(stderr, "%s: reading %s: line %i too long\n", progname, FUSE_CONF, linenum);
-
-				isnewline = 1;
-			}
-			if (isnewline)
-				linenum ++;
-		}
-		if (!isnewline) {
-			fprintf(stderr, "%s: reading %s: missing newline at end of file\n", progname, FUSE_CONF);
-
-		}
-		if (ferror(fp)) {
-			fprintf(stderr, "%s: reading %s: read failed\n", progname, FUSE_CONF);
-			exit(1);
-		}
-		fclose(fp);
-	} else if (errno != ENOENT) {
-		bool fatal = (errno != EACCES && errno != ELOOP &&
-			      errno != ENAMETOOLONG && errno != ENOTDIR &&
-			      errno != EOVERFLOW);
-		fprintf(stderr, "%s: failed to open %s: %s\n",
-			progname, FUSE_CONF, strerror(errno));
-		if (fatal)
-			exit(1);
-	}
-}
-
 static int begins_with(const char *s, const char *beg)
 {
 	if (strncmp(s, beg, strlen(beg)) == 0)
@@ -756,40 +521,6 @@ static int begins_with(const char *s, const char *beg)
 		return 0;
 }
 
-struct mount_flags {
-	const char *opt;
-	unsigned long flag;
-	int on;
-	int safe;
-};
-
-static struct mount_flags mount_flags[] = {
-	{"rw",	    MS_RDONLY,	    0, 1},
-	{"ro",	    MS_RDONLY,	    1, 1},
-	{"suid",    MS_NOSUID,	    0, 0},
-	{"nosuid",  MS_NOSUID,	    1, 1},
-	{"dev",	    MS_NODEV,	    0, 0},
-	{"nodev",   MS_NODEV,	    1, 1},
-	{"exec",    MS_NOEXEC,	    0, 1},
-	{"noexec",  MS_NOEXEC,	    1, 1},
-	{"async",   MS_SYNCHRONOUS, 0, 1},
-	{"sync",    MS_SYNCHRONOUS, 1, 1},
-	{"atime",   MS_NOATIME,	    0, 1},
-	{"noatime", MS_NOATIME,	    1, 1},
-	{"diratime",        MS_NODIRATIME,  0, 1},
-	{"nodiratime",      MS_NODIRATIME,  1, 1},
-	{"lazytime",        MS_LAZYTIME,    1, 1},
-	{"nolazytime",      MS_LAZYTIME,    0, 1},
-	{"relatime",        MS_RELATIME,    1, 1},
-	{"norelatime",      MS_RELATIME,    0, 1},
-	{"strictatime",     MS_STRICTATIME, 1, 1},
-	{"nostrictatime",   MS_STRICTATIME, 0, 1},
-	{"dirsync", MS_DIRSYNC,	    1, 1},
-	{"symfollow",       MS_NOSYMFOLLOW, 0, 1},
-	{"nosymfollow",     MS_NOSYMFOLLOW, 1, 1},
-	{NULL,	    0,		    0, 0}
-};
-
 static int find_mount_flag(const char *s, unsigned len, int *on, int *flag)
 {
 	int i;
@@ -1096,7 +827,6 @@ static int check_perm(const char **mntp, struct stat *stbuf, int *mountpoint_fd)
 	const char *mnt = *mntp;
 	const char *origmnt = mnt;
 	struct statfs fs_buf;
-	size_t i;
 
 	res = lstat(mnt, stbuf);
 	if (res == -1) {
@@ -1126,18 +856,9 @@ static int check_perm(const char **mntp, struct stat *stbuf, int *mountpoint_fd)
 			return -1;
 		}
 
-		if ((stbuf->st_mode & S_ISVTX) && stbuf->st_uid != getuid()) {
-			fprintf(stderr, "%s: mountpoint %s not owned by user\n",
-				progname, origmnt);
-			return -1;
-		}
-
-		res = access(mnt, W_OK);
-		if (res == -1) {
-			fprintf(stderr, "%s: user has no write access to mountpoint %s\n",
-				progname, origmnt);
-			return -1;
-		}
+		res = check_nonroot_dir_access(progname, origmnt, mnt, stbuf);
+		if (res)
+			return res;
 	} else if (S_ISREG(stbuf->st_mode)) {
 		static char procfile[256];
 		*mountpoint_fd = open(mnt, O_WRONLY);
@@ -1169,71 +890,13 @@ static int check_perm(const char **mntp, struct stat *stbuf, int *mountpoint_fd)
 		return -1;
 	}
 
-	/* Do not permit mounting over anything in procfs - it has a couple
-	 * places to which we have "write access" without being supposed to be
-	 * able to just put anything we want there.
-	 * Luckily, without allow_other, we can't get other users to actually
-	 * use any fake information we try to put there anyway.
-	 * Use a whitelist to be safe. */
 	if (statfs(*mntp, &fs_buf)) {
 		fprintf(stderr, "%s: failed to access mountpoint %s: %s\n",
 			progname, mnt, strerror(errno));
 		return -1;
 	}
 
-	/* Define permitted filesystems for the mount target. This was
-	 * originally the same list as used by the ecryptfs mount helper
-	 * (https://bazaar.launchpad.net/~ecryptfs/ecryptfs/trunk/view/head:/src/utils/mount.ecryptfs_private.c#L225)
-	 * but got expanded as we found more filesystems that needed to be
-	 * overlaid. */
-	typeof(fs_buf.f_type) f_type_whitelist[] = {
-		0x61756673 /* AUFS_SUPER_MAGIC */,
-		0x00000187 /* AUTOFS_SUPER_MAGIC */,
-		0xCA451A4E /* BCACHEFS_STATFS_MAGIC */,
-		0x9123683E /* BTRFS_SUPER_MAGIC */,
-		0x00C36400 /* CEPH_SUPER_MAGIC */,
-		0xFF534D42 /* CIFS_MAGIC_NUMBER */,
-		0x0000F15F /* ECRYPTFS_SUPER_MAGIC */,
-		0X2011BAB0 /* EXFAT_SUPER_MAGIC */,
-		0x0000EF53 /* EXT[234]_SUPER_MAGIC */,
-		0xF2F52010 /* F2FS_SUPER_MAGIC */,
-		0x65735546 /* FUSE_SUPER_MAGIC */,
-		0x01161970 /* GFS2_MAGIC */,
-		0x47504653 /* GPFS_SUPER_MAGIC */,
-		0x0000482b /* HFSPLUS_SUPER_MAGIC */,
-		0x000072B6 /* JFFS2_SUPER_MAGIC */,
-		0x3153464A /* JFS_SUPER_MAGIC */,
-		0x0BD00BD0 /* LL_SUPER_MAGIC */,
-		0X00004D44 /* MSDOS_SUPER_MAGIC */,
-		0x0000564C /* NCP_SUPER_MAGIC */,
-		0x00006969 /* NFS_SUPER_MAGIC */,
-		0x00003434 /* NILFS_SUPER_MAGIC */,
-		0x5346544E /* NTFS_SB_MAGIC */,
-		0x7366746E /* NTFS3_SUPER_MAGIC */,
-		0x5346414f /* OPENAFS_SUPER_MAGIC */,
-		0x794C7630 /* OVERLAYFS_SUPER_MAGIC */,
-		0xAAD7AAEA /* PANFS_SUPER_MAGIC */,
-		0x52654973 /* REISERFS_SUPER_MAGIC */,
-		0xFE534D42 /* SMB2_SUPER_MAGIC */,
-		0x73717368 /* SQUASHFS_MAGIC */,
-		0x01021994 /* TMPFS_MAGIC */,
-		0x24051905 /* UBIFS_SUPER_MAGIC */,
-		0x18031977 /* WEKAFS_SUPER_MAGIC */,
-#if __SIZEOF_LONG__ > 4
-		0x736675005346544e /* UFSD */,
-#endif
-		0x58465342 /* XFS_SB_MAGIC */,
-		0x2FC12FC1 /* ZFS_SUPER_MAGIC */,
-		0x858458f6 /* RAMFS_MAGIC */,
-	};
-	for (i = 0; i < sizeof(f_type_whitelist)/sizeof(f_type_whitelist[0]); i++) {
-		if (f_type_whitelist[i] == fs_buf.f_type)
-			return 0;
-	}
-
-	fprintf(stderr, "%s: mounting over filesystem type %#010lx is forbidden\n",
-		progname, (unsigned long)fs_buf.f_type);
-	return -1;
+	return check_nonroot_fstype(progname, &fs_buf);
 }
 
 static int open_fuse_device(const char *dev)
@@ -1273,15 +936,10 @@ static int mount_fuse(const char *mnt, const char *opts, const char **type)
 		return -1;
 
 	drop_privs();
-	read_conf();
+	read_conf(progname);
 
-	if (getuid() != 0 && mount_max != -1) {
-		int mount_count = count_fuse_fs();
-		if (mount_count >= mount_max) {
-			fprintf(stderr, "%s: too many FUSE filesystems mounted; mount_max=N can be set in %s\n", progname, FUSE_CONF);
-			goto fail_close_fd;
-		}
-	}
+	if (getuid() != 0 && check_nonroot_mount_count(progname) != 0)
+		goto fail_close_fd;
 
 	// Extract any options starting with "x-"
 	res= extract_x_options(opts, &do_mount_opts, &x_opts);
diff --git a/util/meson.build b/util/meson.build
index 04ea5ac201340d..aa646ef3c77d16 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -1,18 +1,18 @@
 fuseconf_path = join_paths(get_option('prefix'), get_option('sysconfdir'), 'fuse.conf')
 
-executable('fusermount3', ['fusermount.c', '../lib/mount_util.c', '../lib/util.c'],
+executable('fusermount3', ['fusermount.c', '../lib/mount_util.c', '../lib/util.c', 'fuser_conf.c'],
            include_directories: include_dirs,
            install: true,
            install_dir: get_option('bindir'),
            c_args: '-DFUSE_CONF="@0@"'.format(fuseconf_path))
 
 if private_cfg.get('HAVE_SERVICEMOUNT', false)
-  executable('fuservicemount3', ['mount_service.c', 'fuservicemount.c', '../lib/mount_util.c'],
+  executable('fuservicemount3', ['mount_service.c', 'fuservicemount.c', '../lib/mount_util.c', 'fuser_conf.c'],
              include_directories: include_dirs,
              link_with: [ libfuse ],
              install: true,
              install_dir: get_option('sbindir'),
-             c_args: '-DFUSE_USE_VERSION=319')
+             c_args: ['-DFUSE_USE_VERSION=319', '-DFUSE_CONF="@0@"'.format(fuseconf_path)])
 endif
 
 executable('mount.fuse3', ['mount.fuse.c'],


^ permalink raw reply related

* [PATCH 05/13] mount_service: update mtab after a successful mount
From: Darrick J. Wong @ 2026-04-30 21:16 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Update "mtab" so that non-kernel mount options (e.g. "x-fubar=XXX") are
recorded somewhere so that userspace utilities can pick that up.  Note
that this likely is not the venerable /etc/mtab, which has been a
symlink to procfs for years.  On a modern system, these non-kernel
options end up /run/mount/utab.

But that's not a detail that libfuse has to worry about directly; it's
really just calling mount -f(ake) to make the changes it wants.  Old
hats may remember the use of mount -f to update /etc/mtab after mounting
the root filesystem.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 include/fuse_service_priv.h |    1 
 lib/mount_common_i.h        |    1 
 lib/fuse_service.c          |   15 +++++++
 lib/mount.c                 |    7 +++
 util/mount_service.c        |   95 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 119 insertions(+)


diff --git a/include/fuse_service_priv.h b/include/fuse_service_priv.h
index a3773d90c7db7e..8560b1ac610143 100644
--- a/include/fuse_service_priv.h
+++ b/include/fuse_service_priv.h
@@ -39,6 +39,7 @@ struct fuse_service_memfd_argv {
 #define FUSE_SERVICE_MOUNT_CMD		0x444f4954	/* DOIT */
 #define FUSE_SERVICE_UNMOUNT_CMD	0x554d4e54	/* UMNT */
 #define FUSE_SERVICE_BYE_CMD		0x42594545	/* BYEE */
+#define FUSE_SERVICE_MTABOPTS_CMD	0x4d544142	/* MTAB */
 
 /* mount.service sends replies to the fuse server */
 #define FUSE_SERVICE_OPEN_REPLY		0x46494c45	/* FILE */
diff --git a/lib/mount_common_i.h b/lib/mount_common_i.h
index 631dff3e6f8aaf..541cdebae4f47a 100644
--- a/lib/mount_common_i.h
+++ b/lib/mount_common_i.h
@@ -15,6 +15,7 @@ struct mount_opts;
 char *fuse_mnt_build_source(const struct mount_opts *mo);
 char *fuse_mnt_build_type(const struct mount_opts *mo);
 char *fuse_mnt_kernel_opts(const struct mount_opts *mo);
+char *fuse_mnt_mtab_opts(const struct mount_opts *mo);
 unsigned int fuse_mnt_flags(const struct mount_opts *mo);
 
 
diff --git a/lib/fuse_service.c b/lib/fuse_service.c
index ef512c76120a0f..83c1d564a18b0c 100644
--- a/lib/fuse_service.c
+++ b/lib/fuse_service.c
@@ -992,6 +992,7 @@ int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
 	char *fstype = fuse_mnt_build_type(se->mo);
 	char *source = fuse_mnt_build_source(se->mo);
 	char *mntopts = fuse_mnt_kernel_opts(se->mo);
+	char *mtabopts = fuse_mnt_mtab_opts(se->mo);
 	char path[32];
 	int ret;
 	int error = 0;
@@ -1062,6 +1063,19 @@ int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
 		}
 	}
 
+	if (mtabopts) {
+		ret = send_string(sf, FUSE_SERVICE_MTABOPTS_CMD, mtabopts,
+				  &error);
+		if (ret)
+			goto out_strings;
+		if (error) {
+			fuse_log(FUSE_LOG_ERR, "fuse: service fs mtab options: %s\n",
+				 strerror(error));
+			ret = -error;
+			goto out_strings;
+		}
+	}
+
 	ret = send_mount(sf, fuse_mnt_flags(se->mo), &error);
 	if (ret)
 		goto out_strings;
@@ -1082,6 +1096,7 @@ int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
 	(void)chdir("/");
 
 out_strings:
+	free(mtabopts);
 	free(mntopts);
 	free(source);
 	free(fstype);
diff --git a/lib/mount.c b/lib/mount.c
index 952d8899dcf218..84c73579ab2daf 100644
--- a/lib/mount.c
+++ b/lib/mount.c
@@ -758,6 +758,13 @@ char *fuse_mnt_kernel_opts(const struct mount_opts *mo)
 	return NULL;
 }
 
+char *fuse_mnt_mtab_opts(const struct mount_opts *mo)
+{
+	if (mo->mtab_opts)
+		return strdup(mo->mtab_opts);
+	return NULL;
+}
+
 unsigned int fuse_mnt_flags(const struct mount_opts *mo)
 {
 	return mo->flags;
diff --git a/util/mount_service.c b/util/mount_service.c
index f2a515a2cc3b37..915a0c4b610792 100644
--- a/util/mount_service.c
+++ b/util/mount_service.c
@@ -61,6 +61,9 @@ struct mount_service {
 	/* mount options */
 	char *mntopts;
 
+	/* mtab options */
+	char *mtabopts;
+
 	/* socket fd */
 	int sockfd;
 
@@ -86,6 +89,13 @@ struct mount_service {
 	bool fuseblk;
 };
 
+static char IGNORE_MTAB;
+
+static inline bool have_real_mtabopts(const struct mount_service *mo)
+{
+	return mo->mtabopts && mo->mtabopts != &IGNORE_MTAB;
+}
+
 static ssize_t __send_fd(struct mount_service *mo,
 			 struct fuse_service_requested_file *req,
 			 size_t req_sz, int fd)
@@ -1018,6 +1028,55 @@ static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
 	return mount_service_send_reply(mo, 0);
 }
 
+static int mount_service_handle_mtabopts_cmd(struct mount_service *mo,
+					     const struct fuse_service_packet *p,
+					     size_t psz)
+{
+	struct fuse_service_string_command *oc =
+			container_of(p, struct fuse_service_string_command, p);
+	char *tokstr = oc->value;
+	char *tok, *savetok;
+
+	if (psz < sizeof_fuse_service_string_command(1)) {
+		fprintf(stderr, "%s: mtab options command too small\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!check_null_endbyte(p, psz)) {
+		fprintf(stderr, "%s: mtab options command must be null terminated\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (mo->mtabopts) {
+		fprintf(stderr, "%s: mtab options respecified!\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	mo->mtabopts = strdup(oc->value);
+	if (!mo->mtabopts) {
+		int error = errno;
+
+		fprintf(stderr, "%s: alloc mtab options string: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	/* strtok_r mutates tokstr aka oc->value */
+	while ((tok = strtok_r(tokstr, ",", &savetok)) != NULL) {
+		if (!strcmp(tok, "-n")) {
+			free(mo->mtabopts);
+			mo->mtabopts = &IGNORE_MTAB;
+		}
+
+		tokstr = NULL;
+	}
+
+	return mount_service_send_reply(mo, 0);
+}
+
 static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
 				char *mntpt)
 {
@@ -1293,6 +1352,14 @@ static int mount_service_regular_mount(struct mount_service *mo,
 		goto out_fstype;
 	}
 
+	/*
+	 * The mount succeeded, so we send a positive reply even if the mtab
+	 * update fails.
+	 */
+	if (have_real_mtabopts(mo))
+		fuse_mnt_add_mount(mo->msgtag, mo->source, mo->resv_mountpoint,
+				   fstype, mo->mtabopts);
+
 	mo->mounted = true;
 	ret = mount_service_send_reply(mo, 0);
 out_fstype:
@@ -1485,6 +1552,22 @@ static int mount_service_fsopen_mount(struct mount_service *mo,
 		goto fail_mount;
 	}
 
+	/*
+	 * The mount succeeded, so we send a positive reply even if the mtab
+	 * update fails.
+	 */
+	if (have_real_mtabopts(mo)) {
+		char *fstype = NULL;
+
+		asprintf(&fstype, "%s.%s", fsname(mo), mo->subtype);
+		if (fstype) {
+			fuse_mnt_add_mount(mo->msgtag, mo->source,
+					   mo->resv_mountpoint, fstype,
+					   mo->mtabopts);
+			free(fstype);
+		}
+	}
+
 	mo->mounted = true;
 	return mount_service_send_reply(mo, 0);
 
@@ -1592,6 +1675,13 @@ static int mount_service_handle_unmount_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, error);
 	}
 
+	/*
+	 * The unmount succeeded, so we send a positive reply even if the mtab
+	 * update fails.
+	 */
+	if (have_real_mtabopts(mo))
+		fuse_mnt_remove_mount(mo->msgtag, mo->resv_mountpoint);
+
 	mo->mounted = false;
 	return mount_service_send_reply(mo, 0);
 }
@@ -1631,6 +1721,8 @@ static void mount_service_destroy(struct mount_service *mo)
 	free(mo->mountpoint);
 	free(mo->real_mountpoint);
 	free(mo->resv_mountpoint);
+	if (have_real_mtabopts(mo))
+		free(mo->mtabopts);
 	free(mo->mntopts);
 	free(mo->subtype);
 
@@ -1720,6 +1812,9 @@ int mount_service_main(int argc, char *argv[])
 			ret = mount_service_handle_mountpoint_cmd(&mo, p, sz,
 								  argc, argv);
 			break;
+		case FUSE_SERVICE_MTABOPTS_CMD:
+			ret = mount_service_handle_mtabopts_cmd(&mo, p, sz);
+			break;
 		case FUSE_SERVICE_MOUNT_CMD:
 			ret = mount_service_handle_mount_cmd(&mo, p, sz);
 			break;


^ permalink raw reply related

* [PATCH 04/13] mount_service: use the new mount api for the mount service
From: Darrick J. Wong @ 2026-04-30 21:16 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Use the new fsopen/fsmount system calls to mount the filesystem so that
we get somewhat better diagnostics if something gets screwed up.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/fuse_i.h         |    3 
 meson.build          |   15 ++
 util/mount_service.c |  332 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 346 insertions(+), 4 deletions(-)


diff --git a/lib/fuse_i.h b/lib/fuse_i.h
index 0ca13d132585f6..1710a872e19c72 100644
--- a/lib/fuse_i.h
+++ b/lib/fuse_i.h
@@ -215,6 +215,9 @@ struct fuse_chan *fuse_chan_get(struct fuse_chan *ch);
  */
 void fuse_chan_put(struct fuse_chan *ch);
 
+/* Special return value for mount functions to indicate fallback to fusermount3 is needed */
+#define FUSE_MOUNT_FALLBACK_NEEDED (-2)
+
 struct mount_opts *parse_mount_opts(struct fuse_args *args);
 void destroy_mount_opts(struct mount_opts *mo);
 void fuse_mount_version(void);
diff --git a/meson.build b/meson.build
index 66425a0d4cc16f..c8326b79fcee8f 100644
--- a/meson.build
+++ b/meson.build
@@ -135,6 +135,21 @@ special_funcs = {
 	int main(int argc, char *argv[]) {
           return SD_LISTEN_FDS_START;
 	}
+    ''',
+    'new_mount_api': '''
+       #define _GNU_SOURCE
+       #include <sys/mount.h>
+       #include <linux/mount.h>
+       #include <unistd.h>
+       #include <fcntl.h>
+
+       int main(void) {
+           int fsfd = fsopen("fuse", FSOPEN_CLOEXEC);
+           int res = fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0);
+           int mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+           res = move_mount(mntfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH);
+           return 0;
+       }
     '''
 }
 
diff --git a/util/mount_service.c b/util/mount_service.c
index a43ff79c7bfb6f..f2a515a2cc3b37 100644
--- a/util/mount_service.c
+++ b/util/mount_service.c
@@ -28,6 +28,11 @@
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 
+#ifdef HAVE_NEW_MOUNT_API
+#include <sys/mount.h>
+#include <linux/mount.h>
+#endif
+
 #include "mount_util.h"
 #include "util.h"
 #include "fuse_i.h"
@@ -68,6 +73,9 @@ struct mount_service {
 	/* fd for mount point */
 	int mountfd;
 
+	/* fd for fsopen */
+	int fsopenfd;
+
 	/* did we actually mount successfully? */
 	bool mounted;
 
@@ -187,6 +195,7 @@ static int mount_service_init(struct mount_service *mo, int argc, char *argv[])
 	mo->argvfd = -1;
 	mo->fusedevfd = -1;
 	mo->mountfd = -1;
+	mo->fsopenfd = -1;
 
 	for (i = 0; i < argc; i++) {
 		if (!strcmp(argv[i], "-t") && i + 1 < argc) {
@@ -782,6 +791,20 @@ static inline const char *fsname(const struct mount_service *mo)
 	return mo->fuseblk ? "fuseblk" : "fuse";
 }
 
+#ifdef HAVE_NEW_MOUNT_API
+static void try_fsopen(struct mount_service *mo)
+{
+	/*
+	 * As of Linux 7.0 you can pass subtypes to fsopen, but the manpage for
+	 * fsopen only says that you can pass any value of the second column of
+	 * /proc/filesystems into fsopen.
+	 */
+	mo->fsopenfd = fsopen(fsname(mo), FSOPEN_CLOEXEC);
+}
+#else
+# define try_fsopen(...)	((void)0)
+#endif
+
 static int mount_service_handle_fsopen_cmd(struct mount_service *mo,
 					   const struct fuse_service_packet *p,
 					   size_t psz)
@@ -820,15 +843,52 @@ static int mount_service_handle_fsopen_cmd(struct mount_service *mo,
 	}
 	mo->fsopened = true;
 
+	/* If this fails we fall back on mount(); oc->value is mutated */
+	try_fsopen(mo);
 	return mount_service_send_reply(mo, 0);
 }
 
+#ifdef HAVE_NEW_MOUNT_API
+/* callers must preserve errno */
+static void emit_fsconfig_messages(const struct mount_service *mo)
+{
+	uint8_t buf[BUFSIZ];
+	ssize_t sz;
+
+	while ((sz = read(mo->fsopenfd, buf, sizeof(buf) - 1)) >= 1) {
+		if (buf[sz - 1] == '\n')
+			buf[--sz] = '\0';
+		else
+			buf[sz] = '\0';
+
+		if (!*buf)
+			continue;
+
+		switch (buf[0]) {
+		case 'e':
+			fprintf(stderr, "Error: %s\n", buf + 2);
+			break;
+		case 'w':
+			fprintf(stderr, "Warning: %s\n", buf + 2);
+			break;
+		case 'i':
+			fprintf(stderr, "Info: %s\n", buf + 2);
+			break;
+		default:
+			fprintf(stderr, " %s\n", buf);
+			break;
+		}
+	}
+}
+#endif
+
 static int mount_service_handle_source_cmd(struct mount_service *mo,
 					   const struct fuse_service_packet *p,
 					   size_t psz)
 {
 	struct fuse_service_string_command *oc =
 			container_of(p, struct fuse_service_string_command, p);
+	char *source;
 
 	if (psz < sizeof_fuse_service_string_command(1)) {
 		fprintf(stderr, "%s: source command too small\n",
@@ -848,8 +908,8 @@ static int mount_service_handle_source_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, EINVAL);
 	}
 
-	mo->source = strdup(oc->value);
-	if (!mo->source) {
+	source = strdup(oc->value);
+	if (!source) {
 		int error = errno;
 
 		fprintf(stderr, "%s: alloc source string: %s\n",
@@ -857,6 +917,23 @@ static int mount_service_handle_source_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, error);
 	}
 
+#ifdef HAVE_NEW_MOUNT_API
+	if (mo->fsopenfd >= 0) {
+		int ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "source",
+			       oc->value, 0);
+		if (ret) {
+			int error = errno;
+
+			fprintf(stderr, "%s: fsconfig source: %s\n",
+				mo->msgtag, strerror(error));
+			emit_fsconfig_messages(mo);
+			free(source);
+			return mount_service_send_reply(mo, error);
+		}
+	}
+#endif
+
+	mo->source = source;
 	return mount_service_send_reply(mo, 0);
 }
 
@@ -866,6 +943,9 @@ static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
 {
 	struct fuse_service_string_command *oc =
 			container_of(p, struct fuse_service_string_command, p);
+	char *tokstr = oc->value;
+	char *tok, *savetok;
+	char *mntopts;
 
 	if (psz < sizeof_fuse_service_string_command(1)) {
 		fprintf(stderr, "%s: mount options command too small\n",
@@ -885,8 +965,8 @@ static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, EINVAL);
 	}
 
-	mo->mntopts = strdup(oc->value);
-	if (!mo->mntopts) {
+	mntopts = strdup(oc->value);
+	if (!mntopts) {
 		int error = errno;
 
 		fprintf(stderr, "%s: alloc mount options string: %s\n",
@@ -894,6 +974,47 @@ static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, error);
 	}
 
+	/* strtok_r mutates tokstr aka oc->value */
+	while ((tok = strtok_r(tokstr, ",", &savetok)) != NULL) {
+		char *equals = strchr(tok, '=');
+		char oldchar = 0;
+
+		if (equals) {
+			oldchar = *equals;
+			*equals = 0;
+		}
+
+#ifdef HAVE_NEW_MOUNT_API
+		if (mo->fsopenfd >= 0) {
+			int ret;
+
+			if (equals)
+				ret = fsconfig(mo->fsopenfd,
+					       FSCONFIG_SET_STRING, tok,
+					       equals + 1, 0);
+			else
+				ret = fsconfig(mo->fsopenfd,
+					       FSCONFIG_SET_FLAG, tok,
+					       NULL, 0);
+			if (ret) {
+				int error = errno;
+
+				fprintf(stderr, "%s: set mount option: %s\n",
+					mo->msgtag, strerror(error));
+				emit_fsconfig_messages(mo);
+				free(mntopts);
+				return mount_service_send_reply(mo, error);
+			}
+		}
+#endif
+
+		if (equals)
+			*equals = oldchar;
+
+		tokstr = NULL;
+	}
+
+	mo->mntopts = mntopts;
 	return mount_service_send_reply(mo, 0);
 }
 
@@ -1181,6 +1302,201 @@ static int mount_service_regular_mount(struct mount_service *mo,
 	return ret;
 }
 
+#ifdef HAVE_NEW_MOUNT_API
+struct ms_to_mount_map {
+	unsigned long ms_flag;
+	unsigned int mount_attr_flag;
+};
+
+static const struct ms_to_mount_map attrs[] = {
+	{ MS_RDONLY,		MOUNT_ATTR_RDONLY },
+	{ MS_NOSUID,		MOUNT_ATTR_NOSUID },
+	{ MS_NODEV,		MOUNT_ATTR_NODEV },
+	{ MS_NOEXEC,		MOUNT_ATTR_NOEXEC },
+	{ MS_RELATIME,		MOUNT_ATTR_RELATIME },
+	{ MS_NOATIME,		MOUNT_ATTR_NOATIME },
+	{ MS_STRICTATIME,	MOUNT_ATTR_STRICTATIME },
+	{ MS_NODIRATIME,	MOUNT_ATTR_NODIRATIME },
+#ifdef MOUNT_ATTR_NOSYMFOLLOW
+	{ MS_NOSYMFOLLOW,	MOUNT_ATTR_NOSYMFOLLOW },
+#endif
+	{ 0, 0 },
+};
+
+static void get_mount_attr_flags(const struct fuse_service_mount_command *oc,
+				 unsigned int *attr_flags,
+				 unsigned long *leftover_ms_flags)
+{
+	const struct ms_to_mount_map *i;
+	unsigned int ms_flags = ntohl(oc->ms_flags);
+	unsigned int mount_attr_flags = 0;
+
+	for (i = attrs; i->ms_flag != 0; i++) {
+		if (ms_flags & i->ms_flag)
+			mount_attr_flags |= i->mount_attr_flag;
+		ms_flags &= ~i->ms_flag;
+	}
+
+	*leftover_ms_flags = ms_flags;
+	*attr_flags = mount_attr_flags;
+}
+
+struct ms_to_str_map {
+	unsigned long ms_flag;
+	const char *string;
+};
+
+static const struct ms_to_str_map strflags[] = {
+	{ MS_SYNCHRONOUS,	"sync" },
+	{ MS_DIRSYNC,		"dirsync" },
+	{ MS_LAZYTIME,		"lazytime" },
+	{ 0, 0 },
+};
+
+static int set_ms_flags(struct mount_service *mo, unsigned long ms_flags)
+{
+	const struct ms_to_str_map *i;
+	int ret;
+
+	for (i = strflags; i->ms_flag != 0; i++) {
+		if (!(ms_flags & i->ms_flag))
+			continue;
+
+		ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_FLAG, i->string,
+			       NULL, 0);
+		if (ret) {
+			int error = errno;
+
+			fprintf(stderr, "%s: set %s option: %s\n",
+				mo->msgtag, i->string, strerror(error));
+			emit_fsconfig_messages(mo);
+
+			errno = error;
+			return -1;
+		}
+		ms_flags &= ~i->ms_flag;
+	}
+
+	/*
+	 * We can't translate all the supplied MS_ flags into MOUNT_ATTR_ flags
+	 * or string flags!  Return a magic code so the caller will fall back
+	 * to regular mount(2).
+	 */
+	if (ms_flags)
+		return FUSE_MOUNT_FALLBACK_NEEDED;
+
+	return 0;
+}
+
+static int mount_service_fsopen_mount(struct mount_service *mo,
+				      struct fuse_service_mount_command *oc,
+				      struct stat *stbuf)
+{
+	char tmp[64];
+	unsigned long ms_flags;
+	unsigned int attr_flags;
+	int mfd;
+	int error;
+	int ret;
+
+	get_mount_attr_flags(oc, &attr_flags, &ms_flags);
+
+	ret = set_ms_flags(mo, ms_flags);
+	if (ret == FUSE_MOUNT_FALLBACK_NEEDED)
+		return ret;
+	if (ret) {
+		error = errno;
+		goto fail_mount;
+	}
+
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "subtype",
+		       mo->subtype, 0);
+	if (ret) {
+		error = errno;
+
+		/* The subtype option was merged after fsopen */
+		if (error == EINVAL)
+			return FUSE_MOUNT_FALLBACK_NEEDED;
+
+		fprintf(stderr, "%s: set subtype option: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	snprintf(tmp, sizeof(tmp), "%i", mo->fusedevfd);
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "fd", tmp, 0);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: set fd option: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	snprintf(tmp, sizeof(tmp), "%o", stbuf->st_mode & S_IFMT);
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "rootmode", tmp, 0);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: set rootmode option: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	snprintf(tmp, sizeof(tmp), "%u", getuid());
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "user_id", tmp, 0);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: set user_id option: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	snprintf(tmp, sizeof(tmp), "%u", getgid());
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_SET_STRING, "group_id", tmp, 0);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: set group_id option: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	ret = fsconfig(mo->fsopenfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: creating filesystem: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	mfd = fsmount(mo->fsopenfd, FSMOUNT_CLOEXEC, attr_flags);
+	if (mfd < 0) {
+		error = errno;
+		fprintf(stderr, "%s: fsmount: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_fsconfig;
+	}
+
+	ret = move_mount(mfd, "", mo->mountfd, "",
+			 MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH);
+	close(mfd);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: move_mount: %s\n",
+			mo->msgtag, strerror(error));
+		goto fail_mount;
+	}
+
+	mo->mounted = true;
+	return mount_service_send_reply(mo, 0);
+
+fail_fsconfig:
+	emit_fsconfig_messages(mo);
+fail_mount:
+	return mount_service_send_reply(mo, error);
+}
+#else
+# define mount_service_fsopen_mount(...)	(FUSE_MOUNT_FALLBACK_NEEDED)
+#endif
+
 static int mount_service_handle_mount_cmd(struct mount_service *mo,
 					  struct fuse_service_packet *p,
 					  size_t psz)
@@ -1222,6 +1538,12 @@ static int mount_service_handle_mount_cmd(struct mount_service *mo,
 		return mount_service_send_reply(mo, error);
 	}
 
+	if (mo->fsopenfd >= 0) {
+		ret = mount_service_fsopen_mount(mo, oc, &stbuf);
+		if (ret != FUSE_MOUNT_FALLBACK_NEEDED)
+			return ret;
+	}
+
 	return mount_service_regular_mount(mo, oc, &stbuf);
 }
 
@@ -1301,6 +1623,7 @@ static void mount_service_destroy(struct mount_service *mo)
 	close(mo->mountfd);
 	close(mo->fusedevfd);
 	close(mo->argvfd);
+	close(mo->fsopenfd);
 	shutdown(mo->sockfd, SHUT_RDWR);
 	close(mo->sockfd);
 
@@ -1316,6 +1639,7 @@ static void mount_service_destroy(struct mount_service *mo)
 	mo->argvfd = -1;
 	mo->fusedevfd = -1;
 	mo->mountfd = -1;
+	mo->fsopenfd = -1;
 }
 
 int mount_service_main(int argc, char *argv[])


^ permalink raw reply related

* [PATCH 03/13] mount_service: create high level fuse helpers
From: Darrick J. Wong @ 2026-04-30 21:15 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Create a fuse_main wrapper for fuse services.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/fuse.h         |   34 +++++++++++++++
 lib/fuse_versionscript |    1 
 lib/helper.c           |  109 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 140 insertions(+), 4 deletions(-)


diff --git a/include/fuse.h b/include/fuse.h
index 2bc3a9650c7c8b..129c744e39c46a 100644
--- a/include/fuse.h
+++ b/include/fuse.h
@@ -1008,6 +1008,40 @@ static inline int fuse_main_fn(int argc, char *argv[],
 #define fuse_main(argc, argv, op, user_data) \
 	fuse_main_fn(argc, argv, op, user_data)
 
+#if FUSE_MAKE_VERSION(3, 19) <= FUSE_USE_VERSION
+struct fuse_service;
+int fuse_service_main_real_versioned(struct fuse_service *service,
+				     struct fuse_args *args,
+				     const struct fuse_operations *op,
+				     size_t op_size,
+				     struct libfuse_version *version,
+				     void *user_data);
+
+/**
+ * Same as fuse_service_main_fn, but takes its information from the mount
+ * service context and an fuse_args that has already had fuse_service_append_args
+ * applied to it.
+ */
+static inline int fuse_service_main_fn(struct fuse_service *service,
+				       struct fuse_args *args,
+				       const struct fuse_operations *op,
+				       void *user_data)
+{
+	struct libfuse_version version = {
+		.major  = FUSE_MAJOR_VERSION,
+		.minor  = FUSE_MINOR_VERSION,
+		.hotfix = FUSE_HOTFIX_VERSION,
+		.padding = FUSE_USE_VERSION,
+	};
+
+	return fuse_service_main_real_versioned(service, args, op,
+						sizeof(*(op)), &version,
+						user_data);
+}
+#define fuse_service_main(s, args, op, user_data) \
+	fuse_service_main_fn(s, args, op, user_data)
+#endif /* FUSE_USE_VERSION >= FUSE_MAKE_VERSION(3, 19) */
+
 /* ----------------------------------------------------------- *
  * More detailed API					       *
  * ----------------------------------------------------------- */
diff --git a/lib/fuse_versionscript b/lib/fuse_versionscript
index f34dc959a1d1e1..acd1d28907c614 100644
--- a/lib/fuse_versionscript
+++ b/lib/fuse_versionscript
@@ -236,6 +236,7 @@ FUSE_3.19 {
 		fuse_service_exit;
 		fuse_service_expect_mount_format;
 		fuse_service_finish_file_requests;
+		fuse_service_main_real_versioned;
 		fuse_service_parse_cmdline_opts;
 		fuse_service_receive_file;
 		fuse_service_release;
diff --git a/lib/helper.c b/lib/helper.c
index 819b9a6e4d243c..04d03dc5a805f1 100644
--- a/lib/helper.c
+++ b/lib/helper.c
@@ -15,6 +15,7 @@
 #include "fuse_misc.h"
 #include "fuse_opt.h"
 #include "fuse_lowlevel.h"
+#include "fuse_service.h"
 #include "mount_util.h"
 
 #include <stdio.h>
@@ -365,6 +366,110 @@ int fuse_daemonize(int foreground)
 	return 0;
 }
 
+struct fuse *_fuse_new_31(struct fuse_args *args,
+		       const struct fuse_operations *op, size_t op_size,
+		       struct libfuse_version *version,
+		       void *user_data);
+
+int fuse_service_main_real_versioned(struct fuse_service *service,
+				     struct fuse_args *args,
+				     const struct fuse_operations *op,
+				     size_t op_size,
+				     struct libfuse_version *version,
+				     void *user_data)
+{
+	struct fuse *fuse;
+	struct fuse_cmdline_opts opts;
+	struct fuse_loop_config *loop_config = NULL;
+	int res;
+
+	if (fuse_service_parse_cmdline_opts(args, &opts) != 0) {
+		res = 1;
+		goto out0;
+	}
+
+	if (opts.show_version) {
+		printf("FUSE library version %s\n", PACKAGE_VERSION);
+		fuse_lowlevel_version();
+		res = 0;
+		goto out1;
+	}
+
+	if (opts.show_help) {
+		if (args->argv[0][0] != '\0')
+			printf("usage: %s [options] <mountpoint>\n\n",
+			       args->argv[0]);
+		printf("FUSE options:\n");
+		fuse_cmdline_help();
+		fuse_lib_help(args);
+		res = 0;
+		goto out1;
+	}
+
+	if (!opts.show_help &&
+	    !opts.mountpoint) {
+		fuse_log(FUSE_LOG_ERR, "error: no mountpoint specified\n");
+		res = 2;
+		goto out1;
+	}
+
+	fuse = _fuse_new_31(args, op, op_size, version, user_data);
+	if (fuse == NULL) {
+		res = 3;
+		goto out1;
+	}
+	struct fuse_session *se = fuse_get_session(fuse);
+
+	if (!opts.singlethread) {
+		loop_config = fuse_loop_cfg_create();
+		if (loop_config == NULL) {
+			res = 7;
+			goto out2;
+		}
+	}
+
+	if (fuse_set_signal_handlers(se) != 0) {
+		res = 6;
+		goto out3;
+	}
+
+	if (fuse_service_session_mount(service, se, 0, &opts) != 0) {
+		res = 4;
+		goto out4;
+	}
+
+	if (opts.singlethread) {
+		fuse_service_send_goodbye(service, 0);
+		fuse_service_release(service);
+
+		res = fuse_loop(fuse);
+	} else {
+		fuse_loop_cfg_set_clone_fd(loop_config, opts.clone_fd);
+		fuse_loop_cfg_set_idle_threads(loop_config, opts.max_idle_threads);
+		fuse_loop_cfg_set_max_threads(loop_config, opts.max_threads);
+
+		fuse_service_send_goodbye(service, 0);
+		fuse_service_release(service);
+
+		res = fuse_loop_mt(fuse, loop_config);
+	}
+	if (res)
+		res = 8;
+
+out4:
+	fuse_remove_signal_handlers(se);
+out3:
+	fuse_loop_cfg_destroy(loop_config);
+out2:
+	fuse_destroy(fuse);
+out1:
+	free(opts.mountpoint);
+out0:
+	fuse_service_send_goodbye(service, res);
+	fuse_service_release(service);
+	return res;
+}
+
 int fuse_main_real_versioned(int argc, char *argv[],
 			     const struct fuse_operations *op, size_t op_size,
 			     struct libfuse_version *version, void *user_data)
@@ -403,10 +508,6 @@ int fuse_main_real_versioned(int argc, char *argv[],
 		goto out1;
 	}
 
-	struct fuse *_fuse_new_31(struct fuse_args *args,
-			       const struct fuse_operations *op, size_t op_size,
-			       struct libfuse_version *version,
-			       void *user_data);
 	fuse = _fuse_new_31(&args, op, op_size, version, user_data);
 	if (fuse == NULL) {
 		res = 3;


^ permalink raw reply related

* [PATCH 02/13] mount_service: add systemd socket service mounting helper
From: Darrick J. Wong @ 2026-04-30 21:15 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Create a mount helper program that can start a fuse server that runs as
a socket-based systemd service, and a new libfuse module to wrap all the
details of communicating between the mount helper and the containerized
fuse server.

This enables untrusted ext4 mounts via systemd service containers, which
avoids the problem of malicious filesystems compromising the integrity
of the running kernel through memory corruption.

In theory this could also be supported via inetd and clones, though the
author hasn't found one that supports AF_UNIX sockets.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 include/fuse_service.h                           |  243 ++++
 include/fuse_service_priv.h                      |  160 ++
 lib/mount_common_i.h                             |    3 
 util/mount_service.h                             |   40 +
 .github/workflows/install-ubuntu-dependencies.sh |    4 
 doc/fuservicemount3.8                            |   24 
 doc/meson.build                                  |    3 
 include/meson.build                              |    4 
 lib/fuse_service.c                               | 1233 +++++++++++++++++++
 lib/fuse_service_stub.c                          |  106 ++
 lib/fuse_versionscript                           |   17 
 lib/helper.c                                     |   51 +
 lib/meson.build                                  |   17 
 lib/mount.c                                      |   12 
 meson.build                                      |   34 +
 meson_options.txt                                |    9 
 util/fuservicemount.c                            |   18 
 util/meson.build                                 |    9 
 util/mount_service.c                             | 1427 ++++++++++++++++++++++
 19 files changed, 3412 insertions(+), 2 deletions(-)
 create mode 100644 include/fuse_service.h
 create mode 100644 include/fuse_service_priv.h
 create mode 100644 util/mount_service.h
 create mode 100644 doc/fuservicemount3.8
 create mode 100644 lib/fuse_service.c
 create mode 100644 lib/fuse_service_stub.c
 create mode 100644 util/fuservicemount.c
 create mode 100644 util/mount_service.c


diff --git a/include/fuse_service.h b/include/fuse_service.h
new file mode 100644
index 00000000000000..7e4c204e7a70bf
--- /dev/null
+++ b/include/fuse_service.h
@@ -0,0 +1,243 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file LGPL2.txt.
+ */
+#ifndef FUSE_SERVICE_H_
+#define FUSE_SERVICE_H_
+
+/** @file
+ *
+ * Low level API
+ *
+ * IMPORTANT: you should define FUSE_USE_VERSION before including this
+ * header.  To use the newest API define it to 319 (recommended for any
+ * new application).
+ */
+
+#ifndef FUSE_USE_VERSION
+#error FUSE_USE_VERSION not defined
+#endif
+
+#include "fuse_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if FUSE_MAKE_VERSION(3, 19) <= FUSE_USE_VERSION
+
+struct fuse_service;
+
+/**
+ * Accept a socket created by mount.service for information exchange.
+ *
+ * @param sfp pointer to pointer to a service context.  The pointer will always
+ *            be initialized by this function; use fuse_service_accepted to
+ *            find out if the fuse server is actually running as a service.
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_accept(struct fuse_service **sfp);
+
+/**
+ * Has the fuse server accepted a service context?
+ *
+ * @param sf service context
+ * @return true if it has, false if not
+ */
+static inline bool fuse_service_accepted(struct fuse_service *sf)
+{
+	return sf != NULL;
+}
+
+/**
+ * Will the mount service helper accept the allow_other option?
+ *
+ * @param sf service context
+ * @return true if it has, false if not
+ */
+bool fuse_service_can_allow_other(struct fuse_service *sf);
+
+/**
+ * Release all resources associated with the service context.
+ *
+ * @param sfp service context
+ */
+void fuse_service_release(struct fuse_service *sf);
+
+/**
+ * Destroy a service context and release all resources
+ *
+ * @param sfp pointer to pointer to a service context
+ */
+void fuse_service_destroy(struct fuse_service **sfp);
+
+/**
+ * Append the command line arguments from the mount service helper to an
+ * existing fuse_args structure.  The fuse_args should have been initialized
+ * with the argc and argv passed to main().
+ *
+ * @param sfp service context
+ * @param args arguments to modify (input+output)
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_append_args(struct fuse_service *sf, struct fuse_args *args);
+
+/**
+ * Generate the effective fuse server command line from the args structure.
+ * The args structure should be the outcome from fuse_service_append_args.
+ * The resulting string is suitable for setproctitle and must be freed by the
+ * callre.
+ *
+ * @param argc argument count passed to main()
+ * @param argv argument vector passed to main()
+ * @param args fuse args structure
+ * @return effective command line string, or NULL
+ */
+char *fuse_service_cmdline(int argc, char *argv[], struct fuse_args *args);
+
+struct fuse_cmdline_opts;
+
+/**
+ * Utility function to parse common options for simple file systems
+ * using the low-level API. A help text that describes the available
+ * options can be printed with `fuse_cmdline_help`. A single
+ * non-option argument is treated as the mountpoint. Multiple
+ * non-option arguments will result in an error.
+ *
+ * If neither -o subtype= or -o fsname= options are given, a new
+ * subtype option will be added and set to the basename of the program
+ * (the fsname will remain unset, and then defaults to "fuse").
+ *
+ * Known options will be removed from *args*, unknown options will
+ * remain. The mountpoint will not be checked here; that is the job of
+ * mount.service.
+ *
+ * @param args argument vector (input+output)
+ * @param opts output argument for parsed options
+ * @return 0 on success, -1 on failure
+ */
+int fuse_service_parse_cmdline_opts(struct fuse_args *args,
+				    struct fuse_cmdline_opts *opts);
+
+/**
+ * Don't complain if this file cannot be opened.
+ */
+#define FUSE_SERVICE_REQUEST_FILE_QUIET		(1U << 0)
+
+/**
+ * Ask the mount.service helper to open a file on behalf of the fuse server.
+ *
+ * @param sf service context
+ * @param path the path to file
+ * @param open_flags O_ flags
+ * @param create_mode mode with which to create the file
+ * @param request_flags set of FUSE_SERVICE_REQUEST_* flags
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_request_file(struct fuse_service *sf, const char *path,
+			      int open_flags, mode_t create_mode,
+			      unsigned int request_flags);
+
+/**
+ * Ask the mount.service helper to open a block device on behalf of the fuse
+ * server.
+ *
+ * @param sf service context
+ * @param path the path to file
+ * @param open_flags O_ flags
+ * @param create_mode mode with which to create the file
+ * @param request_flags set of FUSE_SERVICE_REQUEST_* flags
+ * @param block_size set the block device block size to this value
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_request_blockdev(struct fuse_service *sf, const char *path,
+				  int open_flags, mode_t create_mode,
+				  unsigned int request_flags,
+				  unsigned int block_size);
+
+/**
+ * Receive a file previously requested.
+ *
+ * @param sf service context
+ * @param path to file
+ * @fdp pointer to file descriptor, which will be set a non-negative file
+ *      descriptor value on success, or negative errno on failure
+ * @return 0 on success, or negative errno on socket communication failure
+ */
+int fuse_service_receive_file(struct fuse_service *sf,
+			      const char *path, int *fdp);
+
+/**
+ * Prevent the mount.service server from sending us any more open files.
+ *
+ * @param sf service context
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_finish_file_requests(struct fuse_service *sf);
+
+/**
+ * Require that the filesystem mount point have the expected file format
+ * (S_IFDIR/S_IFREG).  Can be overridden when calling
+ * fuse_service_session_mount.
+ *
+ * @param sf service context
+ * @param expected_fmt expected mode (S_IFDIR/S_IFREG) for mount point, or 0
+ *                     to skip checks
+ */
+void fuse_service_expect_mount_format(struct fuse_service *sf,
+				      mode_t expected_fmt);
+
+/**
+ * Bind a FUSE file system to the fuse session inside a fuse service process,
+ * then ask the mount.service helper to mount the filesystem for us.  The fuse
+ * client will begin sending requests to the fuse server immediately after
+ * this.  Do not call fuse_daemonize() when running as a fuse service.
+ *
+ * @param sf service context
+ * @param se fuse session
+ * @param expected_fmt expected mode (S_IFDIR/S_IFREG) for mount point, or 0
+ *                     to skip checks
+ * @param opts command line options
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
+			       mode_t expected_fmt,
+			       struct fuse_cmdline_opts *opts);
+
+/**
+ * Ask the mount helper to unmount th e filesystem.
+ *
+ * @param sf service context
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_session_unmount(struct fuse_service *sf);
+
+/**
+ * Bid farewell to the mount.service helper.  It is still necessary to call
+ * fuse_service_destroy after this.
+ *
+ * @param sf service context
+ * @param exitcode fuse server process exit status
+ * @return 0 on success, or negative errno on failure
+ */
+int fuse_service_send_goodbye(struct fuse_service *sf, int exitcode);
+
+/**
+ * Exit routine for a fuse server running as a systemd service.
+ *
+ * @param ret 0 for success, nonzero for service failure.
+ * @return a value to be passed to exit() or returned from main
+ */
+int fuse_service_exit(int ret);
+
+#endif /* FUSE_USE_VERSION >= FUSE_MAKE_VERSION(3, 19) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FUSE_SERVICE_H_ */
diff --git a/include/fuse_service_priv.h b/include/fuse_service_priv.h
new file mode 100644
index 00000000000000..a3773d90c7db7e
--- /dev/null
+++ b/include/fuse_service_priv.h
@@ -0,0 +1,160 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file LGPL2.txt.
+ */
+#ifndef FUSE_SERVICE_PRIV_H_
+#define FUSE_SERVICE_PRIV_H_
+
+/* All numeric fields are network order (big-endian) when going across the socket */
+
+struct fuse_service_memfd_arg {
+	uint32_t pos;
+	uint32_t len;
+};
+
+struct fuse_service_memfd_argv {
+	uint32_t magic;
+	uint32_t argc;
+};
+
+#define FUSE_SERVICE_MAX_CMD_SIZE	(65536)
+
+#define FUSE_SERVICE_ARGS_MAGIC		0x41524753	/* ARGS */
+
+/* mount.service sends a hello to the server and it replies */
+#define FUSE_SERVICE_HELLO_CMD		0x53414654	/* SAFT */
+#define FUSE_SERVICE_HELLO_REPLY	0x4c415354	/* LAST */
+
+/* fuse servers send commands to mount.service */
+#define FUSE_SERVICE_OPEN_CMD		0x4f50454e	/* OPEN */
+#define FUSE_SERVICE_OPEN_BDEV_CMD	0x42444556	/* BDEV */
+#define FUSE_SERVICE_FSOPEN_CMD		0x54595045	/* TYPE */
+#define FUSE_SERVICE_SOURCE_CMD		0x4e414d45	/* NAME */
+#define FUSE_SERVICE_MNTOPTS_CMD	0x4f505453	/* OPTS */
+#define FUSE_SERVICE_MNTPT_CMD		0x4d4e5450	/* MNTP */
+#define FUSE_SERVICE_MOUNT_CMD		0x444f4954	/* DOIT */
+#define FUSE_SERVICE_UNMOUNT_CMD	0x554d4e54	/* UMNT */
+#define FUSE_SERVICE_BYE_CMD		0x42594545	/* BYEE */
+
+/* mount.service sends replies to the fuse server */
+#define FUSE_SERVICE_OPEN_REPLY		0x46494c45	/* FILE */
+#define FUSE_SERVICE_SIMPLE_REPLY	0x5245504c	/* REPL */
+
+struct fuse_service_packet {
+	uint32_t magic;			/* FUSE_SERVICE_*_{CMD,REPLY} */
+};
+
+#define FUSE_SERVICE_PROTO	(1)
+#define FUSE_SERVICE_MIN_PROTO	(1)
+#define FUSE_SERVICE_MAX_PROTO	(1)
+
+#define FUSE_SERVICE_FLAG_ALLOW_OTHER	(1U << 0)
+
+#define FUSE_SERVICE_FLAGS		(FUSE_SERVICE_FLAG_ALLOW_OTHER)
+
+struct fuse_service_hello {
+	struct fuse_service_packet p;
+	uint16_t min_version;
+	uint16_t max_version;
+	uint32_t flags;
+};
+
+static inline bool check_null_endbyte(const void *p, size_t psz)
+{
+	return *((const char *)p + psz - 1) == 0;
+}
+
+struct fuse_service_hello_reply {
+	struct fuse_service_packet p;
+	uint16_t version;
+	uint16_t padding;
+};
+
+struct fuse_service_simple_reply {
+	struct fuse_service_packet p;
+	uint32_t error;			/* positive errno */
+};
+
+struct fuse_service_requested_file {
+	struct fuse_service_packet p;
+	uint32_t error;			/* positive errno */
+	char path[];
+};
+
+static inline size_t sizeof_fuse_service_requested_file(size_t pathlen)
+{
+	return sizeof(struct fuse_service_requested_file) + pathlen + 1;
+}
+
+#define FUSE_SERVICE_FSOPEN_FUSEBLK	(1U << 0)
+#define FUSE_SERVICE_FSOPEN_FLAGS	(FUSE_SERVICE_FSOPEN_FUSEBLK)
+
+struct fuse_service_fsopen_command {
+	struct fuse_service_packet p;
+	uint32_t fsopen_flags;
+};
+
+#define FUSE_SERVICE_OPEN_QUIET		(1U << 0)
+#define FUSE_SERVICE_OPEN_FLAGS		(FUSE_SERVICE_OPEN_QUIET)
+
+struct fuse_service_open_command {
+	struct fuse_service_packet p;
+	uint32_t open_flags;
+	uint32_t create_mode;
+	uint32_t request_flags;
+	uint32_t block_size;
+	char path[];
+};
+
+static inline size_t sizeof_fuse_service_open_command(size_t pathlen)
+{
+	return sizeof(struct fuse_service_open_command) + pathlen + 1;
+}
+
+struct fuse_service_string_command {
+	struct fuse_service_packet p;
+	char value[];
+};
+
+static inline size_t sizeof_fuse_service_string_command(size_t len)
+{
+	return sizeof(struct fuse_service_string_command) + len + 1;
+}
+
+struct fuse_service_mountpoint_command {
+	struct fuse_service_packet p;
+	uint16_t expected_fmt;
+	uint16_t padding;
+	char value[];
+};
+
+static inline size_t sizeof_fuse_service_mountpoint_command(size_t len)
+{
+	return sizeof(struct fuse_service_mountpoint_command) + len + 1;
+}
+
+struct fuse_service_bye_command {
+	struct fuse_service_packet p;
+	uint32_t exitcode;
+};
+
+struct fuse_service_mount_command {
+	struct fuse_service_packet p;
+	uint32_t ms_flags;
+};
+
+struct fuse_service_unmount_command {
+	struct fuse_service_packet p;
+};
+
+int fuse_parse_cmdline_service(struct fuse_args *args,
+				 struct fuse_cmdline_opts *opts);
+
+#define FUSE_SERVICE_ARGV	"argv"
+#define FUSE_SERVICE_FUSEDEV	"fusedev"
+
+#endif /* FUSE_SERVICE_PRIV_H_ */
diff --git a/lib/mount_common_i.h b/lib/mount_common_i.h
index 6bcb055ff1c23f..631dff3e6f8aaf 100644
--- a/lib/mount_common_i.h
+++ b/lib/mount_common_i.h
@@ -14,5 +14,8 @@ struct mount_opts;
 
 char *fuse_mnt_build_source(const struct mount_opts *mo);
 char *fuse_mnt_build_type(const struct mount_opts *mo);
+char *fuse_mnt_kernel_opts(const struct mount_opts *mo);
+unsigned int fuse_mnt_flags(const struct mount_opts *mo);
+
 
 #endif /* FUSE_MOUNT_COMMON_I_H_ */
diff --git a/util/mount_service.h b/util/mount_service.h
new file mode 100644
index 00000000000000..a0b952a15dacf3
--- /dev/null
+++ b/util/mount_service.h
@@ -0,0 +1,40 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file GPL2.txt.
+ */
+#ifndef MOUNT_SERVICE_H_
+#define MOUNT_SERVICE_H_
+
+/**
+ * Magic value that means that we couldn't connect to the mount service,
+ * so the caller should try to fall back to traditional means.
+ */
+#define MOUNT_SERVICE_FALLBACK_NEEDED	(2)
+
+/**
+ * Connect to a fuse service socket and try to mount the filesystem as
+ * specified with the CLI arguments.
+ *
+ * @argc argument count
+ * @argv vector of argument strings
+ * @return EXIT_SUCCESS for success, EXIT_FAILURE if mount fails, or
+ *         MOUNT_SERVICE_FALLBACK_NEEDED if no service is available.
+ */
+int mount_service_main(int argc, char *argv[]);
+
+/**
+ * Return the fuse filesystem subtype from a full fuse filesystem type
+ * specification.  IOWs, fuse.Y -> Y; fuseblk.Z -> Z; or A -> A.  The returned
+ * pointer is within the caller's string.  The subtype must not contain a path
+ * separator.
+ *
+ * @param fstype full fuse filesystem type
+ * @return fuse subtype
+ */
+const char *mount_service_subtype(const char *fstype);
+
+#endif /* MOUNT_SERVICE_H_ */
diff --git a/.github/workflows/install-ubuntu-dependencies.sh b/.github/workflows/install-ubuntu-dependencies.sh
index 0eb7e610729b7c..9f6e69701438f3 100755
--- a/.github/workflows/install-ubuntu-dependencies.sh
+++ b/.github/workflows/install-ubuntu-dependencies.sh
@@ -15,6 +15,8 @@ PACKAGES_CORE=(
     pkg-config
     python3
     python3-pip
+    libsystemd-dev
+    systemd-dev
 )
 
 PACKAGES_FULL=(
@@ -31,6 +33,8 @@ PACKAGES_FULL=(
     libudev-dev:i386
     pkg-config:i386
     python3-pytest
+    libsystemd-dev
+    systemd-dev
 )
 
 PACKAGES_CODECHECKER=(
diff --git a/doc/fuservicemount3.8 b/doc/fuservicemount3.8
new file mode 100644
index 00000000000000..e45d6a89c8b81a
--- /dev/null
+++ b/doc/fuservicemount3.8
@@ -0,0 +1,24 @@
+.TH fuservicemount3 "8"
+.SH NAME
+fuservicemount3 \- mount a FUSE filesystem that runs as a system socket service
+.SH SYNOPSIS
+.B fuservicemount3
+.B source
+.B mountpoint
+.BI -t " fstype"
+[
+.I options
+]
+.SH DESCRIPTION
+Mount a filesystem using a FUSE server that runs as a socket service.
+These servers can be contained using the platform's service management
+framework.
+.SH "AUTHORS"
+.LP
+The author of the fuse socket service code is Darrick J. Wong <djwong@kernel.org>.
+Debian GNU/Linux distribution.
+.SH SEE ALSO
+.BR fusermount3 (1)
+.BR fusermount (1)
+.BR mount (8)
+.BR fuse (4)
diff --git a/doc/meson.build b/doc/meson.build
index db3e0b26f71975..c105cf3471fdf4 100644
--- a/doc/meson.build
+++ b/doc/meson.build
@@ -2,3 +2,6 @@ if not platform.endswith('bsd') and platform != 'dragonfly'
   install_man('fusermount3.1', 'mount.fuse3.8')
 endif
 
+if private_cfg.get('HAVE_SERVICEMOUNT', false)
+  install_man('fuservicemount3.8')
+endif
diff --git a/include/meson.build b/include/meson.build
index bf671977a5a6a9..da51180f87eea2 100644
--- a/include/meson.build
+++ b/include/meson.build
@@ -1,4 +1,8 @@
 libfuse_headers = [ 'fuse.h', 'fuse_common.h', 'fuse_lowlevel.h',
 	            'fuse_opt.h', 'cuse_lowlevel.h', 'fuse_log.h' ]
 
+if private_cfg.get('HAVE_SERVICEMOUNT', false)
+  libfuse_headers += [ 'fuse_service.h' ]
+endif
+
 install_headers(libfuse_headers, subdir: 'fuse3')
diff --git a/lib/fuse_service.c b/lib/fuse_service.c
new file mode 100644
index 00000000000000..ef512c76120a0f
--- /dev/null
+++ b/lib/fuse_service.c
@@ -0,0 +1,1233 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * Library functions to support fuse servers that can be run as "safe" systemd
+ * containers.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file LGPL2.txt
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <systemd/sd-daemon.h>
+#include <arpa/inet.h>
+#include <limits.h>
+
+#include "fuse_config.h"
+#include "fuse_i.h"
+#include "fuse_service_priv.h"
+#include "fuse_service.h"
+#include "mount_common_i.h"
+
+struct fuse_service {
+	/* expected file format of the mount point */
+	mode_t expected_fmt;
+
+	/* socket fd */
+	int sockfd;
+
+	/* /dev/fuse device */
+	int fusedevfd;
+
+	/* memfd for cli arguments */
+	int argvfd;
+
+	/* do we own fusedevfd? */
+	bool owns_fusedevfd;
+
+	/* can we use allow_other? */
+	bool allow_other;
+};
+
+static int __recv_fd(struct fuse_service *sf,
+		     struct fuse_service_requested_file *buf,
+		     ssize_t bufsize, int *fdp)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = bufsize,
+	};
+	union {
+		struct cmsghdr cmsghdr;
+		char control[CMSG_SPACE(sizeof(int))];
+	} cmsgu = { };
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = cmsgu.control,
+
+		/*
+		 * Do not include padding at the end of the control buffer,
+		 * because we don't want to receive fds that we weren't
+		 * expecting.
+		 */
+		.msg_controllen = CMSG_LEN(sizeof(int)),
+	};
+	struct cmsghdr *cmsg;
+	ssize_t size;
+
+	/*
+	 * A kernel LSM could decide to deny the fd transfer by writing a
+	 * negative number (== invalid fd) into the cmsg buffer instead of
+	 * installing the fd.  Set the initial fd value to -1 to signal an
+	 * invalid fd in case the kernel doesn't even set the cmsg buffer.
+	 * It shouldn't do that, but we absolutely don't want a zero here.
+	 */
+	memset(cmsgu.control, -1, sizeof(cmsgu.control));
+
+	size = recvmsg(sf->sockfd, &msg, MSG_TRUNC | MSG_CMSG_CLOEXEC);
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service file reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size > bufsize ||
+	    size < offsetof(struct fuse_service_requested_file, path)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service file reply size %zd, expected %zd\n",
+			 size, bufsize);
+		return -EBADMSG;
+	}
+
+	if (msg.msg_flags & MSG_CTRUNC) {
+		/* SMACK does this */
+		fuse_log(FUSE_LOG_ERR,
+"fuse: service file reply control data truncated; did an LSM deny SCM_RIGHTS?\n");
+		return -EBADMSG;
+	}
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (!cmsg) {
+		/* no control message means mount.service sent us an error */
+		return 0;
+	}
+	if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+		fuse_log(FUSE_LOG_ERR,
+			 "fuse: wrong service file reply control data size %zd, expected %zd\n",
+			 cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+		return -EBADMSG;
+	}
+	if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+		fuse_log(FUSE_LOG_ERR,
+"fuse: wrong service file reply control data level %d type %d, expected %d and %d\n",
+			 cmsg->cmsg_level, cmsg->cmsg_type, SOL_SOCKET,
+			 SCM_RIGHTS);
+		return -EBADMSG;
+	}
+
+	memcpy(fdp, (int *)CMSG_DATA(cmsg), sizeof(int));
+	return 0;
+}
+
+static ssize_t __send_packet(struct fuse_service *sf, void *ptr, size_t len)
+{
+	struct iovec iov = {
+		.iov_base = ptr,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	return sendmsg(sf->sockfd, &msg, MSG_EOR | MSG_NOSIGNAL);
+}
+
+static ssize_t __recv_packet(struct fuse_service *sf, void *ptr, size_t len)
+{
+	struct iovec iov = {
+		.iov_base = ptr,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	return recvmsg(sf->sockfd, &msg, MSG_TRUNC);
+}
+
+int fuse_service_receive_file(struct fuse_service *sf, const char *path,
+			      int *fdp)
+{
+	struct fuse_service_requested_file *req;
+	const size_t req_sz = sizeof_fuse_service_requested_file(strlen(path));
+	int fd = -ENOENT;
+	int ret;
+
+	*fdp = -ENOENT;
+
+	req = calloc(1, req_sz + 1);
+	if (!req) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: alloc service file reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	ret = __recv_fd(sf, req, req_sz, &fd);
+	if (ret)
+		goto out_req;
+
+	if (fd < 0) {
+		/* The kernel might have given us an errno instead of an fd */
+		fuse_log(FUSE_LOG_ERR, "fuse: service fd transfer failed: %s\n",
+			 strerror(-fd));
+		ret = fd;
+		goto out_req;
+	}
+
+	if (ntohl(req->p.magic) != FUSE_SERVICE_OPEN_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service file reply contains wrong magic!\n");
+		ret = -EBADMSG;
+		goto out_close;
+	}
+	if (strcmp(req->path, path)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: `%s': not the requested service file, got `%s'\n",
+			 path, req->path);
+		ret = -EBADMSG;
+		goto out_close;
+	}
+
+	if (req->error) {
+		*fdp = -ntohl(req->error);
+		goto out_close;
+	}
+
+	if (fd == -ENOENT)
+		fuse_log(FUSE_LOG_ERR, "fuse: did not receive `%s' but no error?\n",
+			 path);
+
+	*fdp = fd;
+	goto out_req;
+
+out_close:
+	close(fd);
+out_req:
+	free(req);
+	return ret;
+}
+
+#define FUSE_SERVICE_REQUEST_FILE_FLAGS	(FUSE_SERVICE_REQUEST_FILE_QUIET)
+
+static int fuse_service_request_path(struct fuse_service *sf, const char *path,
+				     mode_t expected_fmt, int open_flags,
+				     mode_t create_mode,
+				     unsigned int request_flags,
+				     unsigned int block_size)
+{
+	struct fuse_service_open_command *cmd;
+	const size_t cmdsz = sizeof_fuse_service_open_command(strlen(path));
+	ssize_t size;
+	unsigned int rqflags = 0;
+	int ret;
+
+	if (request_flags & ~FUSE_SERVICE_REQUEST_FILE_FLAGS) {
+		fuse_log(FUSE_LOG_ERR, "fuse: invalid fuse service file request flags 0x%x\n",
+			 request_flags);
+		return -EINVAL;
+	}
+
+	if (request_flags & FUSE_SERVICE_REQUEST_FILE_QUIET)
+		rqflags |= FUSE_SERVICE_OPEN_QUIET;
+
+	cmd = calloc(1, cmdsz);
+	if (!cmd) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: alloc service file request: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (S_ISBLK(expected_fmt)) {
+		cmd->p.magic = htonl(FUSE_SERVICE_OPEN_BDEV_CMD);
+		cmd->block_size = htonl(block_size);
+	} else {
+		cmd->p.magic = htonl(FUSE_SERVICE_OPEN_CMD);
+	}
+	cmd->open_flags = htonl(open_flags);
+	cmd->create_mode = htonl(create_mode);
+	cmd->request_flags = htonl(rqflags);
+	strcpy(cmd->path, path);
+
+	size = __send_packet(sf, cmd, cmdsz);
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: request service file: %s\n",
+			 strerror(error));
+		ret = -error;
+		goto out_free;
+	}
+
+	ret = 0;
+out_free:
+	free(cmd);
+	return ret;
+}
+
+int fuse_service_request_file(struct fuse_service *sf, const char *path,
+			      int open_flags, mode_t create_mode,
+			      unsigned int request_flags)
+{
+	return fuse_service_request_path(sf, path, S_IFREG, open_flags,
+					 create_mode, request_flags, 0);
+}
+
+int fuse_service_request_blockdev(struct fuse_service *sf, const char *path,
+				  int open_flags, mode_t create_mode,
+				  unsigned int request_flags,
+				  unsigned int block_size)
+{
+	return fuse_service_request_path(sf, path, S_IFBLK, open_flags,
+					 create_mode, request_flags,
+					 block_size);
+}
+
+int fuse_service_send_goodbye(struct fuse_service *sf, int exitcode)
+{
+	struct fuse_service_bye_command c = {
+		.p.magic = htonl(FUSE_SERVICE_BYE_CMD),
+		.exitcode = htonl(exitcode),
+	};
+	ssize_t size;
+
+	/* already gone? */
+	if (sf->sockfd < 0)
+		return 0;
+
+	size = __send_packet(sf, &c, sizeof(c));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service goodbye: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	shutdown(sf->sockfd, SHUT_RDWR);
+	close(sf->sockfd);
+	sf->sockfd = -1;
+	return 0;
+}
+
+static int count_listen_fds(void)
+{
+	char *listen_fds;
+	char *listen_pid;
+	char *p;
+	long l;
+
+	/*
+	 * No environment variables means we're not running as a system socket
+	 * service, so we'll back out without logging anything.
+	 */
+	listen_fds = getenv("LISTEN_FDS");
+	listen_pid = getenv("LISTEN_PID");
+	if (!listen_fds || !listen_pid)
+		return 0;
+
+	/*
+	 * LISTEN_PID is the pid of the process to which systemd thinks it gave
+	 * the socket fd.  Hopefully that's us.
+	 */
+	errno = 0;
+	l = strtol(listen_pid, &p, 10);
+	if (errno || *p != 0 || l != getpid())
+		return 0;
+
+	/*
+	 * LISTEN_FDS is the number of sockets that were opened in this
+	 * process.
+	 */
+	errno = 0;
+	l = strtol(listen_fds, &p, 10);
+	if (errno || *p != 0 || l > INT_MAX || l < 0)
+		return 0;
+
+	return l;
+}
+
+static int check_sendbuf_size(int sockfd)
+{
+	const size_t min_size = sizeof_fuse_service_open_command(PATH_MAX);
+	int sendbuf_size = -1;
+	socklen_t optlen = sizeof(sendbuf_size);
+	int ret;
+
+	/*
+	 * If we can't query the maximum send buffer length, just keep going.
+	 * Most likely we won't be sending huge open commands, and if we do,
+	 * the sendmsg will fail there too.
+	 */
+	ret = getsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen);
+	if (ret || sendbuf_size < 0)
+		return 0;
+
+	if (sendbuf_size >= min_size)
+		return 0;
+
+	fuse_log(FUSE_LOG_ERR, "max socket send buffer is %d, need at least %zu.\n",
+		 sendbuf_size, min_size);
+	return -ENOBUFS;
+}
+
+static int find_socket_fd(int nr_fds)
+{
+	struct stat stbuf;
+	struct sockaddr_un urk;
+	socklen_t urklen = sizeof(urk);
+	int ret;
+
+	if (nr_fds != 1) {
+		fuse_log(FUSE_LOG_ERR, "fuse: can only handle 1 service socket, got %d.\n",
+			 nr_fds);
+		return -E2BIG;
+	}
+
+	ret = fstat(SD_LISTEN_FDS_START, &stbuf);
+	if (ret) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service socket: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	if (!S_ISSOCK(stbuf.st_mode)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: expected service fd %d to be a socket\n",
+				SD_LISTEN_FDS_START);
+		return -ENOTSOCK;
+	}
+
+	ret = getsockname(SD_LISTEN_FDS_START, &urk, &urklen);
+	if (ret < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service socket family: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	if (ret > 0 || urk.sun_family != AF_UNIX) {
+		/*
+		 * If getsockname wanted to return more data than fits in a
+		 * sockaddr_un, then it's obviously not an AF_UNIX socket.
+		 *
+		 * If it filled the buffer exactly but the family isn't AF_UNIX
+		 * then we also return false.
+		 */
+		fuse_log(FUSE_LOG_ERR, "fuse: service socket is not AF_UNIX\n");
+		return -EAFNOSUPPORT;
+	}
+
+	ret = check_sendbuf_size(SD_LISTEN_FDS_START);
+	if (ret)
+		return ret;
+
+	return SD_LISTEN_FDS_START;
+}
+
+static int negotiate_hello(struct fuse_service *sf)
+{
+	struct fuse_service_hello hello = { };
+	struct fuse_service_hello_reply reply = {
+		.p.magic = htonl(FUSE_SERVICE_HELLO_REPLY),
+		.version = htons(FUSE_SERVICE_PROTO),
+	};
+	uint32_t flags;
+	ssize_t size;
+
+	size = __recv_packet(sf, &hello, sizeof(hello));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: receive service hello: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(hello)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service hello size %zd, expected %zd\n",
+			 size, sizeof(hello));
+		return -EBADMSG;
+	}
+
+	if (ntohl(hello.p.magic) != FUSE_SERVICE_HELLO_CMD) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service server did not send hello command\n");
+		return -EBADMSG;
+	}
+
+	if (ntohs(hello.min_version) < FUSE_SERVICE_MIN_PROTO) {
+		fuse_log(FUSE_LOG_ERR, "fuse: unsupported min service protocol version %u\n",
+			ntohs(hello.min_version));
+		return -EOPNOTSUPP;
+	}
+
+	if (ntohs(hello.max_version) > FUSE_SERVICE_MAX_PROTO) {
+		fuse_log(FUSE_LOG_ERR, "fuse: unsupported max service protocol version %u\n",
+			ntohs(hello.min_version));
+		return -EOPNOTSUPP;
+	}
+
+	flags = ntohl(hello.flags);
+	if (flags & ~FUSE_SERVICE_FLAGS) {
+		fprintf(stderr, "fuse: invalid hello flags: 0x%x\n",
+			flags & ~FUSE_SERVICE_FLAGS);
+		return -EINVAL;
+	}
+
+	if (flags & FUSE_SERVICE_FLAG_ALLOW_OTHER)
+		sf->allow_other = true;
+
+	size = __send_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service hello reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	return 0;
+}
+
+int fuse_service_accept(struct fuse_service **sfp)
+{
+	struct fuse_service *sf;
+	int nr_fds;
+	int sockfd;
+	int flags;
+	int ret = 0;
+
+	*sfp = NULL;
+
+	nr_fds = count_listen_fds();
+	if (nr_fds == 0)
+		return 0;
+
+	/* Find the socket that connects us to mount.service */
+	sockfd = find_socket_fd(nr_fds);
+	if (sockfd < 0)
+		return sockfd;
+
+	flags = fcntl(sockfd, F_GETFD);
+	if (flags < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service socket getfd: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	if (!(flags & FD_CLOEXEC)) {
+		ret = fcntl(sockfd, F_SETFD, flags | FD_CLOEXEC);
+		if (ret) {
+			int error = errno;
+
+			fuse_log(FUSE_LOG_ERR, "fuse: service socket set cloexec: %s\n",
+				 strerror(error));
+			return -error;
+		}
+	}
+
+	sf = calloc(1, sizeof(struct fuse_service));
+	if (!sf) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service alloc: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	sf->sockfd = sockfd;
+
+	ret = negotiate_hello(sf);
+	if (ret)
+		goto out_sf;
+
+	/* Receive the two critical sockets */
+	ret = fuse_service_receive_file(sf, FUSE_SERVICE_ARGV, &sf->argvfd);
+	if (ret < 0)
+		goto out_sockfd;
+	if (sf->argvfd < 0) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service mount options file: %s\n",
+			 strerror(-sf->argvfd));
+		ret = sf->argvfd;
+		goto out_sockfd;
+	}
+
+	ret = fuse_service_receive_file(sf, FUSE_SERVICE_FUSEDEV,
+					&sf->fusedevfd);
+	if (ret < 0)
+		goto out_argvfd;
+	if (sf->fusedevfd < 0) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service fuse device: %s\n",
+			 strerror(-sf->fusedevfd));
+		ret = sf->fusedevfd;
+		goto out_argvfd;
+	}
+
+	sf->owns_fusedevfd = true;
+	*sfp = sf;
+	return 0;
+
+out_argvfd:
+	close(sf->argvfd);
+out_sockfd:
+	shutdown(sf->sockfd, SHUT_RDWR);
+	close(sf->sockfd);
+out_sf:
+	free(sf);
+	return ret;
+}
+
+bool fuse_service_can_allow_other(struct fuse_service *sf)
+{
+	return sf->allow_other;
+}
+
+int fuse_service_append_args(struct fuse_service *sf,
+			     struct fuse_args *existing_args)
+{
+	struct fuse_service_memfd_argv memfd_args = { };
+	struct fuse_args new_args = {
+		.allocated = 1,
+	};
+	char *str = NULL;
+	off_t memfd_pos = 0;
+	ssize_t received;
+	unsigned int i;
+	int ret;
+
+	/* Figure out how many arguments we're getting from the mount helper. */
+	received = pread(sf->argvfd, &memfd_args, sizeof(memfd_args), 0);
+	if (received < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service args file: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (received < sizeof(memfd_args)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service args file length unreadable\n");
+		return -EBADMSG;
+	}
+	if (ntohl(memfd_args.magic) != FUSE_SERVICE_ARGS_MAGIC) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service args file corrupt\n");
+		return -EBADMSG;
+	}
+	memfd_args.magic = htonl(memfd_args.magic);
+	memfd_args.argc = htonl(memfd_args.argc);
+	memfd_pos += sizeof(memfd_args);
+
+	/* Allocate a new array of argv string pointers */
+	new_args.argv = calloc(memfd_args.argc + existing_args->argc,
+			       sizeof(char *));
+	if (!new_args.argv) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service new args: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	/*
+	 * Copy the fuse server's CLI arguments.  We'll leave new_args.argv[0]
+	 * unset for now, because we'll set it in the next step with the fstype
+	 * that the mount helper sent us.
+	 */
+	new_args.argc++;
+	for (i = 1; i < existing_args->argc; i++) {
+		if (existing_args->allocated) {
+			new_args.argv[new_args.argc] = existing_args->argv[i];
+			existing_args->argv[i] = NULL;
+		} else {
+			char *dup = strdup(existing_args->argv[i]);
+
+			if (!dup) {
+				int error = errno;
+
+				fuse_log(FUSE_LOG_ERR,
+					 "fuse: service duplicate existing args: %s\n",
+					 strerror(error));
+				ret = -error;
+				goto out_new_args;
+			}
+
+			new_args.argv[new_args.argc] = dup;
+		}
+
+		new_args.argc++;
+	}
+
+	/* Copy the rest of the arguments from the helper */
+	for (i = 0; i < memfd_args.argc; i++) {
+		struct fuse_service_memfd_arg memfd_arg = { };
+
+		/* Read argv iovec */
+		received = pread(sf->argvfd, &memfd_arg, sizeof(memfd_arg),
+				 memfd_pos);
+		if (received < 0) {
+			int error = errno;
+
+			fuse_log(FUSE_LOG_ERR, "fuse: service args file iovec read: %s\n",
+				 strerror(error));
+			ret = -error;
+			goto out_new_args;
+		}
+		if (received < sizeof(struct fuse_service_memfd_arg)) {
+			fuse_log(FUSE_LOG_ERR,
+				 "fuse: service args file argv[%u] iovec short read %zd",
+				 i, received);
+			ret = -EBADMSG;
+			goto out_new_args;
+		}
+		memfd_arg.pos = htonl(memfd_arg.pos);
+		memfd_arg.len = htonl(memfd_arg.len);
+		memfd_pos += sizeof(memfd_arg);
+
+		/* read arg string from file */
+		str = calloc(1, memfd_arg.len + 1);
+		if (!str) {
+			int error = errno;
+
+			fuse_log(FUSE_LOG_ERR, "fuse: service arg alloc: %s\n",
+				 strerror(error));
+			ret = -error;
+			goto out_new_args;
+		}
+
+		received = pread(sf->argvfd, str, memfd_arg.len, memfd_arg.pos);
+		if (received < 0) {
+			int error = errno;
+
+			fuse_log(FUSE_LOG_ERR, "fuse: service args file read: %s\n",
+				 strerror(error));
+			ret = -error;
+			goto out_str;
+		}
+		if (received < memfd_arg.len) {
+			fuse_log(FUSE_LOG_ERR, "fuse: service args file argv[%u] short read %zd",
+				 i, received);
+			ret = -EBADMSG;
+			goto out_str;
+		}
+
+		/* move string into the args structure */
+		if (i == 0) {
+			/* the first argument is the fs type */
+			new_args.argv[0] = str;
+		} else {
+			new_args.argv[new_args.argc] = str;
+			new_args.argc++;
+		}
+		str = NULL;
+	}
+
+	/* drop existing args, move new args to existing args */
+	fuse_opt_free_args(existing_args);
+	memcpy(existing_args, &new_args, sizeof(*existing_args));
+
+	close(sf->argvfd);
+	sf->argvfd = -1;
+
+	return 0;
+
+out_str:
+	free(str);
+out_new_args:
+	fuse_opt_free_args(&new_args);
+	return ret;
+}
+
+#ifdef SO_PASSRIGHTS
+int fuse_service_finish_file_requests(struct fuse_service *sf)
+{
+	int zero = 0;
+	int ret;
+
+	/*
+	 * Don't let a malicious mount helper send us more fds.  If the kernel
+	 * doesn't know about this new(ish) option that's ok, we'll trust the
+	 * servicemount helper.
+	 */
+	ret = setsockopt(sf->sockfd, SOL_SOCKET, SO_PASSRIGHTS, &zero,
+			 sizeof(zero));
+	if (ret && errno == ENOPROTOOPT)
+		ret = 0;
+	if (ret) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: disabling fd passing: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	return 0;
+}
+#else
+int fuse_service_finish_file_requests(struct fuse_service *sf)
+{
+	(void)sf;
+	return 0;
+}
+#endif
+
+static int send_fsopen(struct fuse_service *sf, const char *fstype,
+		       int *errorp)
+{
+	struct fuse_service_simple_reply reply = { };
+	struct fuse_service_fsopen_command c = {
+		.p.magic = htonl(FUSE_SERVICE_FSOPEN_CMD),
+	};
+	ssize_t size;
+
+	if (!strncmp(fstype, "fuseblk", 7))
+		c.fsopen_flags |= htonl(FUSE_SERVICE_FSOPEN_FUSEBLK);
+
+	size = __send_packet(sf, &c, sizeof(c));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service fsopen command: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	size = __recv_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service fsopen reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(reply)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service fsopen reply size %zd, expected %zd\n",
+			size, sizeof(reply));
+		return -EBADMSG;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_SIMPLE_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service fsopen reply contains wrong magic!\n");
+		return -EBADMSG;
+	}
+
+	*errorp = ntohl(reply.error);
+	return 0;
+}
+
+static int send_string(struct fuse_service *sf, uint32_t command,
+		       const char *value, int *errorp)
+{
+	struct fuse_service_simple_reply reply = { };
+	struct fuse_service_string_command *cmd;
+	const size_t cmdsz = sizeof_fuse_service_string_command(strlen(value));
+	ssize_t size;
+
+	cmd = calloc(1, cmdsz);
+	if (!cmd) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: alloc service string send: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	cmd->p.magic = htonl(command);
+	strcpy(cmd->value, value);
+
+	size = __send_packet(sf, cmd, cmdsz);
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service string: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	free(cmd);
+
+	size = __recv_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service string reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(reply)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service string reply size %zd, expected %zd\n",
+			size, sizeof(reply));
+		return -EBADMSG;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_SIMPLE_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service string reply contains wrong magic!\n");
+		return -EBADMSG;
+	}
+
+	*errorp = ntohl(reply.error);
+	return 0;
+}
+
+static int send_mountpoint(struct fuse_service *sf, mode_t expected_fmt,
+			   const char *value, int *errorp)
+{
+	struct fuse_service_simple_reply reply = { };
+	struct fuse_service_mountpoint_command *cmd;
+	const size_t cmdsz =
+			sizeof_fuse_service_mountpoint_command(strlen(value));
+	ssize_t size;
+
+	cmd = calloc(1, cmdsz);
+	if (!cmd) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: alloc service mountpoint send: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	cmd->p.magic = htonl(FUSE_SERVICE_MNTPT_CMD);
+	cmd->expected_fmt = htons(expected_fmt);
+	strcpy(cmd->value, value);
+
+	size = __send_packet(sf, cmd, cmdsz);
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service mountpoint: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	free(cmd);
+
+	size = __recv_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service mountpoint reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(reply)) {
+		fuse_log(FUSE_LOG_ERR,
+			 "fuse: wrong service mountpoint reply size %zd, expected %zd\n",
+			 size, sizeof(reply));
+		return -EBADMSG;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_SIMPLE_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service mountpoint reply contains wrong magic!\n");
+		return -EBADMSG;
+	}
+
+	*errorp = ntohl(reply.error);
+	return 0;
+}
+
+static int send_mount(struct fuse_service *sf, unsigned int ms_flags,
+		      int *errorp)
+{
+	struct fuse_service_simple_reply reply = { };
+	struct fuse_service_mount_command c = {
+		.p.magic = htonl(FUSE_SERVICE_MOUNT_CMD),
+		.ms_flags = htonl(ms_flags),
+	};
+	ssize_t size;
+
+	size = __send_packet(sf, &c, sizeof(c));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service mount command: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	size = __recv_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service mount reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(reply)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service mount reply size %zd, expected %zd\n",
+			size, sizeof(reply));
+		return -EBADMSG;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_SIMPLE_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service mount reply contains wrong magic!\n");
+		return -EBADMSG;
+	}
+
+	*errorp = ntohl(reply.error);
+	return 0;
+}
+
+void fuse_service_expect_mount_format(struct fuse_service *sf,
+				      mode_t expected_fmt)
+{
+	sf->expected_fmt = expected_fmt;
+}
+
+int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
+			       mode_t expected_fmt,
+			       struct fuse_cmdline_opts *opts)
+{
+	char *fstype = fuse_mnt_build_type(se->mo);
+	char *source = fuse_mnt_build_source(se->mo);
+	char *mntopts = fuse_mnt_kernel_opts(se->mo);
+	char path[32];
+	int ret;
+	int error = 0;
+
+	if (!fstype || !source) {
+		fuse_log(FUSE_LOG_ERR, "fuse: cannot allocate service strings\n");
+		ret = -ENOMEM;
+		goto out_strings;
+	}
+
+	if (!expected_fmt)
+		expected_fmt = sf->expected_fmt;
+
+	/*
+	 * The fuse session takes the fusedev fd if this succeeds.  It is
+	 * required to use the "/dev/fd/XX" format.
+	 */
+	snprintf(path, sizeof(path), "/dev/fd/%d", sf->fusedevfd);
+	errno = 0;
+	ret = fuse_session_mount(se, path);
+	if (ret) {
+		/* Try to return richer errors than fuse_session_mount's -1 */
+		ret = errno ? -errno : -EINVAL;
+		goto out_strings;
+	}
+	sf->owns_fusedevfd = false;
+
+	ret = send_fsopen(sf, fstype, &error);
+	if (ret)
+		goto out_strings;
+	if (error) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service fsopen: %s\n",
+			 strerror(error));
+		ret = -error;
+		goto out_strings;
+	}
+
+	ret = send_string(sf, FUSE_SERVICE_SOURCE_CMD, source, &error);
+	if (ret)
+		goto out_strings;
+	if (error) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service fs source: %s\n",
+			 strerror(error));
+		ret = -error;
+		goto out_strings;
+	}
+
+	ret = send_mountpoint(sf, expected_fmt, opts->mountpoint, &error);
+	if (ret)
+		goto out_strings;
+	if (error) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service fs mountpoint: %s\n",
+			 strerror(error));
+		ret = -error;
+		goto out_strings;
+	}
+
+	if (mntopts) {
+		ret = send_string(sf, FUSE_SERVICE_MNTOPTS_CMD, mntopts,
+				  &error);
+		if (ret)
+			goto out_strings;
+		if (error) {
+			fuse_log(FUSE_LOG_ERR, "fuse: service fs mount options: %s\n",
+				 strerror(error));
+			ret = -error;
+			goto out_strings;
+		}
+	}
+
+	ret = send_mount(sf, fuse_mnt_flags(se->mo), &error);
+	if (ret)
+		goto out_strings;
+	if (error) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service mount: %s\n",
+			 strerror(error));
+		ret = -error;
+		goto out_strings;
+	}
+
+	/*
+	 * foreground mode is needed so that systemd actually tracks the
+	 * service correctly and doesn't try to kill it; and so that
+	 * stdout/stderr don't get zapped.  Change to the root directory so
+	 * that the caller needn't call fuse_daemonize().
+	 */
+	opts->foreground = 1;
+	(void)chdir("/");
+
+out_strings:
+	free(mntopts);
+	free(source);
+	free(fstype);
+	return ret;
+}
+
+int fuse_service_session_unmount(struct fuse_service *sf)
+{
+	struct fuse_service_simple_reply reply = { };
+	struct fuse_service_unmount_command c = {
+		.p.magic = htonl(FUSE_SERVICE_UNMOUNT_CMD),
+	};
+	ssize_t size;
+
+	/* already gone? */
+	if (sf->sockfd < 0)
+		return 0;
+
+	size = __send_packet(sf, &c, sizeof(c));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: send service unmount: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	size = __recv_packet(sf, &reply, sizeof(reply));
+	if (size < 0) {
+		int error = errno;
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service unmount reply: %s\n",
+			 strerror(error));
+		return -error;
+	}
+	if (size != sizeof(reply)) {
+		fuse_log(FUSE_LOG_ERR, "fuse: wrong service unmount reply size %zd, expected %zd\n",
+			size, sizeof(reply));
+		return -EBADMSG;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_SIMPLE_REPLY) {
+		fuse_log(FUSE_LOG_ERR, "fuse: service unmount reply contains wrong magic!\n");
+		return -EBADMSG;
+	}
+
+	if (reply.error) {
+		int error = ntohl(reply.error);
+
+		fuse_log(FUSE_LOG_ERR, "fuse: service unmount: %s\n",
+			 strerror(error));
+		return -error;
+	}
+
+	return 0;
+}
+
+void fuse_service_release(struct fuse_service *sf)
+{
+	if (sf->owns_fusedevfd)
+		close(sf->fusedevfd);
+	sf->owns_fusedevfd = false;
+	sf->fusedevfd = -1;
+	close(sf->argvfd);
+	sf->argvfd = -1;
+	shutdown(sf->sockfd, SHUT_RDWR);
+	close(sf->sockfd);
+	sf->sockfd = -1;
+}
+
+void fuse_service_destroy(struct fuse_service **sfp)
+{
+	struct fuse_service *sf = *sfp;
+
+	if (sf) {
+		fuse_service_release(*sfp);
+		free(sf);
+	}
+
+	*sfp = NULL;
+}
+
+char *fuse_service_cmdline(int argc, char *argv[], struct fuse_args *args)
+{
+	char *p, *dst;
+	size_t len = 1;
+	ssize_t ret;
+	char *argv0;
+	unsigned int i;
+
+	/* Try to preserve argv[0] */
+	if (argc > 0)
+		argv0 = argv[0];
+	else if (args->argc > 0)
+		argv0 = args->argv[0];
+	else
+		return NULL;
+
+	/* Pick up the alleged fstype from args->argv[0] */
+	if (args->argc == 0)
+		return NULL;
+
+	len += strlen(argv0) + 1;
+	len += 3; /* " -t" */
+	for (i = 0; i < args->argc; i++)
+		len += strlen(args->argv[i]) + 1;
+
+	p = calloc(1, len);
+	if (!p)
+		return NULL;
+	dst = p;
+
+	/* Format: argv0 -t alleged_fstype [all other options...] */
+	ret = sprintf(dst, "%s -t", argv0);
+	dst += ret;
+	for (i = 0; i < args->argc; i++) {
+		ret = sprintf(dst, " %s", args->argv[i]);
+		dst += ret;
+	}
+
+	return p;
+}
+
+int fuse_service_parse_cmdline_opts(struct fuse_args *args,
+				    struct fuse_cmdline_opts *opts)
+{
+	return fuse_parse_cmdline_service(args, opts);
+}
+
+int fuse_service_exit(int ret)
+{
+	/*
+	 * We have to sleep 2 seconds here because journald uses the pid to
+	 * connect our log messages to the systemd service.  This is critical
+	 * for capturing all the log messages if the service fails, because
+	 * failure analysis tools use the service name to gather log messages
+	 * for reporting.
+	 */
+	sleep(2);
+
+	/*
+	 * If we're being run as a service, the return code must fit the LSB
+	 * init script action error guidelines, which is to say that we
+	 * compress all errors to 1 ("generic or unspecified error", LSB 5.0
+	 * section 22.2) and hope the admin will scan the log for what actually
+	 * happened.
+	 */
+	return ret != 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/lib/fuse_service_stub.c b/lib/fuse_service_stub.c
new file mode 100644
index 00000000000000..d34df3891a6e31
--- /dev/null
+++ b/lib/fuse_service_stub.c
@@ -0,0 +1,106 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * Stub functions for platforms where we cannot have fuse servers run as "safe"
+ * systemd containers.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file LGPL2.txt
+ */
+
+/* we don't use any parameters at all */
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+#define _GNU_SOURCE
+#include <errno.h>
+
+#include "fuse_config.h"
+#include "fuse_i.h"
+#include "fuse_service.h"
+
+int fuse_service_receive_file(struct fuse_service *sf, const char *path,
+			      int *fdp)
+{
+	return -EOPNOTSUPP;
+}
+
+int fuse_service_request_file(struct fuse_service *sf, const char *path,
+			      int open_flags, mode_t create_mode,
+			      unsigned int request_flags)
+{
+	return -EOPNOTSUPP;
+}
+
+int fuse_service_request_blockdev(struct fuse_service *sf, const char *path,
+				  int open_flags, mode_t create_mode,
+				  unsigned int request_flags,
+				  unsigned int block_size)
+{
+	return -EOPNOTSUPP;
+}
+
+int fuse_service_send_goodbye(struct fuse_service *sf, int error)
+{
+	return -EOPNOTSUPP;
+}
+
+int fuse_service_accept(struct fuse_service **sfp)
+{
+	*sfp = NULL;
+	return 0;
+}
+
+int fuse_service_append_args(struct fuse_service *sf,
+			     struct fuse_args *existing_args)
+{
+	return -EOPNOTSUPP;
+}
+
+char *fuse_service_cmdline(int argc, char *argv[], struct fuse_args *args)
+{
+	return NULL;
+}
+
+int fuse_service_finish_file_requests(struct fuse_service *sf)
+{
+	return -EOPNOTSUPP;
+}
+
+void fuse_service_expect_mount_format(struct fuse_service *sf,
+				      mode_t expected_fmt)
+{
+}
+
+int fuse_service_session_mount(struct fuse_service *sf, struct fuse_session *se,
+			       mode_t expected_fmt,
+			       struct fuse_cmdline_opts *opts)
+{
+	return -EOPNOTSUPP;
+}
+
+int fuse_service_session_unmount(struct fuse_service *sf)
+{
+	return -EOPNOTSUPP;
+}
+
+void fuse_service_release(struct fuse_service *sf)
+{
+}
+
+void fuse_service_destroy(struct fuse_service **sfp)
+{
+	*sfp = NULL;
+}
+
+int fuse_service_parse_cmdline_opts(struct fuse_args *args,
+				    struct fuse_cmdline_opts *opts)
+{
+	return -1;
+}
+
+int fuse_service_exit(int ret)
+{
+	return ret;
+}
diff --git a/lib/fuse_versionscript b/lib/fuse_versionscript
index cce09610316f4b..f34dc959a1d1e1 100644
--- a/lib/fuse_versionscript
+++ b/lib/fuse_versionscript
@@ -227,6 +227,23 @@ FUSE_3.19 {
 		fuse_session_start_teardown_watchdog;
 		fuse_session_stop_teardown_watchdog;
 		fuse_lowlevel_notify_prune;
+
+		fuse_service_accept;
+		fuse_service_append_args;
+		fuse_service_can_allow_other;
+		fuse_service_cmdline;
+		fuse_service_destroy;
+		fuse_service_exit;
+		fuse_service_expect_mount_format;
+		fuse_service_finish_file_requests;
+		fuse_service_parse_cmdline_opts;
+		fuse_service_receive_file;
+		fuse_service_release;
+		fuse_service_request_file;
+		fuse_service_request_blockdev;
+		fuse_service_send_goodbye;
+		fuse_service_session_mount;
+		fuse_service_session_unmount;
 } FUSE_3.18;
 
 # Local Variables:
diff --git a/lib/helper.c b/lib/helper.c
index 74906fdcbd76d9..819b9a6e4d243c 100644
--- a/lib/helper.c
+++ b/lib/helper.c
@@ -26,6 +26,11 @@
 #include <errno.h>
 #include <sys/param.h>
 
+#ifdef HAVE_SERVICEMOUNT
+# include <linux/types.h>
+# include "fuse_service_priv.h"
+#endif
+
 #define FUSE_HELPER_OPT(t, p) \
 	{ t, offsetof(struct fuse_cmdline_opts, p), 1 }
 
@@ -228,6 +233,52 @@ int fuse_parse_cmdline_312(struct fuse_args *args,
 	return 0;
 }
 
+#ifdef HAVE_SERVICEMOUNT
+static int fuse_helper_opt_proc_service(void *data, const char *arg, int key,
+					struct fuse_args *outargs)
+{
+	(void) outargs;
+	struct fuse_cmdline_opts *opts = data;
+
+	switch (key) {
+	case FUSE_OPT_KEY_NONOPT:
+		if (!opts->mountpoint)
+			return fuse_opt_add_opt(&opts->mountpoint, arg);
+
+		fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg);
+		return -1;
+	default:
+		/* Pass through unknown options */
+		return 1;
+	}
+}
+
+int fuse_parse_cmdline_service(struct fuse_args *args,
+			       struct fuse_cmdline_opts *opts)
+{
+	memset(opts, 0, sizeof(struct fuse_cmdline_opts));
+
+	opts->max_idle_threads = UINT_MAX; /* new default in fuse version 3.12 */
+	opts->max_threads = 10;
+
+	if (fuse_opt_parse(args, opts, fuse_helper_opts,
+			   fuse_helper_opt_proc_service) == -1)
+		return -1;
+
+	/*
+	 * *Linux*: if neither -o subtype nor -o fsname are specified,
+	 * set subtype to program's basename.
+	 * *FreeBSD*: if fsname is not specified, set to program's
+	 * basename.
+	 */
+	if (!opts->nodefault_subtype)
+		if (add_default_subtype(args->argv[0], args) == -1)
+			return -1;
+
+	return 0;
+}
+#endif
+
 /**
  * struct fuse_cmdline_opts got extended in libfuse-3.12
  */
diff --git a/lib/meson.build b/lib/meson.build
index fcd95741c9d374..d9a902f74b558f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -10,6 +10,12 @@ else
    libfuse_sources += [ 'mount_bsd.c' ]
 endif
 
+if private_cfg.get('HAVE_SERVICEMOUNT', false)
+  libfuse_sources += [ 'fuse_service.c' ]
+else
+  libfuse_sources += [ 'fuse_service_stub.c' ]
+endif
+
 deps = [ thread_dep ]
 if private_cfg.get('HAVE_ICONV')
    libfuse_sources += [ 'modules/iconv.c' ]
@@ -49,18 +55,25 @@ libfuse = library('fuse3',
                   dependencies: deps,
                   install: true,
                   link_depends: 'fuse_versionscript',
-                  c_args: [ '-DFUSE_USE_VERSION=317',
+                  c_args: [ '-DFUSE_USE_VERSION=319',
                             '-DFUSERMOUNT_DIR="@0@"'.format(fusermount_path) ],
                   link_args: ['-Wl,--version-script,' + meson.current_source_dir()
                               + '/fuse_versionscript' ])
 
+vars = []
+if private_cfg.get('HAVE_SERVICEMOUNT', false)
+  service_socket_dir = private_cfg.get_unquoted('FUSE_SERVICE_SOCKET_DIR', '')
+  vars += ['service_socket_dir=' + service_socket_dir]
+  vars += ['service_socket_perms=' + service_socket_perms]
+endif
 pkg = import('pkgconfig')
 pkg.generate(libraries: [ libfuse, '-lpthread' ],
              libraries_private: '-ldl',
              version: meson.project_version(),
              name: 'fuse3',
              description: 'Filesystem in Userspace',
-             subdirs: 'fuse3')
+             subdirs: 'fuse3',
+             variables: vars)
 
 libfuse_dep = declare_dependency(include_directories: include_dirs,
                                  link_with: libfuse, dependencies: deps)
diff --git a/lib/mount.c b/lib/mount.c
index 2397c3fb2aa26b..952d8899dcf218 100644
--- a/lib/mount.c
+++ b/lib/mount.c
@@ -750,3 +750,15 @@ char *fuse_mnt_build_type(const struct mount_opts *mo)
 
 	return type;
 }
+
+char *fuse_mnt_kernel_opts(const struct mount_opts *mo)
+{
+	if (mo->kernel_opts)
+		return strdup(mo->kernel_opts);
+	return NULL;
+}
+
+unsigned int fuse_mnt_flags(const struct mount_opts *mo)
+{
+	return mo->flags;
+}
diff --git a/meson.build b/meson.build
index 80c5f1dc0bd356..66425a0d4cc16f 100644
--- a/meson.build
+++ b/meson.build
@@ -69,6 +69,16 @@ args_default = [ '-D_GNU_SOURCE' ]
 #
 private_cfg = configuration_data()
 private_cfg.set_quoted('PACKAGE_VERSION', meson.project_version())
+service_socket_dir = get_option('service-socket-dir')
+service_socket_perms = get_option('service-socket-perms')
+if service_socket_dir == ''
+  service_socket_dir = '/run/filesystems'
+endif
+if service_socket_perms == ''
+  service_socket_perms = '0220'
+endif
+private_cfg.set_quoted('FUSE_SERVICE_SOCKET_DIR', service_socket_dir)
+private_cfg.set('FUSE_SERVICE_SOCKET_PERMS', service_socket_perms)
 
 # Test for presence of some functions
 test_funcs = [ 'fork', 'fstatat', 'openat', 'readlinkat', 'pipe2',
@@ -118,6 +128,13 @@ special_funcs = {
 	    return -1;
 	  }
 	}
+    ''',
+    'systemd_headers': '''
+	#include <systemd/sd-daemon.h>
+
+	int main(int argc, char *argv[]) {
+          return SD_LISTEN_FDS_START;
+	}
     '''
 }
 
@@ -180,6 +197,23 @@ if get_option('enable-io-uring') and liburing.found() and libnuma.found()
    endif
 endif
 
+# Check for systemd support
+systemd_system_unit_dir = get_option('systemd-system-unit-dir')
+if systemd_system_unit_dir == ''
+  systemd = dependency('systemd', required: false)
+  if systemd.found()
+     systemd_system_unit_dir = systemd.get_variable(pkgconfig: 'systemd_system_unit_dir')
+  endif
+endif
+
+if systemd_system_unit_dir == '' or private_cfg.get('HAVE_SYSTEMD_HEADERS', false) == false
+  warning('systemd service support will not be built')
+else
+  private_cfg.set_quoted('SYSTEMD_SYSTEM_UNIT_DIR', systemd_system_unit_dir)
+  private_cfg.set('HAVE_SYSTEMD', true)
+  private_cfg.set('HAVE_SERVICEMOUNT', true)
+endif
+
 #
 # Compiler configuration
 #
diff --git a/meson_options.txt b/meson_options.txt
index c1f8fe69467184..193a74c96d0676 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -27,3 +27,12 @@ option('enable-usdt', type : 'boolean', value : false,
 
 option('enable-io-uring', type: 'boolean', value: true,
        description: 'Enable fuse-over-io-uring support')
+
+option('service-socket-dir', type : 'string', value : '',
+       description: 'Where to install fuse server sockets (if empty, /run/filesystems)')
+
+option('service-socket-perms', type : 'string', value : '',
+       description: 'Default fuse server socket permissions (if empty, 0220)')
+
+option('systemd-system-unit-dir', type : 'string', value : '',
+       description: 'Where to install systemd unit files (if empty, query pkg-config(1))')
diff --git a/util/fuservicemount.c b/util/fuservicemount.c
new file mode 100644
index 00000000000000..9c694a4290f94e
--- /dev/null
+++ b/util/fuservicemount.c
@@ -0,0 +1,18 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file GPL2.txt.
+ *
+ * This program wraps the mounting of FUSE filesystems that run in systemd
+ */
+#define _GNU_SOURCE
+#include "fuse_config.h"
+#include "mount_service.h"
+
+int main(int argc, char *argv[])
+{
+	return mount_service_main(argc, argv);
+}
diff --git a/util/meson.build b/util/meson.build
index 0e4b1cce95377e..04ea5ac201340d 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -6,6 +6,15 @@ executable('fusermount3', ['fusermount.c', '../lib/mount_util.c', '../lib/util.c
            install_dir: get_option('bindir'),
            c_args: '-DFUSE_CONF="@0@"'.format(fuseconf_path))
 
+if private_cfg.get('HAVE_SERVICEMOUNT', false)
+  executable('fuservicemount3', ['mount_service.c', 'fuservicemount.c', '../lib/mount_util.c'],
+             include_directories: include_dirs,
+             link_with: [ libfuse ],
+             install: true,
+             install_dir: get_option('sbindir'),
+             c_args: '-DFUSE_USE_VERSION=319')
+endif
+
 executable('mount.fuse3', ['mount.fuse.c'],
            include_directories: include_dirs,
            link_with: [ libfuse ],
diff --git a/util/mount_service.c b/util/mount_service.c
new file mode 100644
index 00000000000000..a43ff79c7bfb6f
--- /dev/null
+++ b/util/mount_service.c
@@ -0,0 +1,1427 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2025-2026 Oracle.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file GPL2.txt.
+ *
+ * This program does the mounting of FUSE filesystems that run in systemd
+ */
+#define _GNU_SOURCE
+#include "fuse_config.h"
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#include "mount_util.h"
+#include "util.h"
+#include "fuse_i.h"
+#include "fuse_service_priv.h"
+#include "mount_service.h"
+
+struct mount_service {
+	/* prefix for printing error messages */
+	const char *msgtag;
+
+	/* fuse subtype based on -t cli argument */
+	char *subtype;
+
+	/* source argument to mount() */
+	char *source;
+
+	/* target argument (aka mountpoint) to mount() */
+	char *mountpoint;
+
+	/* mountpoint that we pass to mount() */
+	char *real_mountpoint;
+
+	/* resolved path to mountpoint that we use for mtab updates */
+	char *resv_mountpoint;
+
+	/* mount options */
+	char *mntopts;
+
+	/* socket fd */
+	int sockfd;
+
+	/* /dev/fuse device */
+	int fusedevfd;
+
+	/* memfd for cli arguments */
+	int argvfd;
+
+	/* fd for mount point */
+	int mountfd;
+
+	/* did we actually mount successfully? */
+	bool mounted;
+
+	/* has the fsopen command already been submitted? */
+	bool fsopened;
+
+	/* is this a fuseblk mount? */
+	bool fuseblk;
+};
+
+static ssize_t __send_fd(struct mount_service *mo,
+			 struct fuse_service_requested_file *req,
+			 size_t req_sz, int fd)
+{
+	union {
+		struct cmsghdr cmsghdr;
+		char control[CMSG_SPACE(sizeof(int))];
+	} cmsgu;
+	struct iovec iov = {
+		.iov_base = req,
+		.iov_len = req_sz,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = cmsgu.control,
+		.msg_controllen = sizeof(cmsgu.control),
+	};
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+
+	if (!cmsg) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	memset(&cmsgu, 0, sizeof(cmsgu));
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+
+	*((int *)CMSG_DATA(cmsg)) = fd;
+
+	return sendmsg(mo->sockfd, &msg, MSG_EOR | MSG_NOSIGNAL);
+}
+
+static ssize_t __send_packet(struct mount_service *mo, void *ptr, size_t len)
+{
+	struct iovec iov = {
+		.iov_base = ptr,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	return sendmsg(mo->sockfd, &msg, MSG_EOR | MSG_NOSIGNAL);
+}
+
+static ssize_t __recv_packet_size(struct mount_service *mo)
+{
+	struct iovec iov = { };
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	return recvmsg(mo->sockfd, &msg, MSG_PEEK | MSG_TRUNC);
+}
+
+static ssize_t __recv_packet(struct mount_service *mo, void *ptr, size_t len)
+{
+	struct iovec iov = {
+		.iov_base = ptr,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	return recvmsg(mo->sockfd, &msg, MSG_TRUNC);
+}
+
+/*
+ * Filter out the subtype of the filesystem (e.g. fuse.Y[.Z] -> Y[.Z]).  The
+ * fuse server determines if it's appropriate to set the "blockdev" mount
+ * option (aka fuseblk).
+ */
+const char *mount_service_subtype(const char *fstype)
+{
+	const char *subtype;
+
+	if (!strncmp(fstype, "fuse.", 5))
+		subtype = fstype + 5;
+	else if (!strncmp(fstype, "fuseblk.", 8))
+		subtype = fstype + 8;
+	else
+		subtype = fstype;
+
+	if (strchr(subtype, '/') != NULL) {
+		fprintf(stderr,
+			"%s: fs subtype cannot contain path separators\n",
+			fstype);
+		return NULL;
+	}
+
+	return subtype;
+}
+
+static int mount_service_init(struct mount_service *mo, int argc, char *argv[])
+{
+	char *fstype = NULL;
+	const char *subtype;
+	int i;
+
+	mo->sockfd = -1;
+	mo->argvfd = -1;
+	mo->fusedevfd = -1;
+	mo->mountfd = -1;
+
+	for (i = 0; i < argc; i++) {
+		if (!strcmp(argv[i], "-t") && i + 1 < argc) {
+			fstype = argv[i + 1];
+			break;
+		}
+	}
+	if (!fstype) {
+		fprintf(stderr, "%s: cannot determine filesystem type.\n",
+			mo->msgtag);
+		return -1;
+	}
+
+	subtype = mount_service_subtype(fstype);
+	if (!subtype)
+		return -1;
+
+	mo->subtype = strdup(subtype);
+	if (!mo->subtype) {
+		int error = errno;
+
+		fprintf(stderr, "%s: cannot alloc memory for fs subtype: %s\n",
+			mo->msgtag, strerror(error));
+		return -1;
+	}
+
+	return 0;
+}
+
+#ifdef SO_PASSRIGHTS
+static int try_drop_passrights(struct mount_service *mo, int sockfd)
+{
+	int zero = 0;
+	int ret;
+
+	/*
+	 * Don't let a malicious mount helper send us any fds.  We don't trust
+	 * the fuse server not to pollute our fd namespace, so we'll end now.
+	 */
+	ret = setsockopt(sockfd, SOL_SOCKET, SO_PASSRIGHTS, &zero,
+			 sizeof(zero));
+	if (ret) {
+		fprintf(stderr, "%s: disabling fd passing: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+#else
+# define try_drop_passrights(...)	(0)
+#endif
+
+static int check_sendbuf_size(struct mount_service *mo, int sockfd)
+{
+	const size_t min_size = sizeof_fuse_service_open_command(PATH_MAX);
+	int sendbuf_size = -1;
+	socklen_t optlen = sizeof(sendbuf_size);
+	int ret;
+
+	/*
+	 * If we can't query the maximum send buffer length, just keep going.
+	 * Most likely we won't be sending huge open commands, and if we do,
+	 * the sendmsg will fail there too.
+	 */
+	ret = getsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen);
+	if (ret || sendbuf_size < 0)
+		return 0;
+
+	if (sendbuf_size >= min_size)
+		return 0;
+
+	fprintf(stderr, "%s: max socket send buffer is %d, need at least %zu.\n",
+		mo->msgtag, sendbuf_size, min_size);
+	return MOUNT_SERVICE_FALLBACK_NEEDED;
+}
+
+static int mount_service_connect(struct mount_service *mo)
+{
+	struct sockaddr_un name = {
+		.sun_family = AF_UNIX,
+	};
+	int sockfd;
+	ssize_t written;
+	int ret;
+
+	written = snprintf(name.sun_path, sizeof(name.sun_path),
+			FUSE_SERVICE_SOCKET_DIR "/%s", mo->subtype);
+	if (written >= sizeof(name.sun_path)) {
+		fprintf(stderr, "%s: filesystem type name `%s' is too long.\n",
+			mo->msgtag, mo->subtype);
+		return -1;
+	}
+
+	sockfd = socket(AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0);
+	if (sockfd < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: opening %s service socket: %s\n",
+			mo->msgtag, mo->subtype, strerror(error));
+		return -1;
+	}
+
+	ret = check_sendbuf_size(mo, sockfd);
+	if (ret)
+		return ret;
+
+	ret = connect(sockfd, (const struct sockaddr *)&name, sizeof(name));
+	if (ret && (errno == ENOENT || errno == ECONNREFUSED)) {
+		fprintf(stderr, "%s: no safe filesystem driver for %s available.\n",
+			mo->msgtag, mo->subtype);
+		close(sockfd);
+		return MOUNT_SERVICE_FALLBACK_NEEDED;
+	}
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, name.sun_path, strerror(error));
+		goto out;
+	}
+
+	ret = try_drop_passrights(mo, sockfd);
+	if (ret)
+		goto out;
+
+	mo->sockfd = sockfd;
+	return 0;
+out:
+	close(sockfd);
+	return -1;
+}
+
+static int mount_service_send_hello(struct mount_service *mo)
+{
+	struct fuse_service_hello hello = {
+		.p.magic = htonl(FUSE_SERVICE_HELLO_CMD),
+		.min_version = htons(FUSE_SERVICE_MIN_PROTO),
+		.max_version = htons(FUSE_SERVICE_MAX_PROTO),
+	};
+	struct fuse_service_hello_reply reply = { };
+	ssize_t size;
+
+	if (getuid() == 0)
+		hello.flags |= htonl(FUSE_SERVICE_FLAG_ALLOW_OTHER);
+
+	size = __send_packet(mo, &hello, sizeof(hello));
+	if (size < 0) {
+		fprintf(stderr, "%s: send hello: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+
+	size = __recv_packet(mo, &reply, sizeof(reply));
+	if (size < 0) {
+		fprintf(stderr, "%s: hello reply: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	if (size != sizeof(reply)) {
+		fprintf(stderr, "%s: wrong hello reply size %zd, expected %zu\n",
+			mo->msgtag, size, sizeof(reply));
+		return -1;
+	}
+
+	if (ntohl(reply.p.magic) != FUSE_SERVICE_HELLO_REPLY) {
+		fprintf(stderr, "%s: %s service server did not reply to hello\n",
+			mo->msgtag, mo->subtype);
+		return -1;
+	}
+
+	if (ntohs(reply.version) < FUSE_SERVICE_MIN_PROTO ||
+	    ntohs(reply.version) > FUSE_SERVICE_MAX_PROTO) {
+		fprintf(stderr, "%s: unsupported protocol version %u\n",
+			mo->msgtag, ntohs(reply.version));
+		return -1;
+	}
+
+	if (reply.padding) {
+		fprintf(stderr, "%s: nonzero value in padding field\n",
+			mo->msgtag);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int mount_service_capture_arg(struct mount_service *mo,
+				     struct fuse_service_memfd_argv *args,
+				     const char *string, off_t *array_pos,
+				     off_t *string_pos)
+{
+	const size_t string_len = strlen(string) + 1;
+	struct fuse_service_memfd_arg arg = {
+		.pos = htonl(*string_pos),
+		.len = htonl(string_len),
+	};
+	ssize_t written;
+
+	written = pwrite(mo->argvfd, string, string_len, *string_pos);
+	if (written < 0) {
+		fprintf(stderr, "%s: memfd argv write: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	if (written < string_len) {
+		fprintf(stderr, "%s: memfd argv[%u] wrote %zd, expected %zu\n",
+			mo->msgtag, args->argc, written, string_len);
+		return -1;
+	}
+
+	written = pwrite(mo->argvfd, &arg, sizeof(arg), *array_pos);
+	if (written < 0) {
+		fprintf(stderr, "%s: memfd arg write: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	if (written < sizeof(arg)) {
+		fprintf(stderr, "%s: memfd arg[%u] wrote %zd, expected %zu\n",
+			mo->msgtag, args->argc, written, sizeof(arg));
+		return -1;
+	}
+
+	args->argc++;
+	*string_pos += string_len;
+	*array_pos += sizeof(arg);
+
+	return 0;
+}
+
+static int mount_service_capture_args(struct mount_service *mo, int argc,
+				      char *argv[])
+{
+	struct fuse_service_memfd_argv args = {
+		.magic = htonl(FUSE_SERVICE_ARGS_MAGIC),
+	};
+	off_t array_pos = sizeof(struct fuse_service_memfd_argv);
+	off_t string_pos = array_pos +
+			(argc * sizeof(struct fuse_service_memfd_arg));
+	ssize_t written;
+	int i;
+	int ret;
+
+	if (argc < 0) {
+		fprintf(stderr, "%s: argc cannot be negative\n",
+			mo->msgtag);
+		return -1;
+	}
+
+	/*
+	 * Create the memfd in which we'll stash arguments, and set the write
+	 * pointer for the names.
+	 */
+	mo->argvfd = memfd_create("fuse service argv", MFD_CLOEXEC);
+	if (mo->argvfd < 0) {
+		fprintf(stderr, "%s: argvfd create: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+
+	/*
+	 * Write the alleged subtype as if it were argv[0], then write the rest
+	 * of the argv arguments.
+	 */
+	ret = mount_service_capture_arg(mo, &args, mo->subtype, &array_pos,
+					&string_pos);
+	if (ret)
+		return ret;
+
+	for (i = 1; i < argc; i++) {
+		/* skip the -t(ype) argument */
+		if (!strcmp(argv[i], "-t") && i + 1 < argc) {
+			i++;
+			continue;
+		}
+
+		ret = mount_service_capture_arg(mo, &args, argv[i],
+						&array_pos, &string_pos);
+		if (ret)
+			return ret;
+	}
+
+	/* Now write the header */
+	args.argc = htonl(args.argc);
+	written = pwrite(mo->argvfd, &args, sizeof(args), 0);
+	if (written < 0) {
+		fprintf(stderr, "%s: memfd argv write: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	if (written < sizeof(args)) {
+		fprintf(stderr, "%s: memfd argv wrote %zd, expected %zu\n",
+			mo->msgtag, written, sizeof(args));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int mount_service_send_file(struct mount_service *mo,
+				   const char *path, int fd)
+{
+	struct fuse_service_requested_file *req;
+	const size_t req_sz =
+			sizeof_fuse_service_requested_file(strlen(path));
+	ssize_t written;
+	int ret = 0;
+
+	req = calloc(1, req_sz);
+	if (!req) {
+		fprintf(stderr, "%s: alloc send file reply: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	req->p.magic = htonl(FUSE_SERVICE_OPEN_REPLY);
+	req->error = 0;
+	strcpy(req->path, path);
+
+	written = __send_fd(mo, req, req_sz, fd);
+	if (written < 0) {
+		fprintf(stderr, "%s: send file reply: %s\n",
+			mo->msgtag, strerror(errno));
+		ret = -1;
+		goto out_req;
+	}
+	if (written < req_sz) {
+		fprintf(stderr, "%s: send file reply wrote %zd, expected %zu\n",
+			mo->msgtag, written, req_sz);
+		ret = -1;
+		goto out_req;
+	}
+
+out_req:
+	free(req);
+	return ret;
+}
+
+static int mount_service_send_file_error(struct mount_service *mo, int error,
+					 const char *path)
+{
+	struct fuse_service_requested_file *req;
+	const size_t req_sz =
+			sizeof_fuse_service_requested_file(strlen(path));
+	ssize_t written;
+	int ret = 0;
+
+	req = calloc(1, req_sz);
+	if (!req) {
+		fprintf(stderr, "%s: alloc send file error: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	req->p.magic = htonl(FUSE_SERVICE_OPEN_REPLY);
+	req->error = htonl(error);
+	strcpy(req->path, path);
+
+	written = __send_packet(mo, req, req_sz);
+	if (written < 0) {
+		fprintf(stderr, "%s: send file error: %s\n",
+			mo->msgtag, strerror(errno));
+		ret = -1;
+		goto out_req;
+	}
+	if (written < req_sz) {
+		fprintf(stderr, "%s: send file error wrote %zd, expected %zu\n",
+			mo->msgtag, written, req_sz);
+		ret = -1;
+		goto out_req;
+	}
+
+out_req:
+	free(req);
+	return ret;
+}
+
+static int mount_service_send_required_files(struct mount_service *mo,
+					     const char *fusedev)
+{
+	int ret;
+
+	mo->fusedevfd = open(fusedev, O_RDWR | O_CLOEXEC);
+	if (mo->fusedevfd < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, fusedev, strerror(error));
+		return -1;
+	}
+
+	ret = mount_service_send_file(mo, FUSE_SERVICE_ARGV, mo->argvfd);
+	if (ret)
+		goto out_fusedevfd;
+
+	close(mo->argvfd);
+	mo->argvfd = -1;
+
+	return mount_service_send_file(mo, FUSE_SERVICE_FUSEDEV,
+				       mo->fusedevfd);
+
+out_fusedevfd:
+	close(mo->fusedevfd);
+	mo->fusedevfd = -1;
+	return ret;
+}
+
+static int mount_service_receive_command(struct mount_service *mo,
+					 struct fuse_service_packet **commandp,
+					 size_t *commandsz)
+{
+	struct fuse_service_packet *command;
+	ssize_t alleged_size, size;
+
+	alleged_size = __recv_packet_size(mo);
+	if (alleged_size < 0) {
+		fprintf(stderr, "%s: peek service command: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+	if (alleged_size == 0) {
+		/* fuse server probably exited early */
+		fprintf(stderr, "%s: fuse server exited without saying goodbye!\n",
+			mo->msgtag);
+		return -1;
+	}
+	if (alleged_size < sizeof(struct fuse_service_packet)) {
+		fprintf(stderr, "%s: wrong command packet size %zd, expected at least %zu\n",
+			mo->msgtag, alleged_size,
+			sizeof(struct fuse_service_packet));
+		return -1;
+	}
+	if (alleged_size > FUSE_SERVICE_MAX_CMD_SIZE) {
+		fprintf(stderr, "%s: wrong command packet size %zd, expected less than %d\n",
+			mo->msgtag, alleged_size, FUSE_SERVICE_MAX_CMD_SIZE);
+		return -1;
+	}
+
+	command = calloc(1, alleged_size + 1);
+	if (!command) {
+		fprintf(stderr, "%s: alloc service command: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+
+	size = __recv_packet(mo, command, alleged_size);
+	if (size < 0) {
+		fprintf(stderr, "%s: receive service command: %s\n",
+			mo->msgtag, strerror(errno));
+		free(command);
+		return -1;
+	}
+	if (size != alleged_size) {
+		fprintf(stderr, "%s: wrong service command size %zd, expected %zd\n",
+			mo->msgtag, size, alleged_size);
+		free(command);
+		return -1;
+	}
+
+	*commandp = command;
+	*commandsz = size;
+	return 0;
+}
+
+static int mount_service_send_reply(struct mount_service *mo, int error)
+{
+	struct fuse_service_simple_reply reply = {
+		.p.magic = htonl(FUSE_SERVICE_SIMPLE_REPLY),
+		.error = htonl(error),
+	};
+	ssize_t size;
+
+	size = __send_packet(mo, &reply, sizeof(reply));
+	if (size < 0) {
+		fprintf(stderr, "%s: send service reply: %s\n",
+			mo->msgtag, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int prepare_bdev(struct mount_service *mo,
+			struct fuse_service_open_command *oc, int fd)
+{
+	struct stat stbuf;
+	int ret;
+
+	ret = fstat(fd, &stbuf);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, oc->path, strerror(error));
+		return -error;
+	}
+
+	if (!S_ISBLK(stbuf.st_mode)) {
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, oc->path, strerror(ENOTBLK));
+		return -ENOTBLK;
+	}
+
+	if (oc->block_size) {
+		int block_size = ntohl(oc->block_size);
+
+		ret = ioctl(fd, BLKBSZSET, &block_size);
+		if (ret) {
+			int error = errno;
+
+			fprintf(stderr, "%s: %s: %s\n",
+				mo->msgtag, oc->path, strerror(error));
+			return -error;
+		}
+	}
+
+	return 0;
+}
+
+static int mount_service_open_path(struct mount_service *mo,
+				   mode_t expected_fmt,
+				   struct fuse_service_packet *p, size_t psz)
+{
+	struct fuse_service_open_command *oc =
+			container_of(p, struct fuse_service_open_command, p);
+	uint32_t request_flags;
+	int open_flags;
+	int ret;
+	int fd;
+
+	if (psz < sizeof_fuse_service_open_command(1)) {
+		fprintf(stderr, "%s: open command too small\n",
+			mo->msgtag);
+		return mount_service_send_file_error(mo, EINVAL, "?");
+	}
+
+	if (!check_null_endbyte(p, psz)) {
+		fprintf(stderr, "%s: open command must be null terminated\n",
+			mo->msgtag);
+		return mount_service_send_file_error(mo, EINVAL, "?");
+	}
+
+	request_flags = ntohl(oc->request_flags);
+	if (request_flags & ~FUSE_SERVICE_OPEN_FLAGS) {
+		fprintf(stderr, "%s: open flags 0x%x not recognized\n",
+			mo->msgtag, request_flags & ~FUSE_SERVICE_OPEN_FLAGS);
+		return mount_service_send_file_error(mo, EINVAL, oc->path);
+	}
+
+	open_flags = ntohl(oc->open_flags) | O_CLOEXEC;
+	fd = open(oc->path, open_flags, ntohl(oc->create_mode));
+	if (fd < 0) {
+		int error = errno;
+
+		/*
+		 * Don't print a busy device error report because the
+		 * filesystem might decide to retry.
+		 */
+		if (error != EBUSY && !(request_flags & FUSE_SERVICE_OPEN_QUIET))
+			fprintf(stderr, "%s: %s: %s\n",
+				mo->msgtag, oc->path, strerror(error));
+		return mount_service_send_file_error(mo, error, oc->path);
+	}
+
+	if (S_ISBLK(expected_fmt)) {
+		ret = prepare_bdev(mo, oc, fd);
+		if (ret < 0) {
+			close(fd);
+			return mount_service_send_file_error(mo, -ret,
+							     oc->path);
+		}
+	}
+
+	ret = mount_service_send_file(mo, oc->path, fd);
+	close(fd);
+	return ret;
+}
+
+static int mount_service_handle_open_cmd(struct mount_service *mo,
+					 struct fuse_service_packet *p,
+					 size_t psz)
+{
+	return mount_service_open_path(mo, 0, p, psz);
+}
+
+static int mount_service_handle_open_bdev_cmd(struct mount_service *mo,
+					      struct fuse_service_packet *p,
+					      size_t psz)
+{
+	return mount_service_open_path(mo, S_IFBLK, p, psz);
+}
+
+static inline const char *fsname(const struct mount_service *mo)
+{
+	return mo->fuseblk ? "fuseblk" : "fuse";
+}
+
+static int mount_service_handle_fsopen_cmd(struct mount_service *mo,
+					   const struct fuse_service_packet *p,
+					   size_t psz)
+{
+	struct fuse_service_fsopen_command *oc =
+			container_of(p, struct fuse_service_fsopen_command, p);
+	uint32_t fsopen_flags;
+
+	if (psz != sizeof(struct fuse_service_fsopen_command)) {
+		fprintf(stderr, "%s: fsopen command wrong size %zu, expected %zu\n",
+			mo->msgtag, psz, sizeof(*oc));
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (mo->fsopened) {
+		fprintf(stderr, "%s: fsopen command respecified\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	fsopen_flags = ntohl(oc->fsopen_flags);
+	if (fsopen_flags & ~FUSE_SERVICE_FSOPEN_FLAGS) {
+		fprintf(stderr, "%s: unknown fsopen flags, 0x%x\n",
+			mo->msgtag, fsopen_flags & ~FUSE_SERVICE_FSOPEN_FLAGS);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (fsopen_flags & FUSE_SERVICE_FSOPEN_FUSEBLK) {
+		if (getuid() != 0) {
+			fprintf(stderr, "%s: fuseblk requires root privilege\n",
+				mo->msgtag);
+			return mount_service_send_reply(mo, EPERM);
+		}
+
+		mo->fuseblk = true;
+	}
+	mo->fsopened = true;
+
+	return mount_service_send_reply(mo, 0);
+}
+
+static int mount_service_handle_source_cmd(struct mount_service *mo,
+					   const struct fuse_service_packet *p,
+					   size_t psz)
+{
+	struct fuse_service_string_command *oc =
+			container_of(p, struct fuse_service_string_command, p);
+
+	if (psz < sizeof_fuse_service_string_command(1)) {
+		fprintf(stderr, "%s: source command too small\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!check_null_endbyte(p, psz)) {
+		fprintf(stderr, "%s: source command must be null terminated\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (mo->source) {
+		fprintf(stderr, "%s: source respecified!\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	mo->source = strdup(oc->value);
+	if (!mo->source) {
+		int error = errno;
+
+		fprintf(stderr, "%s: alloc source string: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	return mount_service_send_reply(mo, 0);
+}
+
+static int mount_service_handle_mntopts_cmd(struct mount_service *mo,
+					    const struct fuse_service_packet *p,
+					    size_t psz)
+{
+	struct fuse_service_string_command *oc =
+			container_of(p, struct fuse_service_string_command, p);
+
+	if (psz < sizeof_fuse_service_string_command(1)) {
+		fprintf(stderr, "%s: mount options command too small\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!check_null_endbyte(p, psz)) {
+		fprintf(stderr, "%s: mount options command must be null terminated\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (mo->mntopts) {
+		fprintf(stderr, "%s: mount options respecified!\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	mo->mntopts = strdup(oc->value);
+	if (!mo->mntopts) {
+		int error = errno;
+
+		fprintf(stderr, "%s: alloc mount options string: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	return mount_service_send_reply(mo, 0);
+}
+
+static int attach_to_mountpoint(struct mount_service *mo, mode_t expected_fmt,
+				char *mntpt)
+{
+	struct stat stbuf;
+	char *res_mntpt;
+	int mountfd = -1;
+	int error;
+	int ret;
+
+	/*
+	 * Open the alleged mountpoint, make sure it's a dir or a file.
+	 */
+	mountfd = open(mntpt, O_RDONLY | O_CLOEXEC);
+	if (mountfd < 0) {
+		error = errno;
+		fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mntpt,
+			strerror(error));
+		goto out_error;
+	}
+
+	/*
+	 * Make sure we can access the mountpoint and that it's either a
+	 * directory or a regular file.  Linux can handle mounting atop special
+	 * files, but we don't care to do such crazy things.
+	 */
+	ret = fstat(mountfd, &stbuf);
+	if (ret) {
+		error = errno;
+		fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mntpt,
+			strerror(error));
+		goto out_mountfd;
+	}
+
+	if (!S_ISDIR(stbuf.st_mode) && !S_ISREG(stbuf.st_mode)) {
+		error = EACCES;
+		fprintf(stderr, "%s: %s: Mount point must be directory or regular file.\n",
+			mo->msgtag, mntpt);
+		goto out_mountfd;
+	}
+
+	/*
+	 * Resolve the (possibly relative) mountpoint path before chdir'ing
+	 * onto it.
+	 */
+	res_mntpt = fuse_mnt_resolve_path(mo->msgtag, mntpt);
+	if (!res_mntpt) {
+		error = EACCES;
+		fprintf(stderr, "%s: %s: Could not resolve path to mount point.\n",
+			mo->msgtag, mntpt);
+		goto out_mountfd;
+	}
+
+	/* Make sure the mountpoint type matches what the caller wanted */
+	switch (expected_fmt) {
+	case S_IFDIR:
+		if (!S_ISDIR(stbuf.st_mode)) {
+			error = ENOTDIR;
+			fprintf(stderr, "%s: %s: %s\n",
+				mo->msgtag, mntpt, strerror(error));
+			goto out_res_mntpt;
+		}
+		break;
+	case S_IFREG:
+		if (!S_ISREG(stbuf.st_mode)) {
+			error = EISDIR;
+			fprintf(stderr, "%s: %s: %s\n",
+				mo->msgtag, mntpt, strerror(error));
+			goto out_res_mntpt;
+		}
+		break;
+	}
+
+	switch (stbuf.st_mode & S_IFMT) {
+	case S_IFREG:
+		/*
+		 * This is a regular file, so we point mount() at the open file
+		 * descriptor.
+		 */
+		asprintf(&mo->real_mountpoint, "/dev/fd/%d", mountfd);
+		break;
+	case S_IFDIR:
+		/*
+		 * Pin the mount so it can't go anywhere.  This only works for
+		 * directories, which is fortunately the common case.
+		 */
+		ret = fchdir(mountfd);
+		if (ret) {
+			error = errno;
+			fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mntpt,
+				strerror(error));
+			goto out_res_mntpt;
+		}
+
+		/*
+		 * Now that we're sitting on the mountpoint directory, we can
+		 * pass "." to mount() and avoid races with directory tree
+		 * mutations.
+		 */
+		mo->real_mountpoint = strdup(".");
+		break;
+	default:
+		/* Should never get here */
+		error = EINVAL;
+		goto out_res_mntpt;
+	}
+	if (!mo->real_mountpoint) {
+		error = ENOMEM;
+		fprintf(stderr, "%s: %s: %s\n", mo->msgtag, mntpt,
+			strerror(error));
+		goto out_res_mntpt;
+	}
+
+	mo->mountpoint = mntpt;
+	mo->mountfd = mountfd;
+	mo->resv_mountpoint = res_mntpt;
+
+	return mount_service_send_reply(mo, 0);
+
+out_res_mntpt:
+	free(res_mntpt);
+out_mountfd:
+	close(mountfd);
+out_error:
+	free(mntpt);
+	return mount_service_send_reply(mo, error);
+}
+
+static int mount_service_handle_mountpoint_cmd(struct mount_service *mo,
+					       const struct fuse_service_packet *p,
+					       size_t psz, int argc, char *argv[])
+{
+	struct fuse_service_mountpoint_command *oc =
+			container_of(p, struct fuse_service_mountpoint_command, p);
+	char *mntpt;
+	mode_t expected_fmt;
+	bool foundit = false;
+	int i;
+
+	if (psz < sizeof_fuse_service_mountpoint_command(1)) {
+		fprintf(stderr, "%s: mount point command too small\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!check_null_endbyte(p, psz)) {
+		fprintf(stderr, "%s: mount point command must be null terminated\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (oc->padding) {
+		fprintf(stderr, "%s: nonzero value in padding field\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (mo->mountpoint) {
+		fprintf(stderr, "%s: mount point respecified!\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	/* Make sure the mountpoint file format matches what the caller wanted */
+	expected_fmt = ntohs(oc->expected_fmt);
+	switch (expected_fmt) {
+	case S_IFDIR:
+	case S_IFREG:
+	case 0:
+		break;
+	default:
+		fprintf(stderr, "%s: %s: weird expected format 0%o\n",
+			mo->msgtag, oc->value, expected_fmt);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	/* Mountpoint must be mentioned in the caller's argument list */
+	for (i = 0; i < argc; i++) {
+		if (!strcmp(argv[i], oc->value)) {
+			foundit = true;
+			break;
+		}
+	}
+	if (!foundit) {
+		fprintf(stderr, "%s: mount point must be in command line arguments\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	mntpt = strdup(oc->value);
+	if (!mntpt) {
+		int error = errno;
+
+		fprintf(stderr, "%s: alloc mount point string: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	return attach_to_mountpoint(mo, expected_fmt, mntpt);
+}
+
+static inline int format_libfuse_mntopts(char *buf, size_t bufsz,
+					 const struct mount_service *mo,
+					 const struct stat *stbuf)
+{
+	if (mo->mntopts)
+		return snprintf(buf, bufsz,
+				"%s,fd=%i,rootmode=%o,user_id=%u,group_id=%u",
+				mo->mntopts, mo->fusedevfd,
+				stbuf->st_mode & S_IFMT,
+				getuid(), getgid());
+
+	return snprintf(buf, bufsz,
+			"fd=%i,rootmode=%o,user_id=%u,group_id=%u",
+			mo->fusedevfd, stbuf->st_mode & S_IFMT,
+			getuid(), getgid());
+}
+
+static int mount_service_regular_mount(struct mount_service *mo,
+				       struct fuse_service_mount_command *oc,
+				       struct stat *stbuf)
+{
+	char *fstype = NULL;
+	char *realmopts;
+	int ret;
+
+	/* Compute the amount of buffer space needed for the mount options */
+	ret = format_libfuse_mntopts(NULL, 0, mo, stbuf);
+	if (ret < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: mount option preformatting: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	realmopts = calloc(1, ret + 1);
+	if (!realmopts) {
+		int error = errno;
+
+		fprintf(stderr, "%s: alloc real mount options string: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	ret = format_libfuse_mntopts(realmopts, ret + 1, mo, stbuf);
+	if (ret < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: mount options formatting: %s\n",
+			mo->msgtag, strerror(error));
+		ret = mount_service_send_reply(mo, error);
+		goto out_realmopts;
+	}
+
+	asprintf(&fstype, "%s.%s", fsname(mo), mo->subtype);
+	if (!fstype) {
+		int error = errno;
+
+		fprintf(stderr, "%s: mount fstype formatting: %s\n",
+			mo->msgtag, strerror(error));
+		ret = mount_service_send_reply(mo, error);
+		goto out_realmopts;
+	}
+
+	ret = mount(mo->source, mo->real_mountpoint, fstype,
+		    ntohl(oc->ms_flags), realmopts);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: mount: %s\n",
+			mo->msgtag, strerror(error));
+		ret = mount_service_send_reply(mo, error);
+		goto out_fstype;
+	}
+
+	mo->mounted = true;
+	ret = mount_service_send_reply(mo, 0);
+out_fstype:
+	free(fstype);
+out_realmopts:
+	free(realmopts);
+	return ret;
+}
+
+static int mount_service_handle_mount_cmd(struct mount_service *mo,
+					  struct fuse_service_packet *p,
+					  size_t psz)
+{
+	struct stat stbuf;
+	struct fuse_service_mount_command *oc =
+			container_of(p, struct fuse_service_mount_command, p);
+	int ret;
+
+	if (psz != sizeof(struct fuse_service_mount_command)) {
+		fprintf(stderr, "%s: mount command wrong size %zu, expected %zu\n",
+			mo->msgtag, psz, sizeof(*oc));
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!mo->source) {
+		fprintf(stderr, "%s: missing mount source parameter\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!mo->mountpoint) {
+		fprintf(stderr, "%s: missing mount point parameter\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	/*
+	 * Call fstat again because access modes might have changed since we
+	 * validated the file type.  This is still racy with mount since we
+	 * don't lock the path target.
+	 */
+	ret = fstat(mo->mountfd, &stbuf);
+	if (ret < 0) {
+		int error = errno;
+
+		fprintf(stderr, "%s: %s: %s\n",
+			mo->msgtag, mo->mountpoint, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	return mount_service_regular_mount(mo, oc, &stbuf);
+}
+
+static int mount_service_handle_unmount_cmd(struct mount_service *mo,
+					    struct fuse_service_packet *p,
+					    size_t psz)
+{
+	int ret;
+
+	(void)p;
+
+	if (psz != sizeof(struct fuse_service_unmount_command)) {
+		fprintf(stderr, "%s: unmount command wrong size %zu, expected %zu\n",
+			mo->msgtag, psz, sizeof(struct fuse_service_unmount_command));
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	if (!mo->mounted) {
+		fprintf(stderr, "%s: will not umount before successful mount!\n",
+			mo->msgtag);
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	ret = chdir("/");
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: fuse server failed chdir: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	close(mo->mountfd);
+	mo->mountfd = -1;
+
+	/*
+	 * Try to unmount the resolved mountpoint, and hope that we're not the
+	 * victim of a race.
+	 */
+	ret = umount2(mo->resv_mountpoint, MNT_DETACH);
+	if (ret) {
+		int error = errno;
+
+		fprintf(stderr, "%s: fuse server failed unmount: %s\n",
+			mo->msgtag, strerror(error));
+		return mount_service_send_reply(mo, error);
+	}
+
+	mo->mounted = false;
+	return mount_service_send_reply(mo, 0);
+}
+
+static int mount_service_handle_bye_cmd(struct mount_service *mo,
+					struct fuse_service_packet *p,
+					size_t psz)
+{
+	struct fuse_service_bye_command *bc =
+			container_of(p, struct fuse_service_bye_command, p);
+	int ret;
+
+	if (psz != sizeof(struct fuse_service_bye_command)) {
+		fprintf(stderr, "%s: bye command wrong size %zu, expected %zu\n",
+			mo->msgtag, psz, sizeof(*bc));
+		return mount_service_send_reply(mo, EINVAL);
+	}
+
+	ret = ntohl(bc->exitcode);
+	if (ret)
+		fprintf(stderr, "%s: fuse server failed mount, check dmesg/logs for details.\n",
+			mo->msgtag);
+
+	return ret;
+}
+
+static void mount_service_destroy(struct mount_service *mo)
+{
+	close(mo->mountfd);
+	close(mo->fusedevfd);
+	close(mo->argvfd);
+	shutdown(mo->sockfd, SHUT_RDWR);
+	close(mo->sockfd);
+
+	free(mo->source);
+	free(mo->mountpoint);
+	free(mo->real_mountpoint);
+	free(mo->resv_mountpoint);
+	free(mo->mntopts);
+	free(mo->subtype);
+
+	memset(mo, 0, sizeof(*mo));
+	mo->sockfd = -1;
+	mo->argvfd = -1;
+	mo->fusedevfd = -1;
+	mo->mountfd = -1;
+}
+
+int mount_service_main(int argc, char *argv[])
+{
+	const char *fusedev = fuse_mnt_get_devname();
+	struct mount_service mo = { };
+	bool running = true;
+	int ret;
+
+	if (argc < 3 || !strcmp(argv[1], "--help")) {
+		printf("Usage: %s source mountpoint -t type [-o options]\n",
+				argv[0]);
+		return EXIT_FAILURE;
+	}
+
+	if (argc > 0 && argv[0])
+		mo.msgtag = argv[0];
+	else
+		mo.msgtag = "mount.service";
+
+	ret = mount_service_init(&mo, argc, argv);
+	if (ret)
+		return EXIT_FAILURE;
+
+	ret = mount_service_connect(&mo);
+	if (ret == MOUNT_SERVICE_FALLBACK_NEEDED)
+		goto out;
+	if (ret) {
+		ret = EXIT_FAILURE;
+		goto out;
+	}
+
+	ret = mount_service_send_hello(&mo);
+	if (ret) {
+		ret = EXIT_FAILURE;
+		goto out;
+	}
+
+	ret = mount_service_capture_args(&mo, argc, argv);
+	if (ret) {
+		ret = EXIT_FAILURE;
+		goto out;
+	}
+
+	ret = mount_service_send_required_files(&mo, fusedev);
+	if (ret) {
+		ret = EXIT_FAILURE;
+		goto out;
+	}
+
+	while (running) {
+		struct fuse_service_packet *p = NULL;
+		size_t sz;
+
+		ret = mount_service_receive_command(&mo, &p, &sz);
+		if (ret) {
+			ret = EXIT_FAILURE;
+			goto out;
+		}
+
+		switch (ntohl(p->magic)) {
+		case FUSE_SERVICE_OPEN_CMD:
+			ret = mount_service_handle_open_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_OPEN_BDEV_CMD:
+			ret = mount_service_handle_open_bdev_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_FSOPEN_CMD:
+			ret = mount_service_handle_fsopen_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_SOURCE_CMD:
+			ret = mount_service_handle_source_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_MNTOPTS_CMD:
+			ret = mount_service_handle_mntopts_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_MNTPT_CMD:
+			ret = mount_service_handle_mountpoint_cmd(&mo, p, sz,
+								  argc, argv);
+			break;
+		case FUSE_SERVICE_MOUNT_CMD:
+			ret = mount_service_handle_mount_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_UNMOUNT_CMD:
+			ret = mount_service_handle_unmount_cmd(&mo, p, sz);
+			break;
+		case FUSE_SERVICE_BYE_CMD:
+			ret = mount_service_handle_bye_cmd(&mo, p, sz);
+			free(p);
+			goto out;
+		default:
+			fprintf(stderr, "%s: unrecognized packet 0x%x\n",
+				mo.msgtag, ntohl(p->magic));
+			ret = EXIT_FAILURE;
+			break;
+		}
+		free(p);
+
+		if (ret) {
+			ret = EXIT_FAILURE;
+			goto out;
+		}
+	}
+
+	ret = EXIT_SUCCESS;
+out:
+	mount_service_destroy(&mo);
+	return ret;
+}


^ permalink raw reply related

* [PATCH 01/13] Refactor mount code / move common functions to mount_util.c
From: Darrick J. Wong @ 2026-04-30 21:15 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong
In-Reply-To: <177758363484.1314717.11777978893472254088.stgit@frogsfrogsfrogs>

From: Bernd Schubert <bernd@bsbernd.com>

Also create the new "mount_i.h", which is independent of the
the rest of libfuse.

This is preparation for the new mount API, which goes into a new file.

This is to allow fusermount to use the code from mount_fsmount.c.
I.e. avoid code dup and add just re-use the new linux api mount
functions from that file for fusermount.

Signed-off-by: Bernd Schubert <bernd@bsbernd.com>
[djwong: extract only the parts we need for mount services]
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/mount_common_i.h |   18 +++++++++++++++++
 lib/mount_util.h     |    8 ++++++++
 lib/mount.c          |   53 ++++++++++++++++++++++++++++++++++++--------------
 lib/mount_util.c     |    9 ++++++++
 util/fusermount.c    |    5 +----
 5 files changed, 74 insertions(+), 19 deletions(-)
 create mode 100644 lib/mount_common_i.h


diff --git a/lib/mount_common_i.h b/lib/mount_common_i.h
new file mode 100644
index 00000000000000..6bcb055ff1c23f
--- /dev/null
+++ b/lib/mount_common_i.h
@@ -0,0 +1,18 @@
+/*
+ *  FUSE: Filesystem in Userspace
+ *  Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+ *                2026 Bernd Schubert <bernd@bsbernd.com>
+ *
+ *  This program can be distributed under the terms of the GNU LGPLv2.
+ *  See the file LGPL2.txt
+ */
+
+#ifndef FUSE_MOUNT_COMMON_I_H_
+#define FUSE_MOUNT_COMMON_I_H_
+
+struct mount_opts;
+
+char *fuse_mnt_build_source(const struct mount_opts *mo);
+char *fuse_mnt_build_type(const struct mount_opts *mo);
+
+#endif /* FUSE_MOUNT_COMMON_I_H_ */
diff --git a/lib/mount_util.h b/lib/mount_util.h
index 9cb9077dd17738..b54392abb8b07d 100644
--- a/lib/mount_util.h
+++ b/lib/mount_util.h
@@ -6,6 +6,9 @@
   See the file LGPL2.txt.
 */
 
+#ifndef FUSE_MOUNT_UTIL_H_
+#define FUSE_MOUNT_UTIL_H_
+
 #include <sys/types.h>
 
 int fuse_mnt_add_mount(const char *progname, const char *fsname,
@@ -16,3 +19,8 @@ int fuse_mnt_umount(const char *progname, const char *abs_mnt,
 char *fuse_mnt_resolve_path(const char *progname, const char *orig);
 int fuse_mnt_check_fuseblk(void);
 int fuse_mnt_parse_fuse_fd(const char *mountpoint);
+
+/* Helper functions for mount operations */
+const char *fuse_mnt_get_devname(void);
+
+#endif /* FUSE_MOUNT_UTIL_H_ */
diff --git a/lib/mount.c b/lib/mount.c
index c56a9da1fe8014..2397c3fb2aa26b 100644
--- a/lib/mount.c
+++ b/lib/mount.c
@@ -31,6 +31,7 @@
 #include <sys/wait.h>
 
 #include "fuse_mount_compat.h"
+#include "mount_common_i.h"
 
 #ifdef __NetBSD__
 #include <perfuse.h>
@@ -49,7 +50,6 @@
 #define FUSERMOUNT_PROG		"fusermount3"
 #define FUSE_COMMFD_ENV		"_FUSE_COMMFD"
 #define FUSE_COMMFD2_ENV	"_FUSE_COMMFD2"
-#define FUSE_KERN_DEVICE_ENV	"FUSE_KERN_DEVICE"
 
 #ifndef MS_DIRSYNC
 #define MS_DIRSYNC 128
@@ -510,7 +510,7 @@ static int fuse_mount_sys(const char *mnt, struct mount_opts *mo,
 			  const char *mnt_opts)
 {
 	char tmp[128];
-	const char *devname = getenv(FUSE_KERN_DEVICE_ENV) ?: "/dev/fuse";
+	const char *devname = fuse_mnt_get_devname();
 	char *source = NULL;
 	char *type = NULL;
 	struct stat stbuf;
@@ -550,24 +550,13 @@ static int fuse_mount_sys(const char *mnt, struct mount_opts *mo,
 	if (res == -1)
 		goto out_close;
 
-	source = malloc((mo->fsname ? strlen(mo->fsname) : 0) +
-			(mo->subtype ? strlen(mo->subtype) : 0) +
-			strlen(devname) + 32);
-
-	type = malloc((mo->subtype ? strlen(mo->subtype) : 0) + 32);
+	source = fuse_mnt_build_source(mo);
+	type = fuse_mnt_build_type(mo);
 	if (!type || !source) {
 		fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate memory\n");
 		goto out_close;
 	}
 
-	strcpy(type, mo->blkdev ? "fuseblk" : "fuse");
-	if (mo->subtype) {
-		strcat(type, ".");
-		strcat(type, mo->subtype);
-	}
-	strcpy(source,
-	       mo->fsname ? mo->fsname : (mo->subtype ? mo->subtype : devname));
-
 	res = mount(source, mnt, type, mo->flags, mo->kernel_opts);
 	if (res == -1 && errno == ENODEV && mo->subtype) {
 		/* Probably missing subtype support */
@@ -727,3 +716,37 @@ int fuse_kern_mount(const char *mountpoint, struct mount_opts *mo)
 	free(mnt_opts);
 	return res;
 }
+
+char *fuse_mnt_build_source(const struct mount_opts *mo)
+{
+	const char *devname = fuse_mnt_get_devname();
+	char *source;
+
+	source = malloc((mo->fsname ? strlen(mo->fsname) : 0) +
+			(mo->subtype ? strlen(mo->subtype) : 0) +
+			strlen(devname) + 32);
+	if (!source)
+		return NULL;
+
+	strcpy(source,
+	       mo->fsname ? mo->fsname : (mo->subtype ? mo->subtype : devname));
+
+	return source;
+}
+
+char *fuse_mnt_build_type(const struct mount_opts *mo)
+{
+	char *type;
+
+	type = malloc((mo->subtype ? strlen(mo->subtype) : 0) + 32);
+	if (!type)
+		return NULL;
+
+	strcpy(type, mo->blkdev ? "fuseblk" : "fuse");
+	if (mo->subtype) {
+		strcat(type, ".");
+		strcat(type, mo->subtype);
+	}
+
+	return type;
+}
diff --git a/lib/mount_util.c b/lib/mount_util.c
index 5746e8ed06b736..bdafeda7567fbd 100644
--- a/lib/mount_util.c
+++ b/lib/mount_util.c
@@ -377,3 +377,12 @@ int fuse_mnt_parse_fuse_fd(const char *mountpoint)
 
 	return -1;
 }
+
+#define FUSE_KERN_DEVICE_ENV	"FUSE_KERN_DEVICE"
+
+const char *fuse_mnt_get_devname(void)
+{
+	const char *devname = getenv(FUSE_KERN_DEVICE_ENV);
+
+	return devname ? devname : "/dev/fuse";
+}
diff --git a/util/fusermount.c b/util/fusermount.c
index a5ed4d26dd4d27..68370468140a59 100644
--- a/util/fusermount.c
+++ b/util/fusermount.c
@@ -47,9 +47,6 @@
 #endif
 
 #define FUSE_COMMFD_ENV		"_FUSE_COMMFD"
-#define FUSE_KERN_DEVICE_ENV	"FUSE_KERN_DEVICE"
-
-#define FUSE_DEV "/dev/fuse"
 
 static const char *progname;
 
@@ -1262,7 +1259,7 @@ static int mount_fuse(const char *mnt, const char *opts, const char **type)
 {
 	int res;
 	int fd;
-	const char *dev = getenv(FUSE_KERN_DEVICE_ENV) ?: FUSE_DEV;
+	const char *dev = fuse_mnt_get_devname();
 	struct stat stbuf;
 	char *source = NULL;
 	char *mnt_opts = NULL;


^ permalink raw reply related

* [PATCHSET v5.1] libfuse: run fuse servers as a contained service
From: Darrick J. Wong @ 2026-04-30 21:15 UTC (permalink / raw)
  To: bernd, djwong
  Cc: linux-fsdevel, fuse-devel, linux-ext4, miklos, neal, joannelkoong

Hi all,

This patchset defines the necessary communication protocols and library
code so that users can mount fuse servers that run in unprivileged
systemd service containers.  That in turn allows unprivileged untrusted
mounts, because the worst that can happen is that a malicious image
crashes the fuse server and the mount dies, instead of corrupting the
kernel's memory.

v5.1: fix some of the SCM_RIGHTS handling code, fix header inclusion
      errors, improve documentation of example code, improve statx
      flags handling, improve phony timestamp handling
v5: Refactor socket IO into helpers, tighten the security checks in
    mount_service.c, always set nosuid/nodev for unprivileged mounts,
    use posix_spawnp in mount.fuse, restructure sample programs and hl
    library code to avoid the need for unmounting during startup
v4.1: fix various cppcheck/codecheck complaints
v4: fix a large number of security problems that only matter when the
    mount helper is being run as a setuid program; fix protocol
    byteswapping problems; add CLOEXEC to all files being traded
    back and forth; add an umount command; and strengthen mount socket
    protocol checks.
v3: refactor the sample code to reduce duplication; fix all the
    checkpatch complaints; examples actually build standalone;
    fuservicemount handles utab now; cleaned up meson feature detection;
    handle MS_ flags that don't translate to MOUNT_ATTR_*
v2: cleaned up error code handling and logging; add some example fuse
    service; fuservicemount3 can now be a setuid program to allow
    unprivileged userspace to fire up a contained filesystem driver.
    This could be opening Pandora's box...
v1: detach from fuse-iomap series

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

With a bit of luck, this should all go splendidly.
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-service-container
---
Commits in this patchset:
 * Refactor mount code / move common functions to mount_util.c
 * mount_service: add systemd socket service mounting helper
 * mount_service: create high level fuse helpers
 * mount_service: use the new mount api for the mount service
 * mount_service: update mtab after a successful mount
 * util: hoist the fuse.conf parsing and setuid mode enforcement code
 * util: fix checkpatch complaints in fuser_conf.[ch]
 * mount_service: enable unprivileged users in a similar manner as fusermount
 * mount.fuse3: integrate systemd service startup
 * mount_service: allow installation as a setuid program
 * example/service_ll: create a sample systemd service fuse server
 * example/service: create a sample systemd service for a high-level fuse server
 * nullfs: support fuse systemd service mode
---
 example/single_file.h                            |  195 ++
 include/fuse.h                                   |   34 
 include/fuse_service.h                           |  243 +++
 include/fuse_service_priv.h                      |  161 ++
 lib/fuse_i.h                                     |    3 
 lib/mount_common_i.h                             |   22 
 lib/mount_util.h                                 |    8 
 lib/util.h                                       |   35 
 util/fuser_conf.h                                |   62 +
 util/mount_service.h                             |   49 +
 .github/workflows/install-ubuntu-dependencies.sh |    4 
 README.md                                        |    3 
 doc/fuservicemount3.8                            |   32 
 doc/meson.build                                  |    3 
 example/meson.build                              |   26 
 example/null.c                                   |   51 +
 example/null.socket.in                           |   15 
 example/null@.service                            |  102 +
 example/service_hl.c                             |  240 +++
 example/service_hl.socket.in                     |   15 
 example/service_hl@.service                      |  102 +
 example/service_ll.c                             |  329 +++
 example/service_ll.socket.in                     |   15 
 example/service_ll@.service                      |  102 +
 example/single_file.c                            |  992 ++++++++++
 include/meson.build                              |    4 
 lib/fuse_service.c                               | 1248 +++++++++++++
 lib/fuse_service_stub.c                          |  106 +
 lib/fuse_versionscript                           |   18 
 lib/helper.c                                     |  160 ++
 lib/meson.build                                  |   17 
 lib/mount.c                                      |   72 +
 lib/mount_util.c                                 |    9 
 meson.build                                      |   53 +
 meson_options.txt                                |    9 
 test/ci-build.sh                                 |   14 
 util/fuser_conf.c                                |  398 ++++
 util/fusermount.c                                |  363 ----
 util/fuservicemount.c                            |   65 +
 util/install_helper.sh                           |    6 
 util/meson.build                                 |   24 
 util/mount.fuse.c                                |  171 ++
 util/mount_service.c                             | 2111 ++++++++++++++++++++++
 43 files changed, 7287 insertions(+), 404 deletions(-)
 create mode 100644 example/single_file.h
 create mode 100644 include/fuse_service.h
 create mode 100644 include/fuse_service_priv.h
 create mode 100644 lib/mount_common_i.h
 create mode 100644 util/fuser_conf.h
 create mode 100644 util/mount_service.h
 create mode 100644 doc/fuservicemount3.8
 create mode 100644 example/null.socket.in
 create mode 100644 example/null@.service
 create mode 100644 example/service_hl.c
 create mode 100644 example/service_hl.socket.in
 create mode 100644 example/service_hl@.service
 create mode 100644 example/service_ll.c
 create mode 100644 example/service_ll.socket.in
 create mode 100644 example/service_ll@.service
 create mode 100644 example/single_file.c
 create mode 100644 lib/fuse_service.c
 create mode 100644 lib/fuse_service_stub.c
 create mode 100644 util/fuser_conf.c
 create mode 100644 util/fuservicemount.c
 create mode 100644 util/mount_service.c


^ permalink raw reply

* Re: [PATCH v3 04/22] ext4: add iomap address space operations for buffered I/O
From: Jan Kara @ 2026-04-30 13:23 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-5-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:24, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Introduce initial support for iomap in the buffered I/O path for regular
> files on ext4.
> 
>   - Add a new inode state flag EXT4_STATE_BUFFERED_IOMAP to indicate the
>     inode uses iomap instead of buffer_head for buffered I/O
>   - Add helper ext4_inode_buffered_iomap() to check the flag
>   - Add new address space operations ext4_iomap_aops with callbacks that
>     will use generic iomap implementations
>   - Add ext4_iomap_aops to ext4_set_aops() when the flag is set
> 
> The following callbacks(read_folio(), readahead(), writepages()) are
> provided as placeholders and will be implemented in later patches.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/ext4.h  |  7 +++++++
>  fs/ext4/inode.c | 32 ++++++++++++++++++++++++++++++++
>  2 files changed, 39 insertions(+)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 9e4353432325..fe3491ad2129 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1972,6 +1972,7 @@ enum {
>  	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
>  	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
>  	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
> +	EXT4_STATE_BUFFERED_IOMAP,	/* Inode use iomap for buffered IO */
>  };
>  
>  #define EXT4_INODE_BIT_FNS(name, field, offset)				\
> @@ -2040,6 +2041,12 @@ static inline bool ext4_inode_orphan_tracked(struct inode *inode)
>  		!list_empty(&EXT4_I(inode)->i_orphan);
>  }
>  
> +/* Whether the inode pass through the iomap infrastructure for buffered I/O */
> +static inline bool ext4_inode_buffered_iomap(struct inode *inode)
> +{
> +	return ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
> +}
> +
>  /*
>   * Codes for operating systems
>   */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 59405a95ecfc..9e9f421888ed 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3908,6 +3908,22 @@ const struct iomap_ops ext4_iomap_report_ops = {
>  	.iomap_begin = ext4_iomap_begin_report,
>  };
>  
> +static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
> +{
> +	return 0;
> +}
> +
> +static void ext4_iomap_readahead(struct readahead_control *rac)
> +{
> +
> +}
> +
> +static int ext4_iomap_writepages(struct address_space *mapping,
> +				 struct writeback_control *wbc)
> +{
> +	return 0;
> +}
> +
>  /*
>   * For data=journal mode, folio should be marked dirty only when it was
>   * writeably mapped. When that happens, it was already attached to the
> @@ -3994,6 +4010,20 @@ static const struct address_space_operations ext4_da_aops = {
>  	.swap_activate		= ext4_iomap_swap_activate,
>  };
>  
> +static const struct address_space_operations ext4_iomap_aops = {
> +	.read_folio		= ext4_iomap_read_folio,
> +	.readahead		= ext4_iomap_readahead,
> +	.writepages		= ext4_iomap_writepages,
> +	.dirty_folio		= iomap_dirty_folio,
> +	.bmap			= ext4_bmap,
> +	.invalidate_folio	= iomap_invalidate_folio,
> +	.release_folio		= iomap_release_folio,
> +	.migrate_folio		= filemap_migrate_folio,
> +	.is_partially_uptodate  = iomap_is_partially_uptodate,
> +	.error_remove_folio	= generic_error_remove_folio,
> +	.swap_activate		= ext4_iomap_swap_activate,
> +};
> +
>  static const struct address_space_operations ext4_dax_aops = {
>  	.writepages		= ext4_dax_writepages,
>  	.dirty_folio		= noop_dirty_folio,
> @@ -4015,6 +4045,8 @@ void ext4_set_aops(struct inode *inode)
>  	}
>  	if (IS_DAX(inode))
>  		inode->i_mapping->a_ops = &ext4_dax_aops;
> +	else if (ext4_inode_buffered_iomap(inode))
> +		inode->i_mapping->a_ops = &ext4_iomap_aops;
>  	else if (test_opt(inode->i_sb, DELALLOC))
>  		inode->i_mapping->a_ops = &ext4_da_aops;
>  	else
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v3 05/22] ext4: implement buffered read path using iomap
From: Jan Kara @ 2026-04-30 13:23 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-6-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:25, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Implement the iomap read path for ext4 by introducing a new
> ext4_iomap_buffered_read_ops instance. This provides the read_folio()
> and readahead() callbacks for ext4_iomap_aops. The implementation
> introduces:
> 
>  - ext4_iomap_map_blocks(): Helper function to query extent mappings for
>    a given read range using ext4_map_blocks() and convert the mapping
>    information to iomap type
>  - ext4_iomap_buffered_read_begin(): The iomap_begin callback that maps
>    blocks, validates filesystem state, and populates the iomap. It
>    returns -ERANGE for inline data which is not yet supported.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 9e9f421888ed..df21f6870ec4 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3908,14 +3908,57 @@ const struct iomap_ops ext4_iomap_report_ops = {
>  	.iomap_begin = ext4_iomap_begin_report,
>  };
>  
> +static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
> +		loff_t length, struct ext4_map_blocks *map)
> +{
> +	u8 blkbits = inode->i_blkbits;
> +
> +	if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
> +		return -EINVAL;
> +
> +	/* Calculate the first and last logical blocks respectively. */
> +	map->m_lblk = offset >> blkbits;
> +	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
> +			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
> +
> +	return ext4_map_blocks(NULL, inode, map, 0);
> +}
> +
> +static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
> +		loff_t length, unsigned int flags, struct iomap *iomap,
> +		struct iomap *srcmap)
> +{
> +	struct ext4_map_blocks map;
> +	int ret;
> +
> +	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
> +		return -EIO;
> +
> +	/* Inline data support is not yet available. */
> +	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
> +		return -ERANGE;
> +
> +	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
> +	if (ret < 0)
> +		return ret;
> +
> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
> +	return 0;
> +}
> +
> +const struct iomap_ops ext4_iomap_buffered_read_ops = {
> +	.iomap_begin = ext4_iomap_buffered_read_begin,
> +};
> +
>  static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
>  {
> +	iomap_bio_read_folio(folio, &ext4_iomap_buffered_read_ops);
>  	return 0;
>  }
>  
>  static void ext4_iomap_readahead(struct readahead_control *rac)
>  {
> -
> +	iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
>  }
>  
>  static int ext4_iomap_writepages(struct address_space *mapping,
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v3 06/22] ext4: pass out extent seq counter when mapping da blocks
From: Jan Kara @ 2026-04-30 13:23 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-7-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:26, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> The iomap buffered write path does not hold any locks between querying
> inode extent mapping information and performing buffered writes. It
> relies on the sequence counter saved in the inode to detect stale
> mappings.
> 
> Commit 07c440e8da8f ("ext4: pass out extent seq counter when mapping
> blocks") added the m_seq field to ext4_map_blocks to pass out extent
> sequence numbers, but it missed two callsites within
> ext4_da_map_blocks(). These callsites are on the delayed allocation
> path, which is also used in the iomap buffered write path. Pass out the
> sequence counter to ensure stale mappings can be detected.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/inode.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index df21f6870ec4..5ffd6aeb3485 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1929,7 +1929,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
>  	ext4_check_map_extents_env(inode);
>  
>  	/* Lookup extent status tree firstly */
> -	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
> +	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
>  		map->m_len = min_t(unsigned int, map->m_len,
>  				   es.es_len - (map->m_lblk - es.es_lblk));
>  
> @@ -1982,7 +1982,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
>  	 * is held in write mode, before inserting a new da entry in
>  	 * the extent status tree.
>  	 */
> -	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
> +	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
>  		map->m_len = min_t(unsigned int, map->m_len,
>  				   es.es_len - (map->m_lblk - es.es_lblk));
>  
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v3 03/22] ext4: simplify error handling in ext4_setattr()
From: Jan Kara @ 2026-04-30 13:03 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-4-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:23, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Remove the redundant rc variable and consolidate error handling.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

One comment below. Otherwise the changes look good.

> @@ -6073,8 +6073,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  
>  		filemap_invalidate_lock(inode->i_mapping);
>  
> -		rc = ext4_break_layouts(inode);
> -		if (rc) {
> +		error = ext4_break_layouts(inode);
> +		if (error) {

This is wrong. Errors from ext4_break_layouts() just need to be returned
but they shouldn't be logged with ext4_std_error().

>  			filemap_invalidate_unlock(inode->i_mapping);
>  			goto err_out;
>  		}

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v3 02/22] ext4: factor out ext4_truncate_[up|down]()
From: Jan Kara @ 2026-04-30 12:55 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-3-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:22, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Refactor ext4_setattr() by introducing two helper functions,
> ext4_truncate_up() and ext4_truncate_down(), to handle size changes. The
> current ATTR_SIZE processing consolidates checks for both shrinking and
> non-shrinking cases, leading to cluttered code. Separating the
> truncation paths improves readability.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good. Just a few nits below.

> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a991e5c..9e4353432325 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3501,6 +3501,23 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
>  	return changed;
>  }
>  
> +/*
> + * Set i_size and i_disksize to 'newsize'.
> + *
> + * Both i_rwsem and i_data_sem are required here to avoid races between
> + * generic append writeback and concurrent truncate that also modify
> + * i_size and i_disksize.
> + */
> +static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
> +{
> +	WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode));
> +
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	i_size_write(inode, newsize);
> +	EXT4_I(inode)->i_disksize = newsize;
> +	up_write(&EXT4_I(inode)->i_data_sem);
> +}
> +

Do we need this in the header later or can we keep it local to inode.c?

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 0751dc55e94f..5e913aca6499 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5855,6 +5855,83 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
>  	}
>  }
>  
> +static int ext4_truncate_up(struct inode *inode, loff_t oldsize, loff_t newsize)
> +{
> +	ext4_lblk_t old_lblk, new_lblk;
> +	handle_t *handle;
> +	int ret;
> +
> +	if (!IS_ALIGNED(oldsize | newsize, i_blocksize(inode))) {
> +		ret = ext4_inode_attach_jinode(inode);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> +	if (oldsize & (i_blocksize(inode) - 1)) {

When you transitioned to IS_ALIGNED above, use it here as well?

> +		ret = ext4_block_zero_eof(inode, oldsize, LLONG_MAX);
> +		if (ret)
> +			return ret;
> +	}

...

> +		if (attr->ia_size > oldsize)
> +			error = ext4_truncate_up(inode, oldsize, attr->ia_size);
> +		else if (shrink)
> +			error = ext4_truncate_down(inode, oldsize,
> +						   attr->ia_size, &orphan);
> +		if (error)
> +			goto out_mmap_sem;
>  
>  		/*
>  		 * Truncate pagecache after we've waited for commit

Hum, why not move the truncate_pagecache() call and ext4_truncate() call
into ext4_truncate_down()? They are not needed in the truncate up case...

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v3 01/22] ext4: simplify size updating in ext4_setattr()
From: Jan Kara @ 2026-04-30 12:43 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-2-yi.zhang@huaweicloud.com>

On Wed 22-04-26 10:10:21, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> The logic for updating the file size in ext4_setattr() is currently
> somewhat messy. By directly entering the error-handling path after
> failing to add an orphan inode, the unnecessary recovery process
> involving old_disksize and the file size can be avoided.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Indeed, that code looks confusing for no good reason. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/inode.c | 22 +++++++++-------------
>  1 file changed, 9 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c2c2d6ac7f3d..0751dc55e94f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5953,7 +5953,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  	if (attr->ia_valid & ATTR_SIZE) {
>  		handle_t *handle;
>  		loff_t oldsize = inode->i_size;
> -		loff_t old_disksize;
>  		int shrink = (attr->ia_size < inode->i_size);
>  
>  		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> @@ -6037,6 +6036,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  			if (ext4_handle_valid(handle) && shrink) {
>  				error = ext4_orphan_add(handle, inode);
>  				orphan = 1;
> +				if (error)
> +					goto out_handle;
>  			}
>  
>  			if (shrink)
> @@ -6052,23 +6053,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  					(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
>  					inode->i_sb->s_blocksize_bits);
>  
> -			down_write(&EXT4_I(inode)->i_data_sem);
> -			old_disksize = EXT4_I(inode)->i_disksize;
> -			EXT4_I(inode)->i_disksize = attr->ia_size;
> -
>  			/*
>  			 * We have to update i_size under i_data_sem together
>  			 * with i_disksize to avoid races with writeback code
> -			 * running ext4_wb_update_i_disksize().
> +			 * updating disksize in mpage_map_and_submit_extent().
>  			 */
> -			if (!error)
> -				i_size_write(inode, attr->ia_size);
> -			else
> -				EXT4_I(inode)->i_disksize = old_disksize;
> +			down_write(&EXT4_I(inode)->i_data_sem);
> +			i_size_write(inode, attr->ia_size);
> +			EXT4_I(inode)->i_disksize = attr->ia_size;
>  			up_write(&EXT4_I(inode)->i_data_sem);
> -			rc = ext4_mark_inode_dirty(handle, inode);
> -			if (!error)
> -				error = rc;
> +
> +			error = ext4_mark_inode_dirty(handle, inode);
> +out_handle:
>  			ext4_journal_stop(handle);
>  			if (error)
>  				goto out_mmap_sem;
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* [linus:master] [ext4]  1ad0f42823:  fio.write_iops 23.8% improvement
From: kernel test robot @ 2026-04-30  5:38 UTC (permalink / raw)
  To: Zhang Yi
  Cc: oe-lkp, lkp, linux-kernel, Theodore Ts'o, Jan Kara,
	linux-ext4, oliver.sang



Hello,

kernel test robot noticed a 23.8% improvement of fio.write_iops on:


commit: 1ad0f42823291bcac371dafd37533f5e8d92acc3 ("ext4: move pagecache_isize_extended() out of active handle")
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master


testcase: fio-basic
config: x86_64-rhel-9.4
compiler: gcc-14
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:

	runtime: 300s
	disk: 1HDD
	fs: ext4
	nr_task: 1
	test_size: 128G
	rw: write
	bs: 4k
	ioengine: falloc
	cpufreq_governor: performance



Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20260430/202604301021.faab1f8a-lkp@intel.com

=========================================================================================
bs/compiler/cpufreq_governor/disk/fs/ioengine/kconfig/nr_task/rootfs/runtime/rw/tbox_group/test_size/testcase:
  4k/gcc-14/performance/1HDD/ext4/falloc/x86_64-rhel-9.4/1/debian-13-x86_64-20250902.cgz/300s/write/lkp-icl-2sp9/128G/fio-basic

commit: 
  116c0bdac2 ("ext4: remove ctime/mtime update from ext4_alloc_file_blocks()")
  1ad0f42823 ("ext4: move pagecache_isize_extended() out of active handle")

116c0bdac2ec059d 1ad0f42823291bcac371dafd375 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.11 ±  5%      -0.0        0.08 ±  7%  fio.latency_2us%
      0.07 ±  3%      -0.0        0.05 ±  6%  fio.latency_4us%
     34.79           -19.1%      28.14        fio.time.elapsed_time
     34.79           -19.1%      28.14        fio.time.elapsed_time.max
     24.17           -26.0%      17.90        fio.time.system_time
      3736           -18.7%       3037        fio.time.voluntary_context_switches
      3812           +23.8%       4720        fio.write_bw_MBps
    850.67           -22.7%     657.33        fio.write_clat_90%_ns
    857.33           -22.4%     665.33        fio.write_clat_95%_ns
    872.00           -22.5%     676.00        fio.write_clat_99%_ns
    835.45           -22.7%     645.60        fio.write_clat_mean_ns
    192.67 ± 28%    +122.3%     428.28 ± 20%  fio.write_clat_stddev
    975889           +23.8%    1208502        fio.write_iops
      1.13            -7.7%       1.05        iostat.cpu.system
    146867           -17.0%     121930        turbostat.IRQ
 2.317e+09           -17.6%  1.908e+09        cpuidle..time
    121665 ±  9%     -11.5%     107660 ±  5%  cpuidle..usage
      1.13            -0.1        1.04        mpstat.cpu.all.sys%
      0.54 ±  2%      +0.1        0.62 ±  2%  mpstat.cpu.all.usr%
    124.09 ±  2%      -6.1%     116.50        uptime.boot
      7577 ±  2%      -6.3%       7099        uptime.idle
      1507 ±  4%     +13.8%       1714        vmstat.io.bo
      1.58 ±  3%      +9.7%       1.73 ±  2%  vmstat.procs.r
    256506            -3.5%     247467        proc-vmstat.numa_hit
    281823            -4.4%     269466        proc-vmstat.pgalloc_normal
    166349            -7.9%     153190        proc-vmstat.pgfault
    159495            -7.7%     147154        proc-vmstat.pgfree
      7811 ± 20%     -45.1%       4288        proc-vmstat.pgpgout
      7649            -8.2%       7019 ±  2%  proc-vmstat.pgreuse
      4.74 ± 63%      -4.7        0.00        perf-profile.calltrace.cycles-pp.do_mprotect_pkey.__x64_sys_mprotect.do_syscall_64.entry_SYSCALL_64_after_hwframe
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.acpi_idle_do_entry.acpi_idle_enter.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.acpi_idle_enter.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.acpi_safe_halt.acpi_idle_do_entry.acpi_idle_enter.cpuidle_enter_state.cpuidle_enter
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.calltrace.cycles-pp.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary.common_startup_64
      4.95 ± 81%      -3.4        1.54 ±149%  perf-profile.children.cycles-pp.__split_vma
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.acpi_idle_do_entry
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.acpi_idle_enter
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.acpi_safe_halt
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.cpuidle_enter
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.cpuidle_enter_state
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.cpuidle_idle_call
      3.64 ±104%      -3.1        0.56 ±223%  perf-profile.children.cycles-pp.pv_native_safe_halt
      1.82 ±148%      +5.8        7.59 ± 39%  perf-profile.children.cycles-pp.__mmap_region
      1.82 ±148%      +6.5        8.34 ± 44%  perf-profile.children.cycles-pp.do_mmap
      1.82 ±148%      +6.5        8.34 ± 44%  perf-profile.children.cycles-pp.vm_mmap_pgoff
      0.03 ±  7%     +21.8%       0.04 ±  4%  perf-stat.i.MPKI
      0.26            +0.1        0.32        perf-stat.i.branch-miss-rate%
   4548711           +21.7%    5537175        perf-stat.i.branch-misses
    274432 ±  6%     +20.4%     330549 ±  4%  perf-stat.i.cache-misses
   1860596            +8.1%    2011597        perf-stat.i.cache-references
      0.47            +2.1%       0.48        perf-stat.i.cpi
      2398            +3.0%       2470        perf-stat.i.minor-faults
      2398            +3.0%       2470        perf-stat.i.page-faults
      0.03 ±  6%     +22.3%       0.04 ±  4%  perf-stat.overall.MPKI
      0.26            +0.1        0.32        perf-stat.overall.branch-miss-rate%
     14.77 ±  6%      +1.7       16.46 ±  4%  perf-stat.overall.cache-miss-rate%
      0.47            +1.8%       0.48        perf-stat.overall.cpi
     14177 ±  6%     -16.9%      11783 ±  5%  perf-stat.overall.cycles-between-cache-misses
      2.12            -1.8%       2.08        perf-stat.overall.ipc
      8603           -19.7%       6911        perf-stat.overall.path-length
  1.69e+09            -1.7%  1.662e+09        perf-stat.ps.branch-instructions
   4424559           +21.0%    5353882        perf-stat.ps.branch-misses
    267376 ±  6%     +19.9%     320480 ±  4%  perf-stat.ps.cache-misses
   1810896            +7.5%    1947007        perf-stat.ps.cache-references
 7.998e+09            -2.0%  7.838e+09        perf-stat.ps.instructions
      2334            +2.5%       2393        perf-stat.ps.minor-faults
      2334            +2.5%       2393        perf-stat.ps.page-faults
 2.887e+11           -19.7%  2.319e+11        perf-stat.total.instructions




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply

* Re: [PATCH v12 04/15] exfat: Implement fileattr_get for case sensitivity
From: David Timber @ 2026-04-29 22:00 UTC (permalink / raw)
  To: Chuck Lever, Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-4-8057123bebe0@oracle.com>

On 4/30/26 03:07, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> Report exFAT's case sensitivity behavior via the FS_XFLAG_CASEFOLD
> flag. exFAT is always case-insensitive (using an upcase table for
> comparison) and always preserves case at rest.
Not necessarily "always".

Link: https://github.com/exfatprogs/exfatprogs/issues/313

The specs(SD spec part 2 and MS spec) leave it up to the formatter
implementation on how the volume should behave. The observed behaviour
is that it is quite flexible: you can pretty much use any artitrary
up-case table to make an exFAT volume behave completely different and
major implementations including Linux and Windows kernel honour the
table no matter what. So exFAT is not so "binary"(folding vs. not
folding) when it comes to case folding behaviour.

NTFS also has a similar up-case table feature. Although it's usually
unused, if an up-case table exists in the volume, the implementation
probably has to honour it(although this is not written down in any spec,
this should be the expectation).

At the end of the day, it wouldn't matter much because no sane formatter
would produce a volume with some weird version up-case table. But if
that attribute plays a important role in some system that has some level
of impact, I suggest considering another attribute, say "unknown" or "it
depends".

Davo

^ permalink raw reply

* [PATCH v12 15/15] ksmbd: Report filesystem case sensitivity via FS_ATTRIBUTE_INFORMATION
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

FS_ATTRIBUTE_INFORMATION responses have always reported
FILE_CASE_SENSITIVE_SEARCH and FILE_CASE_PRESERVED_NAMES
unconditionally. Case-insensitive filesystems like exFAT, and
casefolded directories on ext4 or f2fs, have no way to signal
their actual semantics to SMB clients.

Now that filesystems expose case behavior through ->fileattr_get,
query it via vfs_fileattr_get() and translate the FS_XFLAG_CASEFOLD
and FS_XFLAG_CASENONPRESERVING flags into the corresponding SMB
attributes. Filesystems without ->fileattr_get continue reporting
default POSIX behavior (case-sensitive, case-preserving).

SMB's FS_ATTRIBUTE_INFORMATION reports per-share attributes from
the share root, not per-file. Shares mixing casefold and
non-casefold directories report the root directory's behavior.

Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/smb/server/smb2pdu.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index ee32e61b6d3c..cf0bc453a036 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -14,6 +14,7 @@
 #include <linux/falloc.h>
 #include <linux/mount.h>
 #include <linux/filelock.h>
+#include <linux/fileattr.h>
 
 #include "glob.h"
 #include "smbfsctl.h"
@@ -5541,16 +5542,33 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
 	case FS_ATTRIBUTE_INFORMATION:
 	{
 		FILE_SYSTEM_ATTRIBUTE_INFO *info;
+		struct file_kattr fa = {};
 		size_t sz;
+		u32 attrs;
+		int err;
 
 		info = (FILE_SYSTEM_ATTRIBUTE_INFO *)rsp->Buffer;
-		info->Attributes = cpu_to_le32(FILE_SUPPORTS_OBJECT_IDS |
-					       FILE_PERSISTENT_ACLS |
-					       FILE_UNICODE_ON_DISK |
-					       FILE_CASE_PRESERVED_NAMES |
-					       FILE_CASE_SENSITIVE_SEARCH |
-					       FILE_SUPPORTS_BLOCK_REFCOUNTING);
+		attrs = FILE_SUPPORTS_OBJECT_IDS |
+			FILE_PERSISTENT_ACLS |
+			FILE_UNICODE_ON_DISK |
+			FILE_SUPPORTS_BLOCK_REFCOUNTING;
 
+		err = vfs_fileattr_get(path.dentry, &fa);
+		/*
+		 * -EINVAL, -EOPNOTSUPP: ntfs-3g and other FUSE
+		 * filesystems that lack FS_IOC_FSGETXATTR support.
+		 */
+		if (err && err != -ENOIOCTLCMD && err != -ENOTTY &&
+		    err != -EINVAL && err != -EOPNOTSUPP) {
+			path_put(&path);
+			return err;
+		}
+		if (!(fa.fsx_xflags & FS_XFLAG_CASEFOLD))
+			attrs |= FILE_CASE_SENSITIVE_SEARCH;
+		if (!(fa.fsx_xflags & FS_XFLAG_CASENONPRESERVING))
+			attrs |= FILE_CASE_PRESERVED_NAMES;
+
+		info->Attributes = cpu_to_le32(attrs);
 		info->Attributes |= cpu_to_le32(server_conf.share_fake_fscaps);
 
 		if (test_share_config_flag(work->tcon->share_conf,

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 14/15] nfsd: Implement NFSv4 FATTR4_CASE_INSENSITIVE and FATTR4_CASE_PRESERVING
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

NFSD currently provides NFSv4 clients with hard-coded responses
indicating all exported filesystems are case-sensitive and
case-preserving. This is incorrect for case-insensitive filesystems
and ext4 directories with casefold enabled.

Query the underlying filesystem's actual case sensitivity via
nfsd_get_case_info() and return accurate values to clients. This
supports per-directory settings for filesystems that allow mixing
case-sensitive and case-insensitive directories within an export.

Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2a0946c630e1..d77304692e11 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3158,6 +3158,8 @@ struct nfsd4_fattr_args {
 	u32			rdattr_err;
 	bool			contextsupport;
 	bool			ignore_crossmnt;
+	bool			case_insensitive;
+	bool			case_preserving;
 };
 
 typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
@@ -3356,6 +3358,33 @@ static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr,
 	return nfs_ok;
 }
 
+static __be32 nfsd4_encode_fattr4_case_insensitive(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, args->case_insensitive);
+}
+
+static __be32 nfsd4_encode_fattr4_case_preserving(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, args->case_preserving);
+}
+
+static __be32 nfsd4_encode_fattr4_homogeneous(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	/*
+	 * Casefold-capable filesystems (e.g. ext4 or f2fs with the
+	 * casefold feature) attach a Unicode encoding at mount time
+	 * but apply case folding per directory.  The per-file-system
+	 * case_insensitive and case_preserving values can therefore
+	 * legitimately differ across objects that share the same fsid.
+	 * Report FATTR4_HOMOGENEOUS = FALSE on such filesystems to
+	 * keep that variation consistent with RFC 8881 Section 5.8.2.16.
+	 */
+	return nfsd4_encode_bool(xdr, !sb_has_encoding(args->dentry->d_sb));
+}
+
 static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
 					     const struct nfsd4_fattr_args *args)
 {
@@ -3748,8 +3777,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 	[FATTR4_ACLSUPPORT]		= nfsd4_encode_fattr4_aclsupport,
 	[FATTR4_ARCHIVE]		= nfsd4_encode_fattr4__noop,
 	[FATTR4_CANSETTIME]		= nfsd4_encode_fattr4__true,
-	[FATTR4_CASE_INSENSITIVE]	= nfsd4_encode_fattr4__false,
-	[FATTR4_CASE_PRESERVING]	= nfsd4_encode_fattr4__true,
+	[FATTR4_CASE_INSENSITIVE]	= nfsd4_encode_fattr4_case_insensitive,
+	[FATTR4_CASE_PRESERVING]	= nfsd4_encode_fattr4_case_preserving,
 	[FATTR4_CHOWN_RESTRICTED]	= nfsd4_encode_fattr4__true,
 	[FATTR4_FILEHANDLE]		= nfsd4_encode_fattr4_filehandle,
 	[FATTR4_FILEID]			= nfsd4_encode_fattr4_fileid,
@@ -3758,7 +3787,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 	[FATTR4_FILES_TOTAL]		= nfsd4_encode_fattr4_files_total,
 	[FATTR4_FS_LOCATIONS]		= nfsd4_encode_fattr4_fs_locations,
 	[FATTR4_HIDDEN]			= nfsd4_encode_fattr4__noop,
-	[FATTR4_HOMOGENEOUS]		= nfsd4_encode_fattr4__true,
+	[FATTR4_HOMOGENEOUS]		= nfsd4_encode_fattr4_homogeneous,
 	[FATTR4_MAXFILESIZE]		= nfsd4_encode_fattr4_maxfilesize,
 	[FATTR4_MAXLINK]		= nfsd4_encode_fattr4_maxlink,
 	[FATTR4_MAXNAME]		= nfsd4_encode_fattr4_maxname,
@@ -3968,6 +3997,23 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		args.fhp = tempfh;
 	} else
 		args.fhp = fhp;
+	if (attrmask[0] & (FATTR4_WORD0_CASE_INSENSITIVE |
+			   FATTR4_WORD0_CASE_PRESERVING)) {
+		err = nfsd_get_case_info(dentry, &args.case_insensitive,
+					 &args.case_preserving);
+		/*
+		 * SUPPORTED_ATTRS unconditionally advertises both
+		 * bits, and the Linux client treats an absent
+		 * CASE_PRESERVING in a GETATTR reply as false. When
+		 * the filesystem does not expose case state,
+		 * nfsd_get_case_info() fills POSIX defaults
+		 * (case-sensitive, case-preserving) and returns
+		 * -EOPNOTSUPP; encode those defaults so the reply
+		 * agrees with what the server claims to support.
+		 */
+		if (err && err != -EOPNOTSUPP)
+			goto out_nfserr;
+	}
 
 	if (attrmask[0] & FATTR4_WORD0_ACL) {
 		err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 13/15] nfsd: Report export case-folding via NFSv3 PATHCONF
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

The hard-coded MSDOS_SUPER_MAGIC check in nfsd3_proc_pathconf()
only recognizes FAT filesystems as case-insensitive. Modern
filesystems like F2FS, exFAT, and CIFS support case-insensitive
directories, but NFSv3 clients cannot discover this capability.

Query the export's actual case behavior through ->fileattr_get
instead. This allows NFSv3 clients to correctly handle case
sensitivity for any filesystem that implements the fileattr
interface. Filesystems without ->fileattr_get continue to report
the default POSIX behavior (case-sensitive, case-preserving).

This change depends on the earlier "fat: Implement fileattr_get
for case sensitivity" patch in this series, which ensures FAT
filesystems report their case behavior correctly via the
fileattr interface.

Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c | 36 +++++++++++++++++++++------
 fs/nfsd/vfs.c      | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/vfs.h      |  3 +++
 fs/nfsd/xdr3.h     |  4 +--
 4 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 42adc5461db0..62ebc65b8af2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -710,23 +710,43 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp)
 	resp->p_name_max = 255;		/* at least */
 	resp->p_no_trunc = 0;
 	resp->p_chown_restricted = 1;
-	resp->p_case_insensitive = 0;
-	resp->p_case_preserving = 1;
+	resp->p_case_insensitive = false;
+	resp->p_case_preserving = true;
 
 	resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
 	if (resp->status == nfs_ok) {
 		struct super_block *sb = argp->fh.fh_dentry->d_sb;
+		int err;
 
-		/* Note that we don't care for remote fs's here */
-		switch (sb->s_magic) {
-		case EXT2_SUPER_MAGIC:
+		if (sb->s_magic == EXT2_SUPER_MAGIC) {
 			resp->p_link_max = EXT2_LINK_MAX;
 			resp->p_name_max = EXT2_NAME_LEN;
+		}
+
+		err = nfsd_get_case_info(argp->fh.fh_dentry,
+					 &resp->p_case_insensitive,
+					 &resp->p_case_preserving);
+		/*  
+		 * RFC 1813 lists NFS3ERR_STALE, NFS3ERR_BADHANDLE, and
+		 * NFS3ERR_SERVERFAULT as the only PATHCONF errors.
+		 */
+		switch (err) {
+		case 0:
+		case -EOPNOTSUPP:
+			/* Both arms leave the output booleans valid. */
 			break;
-		case MSDOS_SUPER_MAGIC:
-			resp->p_case_insensitive = 1;
-			resp->p_case_preserving  = 0;
+		case -EACCES:
+		case -EPERM:
+			/*
+			 * Policy denied the query. Report STALE so the
+			 * handle is unusable without implying a server
+			 * malfunction.
+			 */
+			resp->status = nfserr_stale;
+			break;
+		default:
+			resp->status = nfserr_serverfault;
 			break;
 		}
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index eafdf7b7890f..4bd63d8efbf7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -32,6 +32,7 @@
 #include <linux/writeback.h>
 #include <linux/security.h>
 #include <linux/sunrpc/xdr.h>
+#include <linux/fileattr.h>
 
 #include "xdr3.h"
 
@@ -2891,3 +2892,74 @@ nfsd_permission(struct svc_cred *cred, struct svc_export *exp,
 
 	return err? nfserrno(err) : 0;
 }
+
+/**
+ * nfsd_get_case_info - get case sensitivity info for a dentry
+ * @dentry: dentry to query
+ * @case_insensitive: set to true if the filesystem is case-insensitive
+ * @case_preserving: set to true if the filesystem preserves case
+ *
+ * On casefold-capable filesystems the flag lives on the directory,
+ * not on its entries, so for a non-directory @dentry the parent is
+ * queried instead. A directory (including an export root, whose
+ * parent lies outside the export) is queried as-is so its own
+ * contents' lookup behavior is reported.
+ *
+ * When the filesystem does not expose case-folding state (no
+ * ->fileattr_get, or the callback returns -EOPNOTSUPP /
+ * -ENOIOCTLCMD / -ENOTTY / -EINVAL), the outputs are filled with
+ * POSIX defaults (case-sensitive, case-preserving) on the premise
+ * that a filesystem with case-folding support wires up
+ * fileattr_get.
+ *
+ * Other errors propagate unmodified (-EACCES, -EPERM from LSM
+ * hooks; -EIO, -ESTALE, ... from the filesystem). Case-folding
+ * behavior is a property of the exported filesystem, not of the
+ * caller's credentials, so silently substituting defaults would
+ * let the same dentry report POSIX while LSM denies and report
+ * casefolding once LSM allows -- a client could race against
+ * silent name collisions on a case-insensitive export.
+ *
+ * Return: 0 with outputs filled, -EOPNOTSUPP with outputs filled
+ *         to POSIX defaults, or a negative errno with outputs
+ *         unmodified.
+ */
+int
+nfsd_get_case_info(struct dentry *dentry, bool *case_insensitive,
+		   bool *case_preserving)
+{
+	struct file_kattr fa = {};
+	struct dentry *cd;
+	bool put = false;
+	int err;
+
+	if (d_is_dir(dentry)) {
+		cd = dentry;
+	} else {
+		cd = dget_parent(dentry);
+		put = true;
+	}
+	err = vfs_fileattr_get(cd, &fa);
+	if (put)
+		dput(cd);
+	switch (err) {
+	case 0:
+		*case_insensitive = fa.fsx_xflags & FS_XFLAG_CASEFOLD;
+		*case_preserving =
+			!(fa.fsx_xflags & FS_XFLAG_CASENONPRESERVING);
+		return 0;
+	case -EINVAL:
+	case -ENOTTY:
+	case -ENOIOCTLCMD:
+	case -EOPNOTSUPP:
+		/*
+		 * Filesystem does not expose case state.
+		 * Report POSIX defaults.
+		 */
+		*case_insensitive = false;
+		*case_preserving = true;
+		return -EOPNOTSUPP;
+	default:
+		return err;
+	}
+}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 702a844f2106..e09ea04a51b9 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -156,6 +156,9 @@ __be32		nfsd_readdir(struct svc_rqst *, struct svc_fh *,
 			     loff_t *, struct readdir_cd *, nfsd_filldir_t);
 __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 				struct kstatfs *, int access);
+int		nfsd_get_case_info(struct dentry *dentry,
+				   bool *case_insensitive,
+				   bool *case_preserving);
 
 __be32		nfsd_permission(struct svc_cred *cred, struct svc_export *exp,
 				struct dentry *dentry, int acc);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 522067b7fd75..a7c9714b0b0e 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -209,8 +209,8 @@ struct nfsd3_pathconfres {
 	__u32			p_name_max;
 	__u32			p_no_trunc;
 	__u32			p_chown_restricted;
-	__u32			p_case_insensitive;
-	__u32			p_case_preserving;
+	bool			p_case_insensitive;
+	bool			p_case_preserving;
 };
 
 struct nfsd3_commitres {

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 12/15] isofs: Implement fileattr_get for case sensitivity
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Upper layers such as NFSD need a way to query whether a
filesystem handles filenames in a case-sensitive manner so
they can provide correct semantics to remote clients. Without
this information, NFS exports of ISO 9660 filesystems cannot
advertise their filename case behavior.

Implement isofs_fileattr_get() to report ISO 9660 case handling
behavior. The 'check=r' (relaxed) mount option enables
case-insensitive lookups and is reported via FS_XFLAG_CASEFOLD.
By default, Joliet extensions operate in relaxed mode while
plain ISO 9660 uses strict (case-sensitive) mode.

Plain ISO 9660 names on the medium are uppercase. When neither
Rock Ridge nor Joliet is in effect, the default 'map=n' option
(and 'map=a') routes lookup and readdir through
isofs_name_translate(), which forces A-Z to a-z. The names
visible to userspace then differ in case from the on-disc form,
so report FS_XFLAG_CASENONPRESERVING in that configuration. Rock
Ridge and Joliet both deliver names as authored, and 'map=o'
emits the raw on-disc name unchanged, so those configurations
remain case-preserving.

Casefolding is a directory property, and the in-tree consumers
(NFSD, ksmbd) issue the query against a directory: NFSD walks
to the parent for non-directory dentries before calling
vfs_fileattr_get(), and ksmbd reports per-share attributes from
the share root. Wire .fileattr_get only on
isofs_dir_inode_operations. The CASEFOLD flag is set in both
fa->fsx_xflags and fa->flags so FS_IOC_FSGETXATTR and
FS_IOC_GETFLAGS agree.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/isofs/dir.c   | 16 ++++++++++++++++
 fs/isofs/isofs.h |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2fd9948d606e..55385a72a4ce 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -14,6 +14,7 @@
 #include <linux/gfp.h>
 #include <linux/filelock.h>
 #include "isofs.h"
+#include <linux/fileattr.h>
 
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
 {
@@ -267,6 +268,20 @@ static int isofs_readdir(struct file *file, struct dir_context *ctx)
 	return result;
 }
 
+int isofs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+	struct isofs_sb_info *sbi = ISOFS_SB(dentry->d_sb);
+
+	if (sbi->s_check == 'r') {
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+		fa->flags |= FS_CASEFOLD_FL;
+	}
+	if (!sbi->s_joliet_level && !sbi->s_rock &&
+	    (sbi->s_mapping == 'n' || sbi->s_mapping == 'a'))
+		fa->fsx_xflags |= FS_XFLAG_CASENONPRESERVING;
+	return 0;
+}
+
 const struct file_operations isofs_dir_operations =
 {
 	.llseek = generic_file_llseek,
@@ -281,6 +296,7 @@ const struct file_operations isofs_dir_operations =
 const struct inode_operations isofs_dir_inode_operations =
 {
 	.lookup = isofs_lookup,
+	.fileattr_get = isofs_fileattr_get,
 };
 
 
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 506555837533..0ec8b24a42ed 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -197,6 +197,9 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 	}
 }
 
+struct file_kattr;
+int isofs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+
 extern const struct inode_operations isofs_dir_inode_operations;
 extern const struct file_operations isofs_dir_operations;
 extern const struct address_space_operations isofs_symlink_aops;

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 11/15] vboxsf: Implement fileattr_get for case sensitivity
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Upper layers such as NFSD need a way to query whether a
filesystem handles filenames in a case-sensitive manner. Report
VirtualBox shared folder case handling behavior via the
FS_XFLAG_CASEFOLD flag.

The case sensitivity property is queried from the VirtualBox host
service at mount time and cached in struct vboxsf_sbi. The host
determines case sensitivity based on the underlying host filesystem
(for example, Windows NTFS is case-insensitive while Linux ext4 is
case-sensitive).

VirtualBox shared folders always preserve filename case exactly
as provided by the guest. The host interface does not expose a
separate case-preserving property; leaving
FS_XFLAG_CASENONPRESERVING unset reports the POSIX-default
case-preserving behavior, which matches vboxsf semantics.

The callback is registered in all three inode_operations
structures (directory, file, and symlink) to ensure consistent
reporting across all inode types.

Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/vboxsf/dir.c    |  1 +
 fs/vboxsf/file.c   |  6 ++++--
 fs/vboxsf/super.c  |  7 +++++++
 fs/vboxsf/utils.c  | 30 ++++++++++++++++++++++++++++++
 fs/vboxsf/vfsmod.h |  6 ++++++
 5 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 42bedc4ec7af..c5bd3271aa96 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -477,4 +477,5 @@ const struct inode_operations vboxsf_dir_iops = {
 	.symlink = vboxsf_dir_symlink,
 	.getattr = vboxsf_getattr,
 	.setattr = vboxsf_setattr,
+	.fileattr_get = vboxsf_fileattr_get,
 };
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 7a7a3fbb2651..943953867e18 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -222,7 +222,8 @@ const struct file_operations vboxsf_reg_fops = {
 
 const struct inode_operations vboxsf_reg_iops = {
 	.getattr = vboxsf_getattr,
-	.setattr = vboxsf_setattr
+	.setattr = vboxsf_setattr,
+	.fileattr_get = vboxsf_fileattr_get,
 };
 
 static int vboxsf_read_folio(struct file *file, struct folio *folio)
@@ -389,5 +390,6 @@ static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode,
 }
 
 const struct inode_operations vboxsf_lnk_iops = {
-	.get_link = vboxsf_get_link
+	.get_link = vboxsf_get_link,
+	.fileattr_get = vboxsf_fileattr_get,
 };
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index a618cb093e00..a61fbab51d37 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -185,6 +185,13 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		goto fail_unmap;
 
+	/*
+	 * A failed query leaves sbi->case_insensitive false, so the
+	 * mount defaults to reporting case-sensitive behavior. Do not
+	 * fail the mount over an advisory attribute.
+	 */
+	vboxsf_query_case_sensitive(sbi);
+
 	sb->s_magic = VBOXSF_SUPER_MAGIC;
 	sb->s_blocksize = 1024;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 440e8c50629d..298bfc93255c 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -11,6 +11,7 @@
 #include <linux/sizes.h>
 #include <linux/pagemap.h>
 #include <linux/vfs.h>
+#include <linux/fileattr.h>
 #include "vfsmod.h"
 
 struct inode *vboxsf_new_inode(struct super_block *sb)
@@ -567,3 +568,32 @@ int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d,
 
 	return err;
 }
+
+int vboxsf_query_case_sensitive(struct vboxsf_sbi *sbi)
+{
+	struct shfl_volinfo volinfo = {};
+	u32 buf_len;
+	int err;
+
+	buf_len = sizeof(volinfo);
+	err = vboxsf_fsinfo(sbi->root, 0, SHFL_INFO_GET | SHFL_INFO_VOLUME,
+			    &buf_len, &volinfo);
+	if (err)
+		return err;
+	if (buf_len < sizeof(volinfo))
+		return 0;
+
+	sbi->case_insensitive = !volinfo.properties.case_sensitive;
+	return 0;
+}
+
+int vboxsf_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
+
+	if (sbi->case_insensitive) {
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+		fa->flags |= FS_CASEFOLD_FL;
+	}
+	return 0;
+}
diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h
index 05973eb89d52..b61afd0ce842 100644
--- a/fs/vboxsf/vfsmod.h
+++ b/fs/vboxsf/vfsmod.h
@@ -47,6 +47,7 @@ struct vboxsf_sbi {
 	u32 next_generation;
 	u32 root;
 	int bdi_id;
+	bool case_insensitive;
 };
 
 /* per-inode information */
@@ -111,6 +112,11 @@ void vboxsf_dir_info_free(struct vboxsf_dir_info *p);
 int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d,
 			u64 handle);
 
+int vboxsf_query_case_sensitive(struct vboxsf_sbi *sbi);
+
+struct file_kattr;
+int vboxsf_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+
 /* from vboxsf_wrappers.c */
 int vboxsf_connect(void);
 void vboxsf_disconnect(void);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 10/15] nfs: Implement fileattr_get for case sensitivity
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

An NFS server re-exporting an NFS mount point needs to report
the case sensitivity behavior of the underlying filesystem to
its clients. NFSD's attribute encoder obtains that information
by calling vfs_fileattr_get() on the lower filesystem, so the
NFS client must implement fileattr_get to surface what it
learned from its own server.

The NFS client already retrieves case sensitivity information
from servers during mount via PATHCONF (NFSv3) or the
FATTR4_CASE_INSENSITIVE/FATTR4_CASE_PRESERVING attributes
(NFSv4). Expose this information through fileattr_get by
reporting the FS_XFLAG_CASEFOLD and FS_XFLAG_CASENONPRESERVING
flags. NFSv2 lacks PATHCONF support, so mounts using that protocol
version default to standard POSIX behavior: case-sensitive and
case-preserving.

PATHCONF is now invoked unconditionally for NFSv2 and NFSv3 mounts
so the case-sensitivity capabilities are established even when the
user pins server->namelen with the namlen= mount option. That option
is orthogonal to case handling, and skipping PATHCONF because
namelen was already known would leave the caps unset.

The two capability bits carry opposite polarity because their POSIX
defaults differ. Most servers are case-sensitive and case-
preserving, matching "neither xflag set." NFS_CAP_CASE_INSENSITIVE
is set only when the server affirms case insensitivity, so "server
said no" and "server did not answer" both collapse to the case-
sensitive default. NFS_CAP_CASE_NONPRESERVING follows the same
pattern in the opposite direction: set only when the server affirms
that it does not preserve case, so that silence or a missing
attribute lands on the case-preserving default. The NFSv4 probe
checks res.attr_bitmask[0] to distinguish "server said false" from
"server omitted the attribute" before setting the bit.

Both capability bits are cleared at the start of each successful
probe so a remount or NFSv4 transparent state migration to a server
with different case semantics does not retain stale capabilities
from the prior probe.

Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/client.c           | 25 ++++++++++++++++++-------
 fs/nfs/inode.c            | 15 +++++++++++++++
 fs/nfs/internal.h         |  3 +++
 fs/nfs/namespace.c        |  2 ++
 fs/nfs/nfs3proc.c         |  2 ++
 fs/nfs/nfs3xdr.c          |  7 +++++--
 fs/nfs/nfs4proc.c         | 10 +++++++---
 fs/nfs/proc.c             |  3 +++
 fs/nfs/symlink.c          |  3 +++
 include/linux/nfs_fs_sb.h |  2 +-
 include/linux/nfs_xdr.h   |  2 ++
 11 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index be02bb227741..7ca16fc72689 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -914,6 +914,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
  */
 static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
 {
+	struct nfs_pathconf pathinfo = { };
 	struct nfs_fsinfo fsinfo;
 	struct nfs_client *clp = server->nfs_client;
 	int error;
@@ -933,15 +934,25 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
 
 	nfs_server_set_fsinfo(server, &fsinfo);
 
-	/* Get some general file system info */
-	if (server->namelen == 0) {
-		struct nfs_pathconf pathinfo;
+	pathinfo.fattr = fattr;
+	nfs_fattr_init(fattr);
 
-		pathinfo.fattr = fattr;
-		nfs_fattr_init(fattr);
-
-		if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+	if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) {
+		if (server->namelen == 0)
 			server->namelen = pathinfo.max_namelen;
+		/*
+		 * Clear the bits before re-OR'ing so a remount
+		 * against a server with different case semantics
+		 * does not retain stale caps.
+		 */
+		if (clp->rpc_ops->version < 4) {
+			server->caps &= ~(NFS_CAP_CASE_INSENSITIVE |
+					  NFS_CAP_CASE_NONPRESERVING);
+			if (pathinfo.case_insensitive)
+				server->caps |= NFS_CAP_CASE_INSENSITIVE;
+			if (!pathinfo.case_preserving)
+				server->caps |= NFS_CAP_CASE_NONPRESERVING;
+		}
 	}
 
 	if (clp->rpc_ops->discover_trunking != NULL &&
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 98a8f0de1199..fdcbe6f2052c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -41,6 +41,7 @@
 #include <linux/freezer.h>
 #include <linux/uaccess.h>
 #include <linux/iversion.h>
+#include <linux/fileattr.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -1101,6 +1102,20 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 }
 EXPORT_SYMBOL_GPL(nfs_getattr);
 
+int nfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) {
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+		fa->flags |= FS_CASEFOLD_FL;
+	}
+	if (nfs_server_capable(inode, NFS_CAP_CASE_NONPRESERVING))
+		fa->fsx_xflags |= FS_XFLAG_CASENONPRESERVING;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_fileattr_get);
+
 static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 {
 	refcount_set(&l_ctx->count, 1);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index fc5456377160..309d3f679bb3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -449,6 +449,9 @@ extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
 
+struct file_kattr;
+int nfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 /* localio.c */
 struct nfs_local_dio {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index af9be0c5f516..6d0073c24771 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -246,11 +246,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 const struct inode_operations nfs_mountpoint_inode_operations = {
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 const struct inode_operations nfs_referral_inode_operations = {
 	.getattr	= nfs_namespace_getattr,
 	.setattr	= nfs_namespace_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 95d7cd564b74..b80d0c5efc27 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -1053,6 +1053,7 @@ static const struct inode_operations nfs3_dir_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 #ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= nfs3_listxattr,
 	.get_inode_acl	= nfs3_get_acl,
@@ -1064,6 +1065,7 @@ static const struct inode_operations nfs3_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 #ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= nfs3_listxattr,
 	.get_inode_acl	= nfs3_get_acl,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e17d72908412..e745e78faab0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2276,8 +2276,11 @@ static int decode_pathconf3resok(struct xdr_stream *xdr,
 	if (unlikely(!p))
 		return -EIO;
 	result->max_link = be32_to_cpup(p++);
-	result->max_namelen = be32_to_cpup(p);
-	/* ignore remaining fields */
+	result->max_namelen = be32_to_cpup(p++);
+	p++;	/* ignore no_trunc */
+	p++;	/* ignore chown_restricted */
+	result->case_insensitive = be32_to_cpup(p++) != 0;
+	result->case_preserving = be32_to_cpup(p) != 0;
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d839a97df822..62f66684fbc8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3933,7 +3933,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->caps &=
 			~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
 			  NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
-			  NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
+			  NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME |
+			  NFS_CAP_CASE_INSENSITIVE | NFS_CAP_CASE_NONPRESERVING);
 		server->fattr_valid = NFS_ATTR_FATTR_V4;
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
 				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -3944,8 +3945,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_SYMLINKS;
 		if (res.case_insensitive)
 			server->caps |= NFS_CAP_CASE_INSENSITIVE;
-		if (res.case_preserving)
-			server->caps |= NFS_CAP_CASE_PRESERVING;
+		if ((res.attr_bitmask[0] & FATTR4_WORD0_CASE_PRESERVING) &&
+		    !res.case_preserving)
+			server->caps |= NFS_CAP_CASE_NONPRESERVING;
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 		if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
 			server->caps |= NFS_CAP_SECURITY_LABEL;
@@ -10598,6 +10600,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 	.listxattr	= nfs4_listxattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 static const struct inode_operations nfs4_file_inode_operations = {
@@ -10605,6 +10608,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 	.listxattr	= nfs4_listxattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 static struct nfs_server *nfs4_clone_server(struct nfs_server *source,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 70795684b8e8..03c2c1f31be9 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -598,6 +598,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 {
 	info->max_link = 0;
 	info->max_namelen = NFS2_MAXNAMLEN;
+	info->case_preserving = true;
 	return 0;
 }
 
@@ -718,12 +719,14 @@ static const struct inode_operations nfs_dir_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 static const struct inode_operations nfs_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
 
 const struct nfs_rpc_ops nfs_v2_clientops = {
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 58146e935402..74a072896f8d 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -22,6 +22,8 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 
+#include "internal.h"
+
 /* Symlink caching in the page cache is even more simplistic
  * and straight-forward than readdir caching.
  */
@@ -74,4 +76,5 @@ const struct inode_operations nfs_symlink_inode_operations = {
 	.get_link	= nfs_get_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.fileattr_get	= nfs_fileattr_get,
 };
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4daee27fa5eb..34d294774f8c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -306,7 +306,7 @@ struct nfs_server {
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
 #define NFS_CAP_LGOPEN		(1U << 5)
 #define NFS_CAP_CASE_INSENSITIVE	(1U << 6)
-#define NFS_CAP_CASE_PRESERVING	(1U << 7)
+#define NFS_CAP_CASE_NONPRESERVING	(1U << 7)
 #define NFS_CAP_REBOOT_LAYOUTRETURN	(1U << 8)
 #define NFS_CAP_OFFLOAD_STATUS	(1U << 9)
 #define NFS_CAP_ZERO_RANGE	(1U << 10)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ff1f12aa73d2..7c2057e40f99 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -182,6 +182,8 @@ struct nfs_pathconf {
 	struct nfs_fattr	*fattr; /* Post-op attributes */
 	__u32			max_link; /* max # of hard links */
 	__u32			max_namelen; /* max name length */
+	bool			case_insensitive;
+	bool			case_preserving;
 };
 
 struct nfs4_change_info {

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 09/15] cifs: Implement fileattr_get for case sensitivity
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Steve French, Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Upper layers such as NFSD need a way to query whether a filesystem
handles filenames in a case-sensitive manner. Report CIFS/SMB case
handling behavior via FS_XFLAG_CASEFOLD and
FS_XFLAG_CASENONPRESERVING.

The authoritative source is the server itself: at mount time CIFS
issues QueryFSInfo(FS_ATTRIBUTE_INFORMATION) and caches the reply
on the tcon. That reply carries FILE_CASE_SENSITIVE_SEARCH and
FILE_CASE_PRESERVED_NAMES, which reflect whatever case handling
the share actually implements after SMB3.1.1 POSIX extensions
negotiation. Translating those two bits into the VFS flags lets
cifs_fileattr_get report what the server advertises rather than
what the client was asked to pretend.

QueryFSInfo is best-effort; the mount completes even if the server
does not answer. MaxPathNameComponentLength is zero in that case
and is used as the "no reply received" sentinel. When no reply is
available, fall back to the nocase mount option so that the reported
behavior agrees with the dentry comparison operations installed on
the superblock.

The callback is registered on cifs_dir_inode_ops so that NFSD,
ksmbd, and other consumers querying case handling against a
directory get a definitive answer, and on cifs_file_inode_ops to
preserve FS_COMPR_FL reporting on regular files. cifs_set_ops()
also installs cifs_namespace_inode_operations on DFS referral
directories that carry IS_AUTOMOUNT; register the same callback
there so the answer does not depend on whether the directory is
a referral point.

Registering fileattr_get routes FS_IOC_GETFLAGS through
vfs_fileattr_get() and short-circuits the syscall's fallback to
cifs_ioctl(). That fallback invoked CIFSGetExtAttr() under
CONFIG_CIFS_POSIX and CONFIG_CIFS_ALLOW_INSECURE_LEGACY on servers
advertising CIFS_UNIX_EXTATTR_CAP, surfacing the SMB1 Unix-extension
immutable, append, and nodump bits. cifs_fileattr_get carries over
only FS_COMPR_FL from cached cifsAttrs; the SMB1 extattr fetch is
not reproduced. SMB1 is deprecated, and acquiring a netfid from
within a dentry-only callback is not worth preserving a path tied
to an insecure legacy dialect.

Acked-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/smb/client/cifsfs.c    | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/smb/client/cifsfs.h    |  3 +++
 fs/smb/client/namespace.c |  1 +
 3 files changed, 45 insertions(+)

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2025739f070a..0912d74e32de 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -30,6 +30,7 @@
 #include <linux/xattr.h>
 #include <linux/mm.h>
 #include <linux/key-type.h>
+#include <linux/fileattr.h>
 #include <uapi/linux/magic.h>
 #include <net/ipv6.h>
 #include "cifsfs.h"
@@ -1199,6 +1200,44 @@ struct file_system_type smb3_fs_type = {
 MODULE_ALIAS_FS("smb3");
 MODULE_ALIAS("smb3");
 
+int cifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	u32 attrs = le32_to_cpu(tcon->fsAttrInfo.Attributes);
+
+	/* Preserve FS_COMPR_FL previously reported by cifs_ioctl(). */
+	if (CIFS_I(d_inode(dentry))->cifsAttrs & ATTR_COMPRESSED)
+		fa->flags |= FS_COMPR_FL;
+
+	/*
+	 * The server's FS_ATTRIBUTE_INFORMATION response, cached on
+	 * the tcon at mount, reflects the share's case-handling
+	 * semantics after any POSIX extensions negotiation. Prefer
+	 * it over the client-local nocase mount option, which only
+	 * governs dentry comparison on this superblock.
+	 *
+	 * QueryFSInfo is best-effort at mount; when it did not
+	 * populate fsAttrInfo, MaxPathNameComponentLength remains
+	 * zero. In that case fall back to nocase so the reporting
+	 * matches the comparison behavior installed on the sb.
+	 */
+	if (le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength) == 0) {
+		if (tcon->nocase) {
+			fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+			fa->flags |= FS_CASEFOLD_FL;
+		}
+		return 0;
+	}
+	if (!(attrs & FILE_CASE_SENSITIVE_SEARCH)) {
+		fa->fsx_xflags |= FS_XFLAG_CASEFOLD;
+		fa->flags |= FS_CASEFOLD_FL;
+	}
+	if (!(attrs & FILE_CASE_PRESERVED_NAMES))
+		fa->fsx_xflags |= FS_XFLAG_CASENONPRESERVING;
+	return 0;
+}
+
 const struct inode_operations cifs_dir_inode_ops = {
 	.create = cifs_create,
 	.atomic_open = cifs_atomic_open,
@@ -1217,6 +1256,7 @@ const struct inode_operations cifs_dir_inode_ops = {
 	.listxattr = cifs_listxattr,
 	.get_acl = cifs_get_acl,
 	.set_acl = cifs_set_acl,
+	.fileattr_get = cifs_fileattr_get,
 };
 
 const struct inode_operations cifs_file_inode_ops = {
@@ -1227,6 +1267,7 @@ const struct inode_operations cifs_file_inode_ops = {
 	.fiemap = cifs_fiemap,
 	.get_acl = cifs_get_acl,
 	.set_acl = cifs_set_acl,
+	.fileattr_get = cifs_fileattr_get,
 };
 
 const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 7370b38da938..5f0d459d1a89 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -89,6 +89,9 @@ extern const struct inode_operations cifs_file_inode_ops;
 extern const struct inode_operations cifs_symlink_inode_ops;
 extern const struct inode_operations cifs_namespace_inode_operations;
 
+struct file_kattr;
+int cifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+
 
 /* Functions related to files and directories */
 extern const struct netfs_request_ops cifs_req_ops;
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index 52a520349cb7..52a51b032fae 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -294,4 +294,5 @@ struct vfsmount *cifs_d_automount(struct path *path)
 }
 
 const struct inode_operations cifs_namespace_inode_operations = {
+	.fileattr_get	= cifs_fileattr_get,
 };

-- 
2.53.0


^ permalink raw reply related

* [PATCH v12 08/15] xfs: Report case sensitivity in fileattr_get
From: Chuck Lever @ 2026-04-29 18:07 UTC (permalink / raw)
  To: Al Viro, Christian Brauner, Jan Kara
  Cc: linux-fsdevel, linux-ext4, linux-xfs, linux-cifs, linux-nfs,
	linux-api, linux-f2fs-devel, hirofumi, linkinjeon, sj1557.seo,
	yuezhang.mo, almaz.alexandrovich, slava, glaubitz, frank.li,
	tytso, adilger.kernel, cem, sfrench, pc, ronniesahlberg, sprasad,
	trondmy, anna, jaegeuk, chao, hansg, senozhatsky, Chuck Lever,
	Roland Mainz
In-Reply-To: <20260429-case-sensitivity-v12-0-8057123bebe0@oracle.com>

From: Chuck Lever <chuck.lever@oracle.com>

Upper layers such as NFSD need to query whether a filesystem
is case-sensitive. Add FS_XFLAG_CASEFOLD to xfs_ip2xflags()
when the filesystem is formatted with the ASCIICI feature
flag. This serves both FS_IOC_FSGETXATTR (via xfs_fill_fsxattr()
in xfs_fileattr_get()) and XFS_IOC_BULKSTAT (which populates
bs_xflags directly from xfs_ip2xflags()), so bulkstat consumers
and per-inode queries see a consistent view of the filesystem's
case-folding behavior.

FS_XFLAG_CASEFOLD is read-only: FS_XFLAG_RDONLY_MASK ensures
FS_IOC_FSSETXATTR strips it, and xfs_flags2diflags() has no
clause for CASEFOLD so the on-disk diflags are unaffected.
The legacy FS_IOC_SETFLAGS path in xfs_fileattr_set() also
allows FS_CASEFOLD_FL through its allowlist on ASCIICI
filesystems so that a chattr read-modify-write cycle does
not fail with EOPNOTSUPP.

XFS always preserves case. XFS is case-sensitive by default,
but supports ASCII case-insensitive lookups when formatted
with the ASCIICI feature flag.

Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/xfs/libxfs/xfs_inode_util.c |  2 ++
 fs/xfs/xfs_ioctl.c             | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 551fa51befb6..82be54b6f8d3 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -130,6 +130,8 @@ xfs_ip2xflags(
 
 	if (xfs_inode_has_attr_fork(ip))
 		flags |= FS_XFLAG_HASATTR;
+	if (xfs_has_asciici(ip->i_mount))
+		flags |= FS_XFLAG_CASEFOLD;
 	return flags;
 }
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ed9b4846c05f..f8216f74679f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -755,9 +755,23 @@ xfs_fileattr_set(
 	trace_xfs_ioctl_setattr(ip);
 
 	if (!fa->fsx_valid) {
-		if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
-				  FS_NOATIME_FL | FS_NODUMP_FL |
-				  FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL))
+		unsigned int allowed = FS_IMMUTABLE_FL | FS_APPEND_FL |
+				       FS_NOATIME_FL | FS_NODUMP_FL |
+				       FS_SYNC_FL | FS_DAX_FL |
+				       FS_PROJINHERIT_FL;
+
+		/*
+		 * FS_CASEFOLD_FL reflects the ASCIICI superblock feature,
+		 * a read-only property. Accept it as a no-op so chattr's
+		 * RMW round-trip succeeds; reject any attempt to enable
+		 * it on a non-ASCIICI filesystem. xfs_flags2diflags()
+		 * has no clause for CASEFOLD, so the bit is dropped from
+		 * the on-disk diflags regardless.
+		 */
+		if (xfs_has_asciici(mp))
+			allowed |= FS_CASEFOLD_FL;
+
+		if (fa->flags & ~allowed)
 			return -EOPNOTSUPP;
 	}
 

-- 
2.53.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox