Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH 2/6] iocache: add the actual buffer cache
From: Darrick J. Wong @ 2026-06-25 19:39 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241607127.1810839.16661954075518963408.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Wire up buffer caching into our new caching IO manager.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/support/iocache.c |  482 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 460 insertions(+), 22 deletions(-)


diff --git a/lib/support/iocache.c b/lib/support/iocache.c
index 2148a9d93a4285..59b71306f4dd41 100644
--- a/lib/support/iocache.c
+++ b/lib/support/iocache.c
@@ -9,46 +9,287 @@
  * %End-Header%
  */
 #include "config.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <limits.h>
 #include "ext2fs/ext2_fs.h"
 #include "ext2fs/ext2fs.h"
 #include "ext2fs/ext2fsP.h"
 #include "support/iocache.h"
+#include "support/list.h"
+#include "support/cache.h"
 
 #define IOCACHE_IO_CHANNEL_MAGIC	0x424F5254	/* BORT */
 
 static io_manager iocache_backing_manager;
 
+static inline uint64_t B_TO_FSBT(io_channel channel, uint64_t number) {
+	return number / channel->block_size;
+}
+
+static inline uint64_t B_TO_FSB(io_channel channel, uint64_t number) {
+	return (number + channel->block_size - 1) / channel->block_size;
+}
+
 struct iocache_private_data {
 	int			magic;
-	io_channel		real;
+	io_channel		real;		/* lower level io channel */
+	io_channel		channel;	/* cache channel */
+	struct cache		cache;
+	pthread_mutex_t		stats_lock;
+	struct struct_io_stats	io_stats;
+	unsigned long long	write_errors;
 };
 
+#define IOCACHEDATA(cache) \
+	(container_of(cache, struct iocache_private_data, cache))
+
 static struct iocache_private_data *IOCACHE(io_channel channel)
 {
 	return (struct iocache_private_data *)channel->private_data;
 }
 
-static errcode_t iocache_read_error(io_channel channel, unsigned long block,
-				    int count, void *data, size_t size,
-				    int actual_bytes_read, errcode_t error)
+struct iocache_buf {
+	struct cache_node	node;
+	struct list_head	list;
+	blk64_t			block;
+	void			*buf;
+	errcode_t		write_error;
+	unsigned int		uptodate:1;
+	unsigned int		dirty:1;
+};
+
+static inline void iocache_buf_lock(struct iocache_buf *ubuf)
+{
+	pthread_mutex_lock(&ubuf->node.cn_mutex);
+}
+
+static inline void iocache_buf_unlock(struct iocache_buf *ubuf)
+{
+	pthread_mutex_unlock(&ubuf->node.cn_mutex);
+}
+
+struct iocache_key {
+	blk64_t			block;
+};
+
+#define IOKEY(key)	((struct iocache_key *)(key))
+#define IOBUF(node)	(container_of((node), struct iocache_buf, node))
+
+static unsigned int
+iocache_hash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
+{
+	uint64_t	hashval = IOKEY(key)->block;
+	uint64_t	tmp;
+
+	tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
+	return tmp % hashsize;
+}
+
+static int iocache_compare(struct cache_node *node, cache_key_t key)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+	struct iocache_key *ukey = IOKEY(key);
+
+	if (ubuf->block == ukey->block)
+		return CACHE_HIT;
+
+	return CACHE_MISS;
+}
+
+static struct cache_node *iocache_alloc_node(struct cache *cache,
+					     cache_key_t key)
+{
+	struct iocache_private_data *data = IOCACHEDATA(cache);
+	struct iocache_key *ukey = IOKEY(key);
+	struct iocache_buf *ubuf;
+	errcode_t retval;
+
+	retval = ext2fs_get_mem(sizeof(struct iocache_buf), &ubuf);
+	if (retval)
+		return NULL;
+	memset(ubuf, 0, sizeof(*ubuf));
+
+	retval = io_channel_alloc_buf(data->channel, 0, &ubuf->buf);
+	if (retval) {
+		free(ubuf);
+		return NULL;
+	}
+	memset(ubuf->buf, 0, data->channel->block_size);
+
+	INIT_LIST_HEAD(&ubuf->list);
+	ubuf->block = ukey->block;
+	return &ubuf->node;
+}
+
+static bool iocache_flush_node(struct cache *cache, struct cache_node *node)
+{
+	struct iocache_private_data *data = IOCACHEDATA(cache);
+	struct iocache_buf *ubuf = IOBUF(node);
+	errcode_t retval;
+
+	if (ubuf->dirty) {
+		retval = io_channel_write_blk64(data->real, ubuf->block, 1,
+						ubuf->buf);
+		if (retval) {
+			ubuf->write_error = retval;
+			data->write_errors++;
+		} else {
+			ubuf->dirty = 0;
+			ubuf->write_error = 0;
+		}
+	}
+
+	return ubuf->dirty;
+}
+
+static void iocache_relse(struct cache *cache, struct cache_node *node)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+
+	ext2fs_free_mem(&ubuf->buf);
+	ext2fs_free_mem(&ubuf);
+}
+
+static unsigned int iocache_bulkrelse(struct cache *cache,
+				      struct list_head *list)
+{
+	struct cache_node *cn, *n;
+	int count = 0;
+
+	if (list_empty(list))
+		return 0;
+
+	list_for_each_entry_safe(cn, n, list, cn_mru) {
+		iocache_relse(cache, cn);
+		count++;
+	}
+
+	return count;
+}
+
+/* Flush all dirty buffers in the cache to disk. */
+static errcode_t iocache_flush_cache(struct iocache_private_data *data)
+{
+	return cache_flush(&data->cache) ? 0 : EIO;
+}
+
+/* Flush all dirty buffers in this range of the cache to disk. */
+static errcode_t iocache_flush_range(struct iocache_private_data *data,
+				     blk64_t block, uint64_t count)
+{
+	uint64_t i;
+	bool still_dirty = false;
+
+	for (i = 0; i < count; i++) {
+		struct iocache_key ukey = {
+			.block = block + i,
+		};
+		struct cache_node *node;
+
+		cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+			       &node);
+		if (!node)
+			continue;
+
+		/* cache_flush holds cn_mutex across the node flush */
+		pthread_mutex_unlock(&node->cn_mutex);
+		still_dirty |= iocache_flush_node(&data->cache, node);
+		pthread_mutex_unlock(&node->cn_mutex);
+
+		cache_node_put(&data->cache, node);
+	}
+
+	return still_dirty ? EIO : 0;
+}
+
+static void iocache_add_list(struct cache *cache, struct cache_node *node,
+			     void *data)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+	struct list_head *list = data;
+
+	assert(node->cn_count == 0 || node->cn_count == 1);
+
+	iocache_buf_lock(ubuf);
+	cache_node_grab(cache, node);
+	list_add_tail(&ubuf->list, list);
+	iocache_buf_unlock(ubuf);
+}
+
+static void iocache_invalidate_bufs(struct iocache_private_data *data,
+				    struct list_head *list)
+{
+	struct iocache_buf *ubuf, *n;
+
+	list_for_each_entry_safe(ubuf, n, list, list) {
+		struct iocache_key ukey = {
+			.block = ubuf->block,
+		};
+
+		assert(ubuf->node.cn_count == 1);
+
+		iocache_buf_lock(ubuf);
+		ubuf->dirty = 0;
+		list_del_init(&ubuf->list);
+		iocache_buf_unlock(ubuf);
+
+		cache_node_put(&data->cache, &ubuf->node);
+		cache_node_purge(&data->cache, &ukey, &ubuf->node);
+	}
+}
+
+/*
+ * Remove all blocks from the cache.  Dirty contents are discarded.  Buffer
+ * refcounts must be zero!
+ */
+static void iocache_invalidate_cache(struct iocache_private_data *data)
 {
-	io_channel iocache_channel = channel->app_data;
+	LIST_HEAD(list);
 
-	return iocache_channel->read_error(iocache_channel, block, count, data,
-					   size, actual_bytes_read, error);
+	cache_walk(&data->cache, iocache_add_list, &list);
+	iocache_invalidate_bufs(data, &list);
 }
 
-static errcode_t iocache_write_error(io_channel channel, unsigned long block,
-				     int count, const void *data, size_t size,
-				     int actual_bytes_written,
-				     errcode_t error)
+/*
+ * Remove a range of blocks from the cache.  Dirty contents are discarded.
+ * Buffer refcounts must be zero!
+ */
+static void iocache_invalidate_range(struct iocache_private_data *data,
+				     blk64_t block, uint64_t count)
 {
-	io_channel iocache_channel = channel->app_data;
+	LIST_HEAD(list);
+	uint64_t i;
 
-	return iocache_channel->write_error(iocache_channel, block, count, data,
-					    size, actual_bytes_written, error);
+	for (i = 0; i < count; i++) {
+		struct iocache_key ukey = {
+			.block = block + i,
+		};
+		struct cache_node *node;
+
+		cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+			       &node);
+		if (node) {
+			iocache_add_list(&data->cache, node, &list);
+			cache_node_put(&data->cache, node);
+		}
+	}
+	iocache_invalidate_bufs(data, &list);
 }
 
+static const struct cache_operations iocache_ops = {
+	.hash		= iocache_hash,
+	.alloc		= iocache_alloc_node,
+	.flush		= iocache_flush_node,
+	.relse		= iocache_relse,
+	.compare	= iocache_compare,
+	.bulkrelse	= iocache_bulkrelse,
+	.resize		= cache_gradual_resize,
+};
+
 static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 {
 	io_channel	io = NULL;
@@ -65,6 +306,9 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 	if (retval)
 		return retval;
 
+	/* disable any static cache in the lower io manager */
+	io_channel_set_options(real, "cache=off");
+
 	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
 	if (retval)
 		goto out_backing;
@@ -76,12 +320,19 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 		goto out_channel;
 	memset(data, 0, sizeof(struct iocache_private_data));
 	data->magic = IOCACHE_IO_CHANNEL_MAGIC;
+	data->io_stats.num_fields = 4;
+	data->channel = io;
 
 	io->manager = iocache_io_manager;
 	retval = ext2fs_get_mem(strlen(name) + 1, &io->name);
 	if (retval)
 		goto out_data;
 
+	retval = cache_init(CACHE_AUTO_SHRINK, 1U << 10, &iocache_ops,
+			    &data->cache);
+	if (retval)
+		goto out_name;
+
 	strcpy(io->name, name);
 	io->private_data = data;
 	io->block_size = real->block_size;
@@ -91,12 +342,14 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 	io->flags = real->flags;
 	data->real = real;
 	real->app_data = io;
-	real->read_error = iocache_read_error;
-	real->write_error = iocache_write_error;
+
+	pthread_mutex_init(&data->stats_lock, NULL);
 
 	*channel = io;
 	return 0;
 
+out_name:
+	ext2fs_free_mem(&io->name);
 out_data:
 	ext2fs_free_mem(&data);
 out_channel:
@@ -116,6 +369,10 @@ static errcode_t iocache_close(io_channel channel)
 
 	if (--channel->refcount > 0)
 		return 0;
+	pthread_mutex_destroy(&data->stats_lock);
+	cache_flush(&data->cache);
+	cache_purge(&data->cache);
+	cache_destroy(&data->cache);
 	if (data->real)
 		retval = io_channel_close(data->real);
 	ext2fs_free_mem(&channel->private_data);
@@ -134,6 +391,11 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
+	retval = iocache_flush_cache(data);
+	if (retval)
+		return retval;
+	iocache_invalidate_cache(data);
+
 	retval = io_channel_set_blksize(data->real, blksize);
 	if (retval)
 		return retval;
@@ -145,21 +407,34 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
 static errcode_t iocache_flush(io_channel channel)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval = 0;
+	errcode_t retval2;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_flush(data->real);
+	retval = iocache_flush_cache(data);
+	retval2 = io_channel_flush(data->real);
+	if (retval)
+		return retval;
+	return retval2;
 }
 
 static errcode_t iocache_write_byte(io_channel channel, unsigned long offset,
 				    int count, const void *buf)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	blk64_t bno = B_TO_FSBT(channel, offset);
+	blk64_t next_bno = B_TO_FSB(channel, offset + count);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
+	retval = iocache_flush_range(data, bno, next_bno - bno);
+	if (retval)
+		return retval;
+	iocache_invalidate_range(data, bno, next_bno - bno);
 	return io_channel_write_byte(data->real, offset, count, buf);
 }
 
@@ -170,6 +445,31 @@ static errcode_t iocache_set_option(io_channel channel, const char *option,
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+	errcode_t retval;
+
+	/* don't let unix io cache= options leak through */
+	if (!strcmp(option, "cache"))
+		return 0;
+
+	if (!strcmp(option, "cache_blocks")) {
+		long long size;
+
+		if (!arg)
+			return EXT2_ET_INVALID_ARGUMENT;
+
+		errno = 0;
+		size = strtoll(arg, NULL, 0);
+		if (errno || size == 0 || size > UINT_MAX)
+			return EXT2_ET_INVALID_ARGUMENT;
+
+		cache_set_maxcount(&data->cache, size);
+		return 0;
+	}
+
+	retval = iocache_flush_cache(data);
+	if (retval)
+		return retval;
+	iocache_invalidate_cache(data);
 
 	return data->real->manager->set_option(data->real, option, arg);
 }
@@ -181,31 +481,157 @@ static errcode_t iocache_get_stats(io_channel channel, io_stats *io_stats)
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return data->real->manager->get_stats(data->real, io_stats);
+	/*
+	 * Yes, io_stats is a double-pointer, and we let the caller scribble on
+	 * our stats struct WITHOUT LOCKING!
+	 */
+	if (io_stats)
+		*io_stats = &data->io_stats;
+	return 0;
+}
+
+static void iocache_update_stats(struct iocache_private_data *data,
+				 unsigned long long bytes_read,
+				 unsigned long long bytes_written,
+				 int cache_op)
+{
+	pthread_mutex_lock(&data->stats_lock);
+	data->io_stats.bytes_read += bytes_read;
+	data->io_stats.bytes_written += bytes_written;
+	if (cache_op == CACHE_HIT)
+		data->io_stats.cache_hits++;
+	else
+		data->io_stats.cache_misses++;
+	pthread_mutex_unlock(&data->stats_lock);
 }
 
 static errcode_t iocache_read_blk64(io_channel channel,
 				    unsigned long long block, int count,
 				    void *buf)
 {
+	struct iocache_key ukey = {
+		.block = block,
+	};
 	struct iocache_private_data *data = IOCACHE(channel);
+	unsigned long long i;
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_read_blk64(data->real, block, count, buf);
+	/*
+	 * If we're doing an odd-sized read, flush out the cache and then do a
+	 * direct read.
+	 */
+	if (count < 0) {
+		uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+		retval = iocache_flush_range(data, block, fsbcount);
+		if (retval)
+			return retval;
+		iocache_invalidate_range(data, block, fsbcount);
+		iocache_update_stats(data, 0, 0, CACHE_MISS);
+		return io_channel_read_blk64(data->real, block, count, buf);
+	}
+
+	for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+		struct cache_node *node;
+		struct iocache_buf *ubuf;
+
+		cache_node_get(&data->cache, &ukey, 0, &node);
+		if (!node) {
+			/* cannot instantiate cache, just do a direct read */
+			retval = io_channel_read_blk64(data->real, ukey.block,
+						       1, buf);
+			if (retval)
+				return retval;
+			iocache_update_stats(data, channel->block_size, 0,
+					     CACHE_MISS);
+			continue;
+		}
+
+		ubuf = IOBUF(node);
+		iocache_buf_lock(ubuf);
+		if (!ubuf->uptodate) {
+			retval = io_channel_read_blk64(data->real, ukey.block,
+						       1, ubuf->buf);
+			if (!retval) {
+				ubuf->uptodate = 1;
+				iocache_update_stats(data, channel->block_size,
+						     0, CACHE_MISS);
+			}
+		} else {
+			iocache_update_stats(data, channel->block_size, 0,
+					     CACHE_HIT);
+		}
+		if (ubuf->uptodate)
+			memcpy(buf, ubuf->buf, channel->block_size);
+		iocache_buf_unlock(ubuf);
+		cache_node_put(&data->cache, node);
+		if (retval)
+			return retval;
+	}
+
+	return 0;
 }
 
 static errcode_t iocache_write_blk64(io_channel channel,
 				     unsigned long long block, int count,
 				     const void *buf)
 {
+	struct iocache_key ukey = {
+		.block = block,
+	};
 	struct iocache_private_data *data = IOCACHE(channel);
+	unsigned long long i;
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_write_blk64(data->real, block, count, buf);
+	/*
+	 * If we're doing an odd-sized write, flush out the cache and then do a
+	 * direct write.
+	 */
+	if (count < 0) {
+		uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+		retval = iocache_flush_range(data, block, fsbcount);
+		if (retval)
+			return retval;
+		iocache_invalidate_range(data, block, fsbcount);
+		iocache_update_stats(data, 0, 0, CACHE_MISS);
+		return io_channel_write_blk64(data->real, block, count, buf);
+	}
+
+	for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+		struct cache_node *node;
+		struct iocache_buf *ubuf;
+
+		cache_node_get(&data->cache, &ukey, 0, &node);
+		if (!node) {
+			/* cannot instantiate cache, do a direct write */
+			retval = io_channel_write_blk64(data->real, ukey.block,
+							1, buf);
+			if (retval)
+				return retval;
+			iocache_update_stats(data, 0, channel->block_size,
+					     CACHE_MISS);
+			continue;
+		}
+
+		ubuf = IOBUF(node);
+		iocache_buf_lock(ubuf);
+		memcpy(ubuf->buf, buf, channel->block_size);
+		iocache_update_stats(data, 0, channel->block_size,
+				     ubuf->uptodate ? CACHE_HIT : CACHE_MISS);
+		ubuf->dirty = 1;
+		ubuf->uptodate = 1;
+		iocache_buf_unlock(ubuf);
+		cache_node_put(&data->cache, node);
+	}
+
+	return 0;
 }
 
 static errcode_t iocache_read_blk(io_channel channel, unsigned long block,
@@ -224,11 +650,17 @@ static errcode_t iocache_discard(io_channel channel, unsigned long long block,
 				 unsigned long long count)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_discard(data->real, block, count);
+	retval = io_channel_discard(data->real, block, count);
+	if (retval)
+		return retval;
+
+	iocache_invalidate_range(data, block, count);
+	return 0;
 }
 
 static errcode_t iocache_cache_readahead(io_channel channel,
@@ -247,11 +679,17 @@ static errcode_t iocache_zeroout(io_channel channel, unsigned long long block,
 				 unsigned long long count)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_zeroout(data->real, block, count);
+	retval = io_channel_zeroout(data->real, block, count);
+	if (retval)
+		return retval;
+
+	iocache_invalidate_range(data, block, count);
+	return 0;
 }
 
 static errcode_t iocache_get_fd(io_channel channel, int *fd)


^ permalink raw reply related

* [PATCH 1/6] libsupport: add caching IO manager
From: Darrick J. Wong @ 2026-06-25 19:39 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241607127.1810839.16661954075518963408.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Start creating a caching IO manager so that we can have better caching
of metadata blocks in fuse2fs.  For now it's just a passthrough cache.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/support/iocache.h   |   17 +++
 lib/ext2fs/io_manager.c |    3 
 lib/support/Makefile.in |    6 +
 lib/support/iocache.c   |  304 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 329 insertions(+), 1 deletion(-)
 create mode 100644 lib/support/iocache.h
 create mode 100644 lib/support/iocache.c


diff --git a/lib/support/iocache.h b/lib/support/iocache.h
new file mode 100644
index 00000000000000..502eede08aadc5
--- /dev/null
+++ b/lib/support/iocache.h
@@ -0,0 +1,17 @@
+/*
+ * iocache.h - IO cache
+ *
+ * Copyright (C) 2025-2026 Oracle.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+#ifndef __IOCACHE_H__
+#define __IOCACHE_H__
+
+errcode_t iocache_set_backing_manager(io_manager manager);
+extern io_manager iocache_io_manager;
+
+#endif /* __IOCACHE_H__ */
diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index dff3d73552827f..57beb0368c2a8d 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -16,9 +16,12 @@
 #if HAVE_SYS_TYPES_H
 #include <sys/types.h>
 #endif
+#include <stdbool.h>
 
 #include "ext2_fs.h"
 #include "ext2fs.h"
+#include "support/list.h"
+#include "support/cache.h"
 
 errcode_t io_channel_set_options(io_channel channel, const char *opts)
 {
diff --git a/lib/support/Makefile.in b/lib/support/Makefile.in
index d20d6a984b7679..22242758b4e618 100644
--- a/lib/support/Makefile.in
+++ b/lib/support/Makefile.in
@@ -15,6 +15,7 @@ all::
 
 OBJS=		bthread.o \
 		cstring.o \
+		iocache.o \
 		mkquota.o \
 		plausible.o \
 		profile.o \
@@ -46,7 +47,8 @@ SRCS=		$(srcdir)/argv_parse.c \
 		$(srcdir)/thread.c \
 		$(srcdir)/dict.c \
 		$(srcdir)/devname.c \
-		$(srcdir)/cache.c
+		$(srcdir)/cache.c \
+		$(srcdir)/iocache.c
 
 LIBRARY= libsupport
 LIBDIR= support
@@ -200,3 +202,5 @@ devname.o: $(srcdir)/devname.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/devname.h $(srcdir)/nls-enable.h
 cache.o: $(srcdir)/cache.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/list.h $(srcdir)/cache.h
+iocache.o: $(srcdir)/iocache.c $(top_builddir)/lib/config.h \
+ $(srcdir)/iocache.h $(srcdir)/cache.h $(srcdir)/list.h
diff --git a/lib/support/iocache.c b/lib/support/iocache.c
new file mode 100644
index 00000000000000..2148a9d93a4285
--- /dev/null
+++ b/lib/support/iocache.c
@@ -0,0 +1,304 @@
+/*
+ * iocache.c - caching IO manager
+ *
+ * Copyright (C) 2025-2026 Oracle.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+#include "config.h"
+#include "ext2fs/ext2_fs.h"
+#include "ext2fs/ext2fs.h"
+#include "ext2fs/ext2fsP.h"
+#include "support/iocache.h"
+
+#define IOCACHE_IO_CHANNEL_MAGIC	0x424F5254	/* BORT */
+
+static io_manager iocache_backing_manager;
+
+struct iocache_private_data {
+	int			magic;
+	io_channel		real;
+};
+
+static struct iocache_private_data *IOCACHE(io_channel channel)
+{
+	return (struct iocache_private_data *)channel->private_data;
+}
+
+static errcode_t iocache_read_error(io_channel channel, unsigned long block,
+				    int count, void *data, size_t size,
+				    int actual_bytes_read, errcode_t error)
+{
+	io_channel iocache_channel = channel->app_data;
+
+	return iocache_channel->read_error(iocache_channel, block, count, data,
+					   size, actual_bytes_read, error);
+}
+
+static errcode_t iocache_write_error(io_channel channel, unsigned long block,
+				     int count, const void *data, size_t size,
+				     int actual_bytes_written,
+				     errcode_t error)
+{
+	io_channel iocache_channel = channel->app_data;
+
+	return iocache_channel->write_error(iocache_channel, block, count, data,
+					    size, actual_bytes_written, error);
+}
+
+static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
+{
+	io_channel	io = NULL;
+	io_channel	real;
+	struct iocache_private_data *data = NULL;
+	errcode_t	retval;
+
+	if (!name)
+		return EXT2_ET_BAD_DEVICE_NAME;
+	if (!iocache_backing_manager)
+		return EXT2_ET_INVALID_ARGUMENT;
+
+	retval = iocache_backing_manager->open(name, flags, &real);
+	if (retval)
+		return retval;
+
+	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
+	if (retval)
+		goto out_backing;
+	memset(io, 0, sizeof(struct struct_io_channel));
+	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
+
+	retval = ext2fs_get_mem(sizeof(struct iocache_private_data), &data);
+	if (retval)
+		goto out_channel;
+	memset(data, 0, sizeof(struct iocache_private_data));
+	data->magic = IOCACHE_IO_CHANNEL_MAGIC;
+
+	io->manager = iocache_io_manager;
+	retval = ext2fs_get_mem(strlen(name) + 1, &io->name);
+	if (retval)
+		goto out_data;
+
+	strcpy(io->name, name);
+	io->private_data = data;
+	io->block_size = real->block_size;
+	io->read_error = 0;
+	io->write_error = 0;
+	io->refcount = 1;
+	io->flags = real->flags;
+	data->real = real;
+	real->app_data = io;
+	real->read_error = iocache_read_error;
+	real->write_error = iocache_write_error;
+
+	*channel = io;
+	return 0;
+
+out_data:
+	ext2fs_free_mem(&data);
+out_channel:
+	ext2fs_free_mem(&io);
+out_backing:
+	io_channel_close(real);
+	return retval;
+}
+
+static errcode_t iocache_close(io_channel channel)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t	retval = 0;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	if (--channel->refcount > 0)
+		return 0;
+	if (data->real)
+		retval = io_channel_close(data->real);
+	ext2fs_free_mem(&channel->private_data);
+	if (channel->name)
+		ext2fs_free_mem(&channel->name);
+	ext2fs_free_mem(&channel);
+
+	return retval;
+}
+
+static errcode_t iocache_set_blksize(io_channel channel, int blksize)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	retval = io_channel_set_blksize(data->real, blksize);
+	if (retval)
+		return retval;
+
+	channel->block_size = data->real->block_size;
+	return 0;
+}
+
+static errcode_t iocache_flush(io_channel channel)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_flush(data->real);
+}
+
+static errcode_t iocache_write_byte(io_channel channel, unsigned long offset,
+				    int count, const void *buf)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_write_byte(data->real, offset, count, buf);
+}
+
+static errcode_t iocache_set_option(io_channel channel, const char *option,
+				    const char *arg)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return data->real->manager->set_option(data->real, option, arg);
+}
+
+static errcode_t iocache_get_stats(io_channel channel, io_stats *io_stats)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return data->real->manager->get_stats(data->real, io_stats);
+}
+
+static errcode_t iocache_read_blk64(io_channel channel,
+				    unsigned long long block, int count,
+				    void *buf)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_read_blk64(data->real, block, count, buf);
+}
+
+static errcode_t iocache_write_blk64(io_channel channel,
+				     unsigned long long block, int count,
+				     const void *buf)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_write_blk64(data->real, block, count, buf);
+}
+
+static errcode_t iocache_read_blk(io_channel channel, unsigned long block,
+				  int count, void *buf)
+{
+	return iocache_read_blk64(channel, block, count, buf);
+}
+
+static errcode_t iocache_write_blk(io_channel channel, unsigned long block,
+				   int count, const void *buf)
+{
+	return iocache_write_blk64(channel, block, count, buf);
+}
+
+static errcode_t iocache_discard(io_channel channel, unsigned long long block,
+				 unsigned long long count)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_discard(data->real, block, count);
+}
+
+static errcode_t iocache_cache_readahead(io_channel channel,
+					 unsigned long long block,
+					 unsigned long long count)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_cache_readahead(data->real, block, count);
+}
+
+static errcode_t iocache_zeroout(io_channel channel, unsigned long long block,
+				 unsigned long long count)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_zeroout(data->real, block, count);
+}
+
+static errcode_t iocache_get_fd(io_channel channel, int *fd)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_get_fd(data->real, fd);
+}
+
+static errcode_t iocache_flock(io_channel channel, unsigned int flock_flags)
+{
+	struct iocache_private_data *data = IOCACHE(channel);
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+
+	return io_channel_flock(data->real, flock_flags);
+}
+
+static struct struct_io_manager struct_iocache_manager = {
+	.magic			= EXT2_ET_MAGIC_IO_MANAGER,
+	.name			= "iocache I/O manager",
+	.open			= iocache_open,
+	.close			= iocache_close,
+	.set_blksize		= iocache_set_blksize,
+	.read_blk		= iocache_read_blk,
+	.write_blk		= iocache_write_blk,
+	.flush			= iocache_flush,
+	.write_byte		= iocache_write_byte,
+	.set_option		= iocache_set_option,
+	.get_stats		= iocache_get_stats,
+	.read_blk64		= iocache_read_blk64,
+	.write_blk64		= iocache_write_blk64,
+	.discard		= iocache_discard,
+	.cache_readahead	= iocache_cache_readahead,
+	.zeroout		= iocache_zeroout,
+	.get_fd			= iocache_get_fd,
+	.flock			= iocache_flock,
+};
+
+io_manager iocache_io_manager = &struct_iocache_manager;
+
+errcode_t iocache_set_backing_manager(io_manager manager)
+{
+	iocache_backing_manager = manager;
+	return 0;
+}


^ permalink raw reply related

* [PATCH 10/10] debian: update packaging for fuse4fs service
From: Darrick J. Wong @ 2026-06-25 19:39 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Update the Debian packaging code so that we can create fuse4fs service
containers.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 debian/e2fsprogs.install |    7 ++++++-
 debian/fuse4fs.install   |    3 +++
 debian/rules             |    3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 debian/fuse4fs.install


diff --git a/debian/e2fsprogs.install b/debian/e2fsprogs.install
index 17a80e3922dcee..808474bcab1717 100755
--- a/debian/e2fsprogs.install
+++ b/debian/e2fsprogs.install
@@ -50,4 +50,9 @@ usr/share/man/man8/resize2fs.8
 usr/share/man/man8/tune2fs.8
 etc
 [linux-any] ${deb_udevudevdir}/rules.d
-[linux-any] ${deb_systemdsystemunitdir}
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_all.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_all.timer
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_fail@.service
+[linux-any] ${deb_systemdsystemunitdir}/e2scrub_reap.service
diff --git a/debian/fuse4fs.install b/debian/fuse4fs.install
old mode 100644
new mode 100755
index 17bdc90e33cb67..56048136c2b28b
--- a/debian/fuse4fs.install
+++ b/debian/fuse4fs.install
@@ -1,2 +1,5 @@
+#!/usr/bin/dh-exec
 usr/bin/fuse4fs
 usr/share/man/man1/fuse4fs.1
+[linux-any] ${deb_systemdsystemunitdir}/fuse4fs.socket
+[linux-any] ${deb_systemdsystemunitdir}/fuse4fs@.service
diff --git a/debian/rules b/debian/rules
index b680eb33ceac9e..d629e9d6915cfe 100755
--- a/debian/rules
+++ b/debian/rules
@@ -173,6 +173,9 @@ override_dh_installinfo:
 ifneq ($(DEB_HOST_ARCH_OS), hurd)
 override_dh_installsystemd:
 	dh_installsystemd -p e2fsprogs --no-restart-after-upgrade --no-stop-on-upgrade e2scrub_all.timer e2scrub_reap.service
+ifeq ($(SKIP_FUSE4FS),)
+	dh_installsystemd -p fuse4fs fuse4fs.socket
+endif
 endif
 
 override_dh_makeshlibs:


^ permalink raw reply related

* [PATCH 09/10] fuse4fs: make MMP work correctly in safe service mode
From: Darrick J. Wong @ 2026-06-25 19:39 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Normally, the libext2fs MMP code open()s a complete separate file
descriptor to read and write the MMP block so that it can have its own
private open file with its own access mode and file position.  However,
if the unixfd IO manager is in use, it will reuse the io channel, which
means that MMP and the unixfd share the same open file and hence the
access mode and file position.

MMP requires directio access to block devices so that changes are
immediately visible on other nodes.  Therefore, we need the IO channel
(and thus the filesystem) to be running in directio mode if MMP is in
use.

To make this work correctly with the sole unixfd IO manager user
(fuse4fs in unprivileged service mode), we must set O_DIRECT on the
bdev fd and mount the filesystem in directio mode.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fuse4fs/fuse4fs.c |   51 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)


diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 5fa51569a1167f..fdd4327a4c0907 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -1423,12 +1423,57 @@ static int fuse4fs_service_get_config(struct fuse4fs *ff)
 }
 
 static errcode_t fuse4fs_service_openfs(struct fuse4fs *ff, char *options,
-					int flags)
+					int *flags)
 {
+	struct stat statbuf;
 	char path[64];
+	errcode_t retval;
+	int ret;
 
+	ret = fstat(ff->bdev_fd, &statbuf);
+	if (ret)
+		return errno;
+
+	/*
+	 * Open the filesystem with SKIP_MMP so that we can find out if the
+	 * filesystem actually has MMP.
+	 */
 	snprintf(path, sizeof(path), "/dev/fd/%d", ff->bdev_fd);
-	return ext2fs_open2(path, options, flags, 0, 0, unixfd_io_manager,
+	retval = ext2fs_open2(path, options, *flags | EXT2_FLAG_SKIP_MMP, 0, 0,
+			      unixfd_io_manager, &ff->fs);
+	if (retval)
+		return retval;
+
+	/*
+	 * If the fs doesn't have MMP then we're good to go.  Otherwise close
+	 * the filesystem so that we can reopen it with MMP enabled.
+	 */
+	if (!ext2fs_has_feature_mmp(ff->fs->super))
+		return 0;
+
+	retval = ext2fs_close_free(&ff->fs);
+	if (retval)
+		return retval;
+
+	/*
+	 * If the filesystem is not on a regular file, MMP will share the same
+	 * fd as the unixfd IO channel.  We need to set O_DIRECT on the bdev_fd
+	 * and open the filesystem in directio mode.
+	 */
+	if (!S_ISREG(statbuf.st_mode)) {
+		int fflags = fcntl(ff->bdev_fd, F_GETFL);
+
+		if (!(fflags & O_DIRECT)) {
+			ret = fcntl(ff->bdev_fd, F_SETFL, fflags | O_DIRECT);
+			if (ret)
+				return EXT2_ET_MMP_OPEN_DIRECT;
+		}
+
+		ff->directio = 1;
+		*flags |= EXT2_FLAG_DIRECT_IO;
+	}
+
+	return ext2fs_open2(path, options, *flags, 0, 0, unixfd_io_manager,
 			    &ff->fs);
 }
 #else
@@ -1568,7 +1613,7 @@ static errcode_t fuse4fs_open(struct fuse4fs *ff)
 	deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
 	do {
 		if (fuse4fs_is_service(ff))
-			err = fuse4fs_service_openfs(ff, options, flags);
+			err = fuse4fs_service_openfs(ff, options, &flags);
 		else
 			err = ext2fs_open2(ff->device, options, flags, 0, 0,
 					   unix_io_manager, &ff->fs);


^ permalink raw reply related

* [PATCH 08/10] fuse4fs: set proc title when in fuse service mode
From: Darrick J. Wong @ 2026-06-25 19:38 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

When in fuse service mode, set the process title so that we can identify
fuse servers by mount arguments.  When the service ends, amend the title
again to say that we're cleaning up.  This is done to make ps aux a bit
more communicative as to what is going on.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 configure           |  109 +++++++++++++++++++++++++++++++++++++++++++++++++++
 configure.ac        |   13 ++++++
 fuse4fs/Makefile.in |    2 -
 fuse4fs/fuse4fs.c   |   47 ++++++++++++++++++++++
 lib/config.h.in     |    6 +++
 5 files changed, 176 insertions(+), 1 deletion(-)


diff --git a/configure b/configure
index 87960ad2cae3c3..b0531eb58b2b64 100755
--- a/configure
+++ b/configure
@@ -696,6 +696,7 @@ gcc_ranlib
 gcc_ar
 UNI_DIFF_OPTS
 SEM_INIT_LIB
+LIBBSD_LIB
 FUSE4FS_CMT
 FUSE2FS_CMT
 fuse_service_socket_perms
@@ -15022,6 +15023,114 @@ printf "%s\n" "#define HAVE_FUSE_CACHE_READDIR 1" >>confdefs.h
 
 fi
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for setproctitle in -lbsd" >&5
+printf %s "checking for setproctitle in -lbsd... " >&6; }
+if test ${ac_cv_lib_bsd_setproctitle+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbsd  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char setproctitle (void);
+int
+main (void)
+{
+return setproctitle ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_bsd_setproctitle=yes
+else case e in #(
+  e) ac_cv_lib_bsd_setproctitle=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bsd_setproctitle" >&5
+printf "%s\n" "$ac_cv_lib_bsd_setproctitle" >&6; }
+if test "x$ac_cv_lib_bsd_setproctitle" = xyes
+then :
+  LIBBSD_LIB=-lbsd
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for setproctitle_init in -lbsd" >&5
+printf %s "checking for setproctitle_init in -lbsd... " >&6; }
+if test ${ac_cv_lib_bsd_setproctitle_init+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbsd  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char setproctitle_init (void);
+int
+main (void)
+{
+return setproctitle_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_bsd_setproctitle_init=yes
+else case e in #(
+  e) ac_cv_lib_bsd_setproctitle_init=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bsd_setproctitle_init" >&5
+printf "%s\n" "$ac_cv_lib_bsd_setproctitle_init" >&6; }
+if test "x$ac_cv_lib_bsd_setproctitle_init" = xyes
+then :
+  LIBBSD_LIB=-lbsd
+fi
+
+
+if test "$ac_cv_lib_bsd_setproctitle" = yes ; then
+
+printf "%s\n" "#define HAVE_SETPROCTITLE 1" >>confdefs.h
+
+fi
+if test "$ac_cv_lib_bsd_setproctitle_init" = yes ; then
+
+printf "%s\n" "#define HAVE_SETPROCTITLE_INIT 1" >>confdefs.h
+
+fi
+
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for PR_SET_IO_FLUSHER" >&5
 printf %s "checking for PR_SET_IO_FLUSHER... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
diff --git a/configure.ac b/configure.ac
index 381bb15d920a0f..8a5e95cd4eb866 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1622,6 +1622,19 @@ then
 		  [Define to 1 if fuse supports cache_readdir])
 fi
 
+dnl
+dnl see if setproctitle exists
+dnl
+AC_CHECK_LIB(bsd, setproctitle, [LIBBSD_LIB=-lbsd])
+AC_CHECK_LIB(bsd, setproctitle_init, [LIBBSD_LIB=-lbsd])
+AC_SUBST(LIBBSD_LIB)
+if test "$ac_cv_lib_bsd_setproctitle" = yes ; then
+	AC_DEFINE(HAVE_SETPROCTITLE, 1, [Define to 1 if setproctitle present in libbsd])
+fi
+if test "$ac_cv_lib_bsd_setproctitle_init" = yes ; then
+	AC_DEFINE(HAVE_SETPROCTITLE_INIT, 1, [Define to 1 if setproctitle_init present in libbsd])
+fi
+
 dnl
 dnl see if PR_SET_IO_FLUSHER exists
 dnl
diff --git a/fuse4fs/Makefile.in b/fuse4fs/Makefile.in
index 67b8afd54493b0..bb859369914a36 100644
--- a/fuse4fs/Makefile.in
+++ b/fuse4fs/Makefile.in
@@ -76,7 +76,7 @@ fuse4fs: $(FUSE4FS_OBJS) $(DEPLIBS) $(DEPLIBBLKID) $(DEPLIBUUID) \
 	$(E) "	LD $@"
 	$(Q) $(CC) $(ALL_LDFLAGS) -o fuse4fs $(FUSE4FS_OBJS) $(LIBS) \
 		$(LIBFUSE) $(LIBBLKID) $(LIBUUID) $(LIBEXT2FS) $(LIBINTL) \
-		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P)
+		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P) @LIBBSD_LIB@
 
 %.socket: %.socket.in $(DEP_SUBSTITUTE)
 	$(E) "	SUBST $@"
diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 97e668fadc2398..5fa51569a1167f 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -45,6 +45,9 @@
 #ifdef HAVE_FUSE4FS_SERVICE
 # include <sys/mount.h>
 # include <fuse_service.h>
+# ifdef HAVE_SETPROCTITLE
+#  include <bsd/unistd.h>
+# endif
 #endif
 #ifdef __SET_FOB_FOR_FUSE
 # undef _FILE_OFFSET_BITS
@@ -295,6 +298,9 @@ struct fuse4fs {
 	struct cache inodes;
 #ifdef HAVE_FUSE4FS_SERVICE
 	struct fuse_service *service;
+# ifdef HAVE_SETPROCTITLE
+	char *svc_cmdline;
+# endif
 	int bdev_fd;
 #endif
 };
@@ -1291,6 +1297,35 @@ static errcode_t fuse4fs_check_support(struct fuse4fs *ff)
 	return 0;
 }
 
+#if defined(HAVE_FUSE4FS_SERVICE) && defined(HAVE_SETPROCTITLE)
+static void fuse4fs_service_set_proc_cmdline(struct fuse4fs *ff, int argc,
+					     char *argv[],
+					     struct fuse_args *args)
+{
+#ifdef HAVE_SETPROCTITLE_INIT
+	setproctitle_init(argc, argv, environ);
+#endif
+
+	ff->svc_cmdline = fuse_service_cmdline(argc, (const char * const *)argv, args);
+	if (!ff->svc_cmdline)
+		return;
+
+	setproctitle("-%s", ff->svc_cmdline);
+}
+
+static void fuse4fs_service_finish_proc_cmdline(struct fuse4fs *ff)
+{
+	if (!ff->svc_cmdline)
+		return;
+
+	setproctitle("-%s [cleaning up]", ff->svc_cmdline);
+	free(ff->svc_cmdline);
+}
+#else
+# define fuse4fs_service_set_proc_cmdline(...)		((void)0)
+# define fuse4fs_service_finish_proc_cmdline(...)	((void)0)
+#endif
+
 #ifdef HAVE_FUSE4FS_SERVICE
 static int fuse4fs_service_connect(struct fuse4fs *ff, struct fuse_args *args)
 {
@@ -1324,6 +1359,8 @@ static int fuse4fs_service_exit(struct fuse4fs *ff, int exitcode)
 	if (!fuse4fs_is_service(ff))
 		return exitcode;
 
+	fuse4fs_service_finish_proc_cmdline(ff);
+
 	fuse_service_send_goodbye(ff->service, exitcode);
 	fuse_service_release(ff->service);
 	close(ff->bdev_fd);
@@ -6395,6 +6432,16 @@ int main(int argc, char *argv[])
 		goto out_exit;
 	}
 
+	/*
+	 * For fuse services, make the /proc title include the arguments that
+	 * we got from the mount helper.  Do this before parsing argc/argv
+	 * because that may overwrite the argv area.  Note that the procfs
+	 * listing might not reflect the options that actually get enabled,
+	 * just like regular fuse4fs.
+	 */
+	if (fuse4fs_is_service(&fctx))
+		fuse4fs_service_set_proc_cmdline(&fctx, argc, argv, &args);
+
 	ret = fuse_opt_parse(&args, &fctx, fuse4fs_opts, fuse4fs_opt_proc);
 	if (ret)
 		goto out_exit;
diff --git a/lib/config.h.in b/lib/config.h.in
index 15b99c6d28c59e..0973413b5c11e2 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -379,6 +379,12 @@
 /* Define to 1 if you have the 'setmntent' function. */
 #undef HAVE_SETMNTENT
 
+/* Define to 1 if setproctitle present in libbsd */
+#undef HAVE_SETPROCTITLE
+
+/* Define to 1 if setproctitle_init present in libbsd */
+#undef HAVE_SETPROCTITLE_INIT
+
 /* Define to 1 if you have the 'setresgid' function. */
 #undef HAVE_SETRESGID
 


^ permalink raw reply related

* [PATCH 07/10] fuse4fs: enable safe service mode
From: Darrick J. Wong @ 2026-06-25 19:38 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Make it possible to run fuse4fs as a safe systemd service, wherein the
fuse server only has access to the fds that we pass in.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 MCONFIG.in                  |    2 
 configure                   |  186 +++++++++++++++++++++++++++++++
 configure.ac                |  108 ++++++++++++++++++
 fuse4fs/Makefile.in         |   40 ++++++-
 fuse4fs/fuse4fs.c           |  254 +++++++++++++++++++++++++++++++++++++++++--
 fuse4fs/fuse4fs.socket.in   |   17 +++
 fuse4fs/fuse4fs@.service.in |  102 +++++++++++++++++
 lib/config.h.in             |    6 +
 util/subst.conf.in          |    3 +
 9 files changed, 703 insertions(+), 15 deletions(-)
 create mode 100644 fuse4fs/fuse4fs.socket.in
 create mode 100644 fuse4fs/fuse4fs@.service.in


diff --git a/MCONFIG.in b/MCONFIG.in
index d66e2f3bc1d552..7a17778b6da67f 100644
--- a/MCONFIG.in
+++ b/MCONFIG.in
@@ -42,6 +42,8 @@ HAVE_CROND = @have_crond@
 CROND_DIR = @crond_dir@
 HAVE_SYSTEMD = @have_systemd@
 SYSTEMD_SYSTEM_UNIT_DIR = @systemd_system_unit_dir@
+HAVE_FUSE_SERVICE = @have_fuse_service@
+HAVE_FUSE4FS_SERVICE = @have_fuse4fs_service@
 
 @SET_MAKE@
 
diff --git a/configure b/configure
index f24897fcdd4949..87960ad2cae3c3 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ enable_year2038=no
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 OS_IO_FILE
+have_fuse4fs_service
 systemd_system_unit_dir
 have_systemd
 systemd_LIBS
@@ -697,6 +698,9 @@ UNI_DIFF_OPTS
 SEM_INIT_LIB
 FUSE4FS_CMT
 FUSE2FS_CMT
+fuse_service_socket_perms
+fuse_service_socket_dir
+have_fuse_service
 FUSE_LIB
 fuse3_LIBS
 fuse3_CFLAGS
@@ -929,6 +933,8 @@ with_libiconv_prefix
 with_libintl_prefix
 enable_largefile
 with_libarchive
+with_fuse_service_socket_dir
+with_fuse_service_socket_perms
 enable_fuse2fs
 enable_fuse4fs
 enable_lto
@@ -1652,6 +1658,11 @@ Optional Packages:
   --with-libintl-prefix[=DIR]  search for libintl in DIR/include and DIR/lib
   --without-libintl-prefix     don't search for libintl in includedir and libdir
   --without-libarchive    disable use of libarchive
+  --with-fuse-service-socket-dir[=DIR]
+                          Create fuse3 filesystem service sockets in DIR.
+  --with-fuse-service-socket-perms[=MODE]
+                          Create fuse3 filesystem service socket with these
+                          permissions.
   --with-multiarch=ARCH   specify the multiarch triplet
   --with-udev-rules-dir[=DIR]
                           Install udev rules into DIR.
@@ -14598,7 +14609,7 @@ else
         fuse3_LIBS=$pkg_cv_fuse3_LIBS
         { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 printf "%s\n" "yes" >&6; }
-        FUSE_LIB=-lfuse3
+        FUSE_LIB=-lfuse3 ; have_fuse3_pkg=yes
 fi
 
 
@@ -14680,6 +14691,155 @@ printf "%s\n" "#define HAVE_FUSE_LOWLEVEL 1" >>confdefs.h
 
 fi
 
+have_fuse_service=
+fuse_service_socket_dir=
+if test -n "$have_fuse_lowlevel"
+then
+
+# Check whether --with-fuse_service_socket_dir was given.
+if test ${with_fuse_service_socket_dir+y}
+then :
+  withval=$with_fuse_service_socket_dir;
+else case e in #(
+  e) with_fuse_service_socket_dir=yes ;;
+esac
+fi
+
+	if test "x${with_fuse_service_socket_dir}" != "xno"
+then :
+
+		if test "x${with_fuse_service_socket_dir}" = "xyes"
+then :
+
+			if test "x$have_fuse3_pkg" = "xyes"
+then :
+
+				with_fuse_service_socket_dir="$($PKG_CONFIG --variable=service_socket_dir fuse3)"
+
+else case e in #(
+  e)
+				with_fuse_service_socket_dir=""
+			   ;;
+esac
+fi
+
+fi
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse3 service socket dir" >&5
+printf %s "checking for fuse3 service socket dir... " >&6; }
+		fuse_service_socket_dir="${with_fuse_service_socket_dir}"
+		if test -n "${fuse_service_socket_dir}"
+then :
+
+			{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${fuse_service_socket_dir}" >&5
+printf "%s\n" "${fuse_service_socket_dir}" >&6; }
+
+else case e in #(
+  e)
+			{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+		   ;;
+esac
+fi
+
+fi
+
+# Check whether --with-fuse_service_socket_perms was given.
+if test ${with_fuse_service_socket_perms+y}
+then :
+  withval=$with_fuse_service_socket_perms;
+else case e in #(
+  e) with_fuse_service_socket_perms=yes ;;
+esac
+fi
+
+	if test "x${with_fuse_service_socket_perms}" != "xno"
+then :
+
+		if test "x${with_fuse_service_socket_perms}" = "xyes"
+then :
+
+			if test "x$have_fuse3_pkg" = "xyes"
+then :
+
+				with_fuse_service_socket_perms="$($PKG_CONFIG --variable=service_socket_perms fuse3)"
+
+else case e in #(
+  e)
+				with_fuse_service_socket_perms=""
+			   ;;
+esac
+fi
+
+fi
+		fuse_service_socket_perms="${with_fuse_service_socket_perms}"
+
+fi
+fi
+if test -n "$FUSE_USE_VERSION"
+then
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse_service_accept in libfuse" >&5
+printf %s "checking for fuse_service_accept in libfuse... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	#define _GNU_SOURCE
+	#define _FILE_OFFSET_BITS	64
+	#define FUSE_USE_VERSION	319
+	#include <fuse_lowlevel.h>
+	#include <fuse_service.h>
+
+int
+main (void)
+{
+
+	struct fuse_service *moo;
+	fuse_service_accepted(moo);
+
+  ;
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  have_fuse_service_accept=yes
+	   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; } ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse3 service support" >&5
+printf %s "checking for fuse3 service support... " >&6; }
+	if test -n "${fuse_service_socket_dir}" && test "${have_fuse_service_accept}" = "yes"
+then :
+
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+		have_fuse_service="yes"
+
+else case e in #(
+  e)
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+	   ;;
+esac
+fi
+fi
+
+
+
+if test "$have_fuse_service" = yes
+then
+
+printf "%s\n" "#define HAVE_FUSE_SERVICE 1" >>confdefs.h
+
+fi
+
 FUSE2FS_CMT=
 # Check whether --enable-fuse2fs was given.
 if test ${enable_fuse2fs+y}
@@ -16595,6 +16755,30 @@ esac
 fi
 
 
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fuse4fs service support and systemd" >&5
+printf %s "checking for fuse4fs service support and systemd... " >&6; }
+if test "${FUSE4FS_CMT}${have_fuse_service}${have_systemd}" = "yesyes"
+then :
+
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+printf "%s\n" "#define HAVE_FUSE4FS_SERVICE 1" >>confdefs.h
+
+           have_fuse4fs_service=yes
+
+else case e in #(
+  e)
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           have_fuse4fs_service=no
+
+ ;;
+esac
+fi
+
+
 OS_IO_FILE=""
 case "$host_os" in
   mingw*)
diff --git a/configure.ac b/configure.ac
index 38a18de0b67283..381bb15d920a0f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1376,7 +1376,7 @@ dnl Check to see if the FUSE library is -lfuse3 or -losxfuse
 dnl
 FUSE_LIB=
 dnl osxfuse.dylib supersedes fuselib.dylib
-PKG_CHECK_MODULES([fuse3], [fuse3], [FUSE_LIB=-lfuse3],
+PKG_CHECK_MODULES([fuse3], [fuse3], [FUSE_LIB=-lfuse3 ; have_fuse3_pkg=yes],
 [
 	AC_CHECK_LIB(osxfuse, fuse_main, [FUSE_LIB=-losxfuse])
 ])
@@ -1428,6 +1428,96 @@ then
 		  [Define to 1 if fuse supports lowlevel API])
 fi
 
+dnl
+dnl Check if the FUSE library tells us where to put fs service sockets
+dnl
+have_fuse_service=
+fuse_service_socket_dir=
+if test -n "$have_fuse_lowlevel"
+then
+	AC_ARG_WITH([fuse_service_socket_dir],
+	  [AS_HELP_STRING([--with-fuse-service-socket-dir@<:@=DIR@:>@],
+		  [Create fuse3 filesystem service sockets in DIR.])],
+	  [],
+	  [with_fuse_service_socket_dir=yes])
+	AS_IF([test "x${with_fuse_service_socket_dir}" != "xno"],
+	  [
+		AS_IF([test "x${with_fuse_service_socket_dir}" = "xyes"],
+		  [
+			AS_IF([test "x$have_fuse3_pkg" = "xyes" ],
+			  [
+				with_fuse_service_socket_dir="$($PKG_CONFIG --variable=service_socket_dir fuse3)"
+			  ], [
+				with_fuse_service_socket_dir=""
+			  ])
+		  ])
+		AC_MSG_CHECKING([for fuse3 service socket dir])
+		fuse_service_socket_dir="${with_fuse_service_socket_dir}"
+		AS_IF([test -n "${fuse_service_socket_dir}"],
+		  [
+			AC_MSG_RESULT(${fuse_service_socket_dir})
+		  ],
+		  [
+			AC_MSG_RESULT(no)
+		  ])
+	  ],
+	  [])
+	AC_ARG_WITH([fuse_service_socket_perms],
+	  [AS_HELP_STRING([--with-fuse-service-socket-perms@<:@=MODE@:>@],
+		  [Create fuse3 filesystem service socket with these permissions.])],
+	  [],
+	  [with_fuse_service_socket_perms=yes])
+	AS_IF([test "x${with_fuse_service_socket_perms}" != "xno"],
+	  [
+		AS_IF([test "x${with_fuse_service_socket_perms}" = "xyes"],
+		  [
+			AS_IF([test "x$have_fuse3_pkg" = "xyes" ],
+			  [
+				with_fuse_service_socket_perms="$($PKG_CONFIG --variable=service_socket_perms fuse3)"
+			  ], [
+				with_fuse_service_socket_perms=""
+			  ])
+		  ])
+		fuse_service_socket_perms="${with_fuse_service_socket_perms}"
+	  ],
+	  [])
+fi
+if test -n "$FUSE_USE_VERSION"
+then
+	AC_MSG_CHECKING(for fuse_service_accept in libfuse)
+	AC_LINK_IFELSE(
+	[	AC_LANG_PROGRAM([[
+	#define _GNU_SOURCE
+	#define _FILE_OFFSET_BITS	64
+	#define FUSE_USE_VERSION	319
+	#include <fuse_lowlevel.h>
+	#include <fuse_service.h>
+		]], [[
+	struct fuse_service *moo;
+	fuse_service_accepted(moo);
+		]])
+	], have_fuse_service_accept=yes
+	   AC_MSG_RESULT(yes),
+	   AC_MSG_RESULT(no))
+
+	AC_MSG_CHECKING([for fuse3 service support])
+	AS_IF([test -n "${fuse_service_socket_dir}" && test "${have_fuse_service_accept}" = "yes"],
+	  [
+		AC_MSG_RESULT(yes)
+		have_fuse_service="yes"
+	  ],
+	  [
+		AC_MSG_RESULT(no)
+	  ])
+fi
+AC_SUBST(have_fuse_service)
+AC_SUBST(fuse_service_socket_dir)
+AC_SUBST(fuse_service_socket_perms)
+if test "$have_fuse_service" = yes
+then
+	AC_DEFINE(HAVE_FUSE_SERVICE, 1, [Define to 1 if fuse supports service])
+fi
+
 dnl
 dnl Check if fuse2fs is actually built.
 dnl
@@ -2101,6 +2191,22 @@ AS_IF([test "x${with_systemd_unit_dir}" != "xno"],
   ])
 AC_SUBST(have_systemd)
 AC_SUBST(systemd_system_unit_dir)
+
+AC_MSG_CHECKING([for fuse4fs service support and systemd])
+AS_IF([test "${FUSE4FS_CMT}${have_fuse_service}${have_systemd}" = "yesyes"],
+      [
+           AC_MSG_RESULT(yes)
+           AC_DEFINE(HAVE_FUSE4FS_SERVICE, 1,
+                     [Define to 1 if fuse4fs should be built with fuse service support])
+           have_fuse4fs_service=yes
+      ],
+      [
+           AC_MSG_RESULT(no)
+           have_fuse4fs_service=no
+      ]
+)
+AC_SUBST(have_fuse4fs_service)
+
 dnl Adjust the compiled files if we are on windows vs everywhere else
 dnl
 OS_IO_FILE=""
diff --git a/fuse4fs/Makefile.in b/fuse4fs/Makefile.in
index cecee2b2554f82..67b8afd54493b0 100644
--- a/fuse4fs/Makefile.in
+++ b/fuse4fs/Makefile.in
@@ -17,6 +17,13 @@ UMANPAGES=
 @FUSE4FS_CMT@UPROGS+=fuse4fs
 @FUSE4FS_CMT@UMANPAGES+=fuse4fs.1
 
+ifeq ($(HAVE_FUSE4FS_SERVICE),yes)
+SERVICE_FILES	+= fuse4fs.socket fuse4fs@.service
+INSTALLDIRS_TGT	+= installdirs-systemd
+INSTALL_TGT	+= install-systemd
+UNINSTALL_TGT	+= uninstall-systemd
+endif
+
 FUSE4FS_OBJS=	fuse4fs.o journal.o recovery.o revoke.o
 
 PROFILED_FUSE4FS_OJBS=	profiled/fuse4fs.o profiled/journal.o \
@@ -54,7 +61,7 @@ DEPEND_CFLAGS = -I$(top_srcdir)/e2fsck
 @PROFILE_CMT@	$(Q) $(CC) $(ALL_CFLAGS) -g -pg -o profiled/$*.o -c $<
 
 all:: profiled $(SPROGS) $(UPROGS) $(USPROGS) $(SMANPAGES) $(UMANPAGES) \
-	$(FMANPAGES) $(LPROGS)
+	$(FMANPAGES) $(LPROGS) $(SERVICE_FILES)
 
 all-static::
 
@@ -71,6 +78,14 @@ fuse4fs: $(FUSE4FS_OBJS) $(DEPLIBS) $(DEPLIBBLKID) $(DEPLIBUUID) \
 		$(LIBFUSE) $(LIBBLKID) $(LIBUUID) $(LIBEXT2FS) $(LIBINTL) \
 		$(CLOCK_GETTIME_LIB) $(SYSLIBS) $(LIBS_E2P)
 
+%.socket: %.socket.in $(DEP_SUBSTITUTE)
+	$(E) "	SUBST $@"
+	$(Q) $(SUBSTITUTE_UPTIME) $< $@
+
+%.service: %.service.in $(DEP_SUBSTITUTE)
+	$(E) "	SUBST $@"
+	$(Q) $(SUBSTITUTE_UPTIME) $< $@
+
 journal.o: $(srcdir)/../debugfs/journal.c
 	$(E) "	CC $<"
 	$(Q) $(CC) -c $(JOURNAL_CFLAGS) -I$(srcdir) \
@@ -93,11 +108,15 @@ fuse4fs.1: $(DEP_SUBSTITUTE) $(srcdir)/fuse4fs.1.in
 	$(E) "	SUBST $@"
 	$(Q) $(SUBSTITUTE_UPTIME) $(srcdir)/fuse4fs.1.in fuse4fs.1
 
-installdirs:
+installdirs: $(INSTALLDIRS_TGT)
 	$(E) "	MKDIR_P $(bindir) $(man1dir)"
 	$(Q) $(MKDIR_P) $(DESTDIR)$(bindir) $(DESTDIR)$(man1dir)
 
-install: all $(UMANPAGES) installdirs
+installdirs-systemd:
+	$(E) "	MKDIR_P $(SYSTEMD_SYSTEM_UNIT_DIR)"
+	$(Q) $(MKDIR_P) $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)
+
+install: all $(UMANPAGES) installdirs $(INSTALL_TGT)
 	$(Q) for i in $(UPROGS); do \
 		$(ES) "	INSTALL $(bindir)/$$i"; \
 		$(INSTALL_PROGRAM) $$i $(DESTDIR)$(bindir)/$$i; \
@@ -110,13 +129,19 @@ install: all $(UMANPAGES) installdirs
 		$(INSTALL_DATA) $$i $(DESTDIR)$(man1dir)/$$i; \
 	done
 
+install-systemd: $(SERVICE_FILES) installdirs-systemd
+	$(Q) for i in $(SERVICE_FILES); do \
+		$(ES) "	INSTALL_DATA $(SYSTEMD_SYSTEM_UNIT_DIR)/$$i"; \
+		$(INSTALL_DATA) $$i $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/$$i; \
+	done
+
 install-strip: install
 	$(Q) for i in $(UPROGS); do \
 		$(E) "	STRIP $(bindir)/$$i"; \
 		$(STRIP) $(DESTDIR)$(bindir)/$$i; \
 	done
 
-uninstall:
+uninstall: $(UNINSTALL_TGT)
 	for i in $(UPROGS); do \
 		$(RM) -f $(DESTDIR)$(bindir)/$$i; \
 	done
@@ -124,9 +149,16 @@ uninstall:
 		$(RM) -f $(DESTDIR)$(man1dir)/$$i; \
 	done
 
+uninstall-systemd:
+	for i in $(SERVICE_FILES); do \
+		$(RM) -f $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/$$i; \
+	done
+
 clean::
 	$(RM) -f $(UPROGS) $(UMANPAGES) profile.h \
 		fuse4fs.profiled \
+		$(SERVICE_FILES) \
+		fuse4fs.socket \
 		profiled/*.o \#* *.s *.o *.a *~ core gmon.out
 
 mostlyclean: clean
diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index ebf42609c1a739..97e668fadc2398 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -42,6 +42,10 @@
 # define _FILE_OFFSET_BITS 64
 #endif /* _FILE_OFFSET_BITS */
 #include <fuse_lowlevel.h>
+#ifdef HAVE_FUSE4FS_SERVICE
+# include <sys/mount.h>
+# include <fuse_service.h>
+#endif
 #ifdef __SET_FOB_FOR_FUSE
 # undef _FILE_OFFSET_BITS
 #endif /* __SET_FOB_FOR_FUSE */
@@ -140,6 +144,10 @@
 
 #define FUSE4FS_ATTR_TIMEOUT	(0.0)
 
+#ifndef O_DIRECT
+# define O_DIRECT	(0)
+#endif
+
 static inline uint64_t round_up(uint64_t b, unsigned int align)
 {
 	unsigned int m;
@@ -285,8 +293,21 @@ struct fuse4fs {
 #endif
 	struct fuse_session *fuse;
 	struct cache inodes;
+#ifdef HAVE_FUSE4FS_SERVICE
+	struct fuse_service *service;
+	int bdev_fd;
+#endif
 };
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static inline bool fuse4fs_is_service(const struct fuse4fs *ff)
+{
+	return fuse_service_accepted(ff->service);
+}
+#else
+# define fuse4fs_is_service(...)		(false)
+#endif
+
 #define FUSE4FS_CHECK_HANDLE(req, fh) \
 	do { \
 		if ((fh) == NULL || (fh)->magic != FUSE4FS_FILE_MAGIC) { \
@@ -1270,6 +1291,118 @@ static errcode_t fuse4fs_check_support(struct fuse4fs *ff)
 	return 0;
 }
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static int fuse4fs_service_connect(struct fuse4fs *ff, struct fuse_args *args)
+{
+	int ret;
+
+	ret = fuse_service_accept(&ff->service);
+	if (ret)
+		return ret;
+
+	if (!fuse4fs_is_service(ff))
+		return 0;
+
+	return fuse_service_append_args(ff->service, args);
+}
+
+static bool fuse4fs_service_should_drop_kernel_mode(const struct fuse4fs *ff)
+{
+	return ff->kernel && fuse4fs_is_service(ff) &&
+	       !fuse_service_can_allow_other(ff->service);
+}
+
+static void fuse4fs_service_close_bdev(struct fuse4fs *ff)
+{
+	if (ff->bdev_fd >= 0)
+		close(ff->bdev_fd);
+	ff->bdev_fd = -1;
+}
+
+static int fuse4fs_service_exit(struct fuse4fs *ff, int exitcode)
+{
+	if (!fuse4fs_is_service(ff))
+		return exitcode;
+
+	fuse_service_send_goodbye(ff->service, exitcode);
+	fuse_service_release(ff->service);
+	close(ff->bdev_fd);
+	ff->bdev_fd = -1;
+
+	return fuse_service_exit(exitcode);
+}
+
+static int fuse4fs_service_open_bdev(struct fuse4fs *ff)
+{
+	double deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
+	const int open_flags = O_EXCL | (ff->directio ? O_DIRECT : 0);
+	int open_mode = O_RDWR;
+	int fd;
+	int ret;
+
+	do {
+		ret = fuse_service_request_file(ff->service, ff->device,
+						open_mode | open_flags, 0, 0);
+		if (ret)
+			return ret;
+
+		ret = fuse_service_receive_file(ff->service, ff->device, &fd);
+		if (ret)
+			return ret;
+
+		if ((fd == -EPERM || fd == -EACCES || fd == -EROFS) &&
+		    open_mode == O_RDWR) {
+			/* Try readonly, but force the loop to run once more */
+			open_mode = O_RDONLY;
+			ret = 1;
+		}
+	} while (ret == 1 || (fd == -EBUSY && retry_before_deadline(deadline)));
+
+	if (fd < 0) {
+		err_printf(ff, "%s %s: %s.\n", _("opening device"), ff->device,
+			   strerror(-fd));
+		return -1;
+	}
+
+	if (!ff->ro && open_mode == O_RDONLY)
+		ff->ro = 1;
+
+	ff->bdev_fd = fd;
+	return 0;
+}
+
+static int fuse4fs_service_get_config(struct fuse4fs *ff)
+{
+	int ret, ret2;
+
+	ret = fuse4fs_service_open_bdev(ff);
+
+	/* Always prevent further fds from being added to our file table */
+	ret2 = fuse_service_finish_file_requests(ff->service);
+	if (ret2 && !ret)
+		ret = ret2;
+
+	return ret;
+}
+
+static errcode_t fuse4fs_service_openfs(struct fuse4fs *ff, char *options,
+					int flags)
+{
+	char path[64];
+
+	snprintf(path, sizeof(path), "/dev/fd/%d", ff->bdev_fd);
+	return ext2fs_open2(path, options, flags, 0, 0, unixfd_io_manager,
+			    &ff->fs);
+}
+#else
+# define fuse4fs_service_connect(...)		(0)
+# define fuse4fs_service_should_drop_kernel_mode(...)	(false)
+# define fuse4fs_service_close_bdev(...)	((void)0)
+# define fuse4fs_service_exit(fctx, ret)	(ret)
+# define fuse4fs_service_get_config(...)	(EOPNOTSUPP)
+# define fuse4fs_service_openfs(...)		(EOPNOTSUPP)
+#endif
+
 static errcode_t fuse4fs_acquire_lockfile(struct fuse4fs *ff)
 {
 	char *resolved;
@@ -1340,6 +1473,8 @@ static void fuse4fs_unmount(struct fuse4fs *ff)
 				   uuid);
 	}
 
+	fuse4fs_service_close_bdev(ff);
+
 	if (ff->lockfile)
 		fuse4fs_release_lockfile(ff);
 }
@@ -1395,8 +1530,11 @@ static errcode_t fuse4fs_open(struct fuse4fs *ff)
 	 */
 	deadline = init_deadline(FUSE4FS_OPEN_TIMEOUT);
 	do {
-		err = ext2fs_open2(ff->device, options, flags, 0, 0,
-				   unix_io_manager, &ff->fs);
+		if (fuse4fs_is_service(ff))
+			err = fuse4fs_service_openfs(ff, options, flags);
+		else
+			err = ext2fs_open2(ff->device, options, flags, 0, 0,
+					   unix_io_manager, &ff->fs);
 		if ((err == EPERM || err == EACCES) &&
 		    (!ff->ro || (flags & EXT2_FLAG_RW))) {
 			/*
@@ -1741,6 +1879,10 @@ static int fuse4fs_setup_logging(struct fuse4fs *ff)
 	if (logfile)
 		return fuse4fs_capture_output(ff, logfile);
 
+	/* systemd already hooked us up to /dev/ttyprintk */
+	if (fuse4fs_is_service(ff))
+		return 0;
+
 	/* in kernel mode, try to log errors to the kernel log */
 	if (ff->kernel)
 		fuse4fs_capture_output(ff, "/dev/ttyprintk");
@@ -5962,14 +6104,13 @@ static const char *get_subtype(const char *argv0)
 }
 
 static void fuse4fs_compute_libfuse_args(struct fuse4fs *ff,
-					 struct fuse_args *args,
-					 const char *argv0)
+					 struct fuse_args *args)
 {
 	char extra_args[BUFSIZ];
 
 	/* Set up default fuse parameters */
 	snprintf(extra_args, BUFSIZ, "-osubtype=%s,fsname=%s",
-		 get_subtype(argv0),
+		 get_subtype(args->argv[0]),
 		 ff->device);
 	if (ff->no_default_opts == 0)
 		fuse_opt_add_arg(args, extra_args);
@@ -5986,6 +6127,15 @@ static void fuse4fs_compute_libfuse_args(struct fuse4fs *ff,
 #endif
 	}
 
+	/*
+	 * If we're mounting as a systemd service but the mount helper told us
+	 * that allow_other isn't allowed, then disable -okernel.  This mount
+	 * option gets special consideration because it's hardcoded in the
+	 * service unit file.
+	 */
+	if (fuse4fs_service_should_drop_kernel_mode(ff))
+		ff->kernel = 0;
+
 	if (ff->kernel) {
 		/*
 		 * ACLs are always enforced when kernel mode is enabled, to
@@ -6097,6 +6247,69 @@ static int fuse4fs_event_loop(struct fuse4fs *ff,
 	return fuse_session_loop_mt(ff->fuse, loop_config) == 0 ? 0 : 8;
 }
 
+#ifdef HAVE_FUSE4FS_SERVICE
+static int fuse4fs_service_main(struct fuse_args *args, struct fuse4fs *ff)
+{
+	struct fuse_cmdline_opts opts;
+	struct fuse_loop_config *loop_config = NULL;
+	int ret;
+
+	/*
+	 * Service initialization doesn't fork or change stdout/stderr so we
+	 * can drop the extra logfd right now.
+	 */
+	if (ff->logfd >= 0)
+		close(ff->logfd);
+	ff->logfd = -1;
+
+	ret = fuse_service_parse_cmdline_opts(args, &opts);
+	if (ret != 0) {
+		ret = 1;
+		goto out;
+	}
+
+	ret = fuse4fs_create_session(ff, args, &opts);
+	if (ret || !ff->fuse)
+		goto out_free_opts;
+
+	loop_config = fuse_loop_cfg_create();
+	if (loop_config == NULL) {
+		ret = 7;
+		goto out_destroy_session;
+	}
+
+	if (fuse_set_signal_handlers(ff->fuse) != 0) {
+		ret = 6;
+		goto out_loopcfg;
+	}
+
+	ret = fuse_service_session_mount(ff->service, ff->fuse, S_IFDIR, &opts);
+	if (ret) {
+		ret = 4;
+		goto out_signals;
+	}
+
+	fuse_service_send_goodbye(ff->service, 0);
+	fuse_service_release(ff->service);
+
+	ret = fuse4fs_event_loop(ff, loop_config, &opts);
+
+out_signals:
+	fuse_remove_signal_handlers(ff->fuse);
+out_loopcfg:
+	fuse_loop_cfg_destroy(loop_config);
+out_destroy_session:
+	fuse_session_destroy(ff->fuse);
+	ff->fuse = NULL;
+out_free_opts:
+	free(opts.mountpoint);
+out:
+	return ret;
+}
+#else
+# define fuse4fs_service_main(...)		(8)
+#endif
+
 static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 {
 	struct fuse_cmdline_opts opts;
@@ -6168,18 +6381,28 @@ int main(int argc, char *argv[])
 		.bfl = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
 		.oom_score_adj = -500,
 		.opstate = F4OP_WRITABLE,
+#ifdef HAVE_FUSE4FS_SERVICE
+		.bdev_fd = -1,
+#endif
 	};
 	errcode_t err;
 	FILE *orig_stderr = stderr;
 	int ret;
 
+	ret = fuse4fs_service_connect(&fctx, &args);
+	if (ret) {
+		ret = 1;
+		goto out_exit;
+	}
+
 	ret = fuse_opt_parse(&args, &fctx, fuse4fs_opts, fuse4fs_opt_proc);
 	if (ret)
-		exit(1);
+		goto out_exit;
 	if (fctx.device == NULL) {
 		fprintf(stderr, "Missing ext4 device/image\n");
 		fprintf(stderr, "See '%s -h' for usage\n", argv[0]);
-		exit(1);
+		ret = 1;
+		goto out_exit;
 	}
 
 	/* /dev/sda -> sda for reporting */
@@ -6209,6 +6432,14 @@ int main(int argc, char *argv[])
 		goto out;
 	}
 
+	if (fuse4fs_is_service(&fctx)) {
+		ret = fuse4fs_service_get_config(&fctx);
+		if (ret) {
+			ret = 2;
+			goto out;
+		}
+	}
+
 	try_set_io_flusher(&fctx);
 	try_adjust_oom_score(&fctx);
 
@@ -6264,9 +6495,12 @@ int main(int argc, char *argv[])
 	/* Initialize generation counter */
 	get_random_bytes(&fctx.next_generation, sizeof(unsigned int));
 
-	fuse4fs_compute_libfuse_args(&fctx, &args, argv[0]);
+	fuse4fs_compute_libfuse_args(&fctx, &args);
 
-	ret = fuse4fs_main(&args, &fctx);
+	if (fuse4fs_is_service(&fctx))
+		ret = fuse4fs_service_main(&args, &fctx);
+	else
+		ret = fuse4fs_main(&args, &fctx);
 	switch(ret) {
 	case 0:
 		/* success */
@@ -6308,6 +6542,8 @@ int main(int argc, char *argv[])
 	if (fctx.device)
 		free(fctx.device);
 	pthread_mutex_destroy(&fctx.bfl);
+out_exit:
+	ret = fuse4fs_service_exit(&fctx, ret);
 	fuse_opt_free_args(&args);
 	return ret;
 }
diff --git a/fuse4fs/fuse4fs.socket.in b/fuse4fs/fuse4fs.socket.in
new file mode 100644
index 00000000000000..99e391bcc6787e
--- /dev/null
+++ b/fuse4fs/fuse4fs.socket.in
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=Socket for ext4 Service
+
+[Socket]
+ListenSequentialPacket=@fuse_service_socket_dir@/ext2
+ListenSequentialPacket=@fuse_service_socket_dir@/ext3
+ListenSequentialPacket=@fuse_service_socket_dir@/ext4
+Accept=yes
+SocketMode=@fuse_service_socket_perms@
+RemoveOnStop=yes
+
+[Install]
+WantedBy=sockets.target
diff --git a/fuse4fs/fuse4fs@.service.in b/fuse4fs/fuse4fs@.service.in
new file mode 100644
index 00000000000000..38434c383c7be3
--- /dev/null
+++ b/fuse4fs/fuse4fs@.service.in
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2025-2026 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+[Unit]
+Description=ext4 Service
+
+# Don't leave failed units behind, systemd does not clean them up!
+CollectMode=inactive-or-failed
+
+[Service]
+Type=exec
+ExecStart=@bindir@/fuse4fs -o kernel
+
+# Try to capture core dumps
+LimitCORE=infinity
+
+SyslogIdentifier=%N
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Don't let us see anything in the regular system, and don't run as root
+DynamicUser=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+PrivateDevices=true
+PrivateUsers=true
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+RestrictFileSystems=
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+
+SystemCallFilter=~@clock
+SystemCallFilter=~@cpu-emulation
+SystemCallFilter=~@debug
+SystemCallFilter=~@module
+SystemCallFilter=~@reboot
+SystemCallFilter=~@swap
+
+SystemCallFilter=~@mount
+
+# libfuse io_uring wants to pin cores and memory
+SystemCallFilter=mbind
+SystemCallFilter=sched_setaffinity
+
+# Leave a breadcrumb if we get whacked by the system call filter
+SystemCallErrorNumber=EL3RST
+
+# Log to the kernel dmesg, just like an in-kernel ext4 driver
+StandardOutput=append:/dev/ttyprintk
+StandardError=append:/dev/ttyprintk
+
+# Run with no capabilities at all
+CapabilityBoundingSet=
+AmbientCapabilities=
+NoNewPrivileges=true
+
+# fuse4fs doesn't create files
+UMask=7777
+
+# No access to hardware /dev files at all
+ProtectClock=true
+DevicePolicy=closed
+
+# Don't mess with set[ug]id anything.
+RestrictSUIDSGID=true
+
+# Don't let OOM kills of processes in this containment group kill the whole
+# service, because we don't want filesystem drivers to go down.
+OOMPolicy=continue
+OOMScoreAdjust=-1000
diff --git a/lib/config.h.in b/lib/config.h.in
index abba5e2c625b24..15b99c6d28c59e 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -142,6 +142,9 @@
 /* Define to 1 if you have the 'ftruncate64' function. */
 #undef HAVE_FTRUNCATE64
 
+/* Define to 1 if fuse4fs should be built with fuse service support */
+#undef HAVE_FUSE4FS_SERVICE
+
 /* Define to 1 if fuse supports cache_readdir */
 #undef HAVE_FUSE_CACHE_READDIR
 
@@ -151,6 +154,9 @@
 /* Define to 1 if fuse supports lowlevel API */
 #undef HAVE_FUSE_LOWLEVEL
 
+/* Define to 1 if fuse supports service */
+#undef HAVE_FUSE_SERVICE
+
 /* Define to 1 if you have the 'futimes' function. */
 #undef HAVE_FUTIMES
 
diff --git a/util/subst.conf.in b/util/subst.conf.in
index 5af5e356d46ac7..3d0ec5cc39eabd 100644
--- a/util/subst.conf.in
+++ b/util/subst.conf.in
@@ -24,3 +24,6 @@ root_bindir		@root_bindir@
 libdir			@libdir@
 $exec_prefix		@exec_prefix@
 pkglibexecdir		@libexecdir@/e2fsprogs
+bindir			@bindir@
+fuse_service_socket_dir	@fuse_service_socket_dir@
+fuse_service_socket_perms	@fuse_service_socket_perms@


^ permalink raw reply related

* [PATCH 06/10] fuse4fs: hoist some code out of fuse4fs_main
From: Darrick J. Wong @ 2026-06-25 19:38 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

In the next patch, we're going to create a separate fuse4fs_main
function when we're running in service mode.  Hoist into separate
helpers the code that will be shared between the two functions.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fuse4fs/fuse4fs.c |   95 +++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 46 deletions(-)


diff --git a/fuse4fs/fuse4fs.c b/fuse4fs/fuse4fs.c
index 155cb7332a9b3f..ebf42609c1a739 100644
--- a/fuse4fs/fuse4fs.c
+++ b/fuse4fs/fuse4fs.c
@@ -6055,47 +6055,64 @@ static void fuse4fs_com_err_proc(const char *whoami, errcode_t code,
 	fflush(stderr);
 }
 
-static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
+static int fuse4fs_create_session(struct fuse4fs *ff, struct fuse_args *args,
+				  struct fuse_cmdline_opts *opts)
 {
-	struct fuse_cmdline_opts opts;
-	struct fuse_session *se;
-	struct fuse_loop_config *loop_config = NULL;
-	int ret = 0;
-
-	if (fuse_parse_cmdline(args, &opts) != 0) {
-		ret = 1;
-		goto out;
-	}
-
 	if (ff->debug)
-		opts.debug = true;
+		opts->debug = true;
 
-	if (opts.show_help) {
+	if (opts->show_help) {
 		fuse_cmdline_help();
-		ret = 0;
-		goto out_free_opts;
+		return 0;
 	}
 
-	if (opts.show_version) {
+	if (opts->show_version) {
 		printf("FUSE library version %s\n", fuse_pkgversion());
-		ret = 0;
-		goto out_free_opts;
+		return 0;
 	}
 
-	if (!opts.mountpoint) {
+	if (!opts->mountpoint) {
 		fprintf(stderr, "error: no mountpoint specified\n");
-		ret = 2;
-		goto out_free_opts;
+		return 2;
 	}
 
-	se = fuse_session_new(args, &fs_ops, sizeof(fs_ops), ff);
-	if (se == NULL) {
-		ret = 3;
-		goto out_free_opts;
+	ff->fuse = fuse_session_new(args, &fs_ops, sizeof(fs_ops), ff);
+	return ff->fuse ? 0 : 3;
+}
+
+static int fuse4fs_event_loop(struct fuse4fs *ff,
+			      struct fuse_loop_config *loop_config,
+			      const struct fuse_cmdline_opts *opts)
+{
+	/*
+	 * Since there's a Big Kernel Lock around all the libext2fs code, we
+	 * only need to start four threads -- one to decode a request, another
+	 * to do the filesystem work, a third to transmit the reply, and a
+	 * fourth to handle fuse notifications.
+	 */
+	fuse_loop_cfg_set_clone_fd(loop_config, opts->clone_fd);
+	fuse_loop_cfg_set_idle_threads(loop_config, opts->max_idle_threads);
+	fuse_loop_cfg_set_max_threads(loop_config, 4);
+
+	return fuse_session_loop_mt(ff->fuse, loop_config) == 0 ? 0 : 8;
+}
+
+static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
+{
+	struct fuse_cmdline_opts opts;
+	struct fuse_loop_config *loop_config = NULL;
+	int ret;
+
+	if (fuse_parse_cmdline(args, &opts) != 0) {
+		ret = 1;
+		goto out;
 	}
-	ff->fuse = se;
 
-	if (fuse_session_mount(se, opts.mountpoint) != 0) {
+	ret = fuse4fs_create_session(ff, args, &opts);
+	if (ret || !ff->fuse)
+		goto out_free_opts;
+
+	if (fuse_session_mount(ff->fuse, opts.mountpoint) != 0) {
 		ret = 4;
 		goto out_destroy_session;
 	}
@@ -6115,7 +6132,7 @@ static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 		close(ff->logfd);
 	ff->logfd = -1;
 
-	if (fuse_set_signal_handlers(se) != 0) {
+	if (fuse_set_signal_handlers(ff->fuse) != 0) {
 		ret = 6;
 		goto out_unmount;
 	}
@@ -6126,30 +6143,16 @@ static int fuse4fs_main(struct fuse_args *args, struct fuse4fs *ff)
 		goto out_remove_signal_handlers;
 	}
 
-	/*
-	 * Since there's a Big Kernel Lock around all the libext2fs code, we
-	 * only need to start four threads -- one to decode a request, another
-	 * to do the filesystem work, a third to transmit the reply, and a
-	 * fourth to handle fuse notifications.
-	 */
-	fuse_loop_cfg_set_clone_fd(loop_config, opts.clone_fd);
-	fuse_loop_cfg_set_idle_threads(loop_config, opts.max_idle_threads);
-	fuse_loop_cfg_set_max_threads(loop_config, 4);
+	ret = fuse4fs_event_loop(ff, loop_config, &opts);
 
-	if (fuse_session_loop_mt(se, loop_config) != 0) {
-		ret = 8;
-		goto out_loopcfg;
-	}
-
-out_loopcfg:
 	fuse_loop_cfg_destroy(loop_config);
 out_remove_signal_handlers:
-	fuse_remove_signal_handlers(se);
+	fuse_remove_signal_handlers(ff->fuse);
 out_unmount:
-	fuse_session_unmount(se);
+	fuse_session_unmount(ff->fuse);
 out_destroy_session:
+	fuse_session_destroy(ff->fuse);
 	ff->fuse = NULL;
-	fuse_session_destroy(se);
 out_free_opts:
 	free(opts.mountpoint);
 out:


^ permalink raw reply related

* [PATCH 05/10] libext2fs: bump libfuse API version to 3.19
From: Darrick J. Wong @ 2026-06-25 19:38 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

The fuse service container API is only available in 3.19, so we need to
bump FUSE_USE_VERSION up from 3.14 to 3.19.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 configure    |    8 ++++----
 configure.ac |   10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)


diff --git a/configure b/configure
index d941ff1f1ad900..f24897fcdd4949 100755
--- a/configure
+++ b/configure
@@ -14604,14 +14604,14 @@ fi
 
 if test -n "$FUSE_LIB"
 then
-	FUSE_USE_VERSION=314
+	FUSE_USE_VERSION=319
 	CFLAGS="$fuse3_CFLAGS $CFLAGS"
 	FUSE_LIB="$fuse3_LIBS"
 	       for ac_header in pthread.h fuse.h
 do :
   as_ac_Header=`printf "%s\n" "ac_cv_header_$ac_header" | sed "$as_sed_sh"`
 ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "#define _FILE_OFFSET_BITS	64
-#define FUSE_USE_VERSION	314
+#define FUSE_USE_VERSION	319
 "
 if eval test \"x\$"$as_ac_Header"\" = x"yes"
 then :
@@ -14646,7 +14646,7 @@ printf %s "checking for lowlevel interface in libfuse... " >&6; }
 
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse_lowlevel.h>
 
 int
@@ -14826,7 +14826,7 @@ printf %s "checking for cache_readdir support in libfuse... " >&6; }
 
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse.h>
 
 int
diff --git a/configure.ac b/configure.ac
index d8f40f5df0946b..38a18de0b67283 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1384,17 +1384,17 @@ AC_SUBST(FUSE_LIB)
 
 dnl
 dnl Set FUSE_USE_VERSION, which is how fuse servers build against a particular
-dnl libfuse ABI.  Currently we link against the libfuse 3.14 ABI (hence 314)
+dnl libfuse ABI.  Currently we link against the libfuse 3.19 ABI (hence 319)
 dnl
 if test -n "$FUSE_LIB"
 then
-	FUSE_USE_VERSION=314
+	FUSE_USE_VERSION=319
 	CFLAGS="$fuse3_CFLAGS $CFLAGS"
 	FUSE_LIB="$fuse3_LIBS"
 	AC_CHECK_HEADERS([pthread.h fuse.h], [],
 		[AC_MSG_FAILURE([Cannot build against fuse3 headers])],
 [#define _FILE_OFFSET_BITS	64
-#define FUSE_USE_VERSION	314])
+#define FUSE_USE_VERSION	319])
 fi
 if test -n "$FUSE_USE_VERSION"
 then
@@ -1413,7 +1413,7 @@ then
 	[	AC_LANG_PROGRAM([[
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse_lowlevel.h>
 		]], [[
 	struct fuse_lowlevel_ops fs_ops = { };
@@ -1515,7 +1515,7 @@ then
 	[	AC_LANG_PROGRAM([[
 	#define _GNU_SOURCE
 	#define _FILE_OFFSET_BITS	64
-	#define FUSE_USE_VERSION	314
+	#define FUSE_USE_VERSION	319
 	#include <fuse.h>
 		]], [[
 	struct fuse_file_info fs_ops = {


^ permalink raw reply related

* [PATCH 04/10] libext2fs: fix MMP code to work with unixfd IO manager
From: Darrick J. Wong @ 2026-06-25 19:37 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

The MMP code wants to be able to read and write the MMP block directly
to storage so that the pagecache does not get in the way.  This is
critical for correct operation of MMP, because it is guarding against
two cluster nodes trying to change the filesystem at the same time.

Unfortunately there's no convenient way to tell an IO manager to perform
a particular IO in directio mode, so the MMP code open()s the filesystem
source device a second time so that it can set O_DIRECT and maintain its
own file position independently of the IO channel.  This is a gross
layering violation.

For unprivileged containerized fuse4fs, we're going to have a privileged
mount helper pass us the fd to the block device, so we'll be using the
unixfd IO manager.  The enhanced security posture provided by the
service definition file (minimal /dev) means that we cannot reopen the
source device.  In this case, MMP can only duplicate the fd and use the
IO channel carefully.

Fix this (sort of) by detecting the unixfd IO manager and duplicating
the open fd if it's in use.  This adds a requirement that the unixfd
originally be opened in O_DIRECT mode if the filesystem is on a block
device, but that's the best we can do here.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/ext2fs.h  |    1 +
 lib/ext2fs/ext2fsP.h |    4 ++
 lib/ext2fs/mmp.c     |   95 +++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/ext2fs/unix_io.c |    2 +
 4 files changed, 100 insertions(+), 2 deletions(-)


diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index c4fcb10bea0fb9..02c3cbcea92482 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -225,6 +225,7 @@ typedef struct ext2_file *ext2_file_t;
  * Internal flags for use by the ext2fs library only
  */
 #define EXT2_FLAG2_USE_FAKE_TIME	0x000000001
+#define EXT2_FLAG2_MMP_USE_IOCHANNEL	0x000000002
 
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
diff --git a/lib/ext2fs/ext2fsP.h b/lib/ext2fs/ext2fsP.h
index 428081c9e2ff38..bdc92991e7dda0 100644
--- a/lib/ext2fs/ext2fsP.h
+++ b/lib/ext2fs/ext2fsP.h
@@ -218,3 +218,7 @@ errcode_t ext2fs_remove_exit_fn(ext2_exit_fn fn, void *data);
         (sizeof(array) / sizeof(array[0]))
 
 #define EXT2FS_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)]))
+
+#ifndef _WIN32
+int possible_unixfd_pathname(const char *path);
+#endif
diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
index cb15a18fce5547..188cdb68900e97 100644
--- a/lib/ext2fs/mmp.c
+++ b/lib/ext2fs/mmp.c
@@ -26,9 +26,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <limits.h>
 
 #include "ext2fs/ext2_fs.h"
 #include "ext2fs/ext2fs.h"
+#include "ext2fs/ext2fsP.h"
 
 #ifndef O_DIRECT
 #define O_DIRECT 0
@@ -48,6 +50,86 @@ errcode_t ext2fs_mmp_get_mem(ext2_filsys fs, void **ptr)
 	return ext2fs_get_memalign(fs->blocksize, align, ptr);
 }
 
+#ifdef _WIN32
+static int ext2fs_mmp_open_device(ext2_filsys fs, int flags)
+{
+	return open(fs->device_name, flags);
+}
+#else
+static int ext2fs_mmp_open_device(ext2_filsys fs, int flags)
+{
+	struct stat stbuf;
+	char path[64];
+	int maybe_fd = -1;
+	int new_fd;
+	int ret;
+	errcode_t retval = 0;
+
+	/*
+	 * If we can't possibly be using the unixfd IO manager, open the device
+	 * a second time, which is the historical behavior.  This is a huge
+	 * and historic layering violation!
+	 *
+	 * It's also broken if the unixfd IO manager was passed a string with a
+	 * file descriptor number instead of a /dev/fd/XX path, but the
+	 * internet thinks there are no users of the manager outside of Google.
+	 */
+	if (!possible_unixfd_pathname(fs->device_name))
+		return open(fs->device_name, flags);
+
+	/*
+	 * Try to get the fd of the open block device.  If this fails for any
+	 * reason, fall back to the classic open path.
+	 */
+	retval = io_channel_get_fd(fs->io, &maybe_fd);
+	if (retval || maybe_fd < 0)
+		return open(fs->device_name, flags);
+
+	/*
+	 * We extracted the fd from the IO manager.
+	 *
+	 * Skip directio if this is a regular file, just ext2fs_mmp_read does.
+	 * Note that the O_DIRECT-clearing logic in the caller might not have
+	 * cleared the bit because it is path based.
+	 */
+	if (fstat(maybe_fd, &stbuf) == 0 && S_ISREG(stbuf.st_mode))
+		flags &= ~O_DIRECT;
+
+	/*
+	 * Try to reopen the same file descriptor, but with the new mode flags.
+	 * If that works then we're done.  Note that these magic symlinks do
+	 * not have to resolve anywhere.
+	 */
+	snprintf(path, sizeof(path), "/dev/fd/%d", maybe_fd);
+	new_fd = open(path, flags);
+	if (new_fd >= 0)
+		return new_fd;
+
+	/*
+	 * Reopening didn't work.  Instead, duplicate the file descriptor and
+	 * check that we actually got directio if that's required.  Note that
+	 * we can't change the mode on the IO channel's fd because we already
+	 * set it up for buffered IO.
+	 */
+	new_fd = dup(maybe_fd);
+	if (flags & O_DIRECT) {
+		ret = fcntl(new_fd, F_GETFL);
+		if (ret < 0 || !(ret & O_DIRECT)) {
+			close(new_fd);
+			return -1;
+		}
+	}
+
+	/*
+	 * The MMP fd shadows the io channel fd, so we must use that for all
+	 * MMP block accesses because the two fds share the same file position
+	 * and O_DIRECT state, and the iochannel must know about that.
+	 */
+	fs->flags2 |= EXT2_FLAG2_MMP_USE_IOCHANNEL;
+	return new_fd;
+}
+#endif
+
 errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 {
 #ifdef CONFIG_MMP
@@ -77,7 +159,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		    S_ISREG(st.st_mode))
 			flags &= ~O_DIRECT;
 
-		fs->mmp_fd = open(fs->device_name, flags);
+		fs->mmp_fd = ext2fs_mmp_open_device(fs, flags);
 		if (fs->mmp_fd < 0) {
 			retval = EXT2_ET_MMP_OPEN_DIRECT;
 			goto out;
@@ -90,6 +172,15 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 			return retval;
 	}
 
+	if (fs->flags2 & EXT2_FLAG2_MMP_USE_IOCHANNEL) {
+		retval = io_channel_read_blk64(fs->io, mmp_blk, -fs->blocksize,
+					       fs->mmp_cmp);
+		if (retval)
+			return retval;
+
+		goto read_compare;
+	}
+
 	if ((blk64_t) ext2fs_llseek(fs->mmp_fd, mmp_blk * fs->blocksize,
 				    SEEK_SET) !=
 	    mmp_blk * fs->blocksize) {
@@ -102,6 +193,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		goto out;
 	}
 
+read_compare:
 	mmp_cmp = fs->mmp_cmp;
 
 	if (!(fs->flags & EXT2_FLAG_IGNORE_CSUM_ERRORS) &&
@@ -428,6 +520,7 @@ errcode_t ext2fs_mmp_stop(ext2_filsys fs)
 
 mmp_error:
 	if (fs->mmp_fd >= 0) {
+		fs->flags2 &= ~EXT2_FLAG2_MMP_USE_IOCHANNEL;
 		close(fs->mmp_fd);
 		fs->mmp_fd = -1;
 	}
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index a9b1fac62a0250..567bbd9493f7f1 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1152,7 +1152,7 @@ static errcode_t unix_open_channel(const char *name, int fd,
 #define DEV_FD_PATH	"/dev/fd/"
 #define DEV_FD_PATHLEN	(sizeof(DEV_FD_PATH) - 1)
 
-static int possible_unixfd_pathname(const char *path)
+int possible_unixfd_pathname(const char *path)
 {
 	return strncmp(DEV_FD_PATH, path, DEV_FD_PATHLEN) == 0;
 }


^ permalink raw reply related

* [PATCH 03/10] unix_io: allow passing /dev/fd/XXX paths to the unixfd IO manager
From: Darrick J. Wong @ 2026-06-25 19:37 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4, linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Commit 4ccf9e4fe165cf created a "unixfd" IO manager that allows someone
to choose the unixfd IO manager and then mount a filesystem from an
existing file descriptor by passing a string with the fd number as the
"device" name to ext2fs_open().

That was an unfortunate choice of naming, however, because that could
be mistaken for a relative path to a file whose name is an integer
number.  Let's improve this by allowing callers to pass /dev/fd/XX
as the filesystem device name.  The upcoming fuse4fs service patches
will employ this method to open a filesystem on a block device fd passed
into the secure container from a mount helper.

Cc: <linux-ext4@vger.kernel.org> # v1.43.2
Fixes: 4ccf9e4fe165cf ("libext2fs: add unixfd_io_manager")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 79bc9219f9515b..a9b1fac62a0250 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -67,6 +67,7 @@
 #ifdef HAVE_SYS_FILE_H
 #include <sys/file.h>
 #endif
+#include <limits.h>
 
 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
@@ -1148,13 +1149,40 @@ static errcode_t unix_open_channel(const char *name, int fd,
 	return retval;
 }
 
+#define DEV_FD_PATH	"/dev/fd/"
+#define DEV_FD_PATHLEN	(sizeof(DEV_FD_PATH) - 1)
+
+static int possible_unixfd_pathname(const char *path)
+{
+	return strncmp(DEV_FD_PATH, path, DEV_FD_PATHLEN) == 0;
+}
+
 static errcode_t unixfd_open(const char *str_fd, int flags,
 			     io_channel *channel)
 {
 	int fd;
 	int fd_flags;
 
-	fd = atoi(str_fd);
+	/*
+	 * The caller should provide a path in the form "/dev/fd/XX",
+	 * but the shorthand form "XX" is allowed for legacy reasons.
+	 */
+	if (possible_unixfd_pathname(str_fd)) {
+		char *endptr;
+		long maybe_fd;
+
+		errno = 0;
+		maybe_fd = strtol(str_fd + DEV_FD_PATHLEN, &endptr, 10);
+		if (errno)
+			return errno;
+		if (*endptr != 0)
+			return EINVAL;
+		if (maybe_fd < 0 || maybe_fd > INT_MAX)
+			return EINVAL;
+		fd = maybe_fd;
+	} else {
+		fd = atoi(str_fd);
+	}
 #if defined(HAVE_FCNTL)
 	fd_flags = fcntl(fd, F_GETFL);
 	if (fd_flags == -1)


^ permalink raw reply related

* [PATCH 02/10] libext2fs: fix checking for valid fds in mmp.c
From: Darrick J. Wong @ 2026-06-25 19:37 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4, linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

File descriptors are non-negative numbers, which means that 0 is a valid
fd.  Fix the code to be consistent with Unix behaviors.

Cc: <linux-ext4@vger.kernel.org> # v1.42
Fixes: 0f5eba7501f467 ("ext2fs: add multi-mount protection (INCOMPAT_MMP)")
Fixes: 76a6c8788c79e4 ("mmp: do not use O_DIRECT when working with regular file")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/mmp.c    |    6 +++---
 lib/ext2fs/openfs.c |    1 +
 2 files changed, 4 insertions(+), 3 deletions(-)


diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
index e2823732e2b6a2..cb15a18fce5547 100644
--- a/lib/ext2fs/mmp.c
+++ b/lib/ext2fs/mmp.c
@@ -59,11 +59,11 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		return EXT2_ET_MMP_BAD_BLOCK;
 
 	/* ext2fs_open() reserves fd0,1,2 to avoid stdio collision, so checking
-	 * mmp_fd <= 0 is OK to validate that the fd is valid.  This opens its
+	 * mmp_fd < 0 is OK to validate that the fd is valid.  This opens its
 	 * own fd to read the MMP block to ensure that it is using O_DIRECT,
 	 * regardless of how the io_manager is doing reads, to avoid caching of
 	 * the MMP block by the io_manager or the VM.  It needs to be fresh. */
-	if (fs->mmp_fd <= 0) {
+	if (fs->mmp_fd < 0) {
 		struct stat st;
 		int flags = O_RDONLY | O_DIRECT;
 
@@ -427,7 +427,7 @@ errcode_t ext2fs_mmp_stop(ext2_filsys fs)
 	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_cmp);
 
 mmp_error:
-	if (fs->mmp_fd > 0) {
+	if (fs->mmp_fd >= 0) {
 		close(fs->mmp_fd);
 		fs->mmp_fd = -1;
 	}
diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
index 2b8e0e753c46e8..41359d15740881 100644
--- a/lib/ext2fs/openfs.c
+++ b/lib/ext2fs/openfs.c
@@ -148,6 +148,7 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
 	/* don't overwrite sb backups unless flag is explicitly cleared */
 	fs->flags |= EXT2_FLAG_MASTER_SB_ONLY;
 	fs->umask = 022;
+	fs->mmp_fd = -1;
 
 	time_env = ext2fs_safe_getenv("SOURCE_DATE_EPOCH");
 	if (time_env) {


^ permalink raw reply related

* [PATCH 01/10] libext2fs: make it possible to extract the fd from an IO manager
From: Darrick J. Wong @ 2026-06-25 19:37 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606773.1810414.1910863348345107939.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Make it so that we can extract the fd from an open IO manager.  This
will be used in subsequent patches to register the open block device
with the fuse iomap kernel driver.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/ext2_io.h         |    4 +++-
 debian/libext2fs2t64.symbols |    1 +
 lib/ext2fs/io_manager.c      |    8 ++++++++
 lib/ext2fs/unix_io.c         |   20 ++++++++++++++++++++
 4 files changed, 32 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index 61865d54d82490..c880ea2524f248 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -103,7 +103,8 @@ struct struct_io_manager {
 	errcode_t (*zeroout)(io_channel channel, unsigned long long block,
 			     unsigned long long count);
 	errcode_t (*flock)(io_channel channel, unsigned int flock_flags);
-	long	reserved[13];
+	errcode_t (*get_fd)(io_channel channel, int *fd);
+	long	reserved[12];
 };
 
 #define IO_FLAG_RW		0x0001
@@ -155,6 +156,7 @@ extern errcode_t io_channel_cache_readahead(io_channel io,
 					    unsigned long long count);
 extern errcode_t io_channel_flock(io_channel io, unsigned int flock_flags);
 extern errcode_t io_channel_funlock(io_channel io);
+extern errcode_t io_channel_get_fd(io_channel io, int *fd);
 
 #ifdef _WIN32
 /* windows_io.c */
diff --git a/debian/libext2fs2t64.symbols b/debian/libext2fs2t64.symbols
index affe4c27d4e791..555fbbb0c98878 100644
--- a/debian/libext2fs2t64.symbols
+++ b/debian/libext2fs2t64.symbols
@@ -701,6 +701,7 @@ libext2fs.so.2 libext2fs2t64 #MINVER#
  io_channel_discard@Base 1.42
  io_channel_flock@Base 1.47.99
  io_channel_funlock@Base 1.47.99
+ io_channel_get_fd@Base 1.47.99
  io_channel_read_blk64@Base 1.41.1
  io_channel_set_options@Base 1.37
  io_channel_write_blk64@Base 1.41.1
diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index 791ec7d14adbba..dff3d73552827f 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -166,3 +166,11 @@ errcode_t io_channel_funlock(io_channel io)
 
 	return io->manager->flock(io, 0);
 }
+
+errcode_t io_channel_get_fd(io_channel io, int *fd)
+{
+	if (!io->manager->get_fd)
+		return EXT2_ET_OP_NOT_SUPPORTED;
+
+	return io->manager->get_fd(io, fd);
+}
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index f4307db0fb2b05..79bc9219f9515b 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1786,6 +1786,24 @@ static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;
 }
+
+static errcode_t unix_get_fd(io_channel channel, int *fd)
+{
+	struct unix_private_data *data;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	data = (struct unix_private_data *) channel->private_data;
+	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+	if (data->offset) {
+		*fd = -1;
+		return EINVAL;
+	}
+
+	*fd = data->dev;
+	return 0;
+}
+
 #if __GNUC_PREREQ (4, 6)
 #pragma GCC diagnostic pop
 #endif
@@ -1808,6 +1826,7 @@ static struct struct_io_manager struct_unix_manager = {
 	.cache_readahead	= unix_cache_readahead,
 	.zeroout	= unix_zeroout,
 	.flock		= unix_flock,
+	.get_fd		= unix_get_fd,
 };
 
 io_manager unix_io_manager = &struct_unix_manager;
@@ -1830,6 +1849,7 @@ static struct struct_io_manager struct_unixfd_manager = {
 	.cache_readahead	= unix_cache_readahead,
 	.zeroout	= unix_zeroout,
 	.flock		= unix_flock,
+	.get_fd		= unix_get_fd,
 };
 
 io_manager unixfd_io_manager = &struct_unixfd_manager;


^ permalink raw reply related

* [PATCH 3/3] libext2fs: only fsync the unix fd if we wrote to the device
From: Darrick J. Wong @ 2026-06-25 19:36 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606573.1810243.8333361812351601871.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

As an optimization, only fsync the block device fd if we tried to write
to the io channel.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |   86 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 19 deletions(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 15d6d55ff7fdd4..f4307db0fb2b05 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -132,10 +132,13 @@ struct unix_cache {
 #define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
 #define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
 
+#define UNIX_STATE_DIRTY	(1U << 0) /* device needs fsyncing */
+
 struct unix_private_data {
 	int	magic;
 	int	dev;
 	int	flags;
+	unsigned int	state; /* UNIX_STATE_* */
 	int	align;
 	int	access_time;
 	int	unix_flock_flags;
@@ -1198,10 +1201,65 @@ static errcode_t unix_open(const char *name, int flags,
 	return unix_open_channel(name, fd, flags, channel, unix_io_manager);
 }
 
+#ifdef HAVE_FSYNC
+static void mark_dirty(io_channel channel)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+
+	mutex_lock(data, CACHE_MTX);
+	data->state |= UNIX_STATE_DIRTY;
+	mutex_unlock(data, CACHE_MTX);
+}
+
+static errcode_t maybe_fsync(io_channel channel, int force_fsync)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+	int need_fsync;
+	errcode_t retval = 0;
+
+#ifndef NO_IO_CACHE
+	retval = flush_cached_blocks(channel, data, 0);
+#endif
+
+	mutex_lock(data, CACHE_MTX);
+	need_fsync = force_fsync || (data->state & UNIX_STATE_DIRTY);
+	data->state &= ~UNIX_STATE_DIRTY;
+	mutex_unlock(data, CACHE_MTX);
+
+	if (need_fsync && fsync(data->dev) != 0) {
+		if (!retval)
+			retval = errno;
+	}
+	if (retval) {
+		/* redirty because writeback failed */
+		mark_dirty(channel);
+		return retval;
+	}
+
+	return 0;
+}
+#else
+# define mark_dirty(...)		((void)0)
+
+static errcode_t maybe_fsync(io_channel channel, int force_fsync)
+{
+	struct unix_private_data *data =
+		(struct unix_private_data *) channel->private_data;
+	errcode_t retval = 0;
+
+#ifndef NO_IO_CACHE
+	retval = flush_cached_blocks(channel, data, 0);
+#endif
+	return retval;
+}
+#endif
+
 static errcode_t unix_close(io_channel channel)
 {
 	struct unix_private_data *data;
-	errcode_t	retval = 0;
+	errcode_t	retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	data = (struct unix_private_data *) channel->private_data;
@@ -1210,14 +1268,7 @@ static errcode_t unix_close(io_channel channel)
 	if (--channel->refcount > 0)
 		return 0;
 
-#ifndef NO_IO_CACHE
-	retval = flush_cached_blocks(channel, data, 0);
-#endif
-#ifdef HAVE_FSYNC
-	/* always fsync the device, even if flushing our own cache failed */
-	if (fsync(data->dev) != 0 && !retval)
-		retval = errno;
-#endif
+	retval = maybe_fsync(channel, 1);
 
 	unix_funlock(channel);
 
@@ -1388,6 +1439,8 @@ static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
 	data = (struct unix_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
+	mark_dirty(channel);
+
 #ifdef NO_IO_CACHE
 	return raw_write_blk(channel, data, block, count, buf, 0);
 #else
@@ -1512,6 +1565,8 @@ static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
 	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
 		return errno;
 
+	mark_dirty(channel);
+
 	actual = write(data->dev, buf, size);
 	if (actual < 0)
 		return errno;
@@ -1527,21 +1582,12 @@ static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
 static errcode_t unix_flush(io_channel channel)
 {
 	struct unix_private_data *data;
-	errcode_t retval = 0;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	data = (struct unix_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
-#ifndef NO_IO_CACHE
-	retval = flush_cached_blocks(channel, data, 0);
-#endif
-#ifdef HAVE_FSYNC
-	/* always fsync the device, even if flushing our own cache failed */
-	if (fsync(data->dev) != 0 && !retval)
-		return errno;
-#endif
-	return retval;
+	return maybe_fsync(channel, 0);
 }
 
 static errcode_t unix_set_option(io_channel channel, const char *option,
@@ -1653,6 +1699,7 @@ static errcode_t unix_discard(io_channel channel, unsigned long long block,
 		}
 		return errno;
 	}
+	mark_dirty(channel);
 	return 0;
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;
@@ -1734,6 +1781,7 @@ static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
 		}
 		return errno;
 	}
+	mark_dirty(channel);
 	return 0;
 unimplemented:
 	return EXT2_ET_UNIMPLEMENTED;


^ permalink raw reply related

* [PATCH 2/3] libext2fs: always fsync the device when closing the unix IO manager
From: Darrick J. Wong @ 2026-06-25 19:36 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606573.1810243.8333361812351601871.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

unix_close is the last chance that libext2fs has to report write
failures to users.  Although it's likely that ext2fs_close already
called ext2fs_flush and told the IO manager to flush, we could do one
more sync before we close the file descriptor.  Also don't override the
fsync's errno with the close's errno.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index b6feebef93fa5b..15d6d55ff7fdd4 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1213,10 +1213,16 @@ static errcode_t unix_close(io_channel channel)
 #ifndef NO_IO_CACHE
 	retval = flush_cached_blocks(channel, data, 0);
 #endif
+#ifdef HAVE_FSYNC
+	/* always fsync the device, even if flushing our own cache failed */
+	if (fsync(data->dev) != 0 && !retval)
+		retval = errno;
+#endif
 
 	unix_funlock(channel);
 
-	if (channel->manager != unixfd_io_manager && close(data->dev) < 0)
+	if (channel->manager != unixfd_io_manager && close(data->dev) < 0 &&
+	    !retval)
 		retval = errno;
 	free_cache(data);
 	free(data->cache);


^ permalink raw reply related

* [PATCH 1/3] libext2fs: always fsync the device when flushing the cache
From: Darrick J. Wong @ 2026-06-25 19:36 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <178241606573.1810243.8333361812351601871.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

When we're flushing the unix IO manager's buffer cache, we should always
fsync the block device, because something could have written to the
block device -- either the buffer cache itself, or a direct write.
Regardless, the callers all want all dirtied regions to be persisted to
stable media.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 lib/ext2fs/unix_io.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index abd33ba839f7e9..b6feebef93fa5b 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -1531,7 +1531,8 @@ static errcode_t unix_flush(io_channel channel)
 	retval = flush_cached_blocks(channel, data, 0);
 #endif
 #ifdef HAVE_FSYNC
-	if (!retval && fsync(data->dev) != 0)
+	/* always fsync the device, even if flushing our own cache failed */
+	if (fsync(data->dev) != 0 && !retval)
 		return errno;
 #endif
 	return retval;


^ permalink raw reply related

* [PATCHSET v6 4/4] fuse4fs: reclaim buffer cache under memory pressure
From: Darrick J. Wong @ 2026-06-25 19:35 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <20260625193302.GC6054@frogsfrogsfrogs>

Hi all,

Having a static buffer cache limit of 32MB is very conservative.  When
there's plenty of free memory, evicting metadata from the cache isn't
actually a good idea, so we'd like to let it grow to handle large
working sets.  However, we also don't want to OOM the kernel or (in the
future) the fuse4fs container cgroup, so we need to listen for memory
reclamation events in the kernel.

The solution to this is to open the kernel memory pressure stall
indicator files, configure an event when too much time is spent waiting
for reclamation, and to trim the buffer cache when the events happen.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-memory-reclaim
---
Commits in this patchset:
 * libsupport: add pressure stall monitor
 * fuse2fs: only reclaim buffer cache when there is memory pressure
 * fuse4fs: enable memory pressure monitoring with service containers
 * fuse2fs: flush dirty metadata periodically
---
 lib/support/list.h      |    6 +
 lib/support/psi.h       |   66 ++++++
 fuse4fs/Makefile.in     |    3 
 fuse4fs/fuse4fs.c       |  258 +++++++++++++++++++++-
 lib/support/Makefile.in |    4 
 lib/support/iocache.c   |   19 ++
 lib/support/psi.c       |  557 +++++++++++++++++++++++++++++++++++++++++++++++
 misc/Makefile.in        |    3 
 misc/fuse2fs.c          |  189 +++++++++++++++-
 9 files changed, 1091 insertions(+), 14 deletions(-)
 create mode 100644 lib/support/psi.h
 create mode 100644 lib/support/psi.c


^ permalink raw reply

* [PATCHSET v6 3/4] fuse2fs: improve block and inode caching
From: Darrick J. Wong @ 2026-06-25 19:35 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <20260625193302.GC6054@frogsfrogsfrogs>

Hi all,

This series ports the libext2fs inode cache to the new cache.c hashtable
code that was added for fuse4fs unlinked file support and improves on
the UNIX I/O manager's block cache by adding a new I/O manager that does
its own caching.  Now we no longer have statically sized buffer caching
for the two fuse servers.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse2fs-caching
---
Commits in this patchset:
 * libsupport: add caching IO manager
 * iocache: add the actual buffer cache
 * iocache: bump buffer mru priority every 50 accesses
 * fuse2fs: enable caching IO manager
 * fuse2fs: increase inode cache size
 * libext2fs: improve caching for inodes
---
 lib/ext2fs/ext2fsP.h     |   13 +
 lib/support/cache.h      |    1 
 lib/support/iocache.h    |   17 +
 debugfs/Makefile.in      |    8 
 e2fsck/Makefile.in       |   12 -
 fuse4fs/Makefile.in      |   10 -
 fuse4fs/fuse4fs.c        |   15 +
 lib/ext2fs/Makefile.in   |   69 ++--
 lib/ext2fs/inline_data.c |    4 
 lib/ext2fs/inode.c       |  215 ++++++++++---
 lib/ext2fs/io_manager.c  |    3 
 lib/support/Makefile.in  |    6 
 lib/support/cache.c      |   16 +
 lib/support/iocache.c    |  751 ++++++++++++++++++++++++++++++++++++++++++++++
 misc/Makefile.in         |   11 -
 misc/fuse2fs.c           |   11 +
 resize/Makefile.in       |   11 -
 tests/fuzz/Makefile.in   |    4 
 tests/progs/Makefile.in  |    4 
 19 files changed, 1057 insertions(+), 124 deletions(-)
 create mode 100644 lib/support/iocache.h
 create mode 100644 lib/support/iocache.c


^ permalink raw reply

* [PATCHSET v6 2/4] fuse4fs: run servers as a contained service
From: Darrick J. Wong @ 2026-06-25 19:35 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4, linux-ext4
In-Reply-To: <20260625193302.GC6054@frogsfrogsfrogs>

Hi all,

This series packages the newly created fuse4fs server into a systemd
socket service.  This service can be used by the "mount.service" helper
in libfuse to implement untrusted unprivileged mounts.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-service-container
---
Commits in this patchset:
 * libext2fs: make it possible to extract the fd from an IO manager
 * libext2fs: fix checking for valid fds in mmp.c
 * unix_io: allow passing /dev/fd/XXX paths to the unixfd IO manager
 * libext2fs: fix MMP code to work with unixfd IO manager
 * libext2fs: bump libfuse API version to 3.19
 * fuse4fs: hoist some code out of fuse4fs_main
 * fuse4fs: enable safe service mode
 * fuse4fs: set proc title when in fuse service mode
 * fuse4fs: make MMP work correctly in safe service mode
 * debian: update packaging for fuse4fs service
---
 lib/ext2fs/ext2_io.h         |    4 
 lib/ext2fs/ext2fs.h          |    1 
 lib/ext2fs/ext2fsP.h         |    4 
 MCONFIG.in                   |    2 
 configure                    |  303 ++++++++++++++++++++++++++-
 configure.ac                 |  131 +++++++++++
 debian/e2fsprogs.install     |    7 +
 debian/fuse4fs.install       |    3 
 debian/libext2fs2t64.symbols |    1 
 debian/rules                 |    3 
 fuse4fs/Makefile.in          |   42 +++-
 fuse4fs/fuse4fs.c            |  479 ++++++++++++++++++++++++++++++++++++------
 fuse4fs/fuse4fs.socket.in    |   17 +
 fuse4fs/fuse4fs@.service.in  |  102 +++++++++
 lib/config.h.in              |   12 +
 lib/ext2fs/io_manager.c      |    8 +
 lib/ext2fs/mmp.c             |  101 +++++++++
 lib/ext2fs/openfs.c          |    1 
 lib/ext2fs/unix_io.c         |   50 ++++
 util/subst.conf.in           |    3 
 20 files changed, 1177 insertions(+), 97 deletions(-)
 mode change 100644 => 100755 debian/fuse4fs.install
 create mode 100644 fuse4fs/fuse4fs.socket.in
 create mode 100644 fuse4fs/fuse4fs@.service.in


^ permalink raw reply

* [PATCHSET 1/4] libext2fs: fix some missed fsync calls
From: Darrick J. Wong @ 2026-06-25 19:35 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4
In-Reply-To: <20260625193302.GC6054@frogsfrogsfrogs>

Hi all,

Fix a few places (like device closing) where we really ought to tell the
block device to flush whatever's dirty to disk, even if we've failed to
flush all our cached buffers out to disk.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

e2fsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/e2fsprogs.git/log/?h=libext2fs-flushing-fixes
---
Commits in this patchset:
 * libext2fs: always fsync the device when flushing the cache
 * libext2fs: always fsync the device when closing the unix IO manager
 * libext2fs: only fsync the unix fd if we wrote to the device
---
 lib/ext2fs/unix_io.c |   83 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 14 deletions(-)


^ permalink raw reply

* [PATCHBOMB v6] e2fsprogs: containerize ext4 for safer operation
From: Darrick J. Wong @ 2026-06-25 19:33 UTC (permalink / raw)
  To: linux-ext4; +Cc: Theodore Ts'o, Neal Gompa

Hi everyone,

This is the sole remaining part of the gigantic patchset to enable
mounting ext4 filesystems as a systemd-contained fuse server instead of
in the kernel.  The libfuse parts have now been merged upstream, which
means that fuse4fs can now run as a non-root user, with no privilege,
and no access to the network or hardware, etc.  The only connection to
the outside is an ephemeral AF_UNIX socket.  The mount helper program
the other end is a helper program that acquires resources and calls
fsmount().

Why would you want to do that?  Most filesystem drivers are seriously
vulnerable to metadata parsing attacks, as syzbot has shown repeatedly
over almost a decade of its existence.  Faulty code can lead to total
kernel compromise, and I think there's a very strong incentive to move
all that parsing out to userspace where we can containerize the fuse
server process.  Runtime filesystem metadata parsing is no longer a
privileged (== risky) operation.

The consequences of a crashed driver is a dead mount, instead of a
crashed or corrupt OS kernel.

Note that contained fuse filesystem servers are no faster than regular
fuse.  The containerization code only requires changes to libfuse and is
ready to go today.

e2fsprogs:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-service-container_2026-06-25

Note that I threw in a couple more patchsets to improve the caching
behavior of libext2fs for better performance; and the ability to watch
for memory pressure complaints from the kernel so that we can drop our
own cache in response to memory pressure.

e2fsprogs:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/e2fsprogs.git/log/?h=fuse4fs-memory-reclaim_2026-06-25

--Darrick

Unreviewed patches in this patchbomb:

[PATCHSET 1/4] libext2fs: fix some missed fsync calls
  [PATCH 1/3] libext2fs: always fsync the device when flushing the
  [PATCH 2/3] libext2fs: always fsync the device when closing the unix
  [PATCH 3/3] libext2fs: only fsync the unix fd if we wrote to the
[PATCHSET v6 2/4] fuse4fs: run servers as a contained service
  [PATCH 01/10] libext2fs: make it possible to extract the fd from an
  [PATCH 02/10] libext2fs: fix checking for valid fds in mmp.c
  [PATCH 03/10] unix_io: allow passing /dev/fd/XXX paths to the unixfd
  [PATCH 04/10] libext2fs: fix MMP code to work with unixfd IO manager
  [PATCH 05/10] libext2fs: bump libfuse API version to 3.19
  [PATCH 06/10] fuse4fs: hoist some code out of fuse4fs_main
  [PATCH 07/10] fuse4fs: enable safe service mode
  [PATCH 08/10] fuse4fs: set proc title when in fuse service mode
  [PATCH 09/10] fuse4fs: make MMP work correctly in safe service mode
  [PATCH 10/10] debian: update packaging for fuse4fs service
[PATCHSET v6 3/4] fuse2fs: improve block and inode caching
  [PATCH 1/6] libsupport: add caching IO manager
  [PATCH 2/6] iocache: add the actual buffer cache
  [PATCH 3/6] iocache: bump buffer mru priority every 50 accesses
  [PATCH 4/6] fuse2fs: enable caching IO manager
  [PATCH 5/6] fuse2fs: increase inode cache size
  [PATCH 6/6] libext2fs: improve caching for inodes
[PATCHSET v6 4/4] fuse4fs: reclaim buffer cache under memory pressure
  [PATCH 1/4] libsupport: add pressure stall monitor
  [PATCH 2/4] fuse2fs: only reclaim buffer cache when there is memory
  [PATCH 3/4] fuse4fs: enable memory pressure monitoring with service
  [PATCH 4/4] fuse2fs: flush dirty metadata periodically

^ permalink raw reply

* [PATCH RFC] ext4: enable scoped NOFS when starting a handle in nojournal mode
From: Theodore Ts'o @ 2026-06-25 17:22 UTC (permalink / raw)
  To: Ext4 Developers List, Matthew Wilcox; +Cc: Theodore Ts'o

The jbd2 layer enables NOFS mode using memalloc_nofs_{save,restore}()
while a handle is active.  We need to do the same in nojournal mode so
that it is safe to remove GFP_NOFS flags while a jbd2 handle is
active.

This will require that we actually allocate a real handle, but with an
h_invalid flag set, so there is a place to put the saved memalloc
context.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c  | 35 ++++++++++++++++++++++-------------
 fs/ext4/ext4_jbd2.h  |  6 +-----
 include/linux/jbd2.h |  1 +
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 9a8c225f2753..c2f09cf4b506 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -35,12 +35,21 @@ static handle_t *ext4_get_nojournal(void)
 	handle_t *handle = current->journal_info;
 	unsigned long ref_cnt = (unsigned long)handle;
 
-	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
-
-	ref_cnt++;
-	handle = (handle_t *)ref_cnt;
-
-	current->journal_info = handle;
+	BUG_ON(handle && !handle->h_invalid);
+
+	if (!handle) {
+		handle = jbd2_alloc_handle(GFP_NOFS);
+		if (!handle)
+			return ERR_PTR(-ENOMEM);
+		handle->h_invalid = 1;
+		/*
+		 * This is done by start_this_handle() if journalling
+		 * is enabled.
+		 */
+		handle->saved_alloc_context = memalloc_nofs_save();
+		current->journal_info = handle;
+	}
+	handle->h_ref++;
 	return handle;
 }
 
@@ -48,14 +57,14 @@ static handle_t *ext4_get_nojournal(void)
 /* Decrement the non-pointer handle value */
 static void ext4_put_nojournal(handle_t *handle)
 {
-	unsigned long ref_cnt = (unsigned long)handle;
+	BUG_ON(handle->h_ref == 0);
 
-	BUG_ON(ref_cnt == 0);
-
-	ref_cnt--;
-	handle = (handle_t *)ref_cnt;
-
-	current->journal_info = handle;
+	handle->h_ref--;
+	if (handle->h_ref == 0) {
+		memalloc_nofs_restore(handle->saved_alloc_context);
+		jbd2_free_handle(handle);
+		current->journal_info = NULL;
+	}
 }
 
 /*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 63d17c5201b5..75d4670d389c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -182,15 +182,11 @@ handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
 				  int rsv_blocks, int revoke_creds);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
-#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
-
 /* Note:  Do not use this for NULL handles.  This is only to determine if
  * a properly allocated handle is using a journal or not. */
 static inline int ext4_handle_valid(handle_t *handle)
 {
-	if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
-		return 0;
-	return 1;
+	return !handle->h_invalid;
 }
 
 static inline void ext4_handle_sync(handle_t *handle)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b68561187e90..7348fdadc810 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -513,6 +513,7 @@ struct jbd2_journal_handle
 	unsigned int	h_sync:		1;
 	unsigned int	h_reserved:	1;
 	unsigned int	h_aborted:	1;
+	unsigned int	h_invalid:	1;
 	unsigned int	h_type:		8;
 	unsigned int	h_line_no:	16;
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v2] ext4: clear stale xarray tags on folios skipped during writeback
From: Gerald Yang @ 2026-06-25 16:01 UTC (permalink / raw)
  To: tytso, jack; +Cc: linux-ext4, gerald.yang.tw

In data=journal mode, the writeback thread can hit the
WARN_ON_ONCE(sb_rdonly(sb)) in ext4_journal_check_start() while the
superblock is being remounted read-only during reboot:

Workqueue: writeback wb_workfn (flush-253:0)
RIP: 0010:ext4_journal_check_start+0x8b/0xd0
Call Trace:
  __ext4_journal_start_sb+0x3c/0x1e0
  mpage_prepare_extent_to_map+0x4af/0x580
  ext4_do_writepages+0x3c0/0x1080
  ext4_writepages+0xc8/0x1a0
  do_writepages+0xc4/0x180
  __writeback_single_inode+0x45/0x2f0
  writeback_sb_inodes+0x26b/0x5d0
  __writeback_inodes_wb+0x54/0x100
  wb_writeback+0x1ac/0x320
  wb_workfn+0x394/0x470

And followed by the warning:
EXT4-fs warning (device vda1): ext4_evict_inode:195: inode #6263:
comm (sd-umount): data will be lost

This issue is not reproduced every time, but frequently.
The reproduction step is to create a VM with 8 CPUs, 16G memory and
setup data=journal:
sudo tune2fs -o journal_data /dev/vda1
Run fio:
rm -f fiotest
fio --name=fiotest --rw=randwrite --bs=4k --runtime=6 --ioengine=libaio
--iodepth=256 --numjobs=8 --filename=fiotest --filesize=30G
--group_reporting
Reboot the VM, and check the console output from:
virsh console testvm

But there is no dirty inode, folio_clear_dirty_for_io clears PG_dirty
but leaves tags PAGECACHE_TAG_DIRTY and PAGECACHE_TAG_TOWRITE set which
are only cleared by __folio_start_writeback.
In data=journal mode, jbd2 checkpoints the journalled data to its final
location and clears its own dirty flag without touching folio PG_dirty
or xarray dirty flags.
The commit f4a2b42e7891 ("ext4: fix stale xarray tags after writeback")
fixes when PG_dirty is still set but there is no dirty page.
Another case is PG_dirty is cleared, but PAGECACHE_TAG_DIRTY and
PAGECACHE_TAG_TOWRITE is still set. In this case, writeback thread
checks clean folio and skips it in mpage_prepare_extent_to_map:
if (!folio_test_dirty(folio) ||
    ...
        folio_unlcok(folio);
	continue

And never reaches ext4_bio_write_folio where the commit f4a2b42e7891
clears the stale xarray tags. Print debug logs after the filesystem
is remounted read-only:
writepages RDONLY nrpages=2048 dirtytag=1 wbtag=0 towrite=1 sync=0
And all folios are actually clean:
folio idx=3 dirty=0 wb=0 checked=0 dirtybuf=0 jbddirty=0 mapped=1
...

We need to clear the xarray stale tags for such clean folios by
cycling them through writeback in the skip path, the same way
f4a2b42e7891 does in ext4_bio_write_folio.

Fixes: dff4ac75eeee ("ext4: move keep_towrite handling to ext4_bio_write_page()")
Signed-off-by: Gerald Yang <gerald.yang@canonical.com>
---
Changes in v2:
Split the top level condition based on Jan's suggestion

 fs/ext4/inode.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ce99807c5f5b..150f8789f0aa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2694,13 +2694,25 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			 * page is already under writeback and we are not doing
 			 * a data integrity writeback, skip the page
 			 */
-			if (!folio_test_dirty(folio) ||
-			    (folio_test_writeback(folio) &&
-			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+			if ((folio_test_writeback(folio) &&
+			    mpd->wbc->sync_mode == WB_SYNC_NONE) ||
 			    unlikely(folio->mapping != mapping)) {
 				folio_unlock(folio);
 				continue;
 			}
+			/*
+			 * If the folio is clean, skip writing it back.
+			 * Cycle the folio through the writeback state
+			 * though, to clear stale xarray tags.
+			 */
+			if (!folio_test_dirty(folio)) {
+				if (!folio_test_writeback(folio)) {
+					__folio_start_writeback(folio, false);
+					folio_end_writeback(folio);
+				}
+				folio_unlock(folio);
+				continue;
+			}
 
 			folio_wait_writeback(folio);
 			BUG_ON(folio_test_writeback(folio));
-- 
2.43.0


^ permalink raw reply related

* [PATCH v10 4/5] ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-25 15:29 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-ext4, linux-kernel, yun.zhou, linux-fsdevel
In-Reply-To: <20260625152941.24788-1-yun.zhou@windriver.com>

Now that ext4_put_ea_inode() handles deferred iput safely for all cases
(using iput_if_not_last + embedded llist_node), the ea_inode_array
mechanism for batching deferred iputs is redundant.

Remove:
- ext4_expand_inode_array() and ext4_xattr_inode_array_free()
- ext4_xattr_inode_array_free_deferred()
- struct ext4_xattr_inode_array and EIA_INCR/EIA_MASK defines
- ea_inode_array parameter from ext4_xattr_inode_dec_ref_all(),
  ext4_xattr_release_block(), and ext4_xattr_delete_inode()
- ea_inode_array variable from ext4_evict_inode()

Instead, ext4_xattr_inode_dec_ref_all() now calls ext4_put_ea_inode()
directly after processing each EA inode.  This simplifies the code
by eliminating multi-layer parameter threading and removes the need
for callers to manage array lifetime.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c |  6 +---
 fs/ext4/xattr.c | 95 +++----------------------------------------------
 fs/ext4/xattr.h |  7 ----
 3 files changed, 6 insertions(+), 102 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d131371ad3d..6f1b84e46a2e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -176,7 +176,6 @@ void ext4_evict_inode(struct inode *inode)
 	 * (xattr block freeing), bitmap, group descriptor (inode freeing)
 	 */
 	int extra_credits = 6;
-	struct ext4_xattr_inode_array *ea_inode_array = NULL;
 	bool freeze_protected = false;
 
 	trace_ext4_evict_inode(inode);
@@ -282,8 +281,7 @@ void ext4_evict_inode(struct inode *inode)
 	}
 
 	/* Remove xattr references. */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
-				      extra_credits);
+	err = ext4_xattr_delete_inode(handle, inode, extra_credits);
 	if (err) {
 		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
 stop_handle:
@@ -291,7 +289,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_orphan_del(NULL, inode);
 		if (freeze_protected)
 			sb_end_intwrite(inode->i_sb);
-		ext4_xattr_inode_array_free(ea_inode_array);
 		goto no_delete;
 	}
 
@@ -321,7 +318,6 @@ void ext4_evict_inode(struct inode *inode)
 	ext4_journal_stop(handle);
 	if (freeze_protected)
 		sb_end_intwrite(inode->i_sb);
-	ext4_xattr_inode_array_free(ea_inode_array);
 	return;
 no_delete:
 	/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 90b693b78a45..7f334349bd4f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -114,12 +114,6 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 #define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_ea_inode_cache)
 
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode);
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array);
-
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -1162,7 +1156,6 @@ static void
 ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 			     struct buffer_head *bh,
 			     struct ext4_xattr_entry *first, bool block_csum,
-			     struct ext4_xattr_inode_array **ea_inode_array,
 			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
@@ -1199,14 +1192,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			continue;
 
-		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (err) {
-			ext4_warning_inode(ea_inode,
-					   "Expand inode array err=%d", err);
-			ext4_put_ea_inode(parent->i_sb, ea_inode);
-			continue;
-		}
-
 		err = ext4_journal_ensure_credits_fn(handle, credits, credits,
 			ext4_free_metadata_revoke_credits(parent->i_sb, 1),
 			ext4_xattr_restart_fn(handle, parent, bh, block_csum,
@@ -1214,6 +1199,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err < 0) {
 			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 		if (err > 0) {
@@ -1223,6 +1209,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 				ext4_warning_inode(ea_inode,
 						"Re-get write access err=%d",
 						err);
+				ext4_put_ea_inode(parent->i_sb, ea_inode);
 				continue;
 			}
 		}
@@ -1231,6 +1218,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err) {
 			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 
@@ -1247,6 +1235,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		entry->e_value_inum = 0;
 		entry->e_value_size = 0;
 
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 		dirty = true;
 	}
 
@@ -1273,7 +1262,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh,
-			 struct ext4_xattr_inode_array **ea_inode_array,
 			 int extra_credits)
 {
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
@@ -1315,7 +1303,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
 						     BFIRST(bh),
 						     true /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -2184,13 +2171,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 	/* Drop the previous xattr block. */
 	if (bs->bh && bs->bh != new_bh) {
-		struct ext4_xattr_inode_array *ea_inode_array = NULL;
-
 		ext4_xattr_release_block(handle, inode, bs->bh,
-					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free_deferred(inode->i_sb,
-						     ea_inode_array);
 	}
 	error = 0;
 
@@ -2866,46 +2848,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-#define EIA_INCR 16 /* must be 2^n */
-#define EIA_MASK (EIA_INCR - 1)
-
-/* Add the large xattr @inode into @ea_inode_array for deferred iput().
- * If @ea_inode_array is new or full it will be grown and the old
- * contents copied over.
- */
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode)
-{
-	if (*ea_inode_array == NULL) {
-		/*
-		 * Start with 15 inodes, so it fits into a power-of-two size.
-		 */
-		(*ea_inode_array) = kmalloc_flex(**ea_inode_array, inodes,
-						 EIA_MASK, GFP_NOFS);
-		if (*ea_inode_array == NULL)
-			return -ENOMEM;
-		(*ea_inode_array)->count = 0;
-	} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
-		/* expand the array once all 15 + n * 16 slots are full */
-		struct ext4_xattr_inode_array *new_array = NULL;
-
-		new_array = kmalloc_flex(**ea_inode_array, inodes,
-					 (*ea_inode_array)->count + EIA_INCR,
-					 GFP_NOFS);
-		if (new_array == NULL)
-			return -ENOMEM;
-		memcpy(new_array, *ea_inode_array,
-		       struct_size(*ea_inode_array, inodes,
-				   (*ea_inode_array)->count));
-		kfree(*ea_inode_array);
-		*ea_inode_array = new_array;
-	}
-	(*ea_inode_array)->count++;
-	(*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2916,7 +2858,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * references on xattr block and xattr inodes.
  */
 int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			    struct ext4_xattr_inode_array **ea_inode_array,
 			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
@@ -2955,7 +2896,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
 						     IFIRST(header),
 						     false /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     false /* skip_quota */);
 	}
@@ -2994,7 +2934,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 		}
 
-		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+		ext4_xattr_release_block(handle, inode, bh,
 					 extra_credits);
 		/*
 		 * Update i_file_acl value in the same transaction that releases
@@ -3016,31 +2956,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	return error;
 }
 
-void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx;
-
-	if (ea_inode_array == NULL)
-		return;
-
-	for (idx = 0; idx < ea_inode_array->count; ++idx)
-		iput(ea_inode_array->inodes[idx]);
-	kfree(ea_inode_array);
-}
-
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array)
-{
-	int idx;
-
-	if (array == NULL)
-		return;
-
-	for (idx = 0; idx < array->count; ++idx)
-		ext4_put_ea_inode(sb, array->inodes[idx]);
-	kfree(array);
-}
-
 /*
  * Worker function for deferred EA inode iput.  Processes all inodes queued
  * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 9883ba5569a1..8214a31fe001 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -131,11 +131,6 @@ struct ext4_xattr_ibody_find {
 	struct ext4_iloc iloc;
 };
 
-struct ext4_xattr_inode_array {
-	unsigned int count;
-	struct inode *inodes[] __counted_by(count);
-};
-
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
@@ -187,9 +182,7 @@ extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
 				bool is_create);
 
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
-extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
 extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v10 5/5] ext4: prevent deadlock from duplicate EA inode references on corrupted fs
From: Yun Zhou @ 2026-06-25 15:29 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-ext4, linux-kernel, yun.zhou, linux-fsdevel
In-Reply-To: <20260625152941.24788-1-yun.zhou@windriver.com>

On a corrupted filesystem, multiple xattr entries may reference the same
EA inode.  When ext4_xattr_inode_dec_ref_all() processes such entries, it
can dec_ref the EA inode (setting nlink=0) and queue it for deferred iput.
If the deferred worker runs before the loop processes the duplicate entry,
the second iget() may block on I_FREEING while the worker's eviction waits
for the caller's transaction to commit -- classic ABBA deadlock.

Fix by tracking successfully processed EA inodes on a per-call llist
(reusing i_ea_iput_node) and skipping any ea_ino already in the list.
This covers both intra-block duplicates and cross ibody/block duplicates
in ext4_xattr_delete_inode().

The actual ext4_put_ea_inode() is deferred until after the processing
loop completes (ext4_put_ea_inode_llist), ensuring no EA inode is queued
for eviction while the loop is still iterating.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/xattr.c | 68 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7f334349bd4f..5c929043e44a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1152,11 +1152,41 @@ static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+/* Check if an EA inode number is already in the processed llist. */
+static bool ext4_ea_ino_in_llist(unsigned int ea_ino,
+				 struct llist_head *processed)
+{
+	struct ext4_inode_info *ei;
+
+	llist_for_each_entry(ei, processed->first, i_ea_iput_node) {
+		if (ei->vfs_inode.i_ino == ea_ino)
+			return true;
+	}
+	return false;
+}
+
+/* Put all EA inodes on a processed llist via ext4_put_ea_inode. */
+static void ext4_put_ea_inode_llist(struct super_block *sb,
+				    struct llist_head *processed)
+{
+	struct llist_node *node = llist_del_all(processed);
+	struct llist_node *next;
+
+	while (node) {
+		struct ext4_inode_info *ei = container_of(node,
+				struct ext4_inode_info, i_ea_iput_node);
+		next = node->next;
+		ext4_put_ea_inode(sb, &ei->vfs_inode);
+		node = next;
+	}
+}
+
 static void
 ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 			     struct buffer_head *bh,
 			     struct ext4_xattr_entry *first, bool block_csum,
-			     int extra_credits, bool skip_quota)
+			     int extra_credits, bool skip_quota,
+			     struct llist_head *processed)
 {
 	struct inode *ea_inode;
 	struct ext4_xattr_entry *entry;
@@ -1186,6 +1216,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
+
+		/* Skip if already processed (duplicate on corrupted fs) */
+		if (ext4_ea_ino_in_llist(ea_ino, processed))
+			continue;
+
 		err = ext4_xattr_inode_iget(parent, ea_ino,
 					    le32_to_cpu(entry->e_hash),
 					    &ea_inode);
@@ -1235,7 +1270,12 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		entry->e_value_inum = 0;
 		entry->e_value_size = 0;
 
-		ext4_put_ea_inode(parent->i_sb, ea_inode);
+		/*
+		 * Collect processed EA inodes for dedup and deferred iput.
+		 * ext4_put_ea_inode_llist() handles the actual release
+		 * after the loop, preventing iget deadlocks on duplicates.
+		 */
+		llist_add(&EXT4_I(ea_inode)->i_ea_iput_node, processed);
 		dirty = true;
 	}
 
@@ -1262,7 +1302,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh,
-			 int extra_credits)
+			 int extra_credits,
+			 struct llist_head *processed)
 {
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
 	u32 hash, ref;
@@ -1304,7 +1345,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 						     BFIRST(bh),
 						     true /* block_csum */,
 						     extra_credits,
-						     true /* skip_quota */);
+						     true /* skip_quota */,
+						     processed);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -2171,8 +2213,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 	/* Drop the previous xattr block. */
 	if (bs->bh && bs->bh != new_bh) {
+		LLIST_HEAD(processed);
+
 		ext4_xattr_release_block(handle, inode, bs->bh,
-					 0 /* extra_credits */);
+					 0 /* extra_credits */,
+					 &processed);
+		ext4_put_ea_inode_llist(inode->i_sb, &processed);
 	}
 	error = 0;
 
@@ -2866,6 +2912,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_entry *entry;
 	struct inode *ea_inode;
 	int error;
+	LLIST_HEAD(processed);
 
 	error = ext4_journal_ensure_credits(handle, extra_credits,
 			ext4_free_metadata_revoke_credits(inode->i_sb, 1));
@@ -2897,7 +2944,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 						     IFIRST(header),
 						     false /* block_csum */,
 						     extra_credits,
-						     false /* skip_quota */);
+						     false /* skip_quota */,
+						     &processed);
 	}
 
 	if (EXT4_I(inode)->i_file_acl) {
@@ -2921,6 +2969,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 			     entry = EXT4_XATTR_NEXT(entry)) {
 				if (!entry->e_value_inum)
 					continue;
+				/* Skip EA inodes already dec_ref'd from ibody */
+				if (ext4_ea_ino_in_llist(
+					    le32_to_cpu(entry->e_value_inum),
+					    &processed))
+					continue;
 				error = ext4_xattr_inode_iget(inode,
 					      le32_to_cpu(entry->e_value_inum),
 					      le32_to_cpu(entry->e_hash),
@@ -2935,7 +2988,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		}
 
 		ext4_xattr_release_block(handle, inode, bh,
-					 extra_credits);
+					 extra_credits, &processed);
 		/*
 		 * Update i_file_acl value in the same transaction that releases
 		 * block.
@@ -2951,6 +3004,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	}
 	error = 0;
 cleanup:
+	ext4_put_ea_inode_llist(inode->i_sb, &processed);
 	brelse(iloc.bh);
 	brelse(bh);
 	return error;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v10 1/5] fs: add iput_if_not_last() helper
From: Yun Zhou @ 2026-06-25 15:29 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-ext4, linux-kernel, yun.zhou, linux-fsdevel
In-Reply-To: <20260625152941.24788-1-yun.zhou@windriver.com>

Add a helper that drops an inode reference only if the caller does not
hold the last one.  Returns true if the reference was dropped, false
otherwise.

This is useful for filesystems that need to release inode references
in contexts where triggering final iput (and thus eviction) would be
unsafe due to lock ordering constraints.  The caller can check the
return value and defer the final iput to a safe context.

Unlike iput_not_last() which BUG_ON's if called with the last ref,
this variant is designed to be called speculatively.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6da44573ce45..4916a9d54347 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,6 +2418,19 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 void iput_not_last(struct inode *);
+
+/**
+ * iput_if_not_last - drop an inode reference only if it is not the last one
+ * @inode: inode to put
+ *
+ * Returns true if the reference was dropped, false if this was the last
+ * reference and the caller must arrange for final iput() in a safe context.
+ */
+static inline bool iput_if_not_last(struct inode *inode)
+{
+	return atomic_add_unless(&inode->i_count, -1, 1);
+}
+
 int inode_update_time(struct inode *inode, enum fs_update_time type,
 		unsigned int flags);
 int generic_update_time(struct inode *inode, enum fs_update_time type,
-- 
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox