* [PATCH 2/3] libsupport: add a portable get_thread_id() function
From: Theodore Ts'o @ 2026-04-03 4:03 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Darrick J. Wong, Theodore Ts'o
In-Reply-To: <20260403040328.2385083-1-tytso@mit.edu>
The gettid() system call is only available on Linux. So create a new
function, get_thread_id() which implements a number of different ways
of providing a thread id as an integer.
Use get_thread_id() instead of gettid() in fuse2fs.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
configure | 12 ++++++++++++
configure.ac | 2 ++
lib/config.h.in | 6 ++++++
lib/support/Makefile.in | 13 +++++++++++--
lib/support/thread.c | 36 ++++++++++++++++++++++++++++++++++++
lib/support/thread.h | 5 +++++
misc/fuse2fs.c | 3 ++-
7 files changed, 74 insertions(+), 3 deletions(-)
create mode 100644 lib/support/thread.c
create mode 100644 lib/support/thread.h
diff --git a/configure b/configure
index b9a82dcec..b04b31aff 100755
--- a/configure
+++ b/configure
@@ -13749,6 +13749,12 @@ if test "x$ac_cv_func_getrusage" = xyes
then :
printf "%s\n" "#define HAVE_GETRUSAGE 1" >>confdefs.h
+fi
+ac_fn_c_check_func "$LINENO" "gettid" "ac_cv_func_gettid"
+if test "x$ac_cv_func_gettid" = xyes
+then :
+ printf "%s\n" "#define HAVE_GETTID 1" >>confdefs.h
+
fi
ac_fn_c_check_func "$LINENO" "jrand48" "ac_cv_func_jrand48"
if test "x$ac_cv_func_jrand48" = xyes
@@ -13893,6 +13899,12 @@ if test "x$ac_cv_func_pthread_setname_np" = xyes
then :
printf "%s\n" "#define HAVE_PTHREAD_SETNAME_NP 1" >>confdefs.h
+fi
+ac_fn_c_check_func "$LINENO" "pthread_threadid_np" "ac_cv_func_pthread_threadid_np"
+if test "x$ac_cv_func_pthread_threadid_np" = xyes
+then :
+ printf "%s\n" "#define HAVE_PTHREAD_THREADID_NP 1" >>confdefs.h
+
fi
ac_fn_c_check_func "$LINENO" "qsort_r" "ac_cv_func_qsort_r"
if test "x$ac_cv_func_qsort_r" = xyes
diff --git a/configure.ac b/configure.ac
index 2473879fd..4921f81f7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1246,6 +1246,7 @@ AC_CHECK_FUNCS(m4_flatten([
getrandom
getrlimit
getrusage
+ gettid
jrand48
keyctl
llistxattr
@@ -1270,6 +1271,7 @@ AC_CHECK_FUNCS(m4_flatten([
pread64
pwrite64
pthread_setname_np
+ pthread_threadid_np
qsort_r
secure_getenv
setmntent
diff --git a/lib/config.h.in b/lib/config.h.in
index c6cbced5f..f129abfe7 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -181,6 +181,9 @@
/* Define if the GNU gettext() function is already present or preinstalled. */
#undef HAVE_GETTEXT
+/* Define to 1 if you have the 'gettid' function. */
+#undef HAVE_GETTID
+
/* Define to 1 if you have the GNU-style 'qsort_r' function. */
#undef HAVE_GNU_QSORT_R
@@ -331,6 +334,9 @@
/* Define to 1 if you have the 'pthread_setname_np' function. */
#undef HAVE_PTHREAD_SETNAME_NP
+/* Define to 1 if you have the 'pthread_threadid_np' function. */
+#undef HAVE_PTHREAD_THREADID_NP
+
/* Define to 1 if you have the 'pwrite' function. */
#undef HAVE_PWRITE
diff --git a/lib/support/Makefile.in b/lib/support/Makefile.in
index 6383816fd..9aac9cf00 100644
--- a/lib/support/Makefile.in
+++ b/lib/support/Makefile.in
@@ -25,6 +25,7 @@ OBJS= bthread.o \
quotaio.o \
quotaio_v2.o \
quotaio_tree.o \
+ thread.o \
dict.o \
devname.o
@@ -41,6 +42,7 @@ SRCS= $(srcdir)/argv_parse.c \
$(srcdir)/quotaio.c \
$(srcdir)/quotaio_tree.c \
$(srcdir)/quotaio_v2.c \
+ $(srcdir)/thread.c \
$(srcdir)/dict.c \
$(srcdir)/devname.c
@@ -81,10 +83,15 @@ test_cstring: $(srcdir)/cstring.c
$(Q) $(CC) -o test_cstring -DDEBUG_PROGRAM $(srcdir)/cstring.c \
$(ALL_CFLAGS)
+test_thread: $(srcdir)/thread.c
+ $(E) " CC $@"
+ $(Q) $(CC) -o test_thread -DDEBUG_PROGRAM $(srcdir)/thread.c \
+ $(ALL_CFLAGS)
+
clean::
$(RM) -f \#* *.s *.o *.a *~ *.bak core profiled/* \
../libsupport.a ../libsupport_p.a $(SMANPAGES) \
- prof_err.c prof_err.h test_profile test_cstring
+ prof_err.c prof_err.h test_profile test_cstring test_thread
#fullcheck check:: tst_uuid
# LD_LIBRARY_PATH=$(LIB) DYLD_LIBRARY_PATH=$(LIB) ./tst_uuid
@@ -111,7 +118,7 @@ $(OBJS):
argv_parse.o: $(srcdir)/argv_parse.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/argv_parse.h
bthread.o: $(srcdir)/bthread.c $(top_builddir)/lib/config.h \
- $(srcdir)/bthread.h
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/bthread.h
cstring.o: $(srcdir)/cstring.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/cstring.h
mkquota.o: $(srcdir)/mkquota.c $(top_builddir)/lib/config.h \
@@ -183,6 +190,8 @@ quotaio_v2.o: $(srcdir)/quotaio_v2.c $(top_builddir)/lib/config.h \
$(top_srcdir)/lib/ext2fs/ext2_ext_attr.h $(top_srcdir)/lib/ext2fs/hashmap.h \
$(top_srcdir)/lib/ext2fs/bitops.h $(srcdir)/dqblk_v2.h \
$(srcdir)/quotaio_tree.h
+thread.o: $(srcdir)/thread.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/thread.h
dict.o: $(srcdir)/dict.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/dict.h
devname.o: $(srcdir)/devname.c $(top_builddir)/lib/config.h \
diff --git a/lib/support/thread.c b/lib/support/thread.c
new file mode 100644
index 000000000..a9a10940c
--- /dev/null
+++ b/lib/support/thread.c
@@ -0,0 +1,36 @@
+/*
+ * thread.c - utility functions for Posix threads
+ */
+
+#include "config.h"
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "support/thread.h"
+
+uint64_t get_thread_id(void)
+{
+#if defined(HAVE_GETTID)
+ return gettid();
+#elif defined(HAVE_PTHREAD_THREADID_NP)
+ uint64_t tid;
+
+ if (pthread_threadid_np(NULL, &tid))
+ return tid;
+#elif defined(HAVE_PTHREAD)
+ return (__u64)(uintptr_t) pthread_self();
+#endif
+ return getpid();
+}
+
+#ifdef DEBUG_PROGRAM
+int main(int argc, char **argv)
+{
+ printf("Thread id: %llu\n", get_thread_id());
+ return 0;
+}
+#endif
diff --git a/lib/support/thread.h b/lib/support/thread.h
new file mode 100644
index 000000000..9a7f5c9db
--- /dev/null
+++ b/lib/support/thread.h
@@ -0,0 +1,5 @@
+/*
+ * thread.h -- header file for thread utilities
+ */
+
+uint64_t get_thread_id(void);
diff --git a/misc/fuse2fs.c b/misc/fuse2fs.c
index 0b43ec0fb..dfbc98636 100644
--- a/misc/fuse2fs.c
+++ b/misc/fuse2fs.c
@@ -48,6 +48,7 @@
#include "ext2fs/ext2_fs.h"
#include "ext2fs/ext2fsP.h"
#include "support/bthread.h"
+#include "support/thread.h"
#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
# define FUSE_PLATFORM_OPTS ""
#else
@@ -148,7 +149,7 @@ static inline uint64_t round_down(uint64_t b, unsigned int align)
#define dbg_printf(fuse2fs, format, ...) \
while ((fuse2fs)->debug) { \
- printf("FUSE2FS (%s): tid=%d " format, (fuse2fs)->shortdev, gettid(), ##__VA_ARGS__); \
+ printf("FUSE2FS (%s): tid=%llu " format, (fuse2fs)->shortdev, get_thread_id(), ##__VA_ARGS__); \
fflush(stdout); \
break; \
}
--
2.51.0
^ permalink raw reply related
* [PATCH -e2fsprogs 0/3] Fix portability issues on MacOS
From: Theodore Ts'o @ 2026-04-03 4:03 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Darrick J. Wong, Theodore Ts'o
The recent fuse2fs changes introduced some portability issues; fix
them so that e2fsprogs can build on MacOS and create a fuse2fs binary
that works on MacOS 26.3.1 using MacFuse.
Note: the f_opt_extent test is failing on Github Actions when testing
on MacOS. All of the tests are passing clean up my MacOS laptop. So
I'm not sure where the github action failure is coming from.
Theodore Ts'o (3):
libsupport: fix portability issues with the bthread.c
libsupport: add a portable get_thread_id() function
fuse2fs: fix build failure on systems which don't define EUCLEAN
configure | 18 ++++++++++++++++++
configure.ac | 3 +++
lib/config.h.in | 30 +++++++++++++++++++++---------
lib/support/Makefile.in | 13 +++++++++++--
lib/support/bthread.c | 12 ++++++++++--
lib/support/thread.c | 36 ++++++++++++++++++++++++++++++++++++
lib/support/thread.h | 5 +++++
misc/fuse2fs.c | 5 ++++-
8 files changed, 108 insertions(+), 14 deletions(-)
create mode 100644 lib/support/thread.c
create mode 100644 lib/support/thread.h
--
2.51.0
^ permalink raw reply
* [PATCH 3/3] fuse2fs: fix build failure on systems which don't define EUCLEAN
From: Theodore Ts'o @ 2026-04-03 4:03 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Darrick J. Wong, Theodore Ts'o
In-Reply-To: <20260403040328.2385083-1-tytso@mit.edu>
MacOS doesn't have EUCLEAN, so we use EIO as the closest error code.
But then we need to avoid a compile error caused by a duplicate case
labels of EUCLEAN and EIO.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
misc/fuse2fs.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/misc/fuse2fs.c b/misc/fuse2fs.c
index dfbc98636..94e289fab 100644
--- a/misc/fuse2fs.c
+++ b/misc/fuse2fs.c
@@ -5870,7 +5870,9 @@ static int __translate_error(ext2_filsys fs, ext2_ino_t ino, errcode_t err,
#ifdef EILSEQ
case EILSEQ:
#endif
+#if EUCLEAN != EIO
case EUCLEAN:
+#endif
/* these errnos usually denote corruption or persistence fail */
is_err = 1;
ret = -err;
--
2.51.0
^ permalink raw reply related
* Re: [PATCH 8/8] arch: use rest_of_page() macro where appropriate
From: Paul Walmsley @ 2026-04-02 21:58 UTC (permalink / raw)
To: Yury Norov
Cc: Andrew Morton, David S. Miller, Michael S. Tsirkin,
Theodore Ts'o, Albert Ou, Alexander Duyck, Alexander Gordeev,
Alexander Viro, Alexandra Winter, Andreas Dilger, Andrew Lunn,
Anna Schumaker, Anton Yakovlev, Arnaldo Carvalho de Melo,
Aswin Karuvally, Borislav Petkov, Carlos Maiolino,
Catalin Marinas, Chao Yu, Christian Borntraeger,
Christian Brauner, Claudio Imbrenda, Dave Hansen, David Airlie,
Dominique Martinet, Dongsheng Yang, Eric Dumazet,
Eric Van Hensbergen, Heiko Carstens, Herbert Xu, Ingo Molnar,
Jaegeuk Kim, Jakub Kicinski, Jani Nikula, Janosch Frank,
Jaroslav Kysela, Jens Axboe, Joonas Lahtinen, Latchesar Ionkov,
Linus Walleij, Madhavan Srinivasan, Mark Brown, Michael Ellerman,
Miklos Szeredi, Namhyung Kim, Palmer Dabbelt, Paolo Abeni,
Paolo Bonzini, Paul Walmsley, Peter Zijlstra, Rodrigo Vivi,
Sean Christopherson, Simona Vetter, Takashi Iwai, Thomas Gleixner,
Trond Myklebust, Tvrtko Ursulin, Vasily Gorbik, Will Deacon,
Yury Norov, Zheng Gu, linux-kernel, x86, linux-arm-kernel,
linuxppc-dev, linux-riscv, kvm, linux-s390, linux-block,
intel-gfx, dri-devel, dm-devel, netdev, linux-spi, linux-ext4,
linux-f2fs-devel, linux-fsdevel, linux-xfs, linux-nfs,
linux-crypto, linux-mm, linux-perf-users, v9fs, virtualization,
linux-sound
In-Reply-To: <20260304012717.201797-9-ynorov@nvidia.com>
On Tue, 3 Mar 2026, Yury Norov wrote:
> Switch arch code to using the macro. No functional changes intended.
>
> Signed-off-by: Yury Norov <ynorov@nvidia.com>
Acked-by: Paul Walmsley <pjw@kernel.org> # arch/riscv
- Paul
^ permalink raw reply
* Re: [PATCH v6 03/22] fsverity: generate and store zero-block hash
From: Andrey Albershteyn @ 2026-04-02 14:47 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, hch,
linux-ext4, linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260401222717.GH2466@quark>
On 2026-04-01 15:27:17, Eric Biggers wrote:
> On Tue, Mar 31, 2026 at 11:28:04PM +0200, Andrey Albershteyn wrote:
> > Compute the hash of one filesystem block's worth of zeros. A filesystem
> > implementation can decide to elide merkle tree blocks containing only
> > this hash and synthesize the contents at read time.
> >
> > Let's pretend that there's a file containing six data blocks and whose
> > merkle tree looks roughly like this:
> >
> > root
> > +--leaf0
> > | +--data0
> > | +--data1
> > | `--data2
> > `--leaf1
> > +--data3
> > +--data4
> > `--data5
> >
> > If data[0-2] are sparse holes, then leaf0 will contain a repeating
> > sequence of @zero_digest. Therefore, leaf0 need not be written to disk
> > because its contents can be synthesized.
> >
> > A subsequent xfs patch will use this to reduce the size of the merkle
> > tree when dealing with sparse gold master disk images and the like.
> >
> > Add a helper to pre-fill folio with hashes of empty blocks. This will be
> > used by iomap to synthesize blocks full of zero hashes on the fly.
> >
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> > ---
> > fs/verity/fsverity_private.h | 3 +++
> > fs/verity/open.c | 3 +++
> > fs/verity/pagecache.c | 22 ++++++++++++++++++++++
> > include/linux/fsverity.h | 8 ++++++++
> > 4 files changed, 36 insertions(+)
>
> Acked-by: Eric Biggers <ebiggers@kernel.org>
>
> The example given in the commit message is a bit misleading, though.
> Usually there are actually 128 hashes per block, and a block of hashes
> covers 512 KiB. So this optimization applies only where there is a hole
> in the file's data of size (at least) 512 KiB, aligned to the same
> amount.
>
> It's also worth noting that this optimization is being done only for the
> first level. The levels above that are still being stored. So, this
> doesn't really enable e.g. exabyte sized sparse regions, as a block will
> still be stored for each 64 MiB (instead of every 512 KiB).
>
> I'm okay with this if you want to do this, but I just want to make sure
> its limitations are well-understood.
Sure, I will fix the example and add a note that this is only for
data level.
I think for higher levels it could be later added transparently.
Old tree blocks will be read by iomap if exist, and for new format
block will be generated.
--
- Andrey
>
> > + /* the hash of a merkle block-sized buffer of zeroes */
> > + u8 zero_digest[FS_VERITY_MAX_DIGEST_SIZE];
>
> "the hash of an all-zeroes block" would be clearer. This is the hash
> from fsverity_hash_block() which includes the optional salt, not the
> hash from fsverity_hash_buffer() which does not include the salt.
>
> > +/**
> > + * fsverity_fill_zerohash() - fill folio with hashes of zero data block
> > + * @folio: folio to fill
> > + * @poff: offset in the folio to start
> > + * @plen: length of the range to fill with hashes
>
> Maybe go with (len, offset) for consistency with
> fsverity_verify_blocks(). (I assume the "p" prefix stands for "page",
> which is misleading since this works with a folio.)
>
> > +void fsverity_fill_zerohash(struct folio *folio, size_t poff, size_t plen,
> > + struct fsverity_info *vi)
> > +{
> > + size_t offset = poff;
> > +
> > + WARN_ON_ONCE(!IS_ALIGNED(poff, vi->tree_params.digest_size));
> > + WARN_ON_ONCE(!IS_ALIGNED(plen, vi->tree_params.digest_size));
> > +
> > + for (; offset < (poff + plen); offset += vi->tree_params.digest_size)
> > + memcpy_to_folio(folio, offset, vi->tree_params.zero_digest,
> > + vi->tree_params.digest_size);
>
> This could be done more efficiently, especially on HIGHMEM. Probably
> fine for now though, especially since the intersection of anyone wanting
> XFS && fsverity && HIGHMEM is likely to be extremely small.
>
> - Eric
>
^ permalink raw reply
* Re: [PATCH v6 02/22] fsverity: expose ensure_fsverity_info()
From: Andrey Albershteyn @ 2026-04-02 14:02 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, hch,
linux-ext4, linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260401220241.GG2466@quark>
On 2026-04-01 15:02:41, Eric Biggers wrote:
> On Tue, Mar 31, 2026 at 11:28:03PM +0200, Andrey Albershteyn wrote:
> > This function will be used by XFS's scrub to force fsverity activation,
> > therefore, to read fsverity context.
> >
> > Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> > Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
>
> Acked-by: Eric Biggers <ebiggers@kernel.org>
>
> > +/**
> > + * fsverity_ensure_verity_info() - create verity info if it's not in memory yet
> > + * @inode: the inode for which verity info should be created
> > + *
> > + * Ensure this inode has verity info attached to it. Read fsverity descriptor
> > + * and creates verity based on that. Inodes opened outside of
> > + * file_operations->open will not have any verity info attached. This
> > + * info is required for any fsverity related operations.
> > + *
> > + * Return: 0 on success, -errno on failure
> > + */
> > +int fsverity_ensure_verity_info(struct inode *inode);
>
> As Christoph mentioned, fs/verity/ uses the convention of the kerneldoc
> for functions being above the function definition.
>
> I think the comment could also be clearer:
>
> > create verity info if it's not in memory yet
>
> Maybe "cache verity info if it's not already cached", to avoid potential
> confusion with enabling fsverity on the file.
>
> > Ensure this inode has verity info attached to it.
>
> Maybe add: "It's assumed the inode already has fsverity enabled."
>
> > Inodes opened outside of file_operations->open will not have any
> > verity info attached. This info is required for any fsverity
> > related operations.
>
> The first sentence could be misinterpreted as saying that this function
> won't do anything in that case. The second sentence isn't clear what
> counts as "any fsverity related operation". Also "opened" doesn't seem
> like the right word to use when talking about a filesystem-internal read
> that occurs without a file descriptor having been opened.
>
> Maybe replace with:
>
> * This needs to be called at least once before any of the inode's data
> * can be verified (and thus read at all) or the inode's fsverity digest
> * retrieved. fsverity_file_open() calls this already, which handles
> * normal file accesses. If a filesystem does any internal (i.e. not
> * associated with a file descriptor) reads of the file's data or
> * fsverity digest, it must call this explicitly before doing so.
>
> By the way, should there be a patch that converts
> ovl_ensure_verity_loaded() to use this?
sure, I will add a patch on top replacing ovl's function
--
- Andrey
^ permalink raw reply
* Re: [GIT PULL 7/9] fuse2fs: better tracking of writable state
From: Theodore Tso @ 2026-04-02 14:00 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-ext4
In-Reply-To: <177334916629.3254574.9721669086227705727.stg-ugh@frogsfrogsfrogs>
On Thu, Mar 12, 2026 at 02:01:13PM -0700, Darrick J. Wong wrote:
> Hi Ted,
>
> Please pull this branch with changes for ext4.
>
> As usual, I did a test-merge with the main upstream branch as of a few
> minutes ago, and didn't see any conflicts. Please let me know if you
> encounter any problems.
>
> The following changes since commit fe8281316fd37133eb78474040affabb3f060e24:
>
> fuse2fs: record thread id in debug trace data (2026-03-08 19:14:06 -0700)
Thanks, merged into the next branch.
- Ted
^ permalink raw reply
* Re: [GIT PULL 6/9] fuse2fs: improve operation tracing
From: Theodore Tso @ 2026-04-02 13:59 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-ext4
In-Reply-To: <177334916604.3254574.11968947765826043346.stg-ugh@frogsfrogsfrogs>
On Thu, Mar 12, 2026 at 02:00:58PM -0700, Darrick J. Wong wrote:
> Hi Ted,
>
> Please pull this branch with changes for ext4.
>
> As usual, I did a test-merge with the main upstream branch as of a few
> minutes ago, and didn't see any conflicts. Please let me know if you
> encounter any problems.
>
> The following changes since commit 9476b1377d887872f96d7144a81d2107cff3df1f:
>
> fuse2fs: adjust OOM killer score if possible (2026-03-08 19:14:05 -0700)
Thanks, merged into the next branch.
- Ted
^ permalink raw reply
* Re: [GIT PULL 5/9] fuse2fs: refactor mount code
From: Theodore Tso @ 2026-04-02 13:55 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-ext4
In-Reply-To: <177334916576.3254574.12995451033800002505.stg-ugh@frogsfrogsfrogs>
On Thu, Mar 12, 2026 at 02:00:42PM -0700, Darrick J. Wong wrote:
> Hi Ted,
>
> Please pull this branch with changes for ext4.
>
> As usual, I did a test-merge with the main upstream branch as of a few
> minutes ago, and didn't see any conflicts. Please let me know if you
> encounter any problems.
>
> The following changes since commit 5d3394ff8d6f2e120be1343c1504831d9602a3b4:
>
> fuse2fs: hoist unmount code from main (2026-03-08 19:14:05 -0700)
Thanks, merged into the next branch.
- Ted
^ permalink raw reply
* Re: [GIT PULL 4/9] fuse2fs: refactor unmount code
From: Theodore Tso @ 2026-04-02 13:51 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-ext4
In-Reply-To: <177334916548.3254574.4332031441968487356.stg-ugh@frogsfrogsfrogs>
On Thu, Mar 12, 2026 at 02:00:26PM -0700, Darrick J. Wong wrote:
> Hi Ted,
>
> Please pull this branch with changes for ext4.
>
> As usual, I did a test-merge with the main upstream branch as of a few
> minutes ago, and didn't see any conflicts. Please let me know if you
> encounter any problems.
>
> The following changes since commit f5c46155c79cccd46e047d732063c7f21db35c8e:
>
> fuse2fs: collect runtime of various operations (2026-03-08 19:14:05 -0700)
Thanks, merged.
- Ted
^ permalink raw reply
* Re: [GIT PULL 3/9] fuse2fs: clean up operation startup
From: Theodore Tso @ 2026-04-02 13:48 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-ext4
In-Reply-To: <177334916519.3254574.10658825847200071665.stg-ugh@frogsfrogsfrogs>
On Thu, Mar 12, 2026 at 02:00:11PM -0700, Darrick J. Wong wrote:
>
> As usual, I did a test-merge with the main upstream branch as of a few
> minutes ago, and didn't see any conflicts. Please let me know if you
> encounter any problems.
>
> The following changes since commit a4fc4817e3072cb19f481da24d4520671a174fff:
>
> fuse2fs: implement MMP updates (2026-03-08 19:14:04 -0700)
Thanks, merged. (BTW, I noticed that the previous pull request
resulted in a large number of regression test failures triggered by
improving libext2fs's dir_index support. I should have noticed it
earlier, but I had skipped running "make -j24 check" after merging the
previous pull request, which is my bad.)
- Ted
^ permalink raw reply
* Re: [PATCH v2 3/3] ext4: derive f_fsid from block device to avoid collisions
From: Anand Jain @ 2026-04-02 7:33 UTC (permalink / raw)
To: Theodore Tso
Cc: Andreas Dilger, Darrick J. Wong, Anand Jain, linux-ext4,
linux-btrfs, linux-xfs, hch
In-Reply-To: <20260325125952.GB2107@macsyma.local>
On 25/3/26 20:59, Theodore Tso wrote:
> On Wed, Mar 25, 2026 at 06:59:32PM +0800, Anand Jain wrote:
>>
>> IMO, sb->s_uuid (as used by overlayfs)
>> Represents a filesystem UUID that is persistent.
>> It is derived from on-disk metadata.
>>
>> statfs()->f_fsid is..
>> A kind of runtime filesystem identifier used to distinguish mounted
>> filesystems within a running system.
>> It may be stable across reboots or device removal and reinsertion,
>> but this is not guaranteed. It may change if the device dev_t changes.
>
> I always worry about "it might be stable, but it might not; ¯\_(ツ)_/¯"
>
> The problem with that is that people might starting using this
> kinda-of-guarantee-but-maybe-not in scripts or in programs, and then
> when people try to run that script or program on a different system,
> or on a different file system, things goes *boom*.
>
> So if we want to say that it is stable so long as dev_t and the file
> system the same, that's a well defined semantic.
>
Yeah, agreed. Avoid misuse. Document that f_fsid is stable as long
as dev_t and the underlying filesystem identity don't change.
> If it's that it has no guarantees whatsoever; cloud change across
> reboots; could change across remounts, then maybe it should just be a
> global mount sequence number that starts with a random number at boot.
> So you can use it to distinguish between different mounted file
> systems, but that's *all* you can do with the thing. That would also
> be a well defined semantic.
>
Per-mount random value (or global mount sequence) is also a
well-defined semantic, but it comes with trade-offs: we lose
consistency across mount cycles and need to carry per-mount
state.
IMHO, it's better to stick with a deterministic id:
f_fsid = f(dev_t, fsid)
predictable and aligned with XFS/btrfs and avoids additional state.
Bottom line, it fixes the cloned filesystem case without regressing
the existing semantics.
^ permalink raw reply
* Re: [PATCH v4 09/13] ext4: ensure zeroed partial blocks are persisted in SYNC mode
From: Zhang Yi @ 2026-04-02 1:21 UTC (permalink / raw)
To: Jan Kara
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
ojaswin, ritesh.list, libaokun, yi.zhang, yizhang089, yangerkun,
yukuai
In-Reply-To: <42toomq4fa64kbice5yl4spjsw4em2qcq7kvkkas6254iij3u2@r6ekulbdog3i>
On 4/2/2026 1:06 AM, Jan Kara wrote:
> On Fri 27-03-26 18:29:35, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> In ext4_zero_range() and ext4_punch_hole(), when operating in SYNC mode
>> and zeroing a partial block, only data=journal modes guarantee that the
>> zeroed data is synchronously persisted after the operation completes.
>> For data=ordered/writeback mode and non-journal modes, this guarantee is
>> missing.
>>
>> Introduce a partial_zero parameter to explicitly trigger writeback for
>> all scenarios where a partial block is zeroed, ensuring the zeroed data
>> is durably persisted.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> fs/ext4/ext4.h | 2 +-
>> fs/ext4/extents.c | 9 ++++++++-
>> fs/ext4/inode.c | 19 ++++++++++++++-----
>> 3 files changed, 23 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 859ae05339ad..bfe86479a83c 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3101,7 +3101,7 @@ extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
>> int pextents);
>> extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end);
>> extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart,
>> - loff_t length);
>> + loff_t length, bool *did_zero);
>> extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
>> extern qsize_t *ext4_get_reserved_space(struct inode *inode);
>> extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 002b1ec8cee2..16386f499138 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -4673,6 +4673,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>> loff_t align_start, align_end, new_size = 0;
>> loff_t end = offset + len;
>> unsigned int blocksize = i_blocksize(inode);
>> + bool partial_zeroed = false;
>> int ret, flags;
>>
>> trace_ext4_zero_range(inode, offset, len, mode);
>> @@ -4728,9 +4729,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>> return ret;
>>
>> /* Zero out partial block at the edges of the range */
>> - ret = ext4_zero_partial_blocks(inode, offset, len);
>> + ret = ext4_zero_partial_blocks(inode, offset, len, &partial_zeroed);
>> if (ret)
>> return ret;
>> + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) {
>> + ret = filemap_write_and_wait_range(inode->i_mapping, offset,
>> + end - 1);
>> + if (ret)
>> + return ret;
>> + }
>
> Shouldn't we handle this somewhat below, where we do:
>
> if (file->f_flags & O_SYNC)
> ext4_handle_sync(handle);
>
> It would be logical to keep these together... Similarly for
> ext4_punch_hole() below. An yes, the check when calling ext4_handle_sync()
> should be extended with an IS_SYNC() check, which is a preexisting bug.
>
> Honza
Yeah, right, I've done this in the next patch. Thank you for your review!
Cheers,
Yi.
>
>>
>> handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
>> if (IS_ERR(handle)) {
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 8aa4369e3150..b934ad86a96d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4227,7 +4227,8 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>> return 0;
>> }
>>
>> -int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length)
>> +int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length,
>> + bool *did_zero)
>> {
>> struct super_block *sb = inode->i_sb;
>> unsigned partial_start, partial_end;
>> @@ -4244,20 +4245,21 @@ int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length)
>> /* Handle partial zero within the single block */
>> if (start == end &&
>> (partial_start || (partial_end != sb->s_blocksize - 1))) {
>> - err = ext4_block_zero_range(inode, lstart, length, NULL, NULL);
>> + err = ext4_block_zero_range(inode, lstart, length, did_zero,
>> + NULL);
>> return err;
>> }
>> /* Handle partial zero out on the start of the range */
>> if (partial_start) {
>> err = ext4_block_zero_range(inode, lstart, sb->s_blocksize,
>> - NULL, NULL);
>> + did_zero, NULL);
>> if (err)
>> return err;
>> }
>> /* Handle partial zero out on the end of the range */
>> if (partial_end != sb->s_blocksize - 1)
>> err = ext4_block_zero_range(inode, byte_end - partial_end,
>> - partial_end + 1, NULL, NULL);
>> + partial_end + 1, did_zero, NULL);
>> return err;
>> }
>>
>> @@ -4406,6 +4408,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>> loff_t end = offset + length;
>> handle_t *handle;
>> unsigned int credits;
>> + bool partial_zeroed = false;
>> int ret;
>>
>> trace_ext4_punch_hole(inode, offset, length, 0);
>> @@ -4441,9 +4444,15 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
>> if (ret)
>> return ret;
>>
>> - ret = ext4_zero_partial_blocks(inode, offset, length);
>> + ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed);
>> if (ret)
>> return ret;
>> + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) {
>> + ret = filemap_write_and_wait_range(inode->i_mapping, offset,
>> + end - 1);
>> + if (ret)
>> + return ret;
>> + }
>>
>> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> credits = ext4_chunk_trans_extent(inode, 0);
>> --
>> 2.52.0
>>
^ permalink raw reply
* Re: [PATCH] ext4: fix missing brelse() in ext4_xattr_inode_dec_ref_all()
From: Ritesh Harjani @ 2026-04-02 1:04 UTC (permalink / raw)
To: skoyama.kernel, linux-ext4; +Cc: Sohei Koyama
In-Reply-To: <20260330174248.71268-1-skoyama@ddn.com>
skoyama.kernel@gmail.com writes:
> From: Sohei Koyama <skoyama@ddn.com>
>
> The commit c8e008b60492 ("ext4: ignore xattrs past end")
> introduced a refcount leak in when block_csum is false.
>
> ext4_xattr_inode_dec_ref_all() calls ext4_get_inode_loc() to
> get iloc.bh, but never releases it with brelse().
>
> Fixes: c8e008b60492 ("ext4: ignore xattrs past end")
Nice catch, indeed we need to decrement the refcount for bh there.
Since this patch made into stable releases, so I guess it's good to
Cc:stable too.
Feel free to add:
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
^ permalink raw reply
* Re: [PATCH v5 3/5] ext4: fix the error handling process in extents_kunit_init).
From: Ritesh Harjani @ 2026-04-02 0:21 UTC (permalink / raw)
To: Ye Bin, tytso, adilger.kernel, linux-ext4; +Cc: jack
In-Reply-To: <20260330133035.287842-4-yebin@huaweicloud.com>
Ye Bin <yebin@huaweicloud.com> writes:
> From: Ye Bin <yebin10@huawei.com>
>
> The error processing in extents_kunit_init() is improper, causing
> resource leakage.
> Reconstruct the error handling process to prevent potential resource
> leaks
>
> Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion")
> Signed-off-by: Ye Bin <yebin10@huawei.com>
> ---
> fs/ext4/extents-test.c | 50 +++++++++++++++++++++++++++++-------------
> 1 file changed, 35 insertions(+), 15 deletions(-)
Looks better now. Thanks for taking care of it.
Feel free to add:
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
^ permalink raw reply
* Re: [PATCH v6 15/22] xfs: add fs-verity support
From: Eric Biggers @ 2026-04-01 23:57 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-16-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:16PM +0200, Andrey Albershteyn wrote:
> + /*
> + * If this is a block full of hashes of zeroed blocks, don't bother
> + * storing the block. We can synthesize them later.
> + *
> + * However, do this only in case Merkle tree block == fs block size.
> + * Iomap synthesizes these blocks based on holes in the merkle tree. We
> + * won't be able to tell if something need to be synthesizes for the
> + * range in the fs block. For example, for 4k filesystem block
> + *
> + * [ 1k | zero hashes | zero hashes | 1k ]
> + *
> + * Iomap won't know about these empty blocks.
> + */
> + for (i = 0, p = buf; i < size; i += digest_size, p += digest_size)
> + if (memcmp(p, zero_digest, digest_size))
> + break;
> + if (i == size && size == ip->i_mount->m_sb.sb_blocksize)
> + return 0;
Might be too subtle, but this could be done more efficiently with just
two calls to memcmp():
if (size == ip->i_mount->m_sb.sb_blocksize &&
/* first digest is zero_digest */
memcmp(buf, zero_digest, digest_size) == 0 &&
/* every digest is same as previous, thus all are zero_digest */
memcmp(buf + digest_size, buf, size - digest_size) == 0)
return 0;
- Eric
^ permalink raw reply
* Re: [PATCH v6 05/22] fsverity: hoist pagecache_read from f2fs/ext4 to fsverity
From: Eric Biggers @ 2026-04-01 23:44 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-6-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:06PM +0200, Andrey Albershteyn wrote:
> This is the same function to read from pageache. XFS will also need
> this, so move this to core fsverity.
>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Acked-by: Eric Biggers <ebiggers@kernel.org>
Would be worth mentioning that the ext4 and f2fs copies of this function
had diverged slightly, though. This patch takes ext4's implementation
of doing it folio-by-folio, which seems like the right choice.
- Eric
^ permalink raw reply
* Re: [PATCH v6 04/22] fsverity: pass digest size and hash of the empty block to ->write
From: Eric Biggers @ 2026-04-01 23:36 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-5-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:05PM +0200, Andrey Albershteyn wrote:
> Let filesystem iterate over hashes in the block and check if these are
> hashes of zeroed data blocks. XFS will use this to decide if it want to
> store tree block full of these hashes.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Acked-by: Eric Biggers <ebiggers@kernel.org>
> pass digest size and hash of the empty block to ->write
"empty block" => "all-zeroes block"
(it's not empty, it's all-zeroes)
> + * @zero_digest: the hash of a merkle block-sized buffer of zeroes
"a merkle block-sized buffer of zeroes" => "the all-zeroes block"
(to hopefully make it clear, as in patch 3, that it's the salted block
hash, not the value that fsverity_hash_buffer() gives)
> + * @zero_digest: the hash of a merkle block-sized buffer of zeroes
Likewise.
- Eric
^ permalink raw reply
* Re: [PATCH v6 03/22] fsverity: generate and store zero-block hash
From: Eric Biggers @ 2026-04-01 22:27 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-4-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:04PM +0200, Andrey Albershteyn wrote:
> Compute the hash of one filesystem block's worth of zeros. A filesystem
> implementation can decide to elide merkle tree blocks containing only
> this hash and synthesize the contents at read time.
>
> Let's pretend that there's a file containing six data blocks and whose
> merkle tree looks roughly like this:
>
> root
> +--leaf0
> | +--data0
> | +--data1
> | `--data2
> `--leaf1
> +--data3
> +--data4
> `--data5
>
> If data[0-2] are sparse holes, then leaf0 will contain a repeating
> sequence of @zero_digest. Therefore, leaf0 need not be written to disk
> because its contents can be synthesized.
>
> A subsequent xfs patch will use this to reduce the size of the merkle
> tree when dealing with sparse gold master disk images and the like.
>
> Add a helper to pre-fill folio with hashes of empty blocks. This will be
> used by iomap to synthesize blocks full of zero hashes on the fly.
>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> ---
> fs/verity/fsverity_private.h | 3 +++
> fs/verity/open.c | 3 +++
> fs/verity/pagecache.c | 22 ++++++++++++++++++++++
> include/linux/fsverity.h | 8 ++++++++
> 4 files changed, 36 insertions(+)
Acked-by: Eric Biggers <ebiggers@kernel.org>
The example given in the commit message is a bit misleading, though.
Usually there are actually 128 hashes per block, and a block of hashes
covers 512 KiB. So this optimization applies only where there is a hole
in the file's data of size (at least) 512 KiB, aligned to the same
amount.
It's also worth noting that this optimization is being done only for the
first level. The levels above that are still being stored. So, this
doesn't really enable e.g. exabyte sized sparse regions, as a block will
still be stored for each 64 MiB (instead of every 512 KiB).
I'm okay with this if you want to do this, but I just want to make sure
its limitations are well-understood.
> + /* the hash of a merkle block-sized buffer of zeroes */
> + u8 zero_digest[FS_VERITY_MAX_DIGEST_SIZE];
"the hash of an all-zeroes block" would be clearer. This is the hash
from fsverity_hash_block() which includes the optional salt, not the
hash from fsverity_hash_buffer() which does not include the salt.
> +/**
> + * fsverity_fill_zerohash() - fill folio with hashes of zero data block
> + * @folio: folio to fill
> + * @poff: offset in the folio to start
> + * @plen: length of the range to fill with hashes
Maybe go with (len, offset) for consistency with
fsverity_verify_blocks(). (I assume the "p" prefix stands for "page",
which is misleading since this works with a folio.)
> +void fsverity_fill_zerohash(struct folio *folio, size_t poff, size_t plen,
> + struct fsverity_info *vi)
> +{
> + size_t offset = poff;
> +
> + WARN_ON_ONCE(!IS_ALIGNED(poff, vi->tree_params.digest_size));
> + WARN_ON_ONCE(!IS_ALIGNED(plen, vi->tree_params.digest_size));
> +
> + for (; offset < (poff + plen); offset += vi->tree_params.digest_size)
> + memcpy_to_folio(folio, offset, vi->tree_params.zero_digest,
> + vi->tree_params.digest_size);
This could be done more efficiently, especially on HIGHMEM. Probably
fine for now though, especially since the intersection of anyone wanting
XFS && fsverity && HIGHMEM is likely to be extremely small.
- Eric
^ permalink raw reply
* [PATCH 1/2] ext2: validate i_nlink before decrement in ext2_unlink()
From: Vasiliy Kovalev @ 2026-04-01 22:08 UTC (permalink / raw)
To: Jan Kara, Andrew Morton, Alexey Dobriyan, linux-ext4
Cc: linux-kernel, lvc-project, kovalev
In-Reply-To: <20260401220837.2424925-1-kovalev@altlinux.org>
A crafted ext2 image can provide a directory entry pointing to an inode
with i_links_count == 0 on disk. Calling unlink() on such an entry
triggers WARN_ON inside drop_nlink():
WARNING: CPU: 3 PID: 609 at fs/inode.c:336 drop_nlink+0xad/0xd0 fs/inode.c:336
CPU: 3 UID: 0 PID: 609 Comm: syz-executor Not tainted 6.12.77+ #1
Call Trace:
<TASK>
inode_dec_link_count include/linux/fs.h:2518 [inline]
ext2_unlink+0x26c/0x300 fs/ext2/namei.c:295
vfs_unlink+0x2fc/0x9b0 fs/namei.c:4477
do_unlinkat+0x53e/0x730 fs/namei.c:4541
__do_sys_unlink fs/namei.c:4589 [inline]
__se_sys_unlink fs/namei.c:4587 [inline]
__x64_sys_unlink+0xc6/0x110 fs/namei.c:4587
do_syscall_x64 arch/x86/entry/common.c:47 [inline]
do_syscall_64+0xf5/0x220 arch/x86/entry/common.c:78
entry_SYSCALL_64_after_hwframe+0x77/0x7f
</TASK>
At the point of the crash, ext2_delete_entry() has already committed
the removal of the directory entry to disk, so returning an error is
not an option. Instead, skip the decrement and report the corruption
via ext2_error(), which marks the superblock as having errors. The
inode will be reclaimed when its last reference is dropped.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Cc: stable@vger.kernel.org
Fixes: a513b035eadf ("[PATCH] ext2: switch to inode_inc_count, inode_dec_count")
Signed-off-by: Vasiliy Kovalev <kovalev@altlinux.org>
---
fs/ext2/namei.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index bde617a66cec..ea49e8f2b292 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -293,7 +293,12 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry)
goto out;
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
+ if (!inode->i_nlink)
+ ext2_error(inode->i_sb, __func__,
+ "inode %lu has zero i_nlink on unlink, fs may be corrupt",
+ inode->i_ino);
+ else
+ inode_dec_link_count(inode);
err = 0;
out:
return err;
--
2.50.1
^ permalink raw reply related
* [PATCH 0/2] ext2: fix WARN_ON in drop_nlink() triggered by corrupt images
From: Vasiliy Kovalev @ 2026-04-01 22:08 UTC (permalink / raw)
To: Jan Kara, Andrew Morton, Alexey Dobriyan, linux-ext4
Cc: linux-kernel, lvc-project, kovalev
A crafted ext2 image can contain a directory entry pointing to an inode
whose on-disk i_links_count is zero. ext2 mounts such an image without
error. Any subsequent syscall that decrements i_nlink on that inode
triggers WARN_ON inside drop_nlink() in fs/inode.c.
These patches prevent the warning by validating i_nlink before decrementing
it in ext2_unlink() and ext2_rename(), reporting the corruption via
ext2_error() instead.
The issues were found by Linux Verification Center (linuxtesting.org)
with Syzkaller.
Vasiliy Kovalev (2):
ext2: validate i_nlink before decrement in ext2_unlink()
ext2: guard against zero i_nlink on new_inode in ext2_rename()
fs/ext2/namei.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
--- [Reproducer for PATCH 1/2: ext2_unlink] ---
#define _GNU_SOURCE
#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <linux/loop.h>
#ifndef __NR_memfd_create
#define __NR_memfd_create 319
#endif
static unsigned long long procid;
static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}
static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}
static void use_temporary_dir(void)
{
char tmpdir_template[] = "./syzkaller.XXXXXX";
char* tmpdir = mkdtemp(tmpdir_template);
if (!tmpdir)
exit(1);
if (chmod(tmpdir, 0777))
exit(1);
if (chdir(tmpdir))
exit(1);
}
static bool write_file(const char* file, const char* what, ...)
{
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}
#define MAXBITS 15
#define MAXLCODES 286
#define MAXDCODES 30
#define MAXCODES (MAXLCODES + MAXDCODES)
#define FIXLCODES 288
struct puff_state {
unsigned char* out;
unsigned long outlen;
unsigned long outcnt;
const unsigned char* in;
unsigned long inlen;
unsigned long incnt;
int bitbuf;
int bitcnt;
jmp_buf env;
};
static int puff_bits(struct puff_state* s, int need)
{
long val = s->bitbuf;
while (s->bitcnt < need) {
if (s->incnt == s->inlen)
longjmp(s->env, 1);
val |= (long)(s->in[s->incnt++]) << s->bitcnt;
s->bitcnt += 8;
}
s->bitbuf = (int)(val >> need);
s->bitcnt -= need;
return (int)(val & ((1L << need) - 1));
}
static int puff_stored(struct puff_state* s)
{
s->bitbuf = 0;
s->bitcnt = 0;
if (s->incnt + 4 > s->inlen)
return 2;
unsigned len = s->in[s->incnt++];
len |= s->in[s->incnt++] << 8;
if (s->in[s->incnt++] != (~len & 0xff) ||
s->in[s->incnt++] != ((~len >> 8) & 0xff))
return -2;
if (s->incnt + len > s->inlen)
return 2;
if (s->outcnt + len > s->outlen)
return 1;
for (; len--; s->outcnt++, s->incnt++) {
if (s->in[s->incnt])
s->out[s->outcnt] = s->in[s->incnt];
}
return 0;
}
struct puff_huffman {
short* count;
short* symbol;
};
static int puff_decode(struct puff_state* s, const struct puff_huffman* h)
{
int first = 0;
int index = 0;
int bitbuf = s->bitbuf;
int left = s->bitcnt;
int code = first = index = 0;
int len = 1;
short* next = h->count + 1;
while (1) {
while (left--) {
code |= bitbuf & 1;
bitbuf >>= 1;
int count = *next++;
if (code - count < first) {
s->bitbuf = bitbuf;
s->bitcnt = (s->bitcnt - len) & 7;
return h->symbol[index + (code - first)];
}
index += count;
first += count;
first <<= 1;
code <<= 1;
len++;
}
left = (MAXBITS + 1) - len;
if (left == 0)
break;
if (s->incnt == s->inlen)
longjmp(s->env, 1);
bitbuf = s->in[s->incnt++];
if (left > 8)
left = 8;
}
return -10;
}
static int puff_construct(struct puff_huffman* h, const short* length, int n)
{
int len;
for (len = 0; len <= MAXBITS; len++)
h->count[len] = 0;
int symbol;
for (symbol = 0; symbol < n; symbol++)
(h->count[length[symbol]])++;
if (h->count[0] == n)
return 0;
int left = 1;
for (len = 1; len <= MAXBITS; len++) {
left <<= 1;
left -= h->count[len];
if (left < 0)
return left;
}
short offs[MAXBITS + 1];
offs[1] = 0;
for (len = 1; len < MAXBITS; len++)
offs[len + 1] = offs[len] + h->count[len];
for (symbol = 0; symbol < n; symbol++)
if (length[symbol] != 0)
h->symbol[offs[length[symbol]]++] = symbol;
return left;
}
static int puff_codes(struct puff_state* s,
const struct puff_huffman* lencode,
const struct puff_huffman* distcode)
{
static const short lens[29] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258};
static const short lext[29] = {
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
static const short dists[30] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
8193, 12289, 16385, 24577};
static const short dext[30] = {
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
12, 12, 13, 13};
int symbol;
do {
symbol = puff_decode(s, lencode);
if (symbol < 0)
return symbol;
if (symbol < 256) {
if (s->outcnt == s->outlen)
return 1;
if (symbol)
s->out[s->outcnt] = symbol;
s->outcnt++;
} else if (symbol > 256) {
symbol -= 257;
if (symbol >= 29)
return -10;
int len = lens[symbol] + puff_bits(s, lext[symbol]);
symbol = puff_decode(s, distcode);
if (symbol < 0)
return symbol;
unsigned dist = dists[symbol] + puff_bits(s, dext[symbol]);
if (dist > s->outcnt)
return -11;
if (s->outcnt + len > s->outlen)
return 1;
while (len--) {
if (dist <= s->outcnt && s->out[s->outcnt - dist])
s->out[s->outcnt] = s->out[s->outcnt - dist];
s->outcnt++;
}
}
} while (symbol != 256);
return 0;
}
static int puff_fixed(struct puff_state* s)
{
static int virgin = 1;
static short lencnt[MAXBITS + 1], lensym[FIXLCODES];
static short distcnt[MAXBITS + 1], distsym[MAXDCODES];
static struct puff_huffman lencode, distcode;
if (virgin) {
lencode.count = lencnt;
lencode.symbol = lensym;
distcode.count = distcnt;
distcode.symbol = distsym;
short lengths[FIXLCODES];
int symbol;
for (symbol = 0; symbol < 144; symbol++)
lengths[symbol] = 8;
for (; symbol < 256; symbol++)
lengths[symbol] = 9;
for (; symbol < 280; symbol++)
lengths[symbol] = 7;
for (; symbol < FIXLCODES; symbol++)
lengths[symbol] = 8;
puff_construct(&lencode, lengths, FIXLCODES);
for (symbol = 0; symbol < MAXDCODES; symbol++)
lengths[symbol] = 5;
puff_construct(&distcode, lengths, MAXDCODES);
virgin = 0;
}
return puff_codes(s, &lencode, &distcode);
}
static int puff_dynamic(struct puff_state* s)
{
static const short order[19] =
{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
int nlen = puff_bits(s, 5) + 257;
int ndist = puff_bits(s, 5) + 1;
int ncode = puff_bits(s, 4) + 4;
if (nlen > MAXLCODES || ndist > MAXDCODES)
return -3;
short lengths[MAXCODES];
int index;
for (index = 0; index < ncode; index++)
lengths[order[index]] = puff_bits(s, 3);
for (; index < 19; index++)
lengths[order[index]] = 0;
short lencnt[MAXBITS + 1], lensym[MAXLCODES];
struct puff_huffman lencode = {lencnt, lensym};
int err = puff_construct(&lencode, lengths, 19);
if (err != 0)
return -4;
index = 0;
while (index < nlen + ndist) {
int symbol;
int len;
symbol = puff_decode(s, &lencode);
if (symbol < 0)
return symbol;
if (symbol < 16)
lengths[index++] = symbol;
else {
len = 0;
if (symbol == 16) {
if (index == 0)
return -5;
len = lengths[index - 1];
symbol = 3 + puff_bits(s, 2);
} else if (symbol == 17)
symbol = 3 + puff_bits(s, 3);
else
symbol = 11 + puff_bits(s, 7);
if (index + symbol > nlen + ndist)
return -6;
while (symbol--)
lengths[index++] = len;
}
}
if (lengths[256] == 0)
return -9;
err = puff_construct(&lencode, lengths, nlen);
if (err && (err < 0 || nlen != lencode.count[0] + lencode.count[1]))
return -7;
short distcnt[MAXBITS + 1], distsym[MAXDCODES];
struct puff_huffman distcode = {distcnt, distsym};
err = puff_construct(&distcode, lengths + nlen, ndist);
if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1]))
return -8;
return puff_codes(s, &lencode, &distcode);
}
static int puff(
unsigned char* dest,
unsigned long* destlen,
const unsigned char* source,
unsigned long sourcelen)
{
struct puff_state s = {
.out = dest,
.outlen = *destlen,
.outcnt = 0,
.in = source,
.inlen = sourcelen,
.incnt = 0,
.bitbuf = 0,
.bitcnt = 0,
};
int err;
if (setjmp(s.env) != 0)
err = 2;
else {
int last;
do {
last = puff_bits(&s, 1);
int type = puff_bits(&s, 2);
err = type == 0 ? puff_stored(&s) : (type == 1 ? puff_fixed(&s) : (type == 2 ? puff_dynamic(&s) : -1));
if (err != 0)
break;
} while (!last);
}
*destlen = s.outcnt;
return err;
}
#define ZLIB_HEADER_WIDTH 2
static int puff_zlib_to_file(const unsigned char* source, unsigned long sourcelen, int dest_fd)
{
if (sourcelen < ZLIB_HEADER_WIDTH)
return 0;
source += ZLIB_HEADER_WIDTH;
sourcelen -= ZLIB_HEADER_WIDTH;
const unsigned long max_destlen = 132 << 20;
void* ret = mmap(0, max_destlen, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ret == MAP_FAILED)
return -1;
unsigned char* dest = (unsigned char*)ret;
unsigned long destlen = max_destlen;
int err = puff(dest, &destlen, source, sourcelen);
if (err) {
munmap(dest, max_destlen);
errno = -err;
return -1;
}
if (write(dest_fd, dest, destlen) != (ssize_t)destlen) {
munmap(dest, max_destlen);
return -1;
}
return munmap(dest, max_destlen);
}
static int setup_loop_device(unsigned char* data, unsigned long size, const char* loopname, int* loopfd_p)
{
int err = 0, loopfd = -1;
int memfd = syscall(__NR_memfd_create, "syzkaller", 0);
if (memfd == -1) {
err = errno;
goto error;
}
if (puff_zlib_to_file(data, size, memfd)) {
err = errno;
goto error_close_memfd;
}
loopfd = open(loopname, O_RDWR);
if (loopfd == -1) {
err = errno;
goto error_close_memfd;
}
if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
if (errno != EBUSY) {
err = errno;
goto error_close_loop;
}
ioctl(loopfd, LOOP_CLR_FD, 0);
usleep(1000);
if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
err = errno;
goto error_close_loop;
}
}
close(memfd);
*loopfd_p = loopfd;
return 0;
error_close_loop:
close(loopfd);
error_close_memfd:
close(memfd);
error:
errno = err;
return -1;
}
static void reset_loop_device(const char* loopname)
{
int loopfd = open(loopname, O_RDWR);
if (loopfd == -1) {
return;
}
if (ioctl(loopfd, LOOP_CLR_FD, 0)) {
}
close(loopfd);
}
static long syz_mount_image(
volatile long fsarg,
volatile long dir,
volatile long flags,
volatile long optsarg,
volatile long change_dir,
volatile unsigned long size,
volatile long image)
{
unsigned char* data = (unsigned char*)image;
int res = -1, err = 0, need_loop_device = !!size;
char* mount_opts = (char*)optsarg;
char* target = (char*)dir;
char* fs = (char*)fsarg;
char* source = NULL;
char loopname[64];
if (need_loop_device) {
int loopfd;
memset(loopname, 0, sizeof(loopname));
snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
if (setup_loop_device(data, size, loopname, &loopfd) == -1)
return -1;
close(loopfd);
source = loopname;
}
mkdir(target, 0777);
char opts[256];
memset(opts, 0, sizeof(opts));
if (strlen(mount_opts) > (sizeof(opts) - 32)) {
}
strncpy(opts, mount_opts, sizeof(opts) - 32);
if (strcmp(fs, "iso9660") == 0) {
flags |= MS_RDONLY;
} else if (strncmp(fs, "ext", 3) == 0) {
bool has_remount_ro = false;
char* remount_ro_start = strstr(opts, "errors=remount-ro");
if (remount_ro_start != NULL) {
char after = *(remount_ro_start + strlen("errors=remount-ro"));
char before = remount_ro_start == opts ? '\0' : *(remount_ro_start - 1);
has_remount_ro = ((before == '\0' || before == ',') && (after == '\0' || after == ','));
}
if (strstr(opts, "errors=panic") || !has_remount_ro)
strcat(opts, ",errors=continue");
} else if (strcmp(fs, "xfs") == 0) {
strcat(opts, ",nouuid");
} else if (strncmp(fs, "gfs2", 4) == 0 && (strstr(opts, "errors=panic") || strstr(opts, "debug"))) {
strcat(opts, ",errors=withdraw");
}
res = mount(source, target, fs, flags, opts);
if (res == -1) {
err = errno;
goto error_clear_loop;
}
res = open(target, O_RDONLY | O_DIRECTORY);
if (res == -1) {
err = errno;
goto error_clear_loop;
}
if (change_dir) {
res = chdir(target);
if (res == -1) {
err = errno;
}
}
error_clear_loop:
if (need_loop_device)
reset_loop_device(loopname);
errno = err;
return res;
}
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
static void remove_dir(const char* dir)
{
int iter = 0;
DIR* dp = 0;
const int umount_flags = MNT_FORCE | UMOUNT_NOFOLLOW;
retry:
while (umount2(dir, umount_flags) == 0) {
}
dp = opendir(dir);
if (dp == NULL) {
if (errno == EMFILE) {
exit(1);
}
exit(1);
}
struct dirent* ep = 0;
while ((ep = readdir(dp))) {
if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
continue;
char filename[FILENAME_MAX];
snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
while (umount2(filename, umount_flags) == 0) {
}
struct stat st;
if (lstat(filename, &st))
exit(1);
if (S_ISDIR(st.st_mode)) {
remove_dir(filename);
continue;
}
int i;
for (i = 0;; i++) {
if (unlink(filename) == 0)
break;
if (errno == EPERM) {
int fd = open(filename, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0) {
}
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno != EBUSY || i > 100)
exit(1);
if (umount2(filename, umount_flags))
exit(1);
}
}
closedir(dp);
for (int i = 0;; i++) {
if (rmdir(dir) == 0)
break;
if (i < 100) {
if (errno == EPERM) {
int fd = open(dir, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0) {
}
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno == EBUSY) {
if (umount2(dir, umount_flags))
exit(1);
continue;
}
if (errno == ENOTEMPTY) {
if (iter < 100) {
iter++;
goto retry;
}
}
}
exit(1);
}
}
static void kill_and_wait(int pid, int* status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
for (int i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}
static void reset_loop()
{
char buf[64];
snprintf(buf, sizeof(buf), "/dev/loop%llu", procid);
int loopfd = open(buf, O_RDWR);
if (loopfd != -1) {
ioctl(loopfd, LOOP_CLR_FD, 0);
close(loopfd);
}
}
static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
write_file("/proc/self/oom_score_adj", "1000");
if (symlink("/dev/binderfs", "./binderfs")) {
}
}
#define USLEEP_FORKED_CHILD (3 * 50 *1000)
static long handle_clone_ret(long ret)
{
if (ret != 0) {
return ret;
}
usleep(USLEEP_FORKED_CHILD);
syscall(__NR_exit, 0);
while (1) {
}
}
static long syz_clone(volatile long flags, volatile long stack, volatile long stack_len,
volatile long ptid, volatile long ctid, volatile long tls)
{
long sp = (stack + stack_len) & ~15;
long ret = (long)syscall(__NR_clone, flags & ~CLONE_VM, sp, ptid, ctid, tls);
return handle_clone_ret(ret);
}
static void execute_one(void);
#define WAIT_FLAGS __WALL
static void loop(void)
{
int iter = 0;
for (;; iter++) {
char cwdbuf[32];
sprintf(cwdbuf, "./%d", iter);
if (mkdir(cwdbuf, 0777))
exit(1);
reset_loop();
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
if (chdir(cwdbuf))
exit(1);
setup_test();
execute_one();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
sleep_ms(10);
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
if (current_time_ms() - start < 5000)
continue;
kill_and_wait(pid, &status);
break;
}
remove_dir(cwdbuf);
}
}
void execute_one(void)
{
if (write(1, "executing program\n", sizeof("executing program\n") - 1)) {}
memcpy((void*)0x200000000240, "ext2\000", 5);
memcpy((void*)0x200000000440, "./file1\000", 8);
*(uint8_t*)0x200000000480 = 0;
memcpy((void*)0x2000000008c0, "\x78\x9c\xec\xdc\x4f\x68\x1c\x55\x18\x00\xf0\x6f\x66\xbb\xd5\xb6\x89\xa9\x6d\xfd\x57\x95\x16\x14\x2a\x88\xf9\xab\x98\x9b\x54\xef\x0a\x56\xc1\x6b\x30\x9b\xb6\x74\xdb\x48\x12\xa9\x09\xb4\xd8\xb3\xe2\x21\x08\x0a\x9e\xf4\xec\xa1\x37\xf1\xa6\x37\x2f\x1e\x04\x0f\x82\x17\x0f\x05\x41\x28\xf6\xe0\xc1\xff\x2b\xb3\x33\xb3\xa6\xe9\x6e\x68\x93\xdd\xae\x76\x7e\x3f\x98\xdd\xf7\x66\x66\xf9\xde\xcb\xf2\x4d\xde\x7c\x64\x12\x40\x65\x1d\xcd\x5e\x92\x88\x91\x88\xf8\x3e\x22\xc6\xf2\xee\xf5\x27\x1c\xcd\xdf\xd6\x67\x2e\x9f\xca\xb6\x24\x5a\xad\x13\x3f\x27\xed\xf3\x2e\xce\x5c\x3e\x55\x9e\x5a\x7e\x6e\x5f\x44\xbc\x1d\x11\x77\x47\xc4\x81\x88\xa8\x75\x89\xbb\xbc\xba\x76\x66\xae\xd9\x6c\x2c\x15\xfd\x89\x95\xb3\x6f\x4c\x2c\xaf\xae\x3d\x75\xfa\xec\xdc\xc9\xc6\xc9\xc6\xb9\xa9\x99\x67\x9f\x9e\x9c\x7d\x66\x72\x76\xb6\x6f\x73\xbd\xfa\xe5\xbb\xaf\xc5\x2b\x2f\x4d\xbf\x77\xe4\x87\x4f\x5e\xf8\xfc\xe0\x85\x6c\xbc\x23\xc5\xb1\x8d\xf3\xe8\xa7\xa4\xc7\xfe\x07\x07\x11\x6c\x88\xf6\x0c\x7b\x00\x6c\x4b\x96\x9b\xbb\x22\xa2\xde\xce\xff\xb1\xa8\xb5\x7b\x40\x15\xb4\x5a\xad\x23\x5b\x1f\x06\xee\x5c\x89\x24\x87\x8a\x2a\x7f\xd1\x67\xf7\xbf\xe5\x76\x7b\x56\x1e\xb9\xab\xc7\xf3\x1b\x90\x8b\x45\x6d\x61\xbd\x13\x7f\x57\xa4\xc5\x39\x75\xf7\x97\xc0\x00\x7c\x91\x5d\x7f\x26\xbb\x5d\xff\xd2\xeb\x6a\x74\x7b\x8b\xba\xe6\x48\x44\x8c\x46\xc4\x3d\x45\xad\x74\x7f\x44\xdc\x5b\xd4\x39\x0f\x46\xc4\xa1\x88\xb8\xef\x16\xe2\x97\xd7\xbf\xf5\x1b\xae\x7f\x69\xe7\xfa\x57\x8b\x88\xfb\x77\x30\xc7\x77\x9a\xd7\xae\xf4\x8c\x7f\x69\x34\x0e\x77\x8d\x9f\x74\x2a\x41\x49\x76\x23\x18\x11\x0f\x6c\x33\xfe\x67\xcf\x5d\xf8\xaa\xd7\xb1\xd6\xc7\x11\xc7\xa2\x7b\xfc\xd8\x10\x7f\x8b\xfa\xf0\xc4\xc2\xe9\x66\x63\x32\x7f\xed\x1a\xe3\xf9\x57\x17\xbe\xee\x3d\xff\xfc\xbb\xbd\x21\xfe\x86\xa2\x75\x6d\x87\xf5\xda\x2b\x7b\x5e\x8e\xbf\xb6\x88\xff\xc4\x63\xdd\xbf\xff\x03\xc5\x39\xd9\xfc\xff\x8e\x88\xdf\x22\xe2\xf7\x88\xf8\x23\x22\xfe\x8c\x88\x87\x22\xe2\x70\x44\x3c\x1c\x11\x8f\x6c\x11\xff\xf8\x37\xdf\x2e\xf6\x3a\x96\xc5\x9f\xef\xf1\xf3\x4f\x37\xc4\x7f\x74\x3b\x13\x2f\x9c\xff\xf1\xd3\x4b\x3b\xf8\x38\x00\xd0\x67\x69\x7b\x4d\x9b\xa4\xe3\x9d\x76\x9a\x8e\x8f\xe7\x6b\xdd\x43\xb1\x37\x6d\x2e\x2e\xaf\x3c\xb9\xb0\xf8\xe6\xb9\xf9\x7c\xed\xbb\x3f\xea\x69\xb9\xd2\xca\xd7\xbf\xf5\x24\xeb\x4f\x15\x6b\xe1\xb2\x3f\xbd\xa9\x3f\x53\xac\x93\xdf\xaf\xed\x69\xf7\xc7\x5f\x5f\x6c\xce\x0f\x7b\xf2\x50\x71\xfb\x36\xe5\xff\x2f\xb5\x3c\xff\x81\x8a\xf0\x27\x3f\x50\x5d\xf2\x1f\xaa\x4b\xfe\x43\x75\xc9\x7f\xa8\x2e\xf9\x0f\xd5\x25\xff\xa1\xba\xe4\x3f\x54\x97\xfc\x87\xea\x92\xff\x50\x5d\xf2\x1f\xaa\xeb\xe6\xf2\x7f\xf7\xc0\xc7\x01\xdc\x56\x23\x3d\x9e\xff\x19\xdd\xf0\xec\xce\x64\xf1\xbc\xfb\x77\xb5\xfa\x5d\xe5\xb3\x3e\xc0\xff\x5f\x63\xa9\xf3\x9f\x78\x37\x35\x86\x3d\x32\x60\xd0\xfe\x4d\xfa\x61\x8f\x04\x00\x00\x00\x00\xe8\xb7\x5e\xd5\xff\x7e\x36\x86\x3d\x47\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x5b\x93\xfe\x94\x44\x44\xb6\x1d\x1b\x7b\x7c\x64\xf3\xd1\xdd\xc9\xaf\xb5\xf6\x7b\x44\x9c\xff\xe8\xc4\x07\x6f\xcd\xad\xac\x2c\x4d\x65\xfb\xaf\x75\xf6\xaf\x7c\x58\xec\x9f\x1e\xc6\xf8\x81\x9b\x55\xe6\x69\x99\xc7\x40\x75\x2d\xaf\xae\x9d\x99\x6b\x36\x1b\x4b\x1a\xfd\x68\xbc\xf8\xdf\x18\x86\xc6\x9d\xd2\xa8\xc7\x60\x43\xfc\x13\x00\x00\xff\xff\x52\x90\x78\x22", 999);
syz_mount_image(/*fs=*/0x200000000240, /*dir=*/0x200000000440, /*flags=*/0, /*opts=*/0x200000000480, /*chdir=*/1, /*size=*/0x3e7, /*img=*/0x2000000008c0);
syz_clone(/*flags=*/0, /*stack=*/0, /*stack_len=*/0, /*parentid=*/0, /*childtid=*/0, /*tls=*/0);
}
int main(void)
{
syscall(__NR_mmap, /*addr=*/0x1ffffffff000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x200000000000ul, /*len=*/0x1000000ul, /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/7ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x200001000000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
const char* reason;
(void)reason;
use_temporary_dir();
loop();
return 0;
}
--- [Reproducer for PATCH 2/2: ext2_rename] ---
#define _GNU_SOURCE
#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <linux/loop.h>
#ifndef __NR_memfd_create
#define __NR_memfd_create 319
#endif
static unsigned long long procid;
static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}
static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}
static bool write_file(const char* file, const char* what, ...)
{
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}
#define MAXBITS 15
#define MAXLCODES 286
#define MAXDCODES 30
#define MAXCODES (MAXLCODES + MAXDCODES)
#define FIXLCODES 288
struct puff_state {
unsigned char* out;
unsigned long outlen;
unsigned long outcnt;
const unsigned char* in;
unsigned long inlen;
unsigned long incnt;
int bitbuf;
int bitcnt;
jmp_buf env;
};
static int puff_bits(struct puff_state* s, int need)
{
long val = s->bitbuf;
while (s->bitcnt < need) {
if (s->incnt == s->inlen)
longjmp(s->env, 1);
val |= (long)(s->in[s->incnt++]) << s->bitcnt;
s->bitcnt += 8;
}
s->bitbuf = (int)(val >> need);
s->bitcnt -= need;
return (int)(val & ((1L << need) - 1));
}
static int puff_stored(struct puff_state* s)
{
s->bitbuf = 0;
s->bitcnt = 0;
if (s->incnt + 4 > s->inlen)
return 2;
unsigned len = s->in[s->incnt++];
len |= s->in[s->incnt++] << 8;
if (s->in[s->incnt++] != (~len & 0xff) ||
s->in[s->incnt++] != ((~len >> 8) & 0xff))
return -2;
if (s->incnt + len > s->inlen)
return 2;
if (s->outcnt + len > s->outlen)
return 1;
for (; len--; s->outcnt++, s->incnt++) {
if (s->in[s->incnt])
s->out[s->outcnt] = s->in[s->incnt];
}
return 0;
}
struct puff_huffman {
short* count;
short* symbol;
};
static int puff_decode(struct puff_state* s, const struct puff_huffman* h)
{
int first = 0;
int index = 0;
int bitbuf = s->bitbuf;
int left = s->bitcnt;
int code = first = index = 0;
int len = 1;
short* next = h->count + 1;
while (1) {
while (left--) {
code |= bitbuf & 1;
bitbuf >>= 1;
int count = *next++;
if (code - count < first) {
s->bitbuf = bitbuf;
s->bitcnt = (s->bitcnt - len) & 7;
return h->symbol[index + (code - first)];
}
index += count;
first += count;
first <<= 1;
code <<= 1;
len++;
}
left = (MAXBITS + 1) - len;
if (left == 0)
break;
if (s->incnt == s->inlen)
longjmp(s->env, 1);
bitbuf = s->in[s->incnt++];
if (left > 8)
left = 8;
}
return -10;
}
static int puff_construct(struct puff_huffman* h, const short* length, int n)
{
int len;
for (len = 0; len <= MAXBITS; len++)
h->count[len] = 0;
int symbol;
for (symbol = 0; symbol < n; symbol++)
(h->count[length[symbol]])++;
if (h->count[0] == n)
return 0;
int left = 1;
for (len = 1; len <= MAXBITS; len++) {
left <<= 1;
left -= h->count[len];
if (left < 0)
return left;
}
short offs[MAXBITS + 1];
offs[1] = 0;
for (len = 1; len < MAXBITS; len++)
offs[len + 1] = offs[len] + h->count[len];
for (symbol = 0; symbol < n; symbol++)
if (length[symbol] != 0)
h->symbol[offs[length[symbol]]++] = symbol;
return left;
}
static int puff_codes(struct puff_state* s,
const struct puff_huffman* lencode,
const struct puff_huffman* distcode)
{
static const short lens[29] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258};
static const short lext[29] = {
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
static const short dists[30] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
8193, 12289, 16385, 24577};
static const short dext[30] = {
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
12, 12, 13, 13};
int symbol;
do {
symbol = puff_decode(s, lencode);
if (symbol < 0)
return symbol;
if (symbol < 256) {
if (s->outcnt == s->outlen)
return 1;
if (symbol)
s->out[s->outcnt] = symbol;
s->outcnt++;
} else if (symbol > 256) {
symbol -= 257;
if (symbol >= 29)
return -10;
int len = lens[symbol] + puff_bits(s, lext[symbol]);
symbol = puff_decode(s, distcode);
if (symbol < 0)
return symbol;
unsigned dist = dists[symbol] + puff_bits(s, dext[symbol]);
if (dist > s->outcnt)
return -11;
if (s->outcnt + len > s->outlen)
return 1;
while (len--) {
if (dist <= s->outcnt && s->out[s->outcnt - dist])
s->out[s->outcnt] = s->out[s->outcnt - dist];
s->outcnt++;
}
}
} while (symbol != 256);
return 0;
}
static int puff_fixed(struct puff_state* s)
{
static int virgin = 1;
static short lencnt[MAXBITS + 1], lensym[FIXLCODES];
static short distcnt[MAXBITS + 1], distsym[MAXDCODES];
static struct puff_huffman lencode, distcode;
if (virgin) {
lencode.count = lencnt;
lencode.symbol = lensym;
distcode.count = distcnt;
distcode.symbol = distsym;
short lengths[FIXLCODES];
int symbol;
for (symbol = 0; symbol < 144; symbol++)
lengths[symbol] = 8;
for (; symbol < 256; symbol++)
lengths[symbol] = 9;
for (; symbol < 280; symbol++)
lengths[symbol] = 7;
for (; symbol < FIXLCODES; symbol++)
lengths[symbol] = 8;
puff_construct(&lencode, lengths, FIXLCODES);
for (symbol = 0; symbol < MAXDCODES; symbol++)
lengths[symbol] = 5;
puff_construct(&distcode, lengths, MAXDCODES);
virgin = 0;
}
return puff_codes(s, &lencode, &distcode);
}
static int puff_dynamic(struct puff_state* s)
{
static const short order[19] =
{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
int nlen = puff_bits(s, 5) + 257;
int ndist = puff_bits(s, 5) + 1;
int ncode = puff_bits(s, 4) + 4;
if (nlen > MAXLCODES || ndist > MAXDCODES)
return -3;
short lengths[MAXCODES];
int index;
for (index = 0; index < ncode; index++)
lengths[order[index]] = puff_bits(s, 3);
for (; index < 19; index++)
lengths[order[index]] = 0;
short lencnt[MAXBITS + 1], lensym[MAXLCODES];
struct puff_huffman lencode = {lencnt, lensym};
int err = puff_construct(&lencode, lengths, 19);
if (err != 0)
return -4;
index = 0;
while (index < nlen + ndist) {
int symbol;
int len;
symbol = puff_decode(s, &lencode);
if (symbol < 0)
return symbol;
if (symbol < 16)
lengths[index++] = symbol;
else {
len = 0;
if (symbol == 16) {
if (index == 0)
return -5;
len = lengths[index - 1];
symbol = 3 + puff_bits(s, 2);
} else if (symbol == 17)
symbol = 3 + puff_bits(s, 3);
else
symbol = 11 + puff_bits(s, 7);
if (index + symbol > nlen + ndist)
return -6;
while (symbol--)
lengths[index++] = len;
}
}
if (lengths[256] == 0)
return -9;
err = puff_construct(&lencode, lengths, nlen);
if (err && (err < 0 || nlen != lencode.count[0] + lencode.count[1]))
return -7;
short distcnt[MAXBITS + 1], distsym[MAXDCODES];
struct puff_huffman distcode = {distcnt, distsym};
err = puff_construct(&distcode, lengths + nlen, ndist);
if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1]))
return -8;
return puff_codes(s, &lencode, &distcode);
}
static int puff(
unsigned char* dest,
unsigned long* destlen,
const unsigned char* source,
unsigned long sourcelen)
{
struct puff_state s = {
.out = dest,
.outlen = *destlen,
.outcnt = 0,
.in = source,
.inlen = sourcelen,
.incnt = 0,
.bitbuf = 0,
.bitcnt = 0,
};
int err;
if (setjmp(s.env) != 0)
err = 2;
else {
int last;
do {
last = puff_bits(&s, 1);
int type = puff_bits(&s, 2);
err = type == 0 ? puff_stored(&s) : (type == 1 ? puff_fixed(&s) : (type == 2 ? puff_dynamic(&s) : -1));
if (err != 0)
break;
} while (!last);
}
*destlen = s.outcnt;
return err;
}
#define ZLIB_HEADER_WIDTH 2
static int puff_zlib_to_file(const unsigned char* source, unsigned long sourcelen, int dest_fd)
{
if (sourcelen < ZLIB_HEADER_WIDTH)
return 0;
source += ZLIB_HEADER_WIDTH;
sourcelen -= ZLIB_HEADER_WIDTH;
const unsigned long max_destlen = 132 << 20;
void* ret = mmap(0, max_destlen, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ret == MAP_FAILED)
return -1;
unsigned char* dest = (unsigned char*)ret;
unsigned long destlen = max_destlen;
int err = puff(dest, &destlen, source, sourcelen);
if (err) {
munmap(dest, max_destlen);
errno = -err;
return -1;
}
if (write(dest_fd, dest, destlen) != (ssize_t)destlen) {
munmap(dest, max_destlen);
return -1;
}
return munmap(dest, max_destlen);
}
static int setup_loop_device(unsigned char* data, unsigned long size, const char* loopname, int* loopfd_p)
{
int err = 0, loopfd = -1;
int memfd = syscall(__NR_memfd_create, "syzkaller", 0);
if (memfd == -1) {
err = errno;
goto error;
}
if (puff_zlib_to_file(data, size, memfd)) {
err = errno;
goto error_close_memfd;
}
loopfd = open(loopname, O_RDWR);
if (loopfd == -1) {
err = errno;
goto error_close_memfd;
}
if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
if (errno != EBUSY) {
err = errno;
goto error_close_loop;
}
ioctl(loopfd, LOOP_CLR_FD, 0);
usleep(1000);
if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
err = errno;
goto error_close_loop;
}
}
close(memfd);
*loopfd_p = loopfd;
return 0;
error_close_loop:
close(loopfd);
error_close_memfd:
close(memfd);
error:
errno = err;
return -1;
}
static void reset_loop_device(const char* loopname)
{
int loopfd = open(loopname, O_RDWR);
if (loopfd == -1) {
return;
}
if (ioctl(loopfd, LOOP_CLR_FD, 0)) {
}
close(loopfd);
}
static long syz_mount_image(
volatile long fsarg,
volatile long dir,
volatile long flags,
volatile long optsarg,
volatile long change_dir,
volatile unsigned long size,
volatile long image)
{
unsigned char* data = (unsigned char*)image;
int res = -1, err = 0, need_loop_device = !!size;
char* mount_opts = (char*)optsarg;
char* target = (char*)dir;
char* fs = (char*)fsarg;
char* source = NULL;
char loopname[64];
if (need_loop_device) {
int loopfd;
memset(loopname, 0, sizeof(loopname));
snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
if (setup_loop_device(data, size, loopname, &loopfd) == -1)
return -1;
close(loopfd);
source = loopname;
}
mkdir(target, 0777);
char opts[256];
memset(opts, 0, sizeof(opts));
if (strlen(mount_opts) > (sizeof(opts) - 32)) {
}
strncpy(opts, mount_opts, sizeof(opts) - 32);
if (strcmp(fs, "iso9660") == 0) {
flags |= MS_RDONLY;
} else if (strncmp(fs, "ext", 3) == 0) {
bool has_remount_ro = false;
char* remount_ro_start = strstr(opts, "errors=remount-ro");
if (remount_ro_start != NULL) {
char after = *(remount_ro_start + strlen("errors=remount-ro"));
char before = remount_ro_start == opts ? '\0' : *(remount_ro_start - 1);
has_remount_ro = ((before == '\0' || before == ',') && (after == '\0' || after == ','));
}
if (strstr(opts, "errors=panic") || !has_remount_ro)
strcat(opts, ",errors=continue");
} else if (strcmp(fs, "xfs") == 0) {
strcat(opts, ",nouuid");
} else if (strncmp(fs, "gfs2", 4) == 0 && (strstr(opts, "errors=panic") || strstr(opts, "debug"))) {
strcat(opts, ",errors=withdraw");
}
res = mount(source, target, fs, flags, opts);
if (res == -1) {
err = errno;
goto error_clear_loop;
}
res = open(target, O_RDONLY | O_DIRECTORY);
if (res == -1) {
err = errno;
goto error_clear_loop;
}
if (change_dir) {
res = chdir(target);
if (res == -1) {
err = errno;
}
}
error_clear_loop:
if (need_loop_device)
reset_loop_device(loopname);
errno = err;
return res;
}
static void kill_and_wait(int pid, int* status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
for (int i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}
static void reset_loop()
{
char buf[64];
snprintf(buf, sizeof(buf), "/dev/loop%llu", procid);
int loopfd = open(buf, O_RDWR);
if (loopfd != -1) {
ioctl(loopfd, LOOP_CLR_FD, 0);
close(loopfd);
}
}
static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
write_file("/proc/self/oom_score_adj", "1000");
}
static void execute_one(void);
#define WAIT_FLAGS __WALL
static void loop(void)
{
int iter = 0;
for (;; iter++) {
reset_loop();
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
setup_test();
execute_one();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
sleep_ms(10);
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
if (current_time_ms() - start < 5000)
continue;
kill_and_wait(pid, &status);
break;
}
}
}
void execute_one(void)
{
if (write(1, "executing program\n", sizeof("executing program\n") - 1)) {}
memcpy((void*)0x200000000400, "ext2\000", 5);
memcpy((void*)0x200000000440, "./file0\000", 8);
*(uint8_t*)0x200000000480 = 0;
memcpy((void*)0x2000000004c0, "\x78\x9c\xec\xdc\xcd\x6b\x1c\x65\x1c\x07\xf0\xef\xce\x36\xda\x97\xd4\xd6\xb6\xbe\x55\xa5\x05\x85\x0a\x62\x9a\x36\x8a\xbd\x49\xf5\x0f\x10\xac\x82\xd7\x60\xd3\x17\xba\x6d\x24\x89\xd4\x16\x5a\xec\x5d\x3c\x14\x41\xc1\x93\x9e\x3d\xf4\x26\xde\xf4\xe6\xc5\x83\xe0\x41\xf0\xe2\xa1\x20\x08\xc5\x1e\x3c\xf8\xbe\x32\xbb\xb3\xeb\x36\xee\x06\xdb\x26\x6e\xec\x7c\x3e\x30\xbb\xcf\x33\x33\xcb\xef\x79\x92\xfc\x26\xcf\xfc\xc8\x24\x40\x6d\xed\x2d\x5f\x1a\xc9\x64\x92\x6f\x93\x6c\xeb\x76\x6f\x3c\x61\x6f\xf7\xed\xf2\xcc\x95\x13\xe5\xd6\x48\xbb\x7d\xe4\xc7\x46\xe7\xbc\x8b\x33\x57\x4e\xf4\x4e\xed\x7d\x6e\x4b\x92\xb7\x92\x6c\x4c\xb2\x23\x49\x73\x48\xdc\xc5\x73\xe7\x4f\xcd\xb6\x5a\x73\x0b\x55\x7f\xff\xd2\xe9\xd7\xf7\x2f\x9e\x3b\xff\xd4\xc9\xd3\xb3\xc7\xe7\x8e\xcf\x9d\x39\x30\xf3\xec\xd3\xd3\x87\x9e\x99\x3e\x74\x68\xd5\xe6\x7a\xed\xf3\xb7\x5f\xcd\xcb\x2f\x1e\x7c\x67\xcf\x77\x1f\xbd\xf0\xe9\xce\x0b\xe5\x78\x27\xab\x63\x83\xf3\x58\x4d\x8d\x11\xfb\x1f\x5c\x8b\x60\x63\xb4\x69\xdc\x03\xe0\x96\x94\xb9\xb9\x21\xc9\x44\x27\xff\xb7\xa5\xd9\xe9\x01\x75\xd0\x6e\xb7\xf7\xac\x7c\x18\xb8\x73\x35\x24\x39\xd4\x54\xef\x17\x7d\x79\xff\xdb\xdb\xfe\x9b\x95\x47\xd7\xb5\xc3\xdd\x1b\x90\x8b\x55\x6d\xe1\x72\x3f\xfe\x86\x14\xd5\x39\x13\xee\x2f\x81\x35\xf0\x59\x79\xfd\x99\x1e\x76\xfd\x2b\x6e\xa8\xd1\x6d\xae\xea\x9a\x93\x49\xb6\x26\xb9\xa7\xaa\x95\x6e\x4f\x72\x6f\x55\xe7\xdc\x99\x64\x57\x92\xfb\x6e\x22\x7e\xef\xfa\x77\xf9\x1f\xd7\xbf\xa2\x7f\xfd\x6b\x26\xb9\xff\x36\xe6\x38\xd1\xba\x7e\x75\xb0\x3f\x58\x8f\xbd\x76\x29\xd9\x3d\x34\x7e\xa3\x5f\x09\x6a\x94\x37\x82\x49\x1e\xb8\xc5\xf8\x9f\x3c\x77\xe1\x8b\x51\xc7\xda\x1f\x26\xfb\x32\x3c\x7e\x06\xe2\xaf\x50\x1f\xde\x7f\xec\x64\x6b\x6e\xba\xfb\x3a\x34\xc6\xf3\xaf\x1c\xfb\x72\x54\xfc\x72\xfe\x9b\x87\xc5\x2f\xd2\x9f\x7f\xf3\x36\xeb\xb5\x57\x37\xbd\x94\x3f\x56\x88\xff\xc4\x63\xc3\xbf\xff\x3b\x06\xe6\xff\x67\x92\x5f\x92\xfc\x9a\xe4\xb7\x24\xbf\x27\x79\x28\xc9\xee\x24\x0f\x27\x79\x64\x85\xf8\x87\xbf\xfa\x7a\x7e\xd4\xb1\x32\xfe\xd1\x11\x5f\xff\x62\x20\xfe\xa3\xb7\x32\xf1\xca\xd9\xef\x3f\xbe\x74\x1b\x1f\x07\x00\x56\x59\xd1\x59\xd3\x36\x8a\xa9\x7e\xbb\x28\xa6\xa6\xba\x6b\xdd\x5d\xd9\x5c\xb4\xe6\x17\x97\x9e\x3c\x36\xff\xc6\x99\xa3\xdd\xb5\xef\xf6\x4c\x14\xbd\x95\x56\x77\xfd\x3b\xd1\x28\xfb\x07\xaa\xb5\x70\xaf\x7f\x70\x59\x7f\xa6\x5a\x27\xbf\xdb\xdc\xd4\xe9\x4f\xbd\x36\xdf\x3a\x3a\xee\xc9\x43\xcd\x6d\x59\x96\xff\x3f\x35\xbb\xf9\x0f\xd4\x84\x3f\xf9\x81\xfa\x92\xff\x50\x5f\xf2\x1f\xea\x4b\xfe\x43\x7d\xc9\x7f\xa8\x2f\xf9\x0f\xf5\x25\xff\xa1\xbe\xe4\x3f\xd4\x97\xfc\x87\xfa\x92\xff\x50\x5f\xf2\x1f\x6a\x69\x72\xc4\xf3\x3f\x5b\x07\x9e\xdd\x99\xae\x9e\x77\xff\xa6\x39\x71\x77\xef\x59\x1f\xe0\xff\x6f\x6e\xa1\xff\x9f\x78\x97\x35\xc6\x3d\x32\x60\xad\xfd\x9d\xf4\xe3\x1e\x09\x00\x00\x00\x00\xb0\xda\x46\x55\xff\x57\xb3\x31\xee\x39\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xdc\x9c\xe2\x87\x46\x92\x72\xdb\xb7\xed\xf1\xc9\xe5\x47\xef\x6a\xfc\xdc\xec\xbc\x27\x39\xfb\xc1\x91\xf7\xde\x9c\x5d\x5a\x5a\x38\x50\xee\xbf\xde\xdf\xbf\xf4\x7e\xb5\xff\xe0\x38\xc6\x0f\xfc\x5b\xbd\x3c\xed\xe5\x31\x50\x5f\x8b\xe7\xce\x9f\x9a\x6d\xb5\xe6\x16\xee\x9c\xc6\xc6\x24\xeb\x60\x18\x1a\xeb\xa9\xb1\xbd\xfa\x79\x5f\x2f\xe3\x59\x2f\x8d\xbf\x02\x00\x00\xff\xff\x87\xb5\x73\xae", 1002);
syz_mount_image(/*fs=*/0x200000000400, /*dir=*/0x200000000440, /*flags=*/0, /*opts=*/0x200000000480, /*chdir=*/1, /*size=*/0x3ea, /*img=*/0x2000000004c0);
memcpy((void*)0x200000000000, "./file0/file0\000", 14);
memcpy((void*)0x2000000000c0, "./file1\000", 8);
syscall(__NR_rename, /*old=*/0x200000000000ul, /*new=*/0x2000000000c0ul);
}
int main(void)
{
syscall(__NR_mmap, /*addr=*/0x1ffffffff000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x200000000000ul, /*len=*/0x1000000ul, /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/7ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x200001000000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul);
const char* reason;
(void)reason;
loop();
return 0;
}
--
2.50.1
^ permalink raw reply
* [PATCH 2/2] ext2: guard against zero i_nlink on new_inode in ext2_rename()
From: Vasiliy Kovalev @ 2026-04-01 22:08 UTC (permalink / raw)
To: Jan Kara, Andrew Morton, Alexey Dobriyan, linux-ext4
Cc: linux-kernel, lvc-project, kovalev
In-Reply-To: <20260401220837.2424925-1-kovalev@altlinux.org>
A crafted ext2 image can provide a target inode with i_links_count == 0
on disk. When rename() resolves to an existing target, ext2_rename()
calls drop_nlink(new_inode) for the directory case and
inode_dec_link_count(new_inode) unconditionally. Both reach
drop_nlink(), which triggers WARN_ON:
WARNING: CPU: 0 PID: 646 at fs/inode.c:336 drop_nlink+0xad/0xd0 fs/inode.c:336
CPU: 0 UID: 0 PID: 646 Comm: syz.0.17 Not tainted 6.12.77+ #1
Call Trace:
<TASK>
inode_dec_link_count include/linux/fs.h:2518 [inline]
ext2_rename+0x35e/0x850 fs/ext2/namei.c:374
vfs_rename+0xf2f/0x2060 fs/namei.c:5021
do_renameat2+0xbe2/0xd50 fs/namei.c:5178
__do_sys_rename fs/namei.c:5225 [inline]
__se_sys_rename fs/namei.c:5223 [inline]
__x64_sys_rename+0x7e/0xa0 fs/namei.c:5223
do_syscall_x64 arch/x86/entry/common.c:47 [inline]
do_syscall_64+0xf5/0x220 arch/x86/entry/common.c:78
entry_SYSCALL_64_after_hwframe+0x77/0x7f
</TASK>
No disk state has been modified at this point in the function, so
return -EFSCORRUPTED after reporting the corruption via ext2_error().
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Cc: stable@vger.kernel.org
Fixes: 9a53c3a783c2 ("[PATCH] r/o bind mounts: unlink: monitor i_nlink")
Signed-off-by: Vasiliy Kovalev <kovalev@altlinux.org>
---
fs/ext2/namei.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index ea49e8f2b292..419e844f2e54 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -334,6 +334,13 @@ static int ext2_rename (struct mnt_idmap * idmap,
bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
+ if (new_inode && new_inode->i_nlink == 0) {
+ ext2_error(old_dir->i_sb, __func__,
+ "target inode %lu has zero i_nlink, filesystem may be corrupt",
+ new_inode->i_ino);
+ return -EFSCORRUPTED;
+ }
+
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
--
2.50.1
^ permalink raw reply related
* Re: [PATCH v6 02/22] fsverity: expose ensure_fsverity_info()
From: Eric Biggers @ 2026-04-01 22:02 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-3-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:03PM +0200, Andrey Albershteyn wrote:
> This function will be used by XFS's scrub to force fsverity activation,
> therefore, to read fsverity context.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Acked-by: Eric Biggers <ebiggers@kernel.org>
> +/**
> + * fsverity_ensure_verity_info() - create verity info if it's not in memory yet
> + * @inode: the inode for which verity info should be created
> + *
> + * Ensure this inode has verity info attached to it. Read fsverity descriptor
> + * and creates verity based on that. Inodes opened outside of
> + * file_operations->open will not have any verity info attached. This
> + * info is required for any fsverity related operations.
> + *
> + * Return: 0 on success, -errno on failure
> + */
> +int fsverity_ensure_verity_info(struct inode *inode);
As Christoph mentioned, fs/verity/ uses the convention of the kerneldoc
for functions being above the function definition.
I think the comment could also be clearer:
> create verity info if it's not in memory yet
Maybe "cache verity info if it's not already cached", to avoid potential
confusion with enabling fsverity on the file.
> Ensure this inode has verity info attached to it.
Maybe add: "It's assumed the inode already has fsverity enabled."
> Inodes opened outside of file_operations->open will not have any
> verity info attached. This info is required for any fsverity
> related operations.
The first sentence could be misinterpreted as saying that this function
won't do anything in that case. The second sentence isn't clear what
counts as "any fsverity related operation". Also "opened" doesn't seem
like the right word to use when talking about a filesystem-internal read
that occurs without a file descriptor having been opened.
Maybe replace with:
* This needs to be called at least once before any of the inode's data
* can be verified (and thus read at all) or the inode's fsverity digest
* retrieved. fsverity_file_open() calls this already, which handles
* normal file accesses. If a filesystem does any internal (i.e. not
* associated with a file descriptor) reads of the file's data or
* fsverity digest, it must call this explicitly before doing so.
By the way, should there be a patch that converts
ovl_ensure_verity_loaded() to use this?
- Eric
^ permalink raw reply
* Re: [PATCH v6 01/22] fsverity: report validation errors through fserror to fsnotify
From: Eric Biggers @ 2026-04-01 21:19 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260331212827.2631020-2-aalbersh@kernel.org>
On Tue, Mar 31, 2026 at 11:28:02PM +0200, Andrey Albershteyn wrote:
> Reported verification errors to fsnotify through recently added fserror
> interface.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/verity/verify.c | 4 ++++
> 1 file changed, 4 insertions(+)
Acked-by: Eric Biggers <ebiggers@kernel.org>
> if (memchr_inv(dblock->data, 0, params->block_size)) {
> + fserror_report_data_lost(inode, data_pos,
> + params->block_size, GFP_NOFS);
> fsverity_err(inode,
> "FILE CORRUPTED! Data past EOF is not zeroed");
> return false;
> @@ -312,6 +315,7 @@ static bool verify_data_block(struct fsverity_info *vi,
> data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
> params->hash_alg->name, hsize,
> level == 0 ? dblock->real_hash : real_hash);
> + fserror_report_data_lost(inode, data_pos, params->block_size, GFP_NOFS);
The first case does the fserror call first and the log message second,
and the second case does them in the opposite order. I don't think it
matters what the order is, but they should be consistent.
- Eric
^ permalink raw reply
* Re: [PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle
From: Jan Kara @ 2026-04-01 17:21 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, ojaswin, ritesh.list, libaokun, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260327102939.1095257-13-yi.zhang@huaweicloud.com>
On Fri 27-03-26 18:29:38, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under
> an active handle and may also hold folio lock if the block size is
> smaller than the folio size. This also breaks the "folio lock ->
> transaction start" lock ordering for the upcoming iomap buffered I/O
> path.
>
> Therefore, move pagecache_isize_extended() outside of an active handle.
> Additionally, it is unnecessary to update the file length during each
> iteration of the allocation loop. Instead, update the file length only
> to the position where the allocation is successful. Postpone updating
> the inode size until after the allocation loop completes or is
> interrupted due to an error.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good! Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------
> 1 file changed, 39 insertions(+), 23 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 7abe47f923c0..f13f604b1f67 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4553,7 +4553,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
> ext4_lblk_t len_lblk;
> struct ext4_map_blocks map;
> unsigned int credits;
> - loff_t epos, old_size = i_size_read(inode);
> + loff_t epos = 0, old_size = i_size_read(inode);
> unsigned int blkbits = inode->i_blkbits;
> bool alloc_zero = false;
>
> @@ -4618,44 +4618,60 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
> ext4_journal_stop(handle);
> break;
> }
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> + ret = ext4_journal_stop(handle);
> + if (unlikely(ret))
> + break;
> +
> /*
> * allow a full retry cycle for any remaining allocations
> */
> retries = 0;
> - epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
> - if (new_size) {
> - if (epos > new_size)
> - epos = new_size;
> - ext4_update_inode_size(inode, epos);
> - if (epos > old_size)
> - pagecache_isize_extended(inode, old_size, epos);
> - }
> - ret2 = ext4_mark_inode_dirty(handle, inode);
> - ext4_update_inode_fsync_trans(handle, inode, 1);
> - ret3 = ext4_journal_stop(handle);
> - ret2 = ret3 ? ret3 : ret2;
> - if (unlikely(ret2))
> - break;
>
> if (alloc_zero &&
> (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
> - ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> - map.m_len);
> - if (likely(!ret2))
> - ret2 = ext4_convert_unwritten_extents(NULL,
> + ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> + map.m_len);
> + if (likely(!ret))
> + ret = ext4_convert_unwritten_extents(NULL,
> inode, (loff_t)map.m_lblk << blkbits,
> (loff_t)map.m_len << blkbits);
> - if (ret2)
> + if (ret)
> break;
> }
>
> - map.m_lblk += ret;
> - map.m_len = len_lblk = len_lblk - ret;
> + map.m_lblk += map.m_len;
> + map.m_len = len_lblk = len_lblk - map.m_len;
> + epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
> }
> +
> if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> goto retry;
>
> - return ret > 0 ? ret2 : ret;
> + if (!epos || !new_size)
> + return ret;
> +
> + /*
> + * Allocate blocks, update the file size to match the size of the
> + * already successfully allocated blocks.
> + */
> + if (epos > new_size)
> + epos = new_size;
> +
> + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
> + if (IS_ERR(handle))
> + return ret ? ret : PTR_ERR(handle);
> +
> + ext4_update_inode_size(inode, epos);
> + ret2 = ext4_mark_inode_dirty(handle, inode);
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> + ret3 = ext4_journal_stop(handle);
> + ret2 = ret3 ? ret3 : ret2;
> +
> + if (epos > old_size)
> + pagecache_isize_extended(inode, old_size, epos);
> +
> + return ret ? ret : ret2;
> }
>
> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
> --
> 2.52.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox