[PATCHSET v8 1/2] fstests: test generic file IO error reporting

public inbox for fstests@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCHSET v8 1/2] fstests: test generic file IO error reporting
       [not found] <20260303002508.GB57948@frogsfrogsfrogs>
@ 2026-03-03  0:33 ` Darrick J. Wong
  2026-03-03  0:40   ` [PATCH 1/1] generic: test fsnotify filesystem " Darrick J. Wong
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
  1 sibling, 1 reply; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:33 UTC (permalink / raw)
  To: zlang, djwong
  Cc: linux-fsdevel, hch, gabriel, amir73il, jack, fstests, linux-xfs

Hi all,

Refactor the iomap file I/O error handling code so that failures are
reported in a generic way to fsnotify.  Then connect the XFS health
reporting to the same fsnotify, and now XFS can notify userspace of
problems.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

With a bit of luck, this should all go splendidly.
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=filesystem-error-reporting

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=filesystem-error-reporting
---
Commits in this patchset:
 * generic: test fsnotify filesystem error reporting
---
 src/Makefile           |    2 
 src/fs-monitor.c       |  155 +++++++++++++++++++++++++++++++++
 tests/generic/1838     |  228 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/1838.out |   20 ++++
 4 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 src/fs-monitor.c
 create mode 100755 tests/generic/1838
 create mode 100644 tests/generic/1838.out


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03  0:33 ` [PATCHSET v8 1/2] fstests: test generic file IO error reporting Darrick J. Wong
@ 2026-03-03  0:40   ` Darrick J. Wong
  2026-03-03  9:21     ` Amir Goldstein
  2026-03-03 14:54     ` Christoph Hellwig
  0 siblings, 2 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:40 UTC (permalink / raw)
  To: zlang, djwong
  Cc: linux-fsdevel, hch, gabriel, amir73il, jack, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Test the fsnotify filesystem error reporting.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 src/Makefile           |    2 
 src/fs-monitor.c       |  155 +++++++++++++++++++++++++++++++++
 tests/generic/1838     |  228 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/1838.out |   20 ++++
 4 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 src/fs-monitor.c
 create mode 100755 tests/generic/1838
 create mode 100644 tests/generic/1838.out


diff --git a/src/Makefile b/src/Makefile
index 577d816ae859b6..1c761da0ccff20 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -36,7 +36,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
 	fscrypt-crypt-util bulkstat_null_ocount splice-test chprojid_fail \
 	detached_mounts_propagation ext4_resize t_readdir_3 splice2pipe \
 	uuid_ioctl t_snapshot_deleted_subvolume fiemap-fault min_dio_alignment \
-	rw_hint
+	rw_hint fs-monitor
 
 EXTRA_EXECS = dmerror fill2attr fill2fs fill2fs_check scaleread.sh \
 	      btrfs_crc32c_forged_name.py popdir.pl popattr.py \
diff --git a/src/fs-monitor.c b/src/fs-monitor.c
new file mode 100644
index 00000000000000..fef596a3966933
--- /dev/null
+++ b/src/fs-monitor.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021, Collabora Ltd.
+ */
+
+#include <errno.h>
+#include <err.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/fanotify.h>
+#include <sys/types.h>
+#include <unistd.h>
+#ifndef __GLIBC__
+#include <asm-generic/int-ll64.h>
+#endif
+
+#ifndef FAN_FS_ERROR
+#define FAN_FS_ERROR		0x00008000
+#define FAN_EVENT_INFO_TYPE_ERROR	5
+
+struct fanotify_event_info_error {
+	struct fanotify_event_info_header hdr;
+	__s32 error;
+	__u32 error_count;
+};
+#endif
+
+#ifndef FILEID_INO32_GEN
+#define FILEID_INO32_GEN	1
+#endif
+
+#ifndef FILEID_INVALID
+#define	FILEID_INVALID		0xff
+#endif
+
+static void print_fh(struct file_handle *fh)
+{
+	int i;
+	uint32_t *h = (uint32_t *) fh->f_handle;
+
+	printf("\tfh: ");
+	for (i = 0; i < fh->handle_bytes; i++)
+		printf("%hhx", fh->f_handle[i]);
+	printf("\n");
+
+	printf("\tdecoded fh: ");
+	if (fh->handle_type == FILEID_INO32_GEN)
+		printf("inode=%u gen=%u\n", h[0], h[1]);
+	else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes)
+		printf("Type %d (Superblock error)\n", fh->handle_type);
+	else
+		printf("Type %d (Unknown)\n", fh->handle_type);
+
+}
+
+static void handle_notifications(char *buffer, int len)
+{
+	struct fanotify_event_metadata *event =
+		(struct fanotify_event_metadata *) buffer;
+	struct fanotify_event_info_header *info;
+	struct fanotify_event_info_error *err;
+	struct fanotify_event_info_fid *fid;
+	int off;
+
+	for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
+
+		if (event->mask != FAN_FS_ERROR) {
+			printf("unexpected FAN MARK: %llx\n",
+							(unsigned long long)event->mask);
+			goto next_event;
+		}
+
+		if (event->fd != FAN_NOFD) {
+			printf("Unexpected fd (!= FAN_NOFD)\n");
+			goto next_event;
+		}
+
+		printf("FAN_FS_ERROR (len=%d)\n", event->event_len);
+
+		for (off = sizeof(*event) ; off < event->event_len;
+		     off += info->len) {
+			info = (struct fanotify_event_info_header *)
+				((char *) event + off);
+
+			switch (info->info_type) {
+			case FAN_EVENT_INFO_TYPE_ERROR:
+				err = (struct fanotify_event_info_error *) info;
+
+				printf("\tGeneric Error Record: len=%d\n",
+				       err->hdr.len);
+				printf("\terror: %d\n", err->error);
+				printf("\terror_count: %d\n", err->error_count);
+				break;
+
+			case FAN_EVENT_INFO_TYPE_FID:
+				fid = (struct fanotify_event_info_fid *) info;
+
+				printf("\tfsid: %x%x\n",
+#if defined(__GLIBC__)
+				       fid->fsid.val[0], fid->fsid.val[1]);
+#else
+				       fid->fsid.__val[0], fid->fsid.__val[1]);
+#endif
+				print_fh((struct file_handle *) &fid->handle);
+				break;
+
+			default:
+				printf("\tUnknown info type=%d len=%d:\n",
+				       info->info_type, info->len);
+			}
+		}
+next_event:
+		printf("---\n\n");
+		fflush(stdout);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int fd;
+
+	char buffer[BUFSIZ];
+
+	if (argc < 2) {
+		printf("Missing path argument\n");
+		return 1;
+	}
+
+	fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY);
+	if (fd < 0) {
+		perror("fanotify_init");
+		errx(1, "fanotify_init");
+	}
+
+	if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM,
+			  FAN_FS_ERROR, AT_FDCWD, argv[1])) {
+		perror("fanotify_mark");
+		errx(1, "fanotify_mark");
+	}
+
+	printf("fanotify active\n");
+	fflush(stdout);
+
+	while (1) {
+		int n = read(fd, buffer, BUFSIZ);
+
+		if (n < 0)
+			errx(1, "read");
+
+		handle_notifications(buffer, n);
+	}
+
+	return 0;
+}
diff --git a/tests/generic/1838 b/tests/generic/1838
new file mode 100755
index 00000000000000..087851ddcbdb44
--- /dev/null
+++ b/tests/generic/1838
@@ -0,0 +1,228 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1838
+#
+# Check that fsnotify can report file IO errors.
+
+. ./common/preamble
+_begin_fstest auto quick eio selfhealing
+
+# Override the default cleanup function.
+_cleanup()
+{
+	cd /
+	test -n "$fsmonitor_pid" && kill -TERM $fsmonitor_pid
+	rm -f $tmp.*
+	_dmerror_cleanup
+}
+
+# Import common functions.
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+. ./common/systemd
+
+case "$FSTYP" in
+xfs)
+	# added as a part of xfs health monitoring
+	_require_xfs_io_command healthmon
+	# no out of place writes
+	_require_no_xfs_always_cow
+	;;
+ext4)
+	# added at the same time as uevents
+	modprobe fs-$FSTYP
+	test -e /sys/fs/ext4/features/uevents || \
+		_notrun "$FSTYP does not support fsnotify ioerrors"
+	;;
+*)
+	_notrun "$FSTYP does not support fsnotify ioerrors"
+	;;
+esac
+
+_require_scratch
+_require_dm_target error
+_require_test_program fs-monitor
+_require_xfs_io_command "fiemap"
+_require_odirect
+
+# fsnotify only gives us a file handle, the error number, and the number of
+# times it was seen in between event deliveries.   The handle is mostly useless
+# since we have no generic way to map that to a file path.  Therefore we can
+# only coalesce all the I/O errors into one report.
+filter_fsnotify_errors() {
+	_filter_scratch | \
+		grep -E '(FAN_FS_ERROR|Generic Error Record|error: 5)' | \
+		sed -e "s/len=[0-9]*/len=XXX/g" | \
+		sort | \
+		uniq
+}
+
+_scratch_mkfs >> $seqres.full
+
+#
+# The dm-error map added by this test doesn't work on zoned devices because
+# table sizes need to be aligned to the zone size, and even for zoned on
+# conventional this test will get confused because of the internal RT device.
+#
+# That check requires a mounted file system, so do a dummy mount before setting
+# up DM.
+#
+_scratch_mount
+test $FSTYP = xfs && _require_xfs_scratch_non_zoned
+_scratch_unmount
+
+_dmerror_init
+_dmerror_mount >> $seqres.full 2>&1
+
+test $FSTYP = xfs && _xfs_force_bdev data $SCRATCH_MNT
+
+# Write a file with 4 file blocks worth of data, figure out the LBA to target
+victim=$SCRATCH_MNT/a
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
+
+awk_len_prog='{print $4}'
+bmap_str="$($XFS_IO_PROG -c "fiemap -v" $victim | grep "^[[:space:]]*0:")"
+echo "$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
+fs_blksz=$(_get_block_size $SCRATCH_MNT)
+echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
+kernel_sectors_per_fs_block=$((fs_blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
+test "$len" -lt $min_len_sectors && \
+	_fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+echo "$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Set the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# All sector numbers that we feed to the kernel must be in units of 512b, but
+# they also must be aligned to the device's logical block size.
+logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
+kernel_sectors_per_device_lba=$((logical_block_size / 512))
+
+# Mark as bad one of the device LBAs in the middle of the extent.  Target the
+# second LBA of the third block of the four-block file extent that we allocated
+# earlier, but without overflowing into the fourth file block.
+bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
+bad_len=$kernel_sectors_per_device_lba
+if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
+	bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
+fi
+if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
+	echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
+fi
+
+# Remount to flush the page cache, start fsnotify, and make the LBA bad
+_dmerror_unmount
+_dmerror_mount
+
+$here/src/fs-monitor $SCRATCH_MNT > $tmp.fsmonitor &
+fsmonitor_pid=$!
+sleep 1
+
+_dmerror_mark_range_bad $bad_sector $bad_len
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+# See if buffered reads pick it up
+echo "Try buffered read"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Now mark the bad range good so that unmount won't fail due to IO errors.
+echo "Fix device"
+_dmerror_mark_range_good $bad_sector $bad_len
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# Unmount filesystem to start fresh
+echo "Kill fsnotify"
+_dmerror_unmount
+sleep 1
+kill -TERM $fsmonitor_pid
+unset fsmonitor_pid
+echo fsnotify log >> $seqres.full
+cat $tmp.fsmonitor >> $seqres.full
+cat $tmp.fsmonitor | filter_fsnotify_errors
+
+# Start fsnotify again so that can verify that the errors don't persist after
+# we flip back to the good dm table.
+echo "Remount and restart fsnotify"
+_dmerror_mount
+$here/src/fs-monitor $SCRATCH_MNT > $tmp.fsmonitor &
+fsmonitor_pid=$!
+sleep 1
+
+# See if buffered reads pick it up
+echo "Try buffered read again"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read again"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write again"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write again"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Unmount fs and kill fsnotify, then wait for it to finish
+echo "Kill fsnotify again"
+_dmerror_unmount
+sleep 1
+kill -TERM $fsmonitor_pid
+unset fsmonitor_pid
+cat $tmp.fsmonitor >> $seqres.full
+cat $tmp.fsmonitor | filter_fsnotify_errors
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/1838.out b/tests/generic/1838.out
new file mode 100644
index 00000000000000..adae590fe0b2ea
--- /dev/null
+++ b/tests/generic/1838.out
@@ -0,0 +1,20 @@
+QA output created by 1838
+Try buffered read
+pread: Input/output error
+Try directio read
+pread: Input/output error
+Try directio write
+pwrite: Input/output error
+Try buffered write
+fsync: Input/output error
+Fix device
+Kill fsnotify
+	Generic Error Record: len=XXX
+	error: 5
+FAN_FS_ERROR (len=XXX)
+Remount and restart fsnotify
+Try buffered read again
+Try directio read again
+Try directio write again
+Try buffered write again
+Kill fsnotify again


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03  0:40   ` [PATCH 1/1] generic: test fsnotify filesystem " Darrick J. Wong
@ 2026-03-03  9:21     ` Amir Goldstein
  2026-03-03 14:51       ` Christoph Hellwig
  2026-03-03 14:54     ` Christoph Hellwig
  1 sibling, 1 reply; 30+ messages in thread
From: Amir Goldstein @ 2026-03-03  9:21 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: zlang, linux-fsdevel, hch, gabriel, jack, fstests, linux-xfs

On Tue, Mar 3, 2026 at 1:40 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> From: Darrick J. Wong <djwong@kernel.org>
>
> Test the fsnotify filesystem error reporting.

For the record, I feel that I need to say to all the people whom we pushed back
on fanotify tests in fstests until there was a good enough reason to do so,
that this seems like a good reason to do so ;)

But also for future test writers, please note that FAN_FS_ERROR is an
exception to the rule and please keep writing new fanotify/inotify tests in LTP
(until there is a good enough reason...)

>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  src/Makefile           |    2
>  src/fs-monitor.c       |  155 +++++++++++++++++++++++++++++++++
>  tests/generic/1838     |  228 ++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/1838.out |   20 ++++
>  4 files changed, 404 insertions(+), 1 deletion(-)
>  create mode 100644 src/fs-monitor.c
>  create mode 100755 tests/generic/1838
>  create mode 100644 tests/generic/1838.out
>
>
...

> diff --git a/tests/generic/1838 b/tests/generic/1838
> new file mode 100755
> index 00000000000000..087851ddcbdb44
> --- /dev/null
> +++ b/tests/generic/1838
> @@ -0,0 +1,228 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
> +#
> +# FS QA Test No. 1838
> +#
> +# Check that fsnotify can report file IO errors.
> +
> +. ./common/preamble
> +_begin_fstest auto quick eio selfhealing
> +
> +# Override the default cleanup function.
> +_cleanup()
> +{
> +       cd /
> +       test -n "$fsmonitor_pid" && kill -TERM $fsmonitor_pid
> +       rm -f $tmp.*
> +       _dmerror_cleanup
> +}
> +
> +# Import common functions.
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/dmerror
> +. ./common/systemd
> +
> +case "$FSTYP" in
> +xfs)
> +       # added as a part of xfs health monitoring
> +       _require_xfs_io_command healthmon
> +       # no out of place writes
> +       _require_no_xfs_always_cow
> +       ;;
> +ext4)
> +       # added at the same time as uevents
> +       modprobe fs-$FSTYP
> +       test -e /sys/fs/ext4/features/uevents || \
> +               _notrun "$FSTYP does not support fsnotify ioerrors"
> +       ;;
> +*)
> +       _notrun "$FSTYP does not support fsnotify ioerrors"
> +       ;;
> +esac
> +

_require_fsnotify_errors ?

> +_require_scratch
> +_require_dm_target error
> +_require_test_program fs-monitor
> +_require_xfs_io_command "fiemap"
> +_require_odirect
> +
> +# fsnotify only gives us a file handle, the error number, and the number of
> +# times it was seen in between event deliveries.   The handle is mostly useless
> +# since we have no generic way to map that to a file path.  Therefore we can
> +# only coalesce all the I/O errors into one report.
> +filter_fsnotify_errors() {
> +       _filter_scratch | \
> +               grep -E '(FAN_FS_ERROR|Generic Error Record|error: 5)' | \
> +               sed -e "s/len=[0-9]*/len=XXX/g" | \
> +               sort | \
> +               uniq
> +}

move to common/filter?

Apart from those nits, no further comments.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03  9:21     ` Amir Goldstein
@ 2026-03-03 14:51       ` Christoph Hellwig
  2026-03-03 14:56         ` Amir Goldstein
  2026-03-04 10:10         ` Jan Kara
  0 siblings, 2 replies; 30+ messages in thread
From: Christoph Hellwig @ 2026-03-03 14:51 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Darrick J. Wong, zlang, linux-fsdevel, hch, gabriel, jack,
	fstests, linux-xfs

On Tue, Mar 03, 2026 at 10:21:04AM +0100, Amir Goldstein wrote:
> On Tue, Mar 3, 2026 at 1:40 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Test the fsnotify filesystem error reporting.
> 
> For the record, I feel that I need to say to all the people whom we pushed back
> on fanotify tests in fstests until there was a good enough reason to do so,
> that this seems like a good reason to do so ;)

Who pushed backed on that?  Because IMHO hiding stuff in ltp is a sure
way it doesn't get exercisesd regularly?


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 14:51       ` Christoph Hellwig
@ 2026-03-03 14:56         ` Amir Goldstein
  2026-03-04 10:10         ` Jan Kara
  1 sibling, 0 replies; 30+ messages in thread
From: Amir Goldstein @ 2026-03-03 14:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Darrick J. Wong, zlang, linux-fsdevel, hch, gabriel, jack,
	fstests, linux-xfs

On Tue, Mar 3, 2026 at 3:51 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Mar 03, 2026 at 10:21:04AM +0100, Amir Goldstein wrote:
> > On Tue, Mar 3, 2026 at 1:40 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > From: Darrick J. Wong <djwong@kernel.org>
> > >
> > > Test the fsnotify filesystem error reporting.
> >
> > For the record, I feel that I need to say to all the people whom we pushed back
> > on fanotify tests in fstests until there was a good enough reason to do so,
> > that this seems like a good reason to do so ;)
>
> Who pushed backed on that?  Because IMHO hiding stuff in ltp is a sure
> way it doesn't get exercisesd regularly?
>

Jan and myself pushed back on adding generic fanotify tests to fstest
because we already have most fanotify tests in LTP.

LTP is run by many testers on many boxes and many release
kernels and we are happy with this project to host tests for the
subsystem that we maintain.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 14:51       ` Christoph Hellwig
  2026-03-03 14:56         ` Amir Goldstein
@ 2026-03-04 10:10         ` Jan Kara
  1 sibling, 0 replies; 30+ messages in thread
From: Jan Kara @ 2026-03-04 10:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Amir Goldstein, Darrick J. Wong, zlang, linux-fsdevel, hch,
	gabriel, jack, fstests, linux-xfs

On Tue 03-03-26 06:51:19, Christoph Hellwig wrote:
> On Tue, Mar 03, 2026 at 10:21:04AM +0100, Amir Goldstein wrote:
> > On Tue, Mar 3, 2026 at 1:40 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > From: Darrick J. Wong <djwong@kernel.org>
> > >
> > > Test the fsnotify filesystem error reporting.
> > 
> > For the record, I feel that I need to say to all the people whom we pushed back
> > on fanotify tests in fstests until there was a good enough reason to do so,
> > that this seems like a good reason to do so ;)
> 
> Who pushed backed on that?  Because IMHO hiding stuff in ltp is a sure
> way it doesn't get exercisesd regularly?

Amir wrote it well, I'd just add the 0-day runs LTP, distro people run LTP
and lot of other test bots also run LTP so I wouldn't say fsnotify tests
are not exercised regularly. For record I don't expect regular filesystem
developers to need to run fsnotify tests as the code is generally well
separated from individual filesystems. Filesystem error reporting is kind
of special in this regard so I agree having it in fstests makes sense.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03  0:40   ` [PATCH 1/1] generic: test fsnotify filesystem " Darrick J. Wong
  2026-03-03  9:21     ` Amir Goldstein
@ 2026-03-03 14:54     ` Christoph Hellwig
  2026-03-03 16:06       ` Gabriel Krisman Bertazi
  2026-03-03 16:49       ` Darrick J. Wong
  1 sibling, 2 replies; 30+ messages in thread
From: Christoph Hellwig @ 2026-03-03 14:54 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: zlang, linux-fsdevel, hch, gabriel, amir73il, jack, fstests,
	linux-xfs

> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2021, Collabora Ltd.
> + */

Where is this coming from?

> +#ifndef __GLIBC__
> +#include <asm-generic/int-ll64.h>
> +#endif

And what is this for?  Looks pretty whacky.

> +case "$FSTYP" in
> +xfs)
> +	# added as a part of xfs health monitoring
> +	_require_xfs_io_command healthmon
> +	# no out of place writes
> +	_require_no_xfs_always_cow
> +	;;
> +ext4)
> +	# added at the same time as uevents
> +	modprobe fs-$FSTYP
> +	test -e /sys/fs/ext4/features/uevents || \
> +		_notrun "$FSTYP does not support fsnotify ioerrors"
> +	;;
> +*)
> +	_notrun "$FSTYP does not support fsnotify ioerrors"
> +	;;
> +esac

Please abstract this out into a documented helper in common/

> +#
> +# The dm-error map added by this test doesn't work on zoned devices because
> +# table sizes need to be aligned to the zone size, and even for zoned on
> +# conventional this test will get confused because of the internal RT device.
> +#
> +# That check requires a mounted file system, so do a dummy mount before setting
> +# up DM.
> +#
> +_scratch_mount
> +test $FSTYP = xfs && _require_xfs_scratch_non_zoned
> +_scratch_unmount

Hmm, this is a bit sad.  Can we align the map?  Or should we carve in
and add proper error injection to the block code, which has been
somewhere on my todo list forever because dm-error and friends are
so painful to setup.  Maybe I need to expedite that.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 14:54     ` Christoph Hellwig
@ 2026-03-03 16:06       ` Gabriel Krisman Bertazi
  2026-03-03 16:12         ` Christoph Hellwig
  2026-03-03 16:49       ` Darrick J. Wong
  1 sibling, 1 reply; 30+ messages in thread
From: Gabriel Krisman Bertazi @ 2026-03-03 16:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Darrick J. Wong, zlang, linux-fsdevel, hch, amir73il, jack,
	fstests, linux-xfs

Christoph Hellwig <hch@infradead.org> writes:

>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright 2021, Collabora Ltd.
>> + */
>
> Where is this coming from?

This code is heavily based, if not the same, to what I originally wrote
as a kernel tree "samples/fs-monitor.c" when I was employed by
Collabora.  I appreciate Darrick keeping the note actually.

>
>> +#ifndef __GLIBC__
>> +#include <asm-generic/int-ll64.h>
>> +#endif
>
> And what is this for?  Looks pretty whacky.

Comes from kernel commit 3193e8942fc7 ("samples: fix building fs-monitor
on musl systems") to fix building with musl.  We don't need it here.

-- 
Gabriel Krisman Bertazi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 16:06       ` Gabriel Krisman Bertazi
@ 2026-03-03 16:12         ` Christoph Hellwig
  2026-03-03 16:38           ` Darrick J. Wong
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2026-03-03 16:12 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi
  Cc: Christoph Hellwig, Darrick J. Wong, zlang, linux-fsdevel, hch,
	amir73il, jack, fstests, linux-xfs

On Tue, Mar 03, 2026 at 11:06:52AM -0500, Gabriel Krisman Bertazi wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> 
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Copyright 2021, Collabora Ltd.
> >> + */
> >
> > Where is this coming from?
> 
> This code is heavily based, if not the same, to what I originally wrote
> as a kernel tree "samples/fs-monitor.c" when I was employed by
> Collabora.  I appreciate Darrick keeping the note actually.

The note is good.  But if we import code from somewhere, we should
document where it is coming from, both for attribution and to ease
any future resyncs if needed.

> >> +#ifndef __GLIBC__
> >> +#include <asm-generic/int-ll64.h>
> >> +#endif
> >
> > And what is this for?  Looks pretty whacky.
> 
> Comes from kernel commit 3193e8942fc7 ("samples: fix building fs-monitor
> on musl systems") to fix building with musl.  We don't need it here.

In the place that needs it it really should have a comment explainig
the logic behind it.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 16:12         ` Christoph Hellwig
@ 2026-03-03 16:38           ` Darrick J. Wong
  0 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03 16:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Gabriel Krisman Bertazi, zlang, linux-fsdevel, hch, amir73il,
	jack, fstests, linux-xfs

On Tue, Mar 03, 2026 at 08:12:55AM -0800, Christoph Hellwig wrote:
> On Tue, Mar 03, 2026 at 11:06:52AM -0500, Gabriel Krisman Bertazi wrote:
> > Christoph Hellwig <hch@infradead.org> writes:
> > 
> > >> +// SPDX-License-Identifier: GPL-2.0
> > >> +/*
> > >> + * Copyright 2021, Collabora Ltd.
> > >> + */
> > >
> > > Where is this coming from?
> > 
> > This code is heavily based, if not the same, to what I originally wrote
> > as a kernel tree "samples/fs-monitor.c" when I was employed by
> > Collabora.  I appreciate Darrick keeping the note actually.
> 
> The note is good.  But if we import code from somewhere, we should
> document where it is coming from, both for attribution and to ease
> any future resyncs if needed.

Yeah, I copied this straight from the kernel tree, which is why it
contains this wart:

> > >> +#ifndef __GLIBC__
> > >> +#include <asm-generic/int-ll64.h>
> > >> +#endif
> > >
> > > And what is this for?  Looks pretty whacky.
> > 
> > Comes from kernel commit 3193e8942fc7 ("samples: fix building fs-monitor
> > on musl systems") to fix building with musl.  We don't need it here.
> 
> In the place that needs it it really should have a comment explainig
> the logic behind it.

I don't know that people *don't* try to run fstests with musl.  But as
they seem surprisingly patient with continuously fixing up xfsprogs,
perhaps it's ok to clean this up on the way into fstests.

I'll add more attribution for the c file pointing back to where it came
from.

--D

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 14:54     ` Christoph Hellwig
  2026-03-03 16:06       ` Gabriel Krisman Bertazi
@ 2026-03-03 16:49       ` Darrick J. Wong
  2026-03-03 16:53         ` Christoph Hellwig
  1 sibling, 1 reply; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03 16:49 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: zlang, linux-fsdevel, hch, gabriel, amir73il, jack, fstests,
	linux-xfs

On Tue, Mar 03, 2026 at 06:54:29AM -0800, Christoph Hellwig wrote:
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright 2021, Collabora Ltd.
> > + */
> 
> Where is this coming from?
> 
> > +#ifndef __GLIBC__
> > +#include <asm-generic/int-ll64.h>
> > +#endif
> 
> And what is this for?  Looks pretty whacky.
> 
> > +case "$FSTYP" in
> > +xfs)
> > +	# added as a part of xfs health monitoring
> > +	_require_xfs_io_command healthmon
> > +	# no out of place writes
> > +	_require_no_xfs_always_cow
> > +	;;
> > +ext4)
> > +	# added at the same time as uevents
> > +	modprobe fs-$FSTYP
> > +	test -e /sys/fs/ext4/features/uevents || \
> > +		_notrun "$FSTYP does not support fsnotify ioerrors"
> > +	;;
> > +*)
> > +	_notrun "$FSTYP does not support fsnotify ioerrors"
> > +	;;
> > +esac
> 
> Please abstract this out into a documented helper in common/

Ok.  I'm not sure how to check for feature support on ext4 anymore since
the uevents patch didn't get merged, and then I clearly forgot to rip
that out of this helper here.

> > +#
> > +# The dm-error map added by this test doesn't work on zoned devices because
> > +# table sizes need to be aligned to the zone size, and even for zoned on
> > +# conventional this test will get confused because of the internal RT device.
> > +#
> > +# That check requires a mounted file system, so do a dummy mount before setting
> > +# up DM.
> > +#
> > +_scratch_mount
> > +test $FSTYP = xfs && _require_xfs_scratch_non_zoned
> > +_scratch_unmount
> 
> Hmm, this is a bit sad.  Can we align the map?  Or should we carve in
> and add proper error injection to the block code, which has been
> somewhere on my todo list forever because dm-error and friends are
> so painful to setup.  Maybe I need to expedite that.

I think it's theoretically possible to figure out that there's a zone
size and then round outwards the error-target part of the dm table to
align with a zone.  I have a lot more doubts about whether or not doing
that in bash/awk is a good idea though.  It'd be a lot easier if either
the block layer did error injection or if someone just fixes those
limitations in dm itself.

--D

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 16:49       ` Darrick J. Wong
@ 2026-03-03 16:53         ` Christoph Hellwig
  2026-03-03 17:59           ` Darrick J. Wong
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2026-03-03 16:53 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, zlang, linux-fsdevel, hch, gabriel, amir73il,
	jack, fstests, linux-xfs

On Tue, Mar 03, 2026 at 08:49:01AM -0800, Darrick J. Wong wrote:
> > > +ext4)
> > > +	# added at the same time as uevents
> > > +	modprobe fs-$FSTYP
> > > +	test -e /sys/fs/ext4/features/uevents || \
> > > +		_notrun "$FSTYP does not support fsnotify ioerrors"
> > > +	;;
> > > +*)
> > > +	_notrun "$FSTYP does not support fsnotify ioerrors"
> > > +	;;
> > > +esac
> > 
> > Please abstract this out into a documented helper in common/
> 
> Ok.  I'm not sure how to check for feature support on ext4 anymore since
> the uevents patch didn't get merged, and then I clearly forgot to rip
> that out of this helper here.

Oh.  Well, drop that then and move the xfs side and the default n
into a common helper instead of hardcoding it in the test.

> > and add proper error injection to the block code, which has been
> > somewhere on my todo list forever because dm-error and friends are
> > so painful to setup.  Maybe I need to expedite that.
> 
> I think it's theoretically possible to figure out that there's a zone
> size and then round outwards the error-target part of the dm table to
> align with a zone.

It's the sysfs chunk size.  btrfs/237 harcodes reading that out,
which could be easily lifted into a helper.

> I have a lot more doubts about whether or not doing
> that in bash/awk is a good idea though.  It'd be a lot easier if either
> the block layer did error injection or if someone just fixes those
> limitations in dm itself.

I'll sign up to do the block layer stuff.  Doing so should allow us
to run a lot more of the error injetion tests on zoned xfs, which
would be good.  So I guess you should keep it as-is for now,
and I'll do a sweep later.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/1] generic: test fsnotify filesystem error reporting
  2026-03-03 16:53         ` Christoph Hellwig
@ 2026-03-03 17:59           ` Darrick J. Wong
  0 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03 17:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: zlang, linux-fsdevel, hch, gabriel, amir73il, jack, fstests,
	linux-xfs

On Tue, Mar 03, 2026 at 08:53:12AM -0800, Christoph Hellwig wrote:
> On Tue, Mar 03, 2026 at 08:49:01AM -0800, Darrick J. Wong wrote:
> > > > +ext4)
> > > > +	# added at the same time as uevents
> > > > +	modprobe fs-$FSTYP
> > > > +	test -e /sys/fs/ext4/features/uevents || \
> > > > +		_notrun "$FSTYP does not support fsnotify ioerrors"
> > > > +	;;
> > > > +*)
> > > > +	_notrun "$FSTYP does not support fsnotify ioerrors"
> > > > +	;;
> > > > +esac
> > > 
> > > Please abstract this out into a documented helper in common/
> > 
> > Ok.  I'm not sure how to check for feature support on ext4 anymore since
> > the uevents patch didn't get merged, and then I clearly forgot to rip
> > that out of this helper here.
> 
> Oh.  Well, drop that then and move the xfs side and the default n
> into a common helper instead of hardcoding it in the test.

/me discovers that Baolin Liu added a "err_report_sec" sysfs knob to
ext4 in 7.0-rc1, so I can just change the helper to look for that.  I'll
move the logic to common/rc.

> > > and add proper error injection to the block code, which has been
> > > somewhere on my todo list forever because dm-error and friends are
> > > so painful to setup.  Maybe I need to expedite that.
> > 
> > I think it's theoretically possible to figure out that there's a zone
> > size and then round outwards the error-target part of the dm table to
> > align with a zone.
> 
> It's the sysfs chunk size.  btrfs/237 harcodes reading that out,
> which could be easily lifted into a helper.
> 
> > I have a lot more doubts about whether or not doing
> > that in bash/awk is a good idea though.  It'd be a lot easier if either
> > the block layer did error injection or if someone just fixes those
> > limitations in dm itself.
> 
> I'll sign up to do the block layer stuff.  Doing so should allow us
> to run a lot more of the error injetion tests on zoned xfs, which
> would be good.  So I guess you should keep it as-is for now,
> and I'll do a sweep later.

Ok, thanks. :)

--D

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems
       [not found] <20260303002508.GB57948@frogsfrogsfrogs>
  2026-03-03  0:33 ` [PATCHSET v8 1/2] fstests: test generic file IO error reporting Darrick J. Wong
@ 2026-03-03  0:33 ` Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 01/13] xfs: test health monitoring code Darrick J. Wong
                     ` (13 more replies)
  1 sibling, 14 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:33 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

Hi all,

This series adds functionality and regression tests for the automated
self healing daemon for xfs.

v8: clean up userspace for merging now that the kernel part is upstream
v7: more cleanups of the media verification ioctl, improve comments, and
    reuse the bio
v6: fix pi-breaking bugs, make verify failures trigger health reports
v5: add verify-media ioctl, collapse small helper funcs with only
    one caller
v4: drop multiple client support so we can make direct calls into
    healthmon instead of chasing pointers and doing indirect calls
v3: drag out of rfc status

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

With a bit of luck, this should all go splendidly.
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=health-monitoring

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=health-monitoring

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=health-monitoring
---
Commits in this patchset:
 * xfs: test health monitoring code
 * xfs: test for metadata corruption error reporting via healthmon
 * xfs: test io error reporting via healthmon
 * xfs: set up common code for testing xfs_healer
 * xfs: test xfs_healer's event handling
 * xfs: test xfs_healer can fix a filesystem
 * xfs: test xfs_healer can report file I/O errors
 * xfs: test xfs_healer can report file media errors
 * xfs: test xfs_healer can report filesystem shutdowns
 * xfs: test xfs_healer can initiate full filesystem repairs
 * xfs: test xfs_healer can follow mount moves
 * xfs: test xfs_healer wont repair the wrong filesystem
 * xfs: test xfs_healer background service
---
 common/config       |   14 +++
 common/rc           |   15 ++++
 common/systemd      |   32 ++++++++
 common/xfs          |  114 ++++++++++++++++++++++++++++
 doc/group-names.txt |    1 
 tests/xfs/1878      |   93 +++++++++++++++++++++++
 tests/xfs/1878.out  |   10 ++
 tests/xfs/1879      |   93 +++++++++++++++++++++++
 tests/xfs/1879.out  |    8 ++
 tests/xfs/1882      |   44 +++++++++++
 tests/xfs/1882.out  |    2 
 tests/xfs/1884      |   89 ++++++++++++++++++++++
 tests/xfs/1884.out  |    2 
 tests/xfs/1885      |   53 +++++++++++++
 tests/xfs/1885.out  |    5 +
 tests/xfs/1896      |  210 +++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1896.out  |   21 +++++
 tests/xfs/1897      |  172 ++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1897.out  |    7 ++
 tests/xfs/1898      |   37 +++++++++
 tests/xfs/1898.out  |    4 +
 tests/xfs/1899      |  108 ++++++++++++++++++++++++++
 tests/xfs/1899.out  |    3 +
 tests/xfs/1900      |  115 ++++++++++++++++++++++++++++
 tests/xfs/1900.out  |    2 
 tests/xfs/1901      |  137 +++++++++++++++++++++++++++++++++
 tests/xfs/1901.out  |    2 
 tests/xfs/1902      |  152 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1902.out  |    2 
 tests/xfs/802       |    4 -
 30 files changed, 1549 insertions(+), 2 deletions(-)
 create mode 100755 tests/xfs/1878
 create mode 100644 tests/xfs/1878.out
 create mode 100755 tests/xfs/1879
 create mode 100644 tests/xfs/1879.out
 create mode 100755 tests/xfs/1882
 create mode 100644 tests/xfs/1882.out
 create mode 100755 tests/xfs/1884
 create mode 100644 tests/xfs/1884.out
 create mode 100755 tests/xfs/1885
 create mode 100644 tests/xfs/1885.out
 create mode 100755 tests/xfs/1896
 create mode 100644 tests/xfs/1896.out
 create mode 100755 tests/xfs/1897
 create mode 100755 tests/xfs/1897.out
 create mode 100755 tests/xfs/1898
 create mode 100755 tests/xfs/1898.out
 create mode 100755 tests/xfs/1899
 create mode 100644 tests/xfs/1899.out
 create mode 100755 tests/xfs/1900
 create mode 100755 tests/xfs/1900.out
 create mode 100755 tests/xfs/1901
 create mode 100755 tests/xfs/1901.out
 create mode 100755 tests/xfs/1902
 create mode 100755 tests/xfs/1902.out


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 01/13] xfs: test health monitoring code
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
@ 2026-03-03  0:41   ` Darrick J. Wong
  2026-03-09 17:21     ` Zorro Lang
  2026-03-03  0:41   ` [PATCH 02/13] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
                     ` (12 subsequent siblings)
  13 siblings, 1 reply; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:41 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add some functionality tests for the new health monitoring code.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 doc/group-names.txt |    1 +
 tests/xfs/1885      |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1885.out  |    5 +++++
 3 files changed, 59 insertions(+)
 create mode 100755 tests/xfs/1885
 create mode 100644 tests/xfs/1885.out


diff --git a/doc/group-names.txt b/doc/group-names.txt
index 10b49e50517797..158f84d36d3154 100644
--- a/doc/group-names.txt
+++ b/doc/group-names.txt
@@ -117,6 +117,7 @@ samefs			overlayfs when all layers are on the same fs
 scrub			filesystem metadata scrubbers
 seed			btrfs seeded filesystems
 seek			llseek functionality
+selfhealing		self healing filesystem code
 selftest		tests with fixed results, used to validate testing setup
 send			btrfs send/receive
 shrinkfs		decreasing the size of a filesystem
diff --git a/tests/xfs/1885 b/tests/xfs/1885
new file mode 100755
index 00000000000000..1d75ef19c7c9d9
--- /dev/null
+++ b/tests/xfs/1885
@@ -0,0 +1,53 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1885
+#
+# Make sure that healthmon handles module refcount correctly.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/module
+
+refcount_file="/sys/module/xfs/refcnt"
+test -e "$refcount_file" || _notrun "cannot find xfs module refcount"
+
+_require_test
+_require_xfs_io_command healthmon
+
+# Capture mod refcount without the test fs mounted
+_test_unmount
+init_refcount="$(cat "$refcount_file")"
+
+# Capture mod refcount with the test fs mounted
+_test_mount
+nomon_mount_refcount="$(cat "$refcount_file")"
+
+# Capture mod refcount with test fs mounted and the healthmon fd open.
+# Pause the xfs_io process so that it doesn't actually respond to events.
+$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
+sleep 0.5
+kill -STOP %1
+mon_mount_refcount="$(cat "$refcount_file")"
+
+# Capture mod refcount with only the healthmon fd open.
+_test_unmount
+mon_nomount_refcount="$(cat "$refcount_file")"
+
+# Capture mod refcount after continuing healthmon (which should exit due to the
+# unmount) and killing it.
+kill -CONT %1
+kill %1
+wait
+nomon_nomount_refcount="$(cat "$refcount_file")"
+
+_within_tolerance "mount refcount" "$nomon_mount_refcount" "$((init_refcount + 1))" 0 -v
+_within_tolerance "mount + healthmon refcount" "$mon_mount_refcount" "$((init_refcount + 2))" 0 -v
+_within_tolerance "healthmon refcount" "$mon_nomount_refcount" "$((init_refcount + 1))" 0 -v
+_within_tolerance "end refcount" "$nomon_nomount_refcount" "$init_refcount" 0 -v
+
+status=0
+exit
diff --git a/tests/xfs/1885.out b/tests/xfs/1885.out
new file mode 100644
index 00000000000000..f152cef0525609
--- /dev/null
+++ b/tests/xfs/1885.out
@@ -0,0 +1,5 @@
+QA output created by 1885
+mount refcount is in range
+mount + healthmon refcount is in range
+healthmon refcount is in range
+end refcount is in range


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 01/13] xfs: test health monitoring code
  2026-03-03  0:41   ` [PATCH 01/13] xfs: test health monitoring code Darrick J. Wong
@ 2026-03-09 17:21     ` Zorro Lang
  2026-03-09 18:03       ` Darrick J. Wong
  0 siblings, 1 reply; 30+ messages in thread
From: Zorro Lang @ 2026-03-09 17:21 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: hch, fstests, linux-xfs

On Mon, Mar 02, 2026 at 04:41:07PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Add some functionality tests for the new health monitoring code.
> 
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  doc/group-names.txt |    1 +
>  tests/xfs/1885      |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/xfs/1885.out  |    5 +++++
>  3 files changed, 59 insertions(+)
>  create mode 100755 tests/xfs/1885
>  create mode 100644 tests/xfs/1885.out
> 
> 
> diff --git a/doc/group-names.txt b/doc/group-names.txt
> index 10b49e50517797..158f84d36d3154 100644
> --- a/doc/group-names.txt
> +++ b/doc/group-names.txt
> @@ -117,6 +117,7 @@ samefs			overlayfs when all layers are on the same fs
>  scrub			filesystem metadata scrubbers
>  seed			btrfs seeded filesystems
>  seek			llseek functionality
> +selfhealing		self healing filesystem code
>  selftest		tests with fixed results, used to validate testing setup
>  send			btrfs send/receive
>  shrinkfs		decreasing the size of a filesystem
> diff --git a/tests/xfs/1885 b/tests/xfs/1885
> new file mode 100755
> index 00000000000000..1d75ef19c7c9d9
> --- /dev/null
> +++ b/tests/xfs/1885
> @@ -0,0 +1,53 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
> +#
> +# FS QA Test 1885
> +#
> +# Make sure that healthmon handles module refcount correctly.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing

I found this test is quick enough, how about add it into "quick" group.

> +
> +. ./common/filter
> +. ./common/module

Which helper is this "module" file being included for?

> +
> +refcount_file="/sys/module/xfs/refcnt"
> +test -e "$refcount_file" || _notrun "cannot find xfs module refcount"

Or did you intend to add this part as a helper into common/module?

> +
> +_require_test
> +_require_xfs_io_command healthmon
> +
> +# Capture mod refcount without the test fs mounted
> +_test_unmount
> +init_refcount="$(cat "$refcount_file")"
> +
> +# Capture mod refcount with the test fs mounted
> +_test_mount
> +nomon_mount_refcount="$(cat "$refcount_file")"
> +
> +# Capture mod refcount with test fs mounted and the healthmon fd open.
> +# Pause the xfs_io process so that it doesn't actually respond to events.
> +$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
> +sleep 0.5
> +kill -STOP %1
> +mon_mount_refcount="$(cat "$refcount_file")"
> +
> +# Capture mod refcount with only the healthmon fd open.
> +_test_unmount
> +mon_nomount_refcount="$(cat "$refcount_file")"
> +
> +# Capture mod refcount after continuing healthmon (which should exit due to the
> +# unmount) and killing it.
> +kill -CONT %1
> +kill %1
> +wait

We typically ensure that background processes are handled within the _cleanup function.

> +nomon_nomount_refcount="$(cat "$refcount_file")"
> +
> +_within_tolerance "mount refcount" "$nomon_mount_refcount" "$((init_refcount + 1))" 0 -v
> +_within_tolerance "mount + healthmon refcount" "$mon_mount_refcount" "$((init_refcount + 2))" 0 -v
> +_within_tolerance "healthmon refcount" "$mon_nomount_refcount" "$((init_refcount + 1))" 0 -v
> +_within_tolerance "end refcount" "$nomon_nomount_refcount" "$init_refcount" 0 -v
> +
> +status=0
> +exit

_exit 0

> diff --git a/tests/xfs/1885.out b/tests/xfs/1885.out
> new file mode 100644
> index 00000000000000..f152cef0525609
> --- /dev/null
> +++ b/tests/xfs/1885.out
> @@ -0,0 +1,5 @@
> +QA output created by 1885
> +mount refcount is in range
> +mount + healthmon refcount is in range
> +healthmon refcount is in range
> +end refcount is in range
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 01/13] xfs: test health monitoring code
  2026-03-09 17:21     ` Zorro Lang
@ 2026-03-09 18:03       ` Darrick J. Wong
  0 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-09 18:03 UTC (permalink / raw)
  To: Zorro Lang; +Cc: hch, fstests, linux-xfs

On Tue, Mar 10, 2026 at 01:21:14AM +0800, Zorro Lang wrote:
> On Mon, Mar 02, 2026 at 04:41:07PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Add some functionality tests for the new health monitoring code.
> > 
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> >  doc/group-names.txt |    1 +
> >  tests/xfs/1885      |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/xfs/1885.out  |    5 +++++
> >  3 files changed, 59 insertions(+)
> >  create mode 100755 tests/xfs/1885
> >  create mode 100644 tests/xfs/1885.out
> > 
> > 
> > diff --git a/doc/group-names.txt b/doc/group-names.txt
> > index 10b49e50517797..158f84d36d3154 100644
> > --- a/doc/group-names.txt
> > +++ b/doc/group-names.txt
> > @@ -117,6 +117,7 @@ samefs			overlayfs when all layers are on the same fs
> >  scrub			filesystem metadata scrubbers
> >  seed			btrfs seeded filesystems
> >  seek			llseek functionality
> > +selfhealing		self healing filesystem code
> >  selftest		tests with fixed results, used to validate testing setup
> >  send			btrfs send/receive
> >  shrinkfs		decreasing the size of a filesystem
> > diff --git a/tests/xfs/1885 b/tests/xfs/1885
> > new file mode 100755
> > index 00000000000000..1d75ef19c7c9d9
> > --- /dev/null
> > +++ b/tests/xfs/1885
> > @@ -0,0 +1,53 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
> > +#
> > +# FS QA Test 1885
> > +#
> > +# Make sure that healthmon handles module refcount correctly.
> > +#
> > +. ./common/preamble
> > +_begin_fstest auto selfhealing
> 
> I found this test is quick enough, how about add it into "quick" group.

OK.

> > +
> > +. ./common/filter
> > +. ./common/module
> 
> Which helper is this "module" file being included for?

I think at one point I would rmmod/modprobe the module to force the
refcount leak issue, but discovered there's a sysfs knob for that...

> > +
> > +refcount_file="/sys/module/xfs/refcnt"
> > +test -e "$refcount_file" || _notrun "cannot find xfs module refcount"
> 
> Or did you intend to add this part as a helper into common/module?

...so this probably should get refactored into a new helper.

> > +
> > +_require_test
> > +_require_xfs_io_command healthmon
> > +
> > +# Capture mod refcount without the test fs mounted
> > +_test_unmount
> > +init_refcount="$(cat "$refcount_file")"
> > +
> > +# Capture mod refcount with the test fs mounted
> > +_test_mount
> > +nomon_mount_refcount="$(cat "$refcount_file")"
> > +
> > +# Capture mod refcount with test fs mounted and the healthmon fd open.
> > +# Pause the xfs_io process so that it doesn't actually respond to events.
> > +$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
> > +sleep 0.5
> > +kill -STOP %1
> > +mon_mount_refcount="$(cat "$refcount_file")"
> > +
> > +# Capture mod refcount with only the healthmon fd open.
> > +_test_unmount
> > +mon_nomount_refcount="$(cat "$refcount_file")"
> > +
> > +# Capture mod refcount after continuing healthmon (which should exit due to the
> > +# unmount) and killing it.
> > +kill -CONT %1
> > +kill %1
> > +wait
> 
> We typically ensure that background processes are handled within the _cleanup function.

oops, will clean that up.

$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
healer_pid=$!
...
kill $healer_pid

etc.  Thanks for pointing that out.

--D

> > +nomon_nomount_refcount="$(cat "$refcount_file")"
> > +
> > +_within_tolerance "mount refcount" "$nomon_mount_refcount" "$((init_refcount + 1))" 0 -v
> > +_within_tolerance "mount + healthmon refcount" "$mon_mount_refcount" "$((init_refcount + 2))" 0 -v
> > +_within_tolerance "healthmon refcount" "$mon_nomount_refcount" "$((init_refcount + 1))" 0 -v
> > +_within_tolerance "end refcount" "$nomon_nomount_refcount" "$init_refcount" 0 -v
> > +
> > +status=0
> > +exit
> 
> _exit 0
> 
> > diff --git a/tests/xfs/1885.out b/tests/xfs/1885.out
> > new file mode 100644
> > index 00000000000000..f152cef0525609
> > --- /dev/null
> > +++ b/tests/xfs/1885.out
> > @@ -0,0 +1,5 @@
> > +QA output created by 1885
> > +mount refcount is in range
> > +mount + healthmon refcount is in range
> > +healthmon refcount is in range
> > +end refcount is in range
> > 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 02/13] xfs: test for metadata corruption error reporting via healthmon
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 01/13] xfs: test health monitoring code Darrick J. Wong
@ 2026-03-03  0:41   ` Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 03/13] xfs: test io " Darrick J. Wong
                     ` (11 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:41 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check if we can detect runtime metadata corruptions via the health
monitor.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 common/rc          |   10 ++++++
 tests/xfs/1879     |   93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1879.out |    8 ++++
 3 files changed, 111 insertions(+)
 create mode 100755 tests/xfs/1879
 create mode 100644 tests/xfs/1879.out


diff --git a/common/rc b/common/rc
index fd4ca9641822cf..38d4b500b3b51f 100644
--- a/common/rc
+++ b/common/rc
@@ -3013,6 +3013,16 @@ _require_xfs_io_command()
 		echo $testio | grep -q "Inappropriate ioctl" && \
 			_notrun "xfs_io $command support is missing"
 		;;
+	"healthmon")
+		testio=`$XFS_IO_PROG -c "$command -p $param" $TEST_DIR 2>&1`
+		echo $testio | grep -q "bad argument count" && \
+			_notrun "xfs_io $command $param support is missing"
+		echo $testio | grep -q "Inappropriate ioctl" && \
+			_notrun "xfs_io $command $param ioctl support is missing"
+		echo $testio | grep -q "Operation not supported" && \
+			_notrun "xfs_io $command $param kernel support is missing"
+		param_checked="$param"
+		;;
 	"label")
 		testio=`$XFS_IO_PROG -c "label" $TEST_DIR 2>&1`
 		;;
diff --git a/tests/xfs/1879 b/tests/xfs/1879
new file mode 100755
index 00000000000000..75bc8e3b5f4316
--- /dev/null
+++ b/tests/xfs/1879
@@ -0,0 +1,93 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1879
+#
+# Corrupt some metadata and try to access it with the health monitoring program
+# running.  Check that healthmon observes a metadata error.
+#
+. ./common/preamble
+_begin_fstest auto quick eio selfhealing
+
+_cleanup()
+{
+	cd /
+	rm -rf $tmp.* $testdir
+}
+
+. ./common/filter
+
+_require_scratch_nocheck
+_require_scratch_xfs_crc # can't detect minor corruption w/o crc
+_require_xfs_io_command healthmon
+
+# Disable the scratch rt device to avoid test failures relating to the rt
+# bitmap consuming all the free space in our small data device.
+unset SCRATCH_RTDEV
+
+echo "Format and mount"
+_scratch_mkfs -d agcount=1 | _filter_mkfs 2> $tmp.mkfs >> $seqres.full
+. $tmp.mkfs
+_scratch_mount
+mkdir $SCRATCH_MNT/a/
+# Enough entries to get to a single block directory
+for ((i = 0; i < ( (isize + 255) / 256); i++)); do
+	path="$(printf "%s/a/%0255d" "$SCRATCH_MNT" "$i")"
+	touch "$path"
+done
+inum="$(stat -c %i "$SCRATCH_MNT/a")"
+_scratch_unmount
+
+# Fuzz the directory block so that the touch below will be guaranteed to trip
+# a runtime sickness report in exactly the manner we desire.
+_scratch_xfs_db -x -c "inode $inum" -c "dblock 0" -c 'fuzz bhdr.hdr.owner add' -c print &>> $seqres.full
+
+# Try to allocate space to trigger a metadata corruption event
+echo "Runtime corruption detection"
+_scratch_mount
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
+sleep 1	# wait for program to start up
+touch $SCRATCH_MNT/a/farts &>> $seqres.full
+_scratch_unmount
+
+wait	# for healthmon to finish
+
+# Did we get errors?
+check_healthmon()
+{
+	cat $tmp.healthmon >> $seqres.full
+	_filter_scratch < $tmp.healthmon | \
+		grep -E '(sick|corrupt)' | \
+		sed -e 's|SCRATCH_MNT/a|VICTIM|g' \
+		    -e 's|SCRATCH_MNT ino [0-9]* gen 0x[0-9a-f]*|VICTIM|g' | \
+		sort | \
+		uniq
+}
+check_healthmon
+
+# Run scrub to trigger a health event from there too.
+echo "Scrub corruption detection"
+_scratch_mount
+if _supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV; then
+	$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
+	sleep 1	# wait for program to start up
+	$XFS_SCRUB_PROG -n $SCRATCH_MNT &>> $seqres.full
+	_scratch_unmount
+
+	wait	# for healthmon to finish
+
+	# Did we get errors?
+	check_healthmon
+else
+	# mock the output since we don't support scrub
+	_scratch_unmount
+	cat << ENDL
+VICTIM directory: corrupt
+VICTIM directory: sick
+VICTIM parent: corrupt
+ENDL
+fi
+
+status=0
+exit
diff --git a/tests/xfs/1879.out b/tests/xfs/1879.out
new file mode 100644
index 00000000000000..2f6acbe1c4fb22
--- /dev/null
+++ b/tests/xfs/1879.out
@@ -0,0 +1,8 @@
+QA output created by 1879
+Format and mount
+Runtime corruption detection
+VICTIM directory: sick
+Scrub corruption detection
+VICTIM directory: corrupt
+VICTIM directory: sick
+VICTIM parent: corrupt


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 03/13] xfs: test io error reporting via healthmon
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 01/13] xfs: test health monitoring code Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 02/13] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
@ 2026-03-03  0:41   ` Darrick J. Wong
  2026-03-03  0:41   ` [PATCH 04/13] xfs: set up common code for testing xfs_healer Darrick J. Wong
                     ` (10 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:41 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new test to make sure the kernel can report IO errors via
health monitoring.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1878     |   93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1878.out |   10 ++++++
 2 files changed, 103 insertions(+)
 create mode 100755 tests/xfs/1878
 create mode 100644 tests/xfs/1878.out


diff --git a/tests/xfs/1878 b/tests/xfs/1878
new file mode 100755
index 00000000000000..1ff6ae040fb193
--- /dev/null
+++ b/tests/xfs/1878
@@ -0,0 +1,93 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1878
+#
+# Attempt to read and write a file in buffered and directio mode with the
+# health monitoring program running.  Check that healthmon observes all four
+# types of IO errors.
+#
+. ./common/preamble
+_begin_fstest auto quick eio selfhealing
+
+_cleanup()
+{
+	cd /
+	rm -rf $tmp.* $testdir
+	_dmerror_cleanup
+}
+
+. ./common/filter
+. ./common/dmerror
+
+_require_scratch_nocheck
+_require_xfs_io_command healthmon
+_require_dm_target error
+
+filter_healer_errors() {
+	_filter_scratch | \
+		grep -E '(buffered|directio)' | \
+		sed \
+		    -e 's/ino [0-9]*/ino NUM/g' \
+		    -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+		    -e 's/pos [0-9]*/pos NUM/g' \
+		    -e 's/len [0-9]*/len NUM/g' \
+		    -e 's|SCRATCH_MNT/a|VICTIM|g' \
+		    -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
+		uniq
+}
+
+# Disable the scratch rt device to avoid test failures relating to the rt
+# bitmap consuming all the free space in our small data device.
+unset SCRATCH_RTDEV
+
+echo "Format and mount"
+_scratch_mkfs > $seqres.full 2>&1
+_dmerror_init no_log
+_dmerror_mount
+
+_require_fs_space $SCRATCH_MNT 65536
+
+# Create a file with written regions far enough apart that the pagecache can't
+# possibly be caching the regions with a single folio.
+testfile=$SCRATCH_MNT/fsync-err-test
+$XFS_IO_PROG -f \
+	-c 'pwrite -b 1m 0 1m' \
+	-c 'pwrite -b 1m 10g 1m' \
+	-c 'pwrite -b 1m 20g 1m' \
+	-c fsync $testfile >> $seqres.full
+
+# First we check if directio errors get reported
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
+sleep 1	# wait for program to start up
+_dmerror_load_error_table
+$XFS_IO_PROG -d -c 'pwrite -b 256k 12k 16k' $testfile >> $seqres.full
+$XFS_IO_PROG -d -c 'pread -b 256k 10g 16k' $testfile >> $seqres.full
+_dmerror_load_working_table
+
+_dmerror_unmount
+wait	# for healthmon to finish
+_dmerror_mount
+
+# Next we check if buffered io errors get reported.  We have to write something
+# before loading the error table to ensure the dquots get loaded.
+$XFS_IO_PROG -c 'pwrite -b 256k 20g 1k' -c fsync $testfile >> $seqres.full
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
+sleep 1	# wait for program to start up
+_dmerror_load_error_table
+$XFS_IO_PROG -c 'pread -b 256k 12k 16k' $testfile >> $seqres.full
+$XFS_IO_PROG -c 'pwrite -b 256k 20g 16k' -c fsync $testfile >> $seqres.full
+_dmerror_load_working_table
+
+_dmerror_unmount
+wait	# for healthmon to finish
+
+# Did we get errors?
+cat $tmp.healthmon >> $seqres.full
+filter_healer_errors < $tmp.healthmon
+
+_dmerror_cleanup
+
+status=0
+exit
diff --git a/tests/xfs/1878.out b/tests/xfs/1878.out
new file mode 100644
index 00000000000000..f64c440b1a6ed1
--- /dev/null
+++ b/tests/xfs/1878.out
@@ -0,0 +1,10 @@
+QA output created by 1878
+Format and mount
+pwrite: Input/output error
+pread: Input/output error
+pread: Input/output error
+fsync: Input/output error
+VICTIM pos NUM len NUM: directio_write: Input/output error
+VICTIM pos NUM len NUM: directio_read: Input/output error
+VICTIM pos NUM len NUM: buffered_read: Input/output error
+VICTIM pos NUM len NUM: buffered_write: Input/output error


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 04/13] xfs: set up common code for testing xfs_healer
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (2 preceding siblings ...)
  2026-03-03  0:41   ` [PATCH 03/13] xfs: test io " Darrick J. Wong
@ 2026-03-03  0:41   ` Darrick J. Wong
  2026-03-03  0:42   ` [PATCH 05/13] xfs: test xfs_healer's event handling Darrick J. Wong
                     ` (9 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:41 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a bunch of common code so that we can test the xfs_healer daemon.
Most of the changes here are to make it easier to manage the systemd
service units for xfs_healer and xfs_scrub.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 common/config  |   14 +++++++
 common/rc      |    5 ++
 common/systemd |   32 ++++++++++++++++
 common/xfs     |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/802  |    4 +-
 5 files changed, 167 insertions(+), 2 deletions(-)


diff --git a/common/config b/common/config
index 1420e35ddfee42..8468a60081f50c 100644
--- a/common/config
+++ b/common/config
@@ -161,6 +161,20 @@ export XFS_ADMIN_PROG="$(type -P xfs_admin)"
 export XFS_GROWFS_PROG=$(type -P xfs_growfs)
 export XFS_SPACEMAN_PROG="$(type -P xfs_spaceman)"
 export XFS_SCRUB_PROG="$(type -P xfs_scrub)"
+
+XFS_HEALER_PROG="$(type -P xfs_healer)"
+XFS_HEALER_START_PROG="$(type -P xfs_healer_start)"
+
+# If not found, try the ones installed in libexec
+if [ ! -x "$XFS_HEALER_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer ]; then
+	XFS_HEALER_PROG=/usr/libexec/xfsprogs/xfs_healer
+fi
+if [ ! -x "$XFS_HEALER_START_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer_start ]; then
+	XFS_HEALER_START_PROG=/usr/libexec/xfsprogs/xfs_healer_start
+fi
+export XFS_HEALER_PROG
+export XFS_HEALER_START_PROG
+
 export XFS_PARALLEL_REPAIR_PROG="$(type -P xfs_prepair)"
 export XFS_PARALLEL_REPAIR64_PROG="$(type -P xfs_prepair64)"
 export __XFSDUMP_PROG="$(type -P xfsdump)"
diff --git a/common/rc b/common/rc
index 38d4b500b3b51f..91db0fb09da891 100644
--- a/common/rc
+++ b/common/rc
@@ -3026,6 +3026,11 @@ _require_xfs_io_command()
 	"label")
 		testio=`$XFS_IO_PROG -c "label" $TEST_DIR 2>&1`
 		;;
+	"verifymedia")
+		testio=`$XFS_IO_PROG -x -c "verifymedia $* 0 0" 2>&1`
+		echo $testio | grep -q "invalid option" && \
+			_notrun "xfs_io $command support is missing"
+		;;
 	"open")
 		# -c "open $f" is broken in xfs_io <= 4.8. Along with the fix,
 		# a new -C flag was introduced to execute one shot commands.
diff --git a/common/systemd b/common/systemd
index b2e24f267b2d93..b4c77c78a8da44 100644
--- a/common/systemd
+++ b/common/systemd
@@ -44,6 +44,18 @@ _systemd_unit_active() {
 	test "$(systemctl is-active "$1")" = "active"
 }
 
+# Wait for up to a certain number of seconds for a service to reach inactive
+# state.
+_systemd_unit_wait() {
+	local svcname="$1"
+	local timeout="${2:-30}"
+
+	for ((i = 0; i < (timeout * 2); i++)); do
+		test "$(systemctl is-active "$svcname")" = "inactive" && break
+		sleep 0.5
+	done
+}
+
 _require_systemd_unit_active() {
 	_require_systemd_unit_defined "$1"
 	_systemd_unit_active "$1" || \
@@ -71,3 +83,23 @@ _systemd_unit_status() {
 	_systemd_installed || return 1
 	systemctl status "$1"
 }
+
+# Start a running systemd unit
+_systemd_unit_start() {
+	systemctl start "$1"
+}
+# Stop a running systemd unit
+_systemd_unit_stop() {
+	systemctl stop "$1"
+}
+
+# Mask or unmask a running systemd unit
+_systemd_unit_mask() {
+	systemctl mask "$1"
+}
+_systemd_unit_unmask() {
+	systemctl unmask "$1"
+}
+_systemd_unit_masked() {
+	systemctl status "$1" 2>/dev/null | grep -q 'Loaded: masked'
+}
diff --git a/common/xfs b/common/xfs
index 7fa0db2e26b4c9..a4a538fde3f173 100644
--- a/common/xfs
+++ b/common/xfs
@@ -2301,3 +2301,117 @@ _filter_bmap_gno()
 		if ($ag =~ /\d+/) {print "$ag "} ;
         '
 }
+
+# Compute the systemd service instance name for a background service and path
+_xfs_systemd_svcname()
+{
+	local arg
+	local template
+	local out
+	local svc="$1"
+	shift
+
+	case "$svc" in
+	--scrub)	arg="-s"; template="xfs_scrub@.service";;
+	--healer)	arg="-h"; template="xfs_healer@.service";;
+	*)		arg="-t $svc"; template="$svc";;
+	esac
+
+	# xfs_io should be able to do all the magic to make this work...
+	out="$($XFS_IO_PROG -c "svcname ${arg} $*" / 2>/dev/null)"
+	if [ -n "$out" ]; then
+		echo "$out"
+		return
+	fi
+
+	# ...but if not, we can fall back to brute force systemd invocations.
+	systemd-escape --template "$template" --path "$*"
+}
+
+# Compute the xfs_healer systemd service instance name for a given path
+_xfs_healer_svcname()
+{
+	_xfs_systemd_svcname --healer "$@"
+}
+
+# Compute the xfs_scrub systemd service instance name for a given path
+_xfs_scrub_svcname()
+{
+	_xfs_systemd_svcname --scrub "$@"
+}
+
+# Run the xfs_healer program on some filesystem
+_xfs_healer() {
+	$XFS_HEALER_PROG "$@"
+}
+
+# Run the xfs_healer program on the scratch fs
+_scratch_xfs_healer() {
+	_xfs_healer "$@" "$SCRATCH_MNT"
+}
+
+# Turn off the background xfs_healer service if any so that it doesn't fix
+# injected metadata errors; then start a background copy of xfs_healer to
+# capture that.
+_invoke_xfs_healer() {
+	local mount="$1"
+	local logfile="$2"
+	shift; shift
+
+	if _systemd_is_running; then
+		local svc="$(_xfs_healer_svcname "$mount")"
+		_systemd_unit_stop "$svc" &>> $seqres.full
+	fi
+
+	$XFS_HEALER_PROG "$mount" "$@" &> "$logfile" &
+	XFS_HEALER_PID=$!
+
+	# Wait 30s for the healer program to really start up
+	for ((i = 0; i < 60; i++)); do
+		test -e "$logfile" && \
+			grep -q 'monitoring started' "$logfile" && \
+			break
+		sleep 0.5
+	done
+}
+
+# Run our own copy of xfs_healer against the scratch device.  Note that
+# unmounting the scratch fs causes the healer daemon to exit, so we don't need
+# to kill it explicitly from _cleanup.
+_scratch_invoke_xfs_healer() {
+	_invoke_xfs_healer "$SCRATCH_MNT" "$@"
+}
+
+# Unmount the filesystem to kill the xfs_healer instance started by
+# _invoke_xfs_healer, and wait up to a certain amount of time for it to exit.
+_kill_xfs_healer() {
+	local unmount="$1"
+	local timeout="${2:-30}"
+	local i
+
+	# Unmount fs to kill healer, then wait for it to finish
+	for ((i = 0; i < (timeout * 2); i++)); do
+		$unmount &>> $seqres.full && break
+		sleep 0.5
+	done
+
+	test -n "$XFS_HEALER_PID" && \
+		kill $XFS_HEALER_PID &>> $seqres.full
+	wait
+	unset XFS_HEALER_PID
+}
+
+# Unmount the scratch fs to kill a _scratch_invoke_xfs_healer instance.
+_scratch_kill_xfs_healer() {
+	local unmount="${1:-_scratch_unmount}"
+	shift
+
+	_kill_xfs_healer "$unmount" "$@"
+}
+
+# Does this mounted filesystem support xfs_healer?
+_require_xfs_healer()
+{
+	_xfs_healer --supported "$@" &>/dev/null || \
+		_notrun "health monitoring not supported on this kernel"
+}
diff --git a/tests/xfs/802 b/tests/xfs/802
index fc4767acb66a55..18312b15b645bd 100755
--- a/tests/xfs/802
+++ b/tests/xfs/802
@@ -105,8 +105,8 @@ run_scrub_service() {
 }
 
 echo "Scrub Scratch FS"
-scratch_path=$(systemd-escape --path "$SCRATCH_MNT")
-run_scrub_service xfs_scrub@$scratch_path
+svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
+run_scrub_service "$svc"
 find_scrub_trace "$SCRATCH_MNT"
 
 # Remove the xfs_scrub_all media scan stamp directory (if specified) because we


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 05/13] xfs: test xfs_healer's event handling
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (3 preceding siblings ...)
  2026-03-03  0:41   ` [PATCH 04/13] xfs: set up common code for testing xfs_healer Darrick J. Wong
@ 2026-03-03  0:42   ` Darrick J. Wong
  2026-03-03  0:42   ` [PATCH 06/13] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
                     ` (8 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:42 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer can handle every type of event that the kernel
can throw at it by initiating a full scrub of a test filesystem.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1882     |   44 ++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1882.out |    2 ++
 2 files changed, 46 insertions(+)
 create mode 100755 tests/xfs/1882
 create mode 100644 tests/xfs/1882.out


diff --git a/tests/xfs/1882 b/tests/xfs/1882
new file mode 100755
index 00000000000000..2fb4589418401e
--- /dev/null
+++ b/tests/xfs/1882
@@ -0,0 +1,44 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1882
+#
+# Make sure that xfs_healer correctly handles all the reports that it gets
+# from the kernel.  We simulate this by using the --everything mode so we get
+# all the events, not just the sickness reports.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+. ./common/populate
+
+_require_scrub
+_require_xfs_io_command "scrub"		# online check support
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_scratch
+
+# Does this fs support health monitoring?
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+# Create a sample fs with all the goodies
+_scratch_populate_cached nofill &>> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --everything
+
+# Run scrub to make some noise
+_scratch_scrub -b -n >> $seqres.full
+
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1882.out b/tests/xfs/1882.out
new file mode 100644
index 00000000000000..9b31ccb735cabd
--- /dev/null
+++ b/tests/xfs/1882.out
@@ -0,0 +1,2 @@
+QA output created by 1882
+Silence is golden


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 06/13] xfs: test xfs_healer can fix a filesystem
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (4 preceding siblings ...)
  2026-03-03  0:42   ` [PATCH 05/13] xfs: test xfs_healer's event handling Darrick J. Wong
@ 2026-03-03  0:42   ` Darrick J. Wong
  2026-03-03  0:42   ` [PATCH 07/13] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
                     ` (7 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:42 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer can actually fix an injected metadata corruption.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1884     |   89 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1884.out |    2 +
 2 files changed, 91 insertions(+)
 create mode 100755 tests/xfs/1884
 create mode 100644 tests/xfs/1884.out


diff --git a/tests/xfs/1884 b/tests/xfs/1884
new file mode 100755
index 00000000000000..1fa6457ad25203
--- /dev/null
+++ b/tests/xfs/1884
@@ -0,0 +1,89 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1884
+#
+# Ensure that autonomous self healing fixes the filesystem correctly.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_scrub
+_require_xfs_io_command "repair"	# online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+	_notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+	_notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+	fname="$(printf "%0255d" "$i")"
+	ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+	-c 'path /some/victimdir' \
+	-c 'bmap' \
+	-c 'dblock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "try $try saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "retry $try still saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1884.out b/tests/xfs/1884.out
new file mode 100644
index 00000000000000..929e33da01f92c
--- /dev/null
+++ b/tests/xfs/1884.out
@@ -0,0 +1,2 @@
+QA output created by 1884
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 07/13] xfs: test xfs_healer can report file I/O errors
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (5 preceding siblings ...)
  2026-03-03  0:42   ` [PATCH 06/13] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
@ 2026-03-03  0:42   ` Darrick J. Wong
  2026-03-03  0:42   ` [PATCH 08/13] xfs: test xfs_healer can report file media errors Darrick J. Wong
                     ` (6 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:42 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer can actually report file I/O errors.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1896     |  210 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1896.out |   21 +++++
 2 files changed, 231 insertions(+)
 create mode 100755 tests/xfs/1896
 create mode 100644 tests/xfs/1896.out


diff --git a/tests/xfs/1896 b/tests/xfs/1896
new file mode 100755
index 00000000000000..911e1d5ee8a576
--- /dev/null
+++ b/tests/xfs/1896
@@ -0,0 +1,210 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1896
+#
+# Check that xfs_healer can report file IO errors.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+# Override the default cleanup function.
+_cleanup()
+{
+	cd /
+	rm -f $tmp.*
+	_dmerror_cleanup
+}
+
+# Import common functions.
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+. ./common/systemd
+
+_require_scratch
+_require_scrub
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_dm_target error
+_require_no_xfs_always_cow	# no out of place writes
+
+# Ignore everything from the healer except for the four IO error log messages.
+# Strip out file handle and range information because the blocksize can vary.
+# Writeback and readahead can trigger multiple error messages due to retries,
+# hence the uniq.
+filter_healer_errors() {
+	_filter_scratch | \
+		grep -E '(buffered|directio)' | \
+		sed \
+		    -e 's/ino [0-9]*/ino NUM/g' \
+		    -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+		    -e 's/pos [0-9]*/pos NUM/g' \
+		    -e 's/len [0-9]*/len NUM/g' \
+		    -e 's|SCRATCH_MNT/a|VICTIM|g' \
+		    -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
+		sort | \
+		uniq
+}
+
+_scratch_mkfs >> $seqres.full
+
+#
+# The dm-error map added by this test doesn't work on zoned devices because
+# table sizes need to be aligned to the zone size, and even for zoned on
+# conventional this test will get confused because of the internal RT device.
+#
+# That check requires a mounted file system, so do a dummy mount before setting
+# up DM.
+#
+_scratch_mount
+_require_xfs_scratch_non_zoned
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+_dmerror_init
+_dmerror_mount >> $seqres.full 2>&1
+
+# Write a file with 4 file blocks worth of data, figure out the LBA to target
+victim=$SCRATCH_MNT/a
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
+unset errordev
+
+awk_len_prog='{print $6}'
+if _xfs_is_realtime_file $victim; then
+	if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+		awk_len_prog='{print $4}'
+	fi
+	errordev="RT"
+fi
+bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
+echo "$errordev:$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
+fs_blksz=$(_get_block_size $SCRATCH_MNT)
+echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
+kernel_sectors_per_fs_block=$((fs_blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
+test "$len" -lt $min_len_sectors && \
+	_fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Set the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# All sector numbers that we feed to the kernel must be in units of 512b, but
+# they also must be aligned to the device's logical block size.
+logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
+kernel_sectors_per_device_lba=$((logical_block_size / 512))
+
+# Mark as bad one of the device LBAs in the middle of the extent.  Target the
+# second LBA of the third block of the four-block file extent that we allocated
+# earlier, but without overflowing into the fourth file block.
+bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
+bad_len=$kernel_sectors_per_device_lba
+if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
+	bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
+fi
+if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
+	echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
+fi
+
+# Remount to flush the page cache, start the healer, and make the LBA bad
+_dmerror_unmount
+_dmerror_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer"
+
+_dmerror_mark_range_bad $bad_sector $bad_len $errordev
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+# See if buffered reads pick it up
+echo "Try buffered read"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Now mark the bad range good so that unmount won't fail due to IO errors.
+echo "Fix device"
+_dmerror_mark_range_good $bad_sector $bad_len $errordev
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer _dmerror_unmount
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer_errors
+
+# Start the healer again so that can verify that the errors don't persist after
+# we flip back to the good dm table.
+echo "Remount and restart healer"
+_dmerror_mount
+_scratch_invoke_xfs_healer "$tmp.healer"
+
+# See if buffered reads pick it up
+echo "Try buffered read again"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read again"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write again"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write again"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Unmount fs to kill healer, then wait for it to finish
+echo "Kill healer again"
+_scratch_kill_xfs_healer _dmerror_unmount
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer_errors
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1896.out b/tests/xfs/1896.out
new file mode 100644
index 00000000000000..1378d4fad44522
--- /dev/null
+++ b/tests/xfs/1896.out
@@ -0,0 +1,21 @@
+QA output created by 1896
+Try buffered read
+pread: Input/output error
+Try directio read
+pread: Input/output error
+Try directio write
+pwrite: Input/output error
+Try buffered write
+fsync: Input/output error
+Fix device
+Kill healer
+VICTIM pos NUM len NUM: buffered_read: Input/output error
+VICTIM pos NUM len NUM: buffered_write: Input/output error
+VICTIM pos NUM len NUM: directio_read: Input/output error
+VICTIM pos NUM len NUM: directio_write: Input/output error
+Remount and restart healer
+Try buffered read again
+Try directio read again
+Try directio write again
+Try buffered write again
+Kill healer again


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 08/13] xfs: test xfs_healer can report file media errors
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (6 preceding siblings ...)
  2026-03-03  0:42   ` [PATCH 07/13] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
@ 2026-03-03  0:42   ` Darrick J. Wong
  2026-03-03  0:43   ` [PATCH 09/13] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
                     ` (5 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:42 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer can actually report media errors as found by the
kernel.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1897     |  172 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1897.out |    7 ++
 2 files changed, 179 insertions(+)
 create mode 100755 tests/xfs/1897
 create mode 100755 tests/xfs/1897.out


diff --git a/tests/xfs/1897 b/tests/xfs/1897
new file mode 100755
index 00000000000000..4670c333a2d82c
--- /dev/null
+++ b/tests/xfs/1897
@@ -0,0 +1,172 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1897
+#
+# Check that xfs_healer can report media errors.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+_cleanup()
+{
+	cd /
+	rm -f $tmp.*
+	_dmerror_cleanup
+}
+
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+. ./common/systemd
+
+_require_scratch
+_require_scrub
+_require_dm_target error
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_xfs_io_command verifymedia
+
+filter_healer() {
+	_filter_scratch | \
+		grep -E '(media failed|media error)' | \
+		sed \
+		    -e 's/datadev/DEVICE/g' \
+		    -e 's/rtdev/DEVICE/g' \
+		    -e 's/ino [0-9]*/ino NUM/g' \
+		    -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+		    -e 's/pos [0-9]*/pos NUM/g' \
+		    -e 's/len [0-9]*/len NUM/g' \
+		    -e 's/0x[0-9a-f]*/NUM/g' \
+		    -e 's|SCRATCH_MNT/a|VICTIM|g' \
+		    -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g'
+}
+
+filter_verify() {
+	sed -e 's/\([a-z]*dev\): verify error at offset \([0-9]*\) length \([0-9]*\)/DEVICE: verify error at offset XXX length XXX/g'
+}
+
+_scratch_mkfs >> $seqres.full
+
+# The dm-error map added by this test doesn't work on zoned devices because
+# table sizes need to be aligned to the zone size, and even for zoned on
+# conventional this test will get confused because of the internal RT device.
+#
+# That check requires a mounted file system, so do a dummy mount before setting
+# up DM.
+_scratch_mount
+_require_xfs_scratch_non_zoned
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+_dmerror_init
+_dmerror_mount
+
+# Write a file with 4 file blocks worth of data, figure out the LBA to target
+victim=$SCRATCH_MNT/a
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
+unset errordev
+verifymediadev="-d"
+
+awk_len_prog='{print $6}'
+if _xfs_is_realtime_file $victim; then
+	if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+		awk_len_prog='{print $4}'
+	fi
+	errordev="RT"
+	verifymediadev="-r"
+fi
+bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
+echo "$errordev:$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
+fs_blksz=$(_get_block_size $SCRATCH_MNT)
+echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
+kernel_sectors_per_fs_block=$((fs_blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
+test "$len" -lt $min_len_sectors && \
+	_fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Set the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# All sector numbers that we feed to the kernel must be in units of 512b, but
+# they also must be aligned to the device's logical block size.
+logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
+kernel_sectors_per_device_lba=$((logical_block_size / 512))
+
+# Pretend as bad one of the device LBAs in the middle of the extent.  Target
+# the second LBA of the third block of the four-block file extent that we
+# allocated earlier, but without overflowing into the fourth file block.
+bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
+bad_len=$kernel_sectors_per_device_lba
+if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
+	bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
+fi
+if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
+	echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
+fi
+_dmerror_mark_range_bad $bad_sector $bad_len $errordev
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+echo "Simulate media error"
+_scratch_invoke_xfs_healer "$tmp.healer"
+echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
+$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT 2>&1 | filter_verify
+
+# Now mark the bad range good so that a retest shows no media failure.
+_dmerror_mark_range_good $bad_sector $bad_len $errordev
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+echo "No more media error"
+echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
+$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT >> $seqres.full
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer _dmerror_unmount
+
+# filesystems without rmap do not translate media errors to lost file ranges
+# so fake the output
+_xfs_has_feature "$SCRATCH_DEV" rmapbt || \
+	echo "VICTIM pos 0 len 0: media failed" >> $tmp.healer
+
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1897.out b/tests/xfs/1897.out
new file mode 100755
index 00000000000000..1bb615c3119dce
--- /dev/null
+++ b/tests/xfs/1897.out
@@ -0,0 +1,7 @@
+QA output created by 1897
+Simulate media error
+DEVICE: verify error at offset XXX length XXX: Input/output error
+No more media error
+Kill healer
+SCRATCH_MNT DEVICE daddr NUM bbcount NUM: media error
+VICTIM pos NUM len NUM: media failed


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 09/13] xfs: test xfs_healer can report filesystem shutdowns
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (7 preceding siblings ...)
  2026-03-03  0:42   ` [PATCH 08/13] xfs: test xfs_healer can report file media errors Darrick J. Wong
@ 2026-03-03  0:43   ` Darrick J. Wong
  2026-03-03  0:43   ` [PATCH 10/13] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
                     ` (4 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:43 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer can actually report abnormal filesystem shutdowns.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1898     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1898.out |    4 ++++
 2 files changed, 41 insertions(+)
 create mode 100755 tests/xfs/1898
 create mode 100755 tests/xfs/1898.out


diff --git a/tests/xfs/1898 b/tests/xfs/1898
new file mode 100755
index 00000000000000..2b6c72093e7021
--- /dev/null
+++ b/tests/xfs/1898
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1898
+#
+# Check that xfs_healer can report filesystem shutdowns.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+. ./common/fuzzy
+. ./common/filter
+. ./common/systemd
+
+_require_scratch_nocheck
+_require_scrub
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 500k" -c "fsync" $victim >> $seqres.full
+
+echo "Start healer and shut down"
+_scratch_invoke_xfs_healer "$tmp.healer"
+_scratch_shutdown -f
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | _filter_scratch | grep 'shut down'
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1898.out b/tests/xfs/1898.out
new file mode 100755
index 00000000000000..f71f848da810ce
--- /dev/null
+++ b/tests/xfs/1898.out
@@ -0,0 +1,4 @@
+QA output created by 1898
+Start healer and shut down
+Kill healer
+SCRATCH_MNT: filesystem shut down due to forced unmount


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 10/13] xfs: test xfs_healer can initiate full filesystem repairs
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (8 preceding siblings ...)
  2026-03-03  0:43   ` [PATCH 09/13] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
@ 2026-03-03  0:43   ` Darrick J. Wong
  2026-03-03  0:43   ` [PATCH 11/13] xfs: test xfs_healer can follow mount moves Darrick J. Wong
                     ` (3 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:43 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that when xfs_healer can't perform a spot repair, it will actually
start up xfs_scrub to perform a full scan and repair.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1899     |  108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1899.out |    3 +
 2 files changed, 111 insertions(+)
 create mode 100755 tests/xfs/1899
 create mode 100644 tests/xfs/1899.out


diff --git a/tests/xfs/1899 b/tests/xfs/1899
new file mode 100755
index 00000000000000..5d35ca8265645f
--- /dev/null
+++ b/tests/xfs/1899
@@ -0,0 +1,108 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1899
+#
+# Ensure that autonomous self healing works fixes the filesystem correctly
+# even if the spot repair doesn't work and it falls back to a full fsck.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_scrub
+_require_xfs_io_command "repair"	# online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+_require_systemd_unit_defined "xfs_scrub@.service"
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+	_notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+	_notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+filter_healer() {
+	_filter_scratch | \
+		grep 'Full repairs in progress' | \
+		uniq
+}
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+	fname="$(printf "%0255d" "$i")"
+	ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+	-c 'path /some/victimdir' \
+	-c 'bmap' \
+	-c 'dblock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
+	-c 'path /a' \
+	-c 'bmap -a' \
+	-c 'ablock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
+	>> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "try $try saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# Wait for the background fixer to finish
+svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
+_systemd_unit_wait "$svc"
+
+# List the dirents of /victimdir and parent pointers of /a to see if they both
+# stop reporting corruption
+(ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "retry $try still saw corruption" >> $seqres.full
+	sleep 0.1
+	(ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer
+
+status=0
+exit
diff --git a/tests/xfs/1899.out b/tests/xfs/1899.out
new file mode 100644
index 00000000000000..5345fd400f3627
--- /dev/null
+++ b/tests/xfs/1899.out
@@ -0,0 +1,3 @@
+QA output created by 1899
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
+SCRATCH_MNT: Full repairs in progress.


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 11/13] xfs: test xfs_healer can follow mount moves
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (9 preceding siblings ...)
  2026-03-03  0:43   ` [PATCH 10/13] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
@ 2026-03-03  0:43   ` Darrick J. Wong
  2026-03-03  0:43   ` [PATCH 12/13] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
                     ` (2 subsequent siblings)
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:43 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that when xfs_healer needs to reopen a filesystem to repair it,
it can still find the filesystem even if it has been mount --move'd.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1900     |  115 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1900.out |    2 +
 2 files changed, 117 insertions(+)
 create mode 100755 tests/xfs/1900
 create mode 100755 tests/xfs/1900.out


diff --git a/tests/xfs/1900 b/tests/xfs/1900
new file mode 100755
index 00000000000000..9a8f9fabd124ad
--- /dev/null
+++ b/tests/xfs/1900
@@ -0,0 +1,115 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1900
+#
+# Ensure that autonomous self healing fixes the filesystem correctly even if
+# the original mount has moved somewhere else.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_cleanup()
+{
+	command -v _kill_fsstress &>/dev/null && _kill_fsstress
+	cd /
+	rm -r -f $tmp.*
+	if [ -n "$new_dir" ]; then
+		_unmount "$new_dir" &>/dev/null
+		rm -rf "$new_dir"
+	fi
+}
+
+_require_test
+_require_scrub
+_require_xfs_io_command "repair"	# online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+	_notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+	_notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+	fname="$(printf "%0255d" "$i")"
+	ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+	-c 'path /some/victimdir' \
+	-c 'bmap' \
+	-c 'dblock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Move the scratch filesystem to a completely different mountpoint so that
+# we can test if the healer can find it again.
+new_dir=$TEST_DIR/moocow
+mkdir -p $new_dir
+_mount --bind $SCRATCH_MNT $new_dir
+_unmount $SCRATCH_MNT
+
+df -t xfs >> $seqres.full
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err | _filter_test_dir
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "try $try saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err | _filter_test_dir
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "retry $try still saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+new_dir_unmount() {
+	_unmount $new_dir
+}
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer new_dir_unmount
+cat $tmp.healer >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1900.out b/tests/xfs/1900.out
new file mode 100755
index 00000000000000..604c9eb5eb10f4
--- /dev/null
+++ b/tests/xfs/1900.out
@@ -0,0 +1,2 @@
+QA output created by 1900
+ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 12/13] xfs: test xfs_healer wont repair the wrong filesystem
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (10 preceding siblings ...)
  2026-03-03  0:43   ` [PATCH 11/13] xfs: test xfs_healer can follow mount moves Darrick J. Wong
@ 2026-03-03  0:43   ` Darrick J. Wong
  2026-03-03  0:44   ` [PATCH 13/13] xfs: test xfs_healer background service Darrick J. Wong
  2026-03-03  0:47   ` [PATCH 14/13] xfs: test xfs_healer startup service Darrick J. Wong
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:43 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that when xfs_healer needs to reopen a filesystem to repair it, it
won't latch on to another xfs filesystem that has been mounted atop the same
mountpoint.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1901     |  137 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1901.out |    2 +
 2 files changed, 139 insertions(+)
 create mode 100755 tests/xfs/1901
 create mode 100755 tests/xfs/1901.out


diff --git a/tests/xfs/1901 b/tests/xfs/1901
new file mode 100755
index 00000000000000..c92dcf9a3b3d48
--- /dev/null
+++ b/tests/xfs/1901
@@ -0,0 +1,137 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2025-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1901
+#
+# Ensure that autonomous self healing won't fix the wrong filesystem if a
+# snapshot of the original filesystem is now mounted on the same directory as
+# the original.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_cleanup()
+{
+	command -v _kill_fsstress &>/dev/null && _kill_fsstress
+	cd /
+	rm -r -f $tmp.*
+	test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
+	test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
+	test -e "$loop1" && _destroy_loop_device "$loop1"
+	test -e "$loop2" && _destroy_loop_device "$loop2"
+	test -e "$testdir" && rm -r -f "$testdir"
+}
+
+_require_test
+_require_scrub
+_require_xfs_io_command "repair"	# online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+
+testdir=$TEST_DIR/$seq
+mntpt=$testdir/mount
+disk1=$testdir/disk1
+disk2=$testdir/disk2
+
+mkdir -p "$mntpt"
+$XFS_IO_PROG -f -c "truncate 300m" $disk1
+$XFS_IO_PROG -f -c "truncate 300m" $disk2
+loop1="$(_create_loop_device "$disk1")"
+
+filter_mntpt() {
+	sed -e "s|$mntpt|MNTPT|g"
+}
+
+_mkfs_dev "$loop1" >> $seqres.full
+_mount "$loop1" "$mntpt" || _notrun "cannot mount victim filesystem"
+
+_xfs_has_feature $mntpt rmapbt || \
+	_notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $mntpt parent || \
+	_notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $mntpt --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $mntpt set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$mntpt")
+echo testdata > $mntpt/a
+mkdir -p "$mntpt/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+	fname="$(printf "%0255d" "$i")"
+	ln $mntpt/a $mntpt/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $mntpt/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Clone the fs, break the directory, remount filesystem
+_unmount "$mntpt"
+
+cp --sparse=always "$disk1" "$disk2" || _fail "cannot copy disk1"
+loop2="$(_create_loop_device_like_bdev "$disk2" "$loop1")"
+
+$XFS_DB_PROG "$loop1" -x \
+	-c 'path /some/victimdir' \
+	-c 'bmap' \
+	-c 'dblock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_mount "$loop1" "$mntpt" || _fail "cannot mount broken fs"
+
+_invoke_xfs_healer "$mntpt" "$tmp.healer" --repair
+
+# Stop the healer process so that it can't read error events while we do some
+# shenanigans.
+test -n "$XFS_HEALER_PID" || _fail "nobody set XFS_HEALER_PID?"
+kill -STOP $XFS_HEALER_PID
+
+
+echo "LOG $XFS_HEALER_PID SO FAR:" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Access the broken directory to trigger a repair event, which will not yet be
+# processed.
+ls $mntpt/some/victimdir > /dev/null 2> $tmp.err
+filter_mntpt < $tmp.err
+
+ps auxfww | grep xfs_healer >> $seqres.full
+
+echo "LOG AFTER TRYING TO POKE:" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Mount the clone filesystem to the same mountpoint so that the healer cannot
+# actually reopen it to perform repairs.
+_mount "$loop2" "$mntpt" -o nouuid || _fail "cannot mount decoy fs"
+
+grep -w xfs /proc/mounts >> $seqres.full
+
+# Continue the healer process so it can handle events now.  Wait a few seconds
+# while it fails to reopen disk1's mount point to repair things.
+kill -CONT $XFS_HEALER_PID
+sleep 2
+
+new_dir_unmount() {
+	_unmount "$mntpt"
+	_unmount "$mntpt"
+}
+
+# Unmount to kill the healer
+_kill_xfs_healer new_dir_unmount
+echo "LOG AFTER FAILURE" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Did the healer log complaints about not being able to reopen the mountpoint
+# to enact repairs?
+grep -q 'Stale file handle' $tmp.healer || \
+	echo "Should have seen stale file handle complaints"
+
+status=0
+exit
diff --git a/tests/xfs/1901.out b/tests/xfs/1901.out
new file mode 100755
index 00000000000000..ff83e03725307a
--- /dev/null
+++ b/tests/xfs/1901.out
@@ -0,0 +1,2 @@
+QA output created by 1901
+ls: reading directory 'MNTPT/some/victimdir': Structure needs cleaning


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 13/13] xfs: test xfs_healer background service
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (11 preceding siblings ...)
  2026-03-03  0:43   ` [PATCH 12/13] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
@ 2026-03-03  0:44   ` Darrick J. Wong
  2026-03-03  0:47   ` [PATCH 14/13] xfs: test xfs_healer startup service Darrick J. Wong
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:44 UTC (permalink / raw)
  To: zlang, djwong; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that when xfs_healer can monitor and repair filesystems when it's
running as a systemd service, which is the intended usage model.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1902     |  152 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1902.out |    2 +
 2 files changed, 154 insertions(+)
 create mode 100755 tests/xfs/1902
 create mode 100755 tests/xfs/1902.out


diff --git a/tests/xfs/1902 b/tests/xfs/1902
new file mode 100755
index 00000000000000..d327995df8c5b0
--- /dev/null
+++ b/tests/xfs/1902
@@ -0,0 +1,152 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1902
+#
+# Ensure that autonomous self healing fixes the filesystem correctly when
+# running in a systemd service
+#
+# unreliable_in_parallel: this test runs the xfs_healer systemd service, which
+# cannot be isolated to a specific testcase with the way check-parallel is
+# implemented.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing unreliable_in_parallel
+
+_cleanup()
+{
+	cd /
+	if [ -n "$new_svcfile" ]; then
+		rm -f "$new_svcfile"
+		systemctl daemon-reload
+	fi
+	rm -r -f $tmp.*
+}
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_systemd_is_running
+_require_systemd_unit_defined xfs_healer@.service
+_require_scrub
+_require_xfs_io_command "repair"	# online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+	_notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+	_notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+	fname="$(printf "%0255d" "$i")"
+	ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory
+_scratch_unmount
+_scratch_xfs_db -x \
+	-c 'path /some/victimdir' \
+	-c 'bmap' \
+	-c 'dblock 1' \
+	-c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+
+# Find the existing xfs_healer@ service definition, figure out where we're
+# going to land our test-specific override
+orig_svcfile="$(_systemd_unit_path "xfs_healer@-.service")"
+test -f "$orig_svcfile" || \
+	_notrun "cannot find xfs_healer@ service file"
+
+new_svcdir="$(_systemd_runtime_dir)"
+test -d "$new_svcdir" || \
+	_notrun "cannot find runtime systemd service dir"
+
+# We need to make some local mods to the xfs_healer@ service definition
+# so we fork it and create a new service just for this test.
+new_healer_template="xfs_healer_fstest@.service"
+new_healer_svc="$(_xfs_systemd_svcname "$new_healer_template" "$SCRATCH_MNT")"
+_systemd_unit_status "$new_healer_svc" 2>&1 | \
+	grep -E -q '(could not be found|Loaded: not-found)' || \
+	_notrun "systemd service \"$new_healer_svc\" found, will not mess with this"
+
+new_svcfile="$new_svcdir/$new_healer_template"
+cp "$orig_svcfile" "$new_svcfile"
+
+# Pick up all the CLI args except for --repair and --no-autofsck because we're
+# going to force it to --autofsck below
+execargs="$(grep '^ExecStart=' $new_svcfile | \
+	    sed -e 's/^ExecStart=\S*//g' \
+	        -e 's/--no-autofsck//g' \
+		-e 's/--repair//g')"
+sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
+cat >> "$new_svcfile" << ENDL
+
+[Service]
+ExecCondition=$XFS_HEALER_PROG --supported %f
+ExecStart=$XFS_HEALER_PROG $execargs
+ENDL
+_systemd_reload
+
+# Emit the results of our editing to the full log.
+systemctl cat "$new_healer_svc" >> $seqres.full
+
+# Remount, with service activation
+_scratch_mount
+
+old_healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
+_systemd_unit_stop "$old_healer_svc" &>> $seqres.full
+_systemd_unit_start "$new_healer_svc" &>> $seqres.full
+
+_systemd_unit_status "$new_healer_svc" 2>&1 | grep -q 'Active: active' || \
+	echo "systemd service \"$new_healer_svc\" not running??"
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "try $try saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+	echo "retry $try still saw corruption" >> $seqres.full
+	sleep 0.1
+	ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+	try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+journalctl -u "$new_healer_svc" >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1902.out b/tests/xfs/1902.out
new file mode 100755
index 00000000000000..84f9b9e50e1e02
--- /dev/null
+++ b/tests/xfs/1902.out
@@ -0,0 +1,2 @@
+QA output created by 1902
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 14/13] xfs: test xfs_healer startup service
  2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
                     ` (12 preceding siblings ...)
  2026-03-03  0:44   ` [PATCH 13/13] xfs: test xfs_healer background service Darrick J. Wong
@ 2026-03-03  0:47   ` Darrick J. Wong
  13 siblings, 0 replies; 30+ messages in thread
From: Darrick J. Wong @ 2026-03-03  0:47 UTC (permalink / raw)
  To: zlang; +Cc: hch, fstests, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that xfs_healer_start can actually start up xfs_healer service
instances when a filesystem is mounted.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tests/xfs/1903     |  124 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1903.out |    6 +++
 2 files changed, 130 insertions(+)
 create mode 100755 tests/xfs/1903
 create mode 100644 tests/xfs/1903.out

diff --git a/tests/xfs/1903 b/tests/xfs/1903
new file mode 100755
index 00000000000000..d71d75a6af3f9d
--- /dev/null
+++ b/tests/xfs/1903
@@ -0,0 +1,124 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1903
+#
+# Check that the xfs_healer startup service starts the per-mount xfs_healer
+# service for the scratch filesystem.  IOWs, this is basic testing for the
+# xfs_healer systemd background services.
+#
+
+# unreliable_in_parallel: this appears to try to run healer services on all
+# mounted filesystems - that's a problem when there are a hundred other test
+# filesystems mounted running other tests...
+
+. ./common/preamble
+_begin_fstest auto selfhealing unreliable_in_parallel
+
+_cleanup()
+{
+	cd /
+	test -n "$new_healerstart_svc" &&
+		_systemd_unit_stop "$new_healerstart_svc"
+	test -n "$was_masked" && \
+		_systemd_unit_mask "$healer_svc" &>> $seqres.full
+	if [ -n "$new_svcfile" ]; then
+		rm -f "$new_svcfile"
+		systemctl daemon-reload
+	fi
+	rm -r -f $tmp.*
+}
+
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/systemd
+
+_require_systemd_is_running
+_require_systemd_unit_defined xfs_healer@.service
+_require_systemd_unit_defined xfs_healer_start.service
+_require_scratch
+_require_scrub
+_require_xfs_io_command "scrub"
+_require_xfs_spaceman_command "health"
+_require_populate_commands
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command $ATTR_PROG "attr"
+
+_xfs_skip_online_rebuild
+_xfs_skip_offline_rebuild
+
+orig_svcfile="$(_systemd_unit_path "xfs_healer_start.service")"
+test -f "$orig_svcfile" || \
+	_notrun "cannot find xfs_healer_start service file"
+
+new_svcdir="$(_systemd_runtime_dir)"
+test -d "$new_svcdir" || \
+	_notrun "cannot find runtime systemd service dir"
+
+# We need to make some local mods to the xfs_healer_start service definition
+# so we fork it and create a new service just for this test.
+new_healerstart_svc="xfs_healer_start_fstest.service"
+_systemd_unit_status "$new_healerstart_svc" 2>&1 | \
+	grep -E -q '(could not be found|Loaded: not-found)' || \
+	_notrun "systemd service \"$new_healerstart_svc\" found, will not mess with this"
+
+find_healer_trace() {
+	local path="$1"
+
+	sleep 2		# wait for delays in startup
+	$XFS_HEALER_PROG --supported "$path" 2>&1 | grep -q 'already running' || \
+		echo "cannot find evidence that xfs_healer is running for $path"
+}
+
+echo "Format and populate"
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+
+# Configure the filesystem for background checks of the filesystem.
+$ATTR_PROG -R -s xfs:autofsck -V check $SCRATCH_MNT >> $seqres.full
+
+was_masked=
+healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
+
+# Preserve the xfs_healer@ mask state -- we don't want this permanently
+# changing global state.
+if _systemd_unit_masked "$healer_svc"; then
+	_systemd_unit_unmask "$healer_svc" &>> $seqres.full
+	was_masked=1
+fi
+
+echo "Start healer on scratch FS"
+_systemd_unit_start "$healer_svc"
+find_healer_trace "$SCRATCH_MNT"
+_systemd_unit_stop "$healer_svc"
+
+new_svcfile="$new_svcdir/$new_healerstart_svc"
+cp "$orig_svcfile" "$new_svcfile"
+
+sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
+cat >> "$new_svcfile" << ENDL
+[Service]
+ExecCondition=$XFS_HEALER_START_PROG --supported
+ExecStart=$XFS_HEALER_START_PROG
+ENDL
+_systemd_reload
+
+# Emit the results of our editing to the full log.
+systemctl cat "$new_healerstart_svc" >> $seqres.full
+
+echo "Start healer for everything"
+_systemd_unit_start "$new_healerstart_svc"
+find_healer_trace "$SCRATCH_MNT"
+
+echo "Restart healer for scratch FS"
+_scratch_cycle_mount
+find_healer_trace "$SCRATCH_MNT"
+
+echo "Healer testing done" | tee -a $seqres.full
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1903.out b/tests/xfs/1903.out
new file mode 100644
index 00000000000000..07810f60ca10c6
--- /dev/null
+++ b/tests/xfs/1903.out
@@ -0,0 +1,6 @@
+QA output created by 1903
+Format and populate
+Start healer on scratch FS
+Start healer for everything
+Restart healer for scratch FS
+Healer testing done

^ permalink raw reply related	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2026-03-09 18:03 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260303002508.GB57948@frogsfrogsfrogs>
2026-03-03  0:33 ` [PATCHSET v8 1/2] fstests: test generic file IO error reporting Darrick J. Wong
2026-03-03  0:40   ` [PATCH 1/1] generic: test fsnotify filesystem " Darrick J. Wong
2026-03-03  9:21     ` Amir Goldstein
2026-03-03 14:51       ` Christoph Hellwig
2026-03-03 14:56         ` Amir Goldstein
2026-03-04 10:10         ` Jan Kara
2026-03-03 14:54     ` Christoph Hellwig
2026-03-03 16:06       ` Gabriel Krisman Bertazi
2026-03-03 16:12         ` Christoph Hellwig
2026-03-03 16:38           ` Darrick J. Wong
2026-03-03 16:49       ` Darrick J. Wong
2026-03-03 16:53         ` Christoph Hellwig
2026-03-03 17:59           ` Darrick J. Wong
2026-03-03  0:33 ` [PATCHSET v8 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
2026-03-03  0:41   ` [PATCH 01/13] xfs: test health monitoring code Darrick J. Wong
2026-03-09 17:21     ` Zorro Lang
2026-03-09 18:03       ` Darrick J. Wong
2026-03-03  0:41   ` [PATCH 02/13] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
2026-03-03  0:41   ` [PATCH 03/13] xfs: test io " Darrick J. Wong
2026-03-03  0:41   ` [PATCH 04/13] xfs: set up common code for testing xfs_healer Darrick J. Wong
2026-03-03  0:42   ` [PATCH 05/13] xfs: test xfs_healer's event handling Darrick J. Wong
2026-03-03  0:42   ` [PATCH 06/13] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
2026-03-03  0:42   ` [PATCH 07/13] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
2026-03-03  0:42   ` [PATCH 08/13] xfs: test xfs_healer can report file media errors Darrick J. Wong
2026-03-03  0:43   ` [PATCH 09/13] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
2026-03-03  0:43   ` [PATCH 10/13] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
2026-03-03  0:43   ` [PATCH 11/13] xfs: test xfs_healer can follow mount moves Darrick J. Wong
2026-03-03  0:43   ` [PATCH 12/13] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
2026-03-03  0:44   ` [PATCH 13/13] xfs: test xfs_healer background service Darrick J. Wong
2026-03-03  0:47   ` [PATCH 14/13] xfs: test xfs_healer startup service Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox