* [PATCH 01/14] xfs: test health monitoring code
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
@ 2026-03-10 3:50 ` Darrick J. Wong
2026-03-13 18:18 ` Zorro Lang
2026-03-10 3:50 ` [PATCH 02/14] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
` (13 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:50 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add some functionality tests for the new health monitoring code.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
common/module | 11 ++++++++++
tests/xfs/1885 | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1885.out | 5 ++++
3 files changed, 75 insertions(+)
create mode 100755 tests/xfs/1885
create mode 100644 tests/xfs/1885.out
diff --git a/common/module b/common/module
index 697d76ba718bbc..c0529b65ad6e2b 100644
--- a/common/module
+++ b/common/module
@@ -225,3 +225,14 @@ _optional_reload_fs_module()
_test_loadable_fs_module "$@" 2>&1 | \
sed -e '/patient module removal/d'
}
+
+_require_module_refcount()
+{
+ local refcount_file="/sys/module/$1/refcnt"
+ test -e "$refcount_file" || _notrun "cannot find $1 module refcount"
+}
+
+_module_refcount()
+{
+ cat "/sys/module/$1/refcnt"
+}
diff --git a/tests/xfs/1885 b/tests/xfs/1885
new file mode 100755
index 00000000000000..d44b29d1c57e06
--- /dev/null
+++ b/tests/xfs/1885
@@ -0,0 +1,59 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1885
+#
+# Make sure that healthmon handles module refcount correctly.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing quick
+
+. ./common/filter
+. ./common/module
+
+_cleanup()
+{
+ test -n "$healer_pid" && kill $healer_pid &>/dev/null
+ cd /
+ rm -r -f $tmp.*
+}
+
+_require_test
+_require_xfs_io_command healthmon
+_require_module_refcount xfs
+
+# Capture mod refcount without the test fs mounted
+_test_unmount
+init_refcount="$(_module_refcount xfs)"
+
+# Capture mod refcount with the test fs mounted
+_test_mount
+nomon_mount_refcount="$(_module_refcount xfs)"
+
+# Capture mod refcount with test fs mounted and the healthmon fd open.
+# Pause the xfs_io process so that it doesn't actually respond to events.
+$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
+healer_pid=$!
+sleep 0.5
+kill -STOP $healer_pid
+mon_mount_refcount="$(_module_refcount xfs)"
+
+# Capture mod refcount with only the healthmon fd open.
+_test_unmount
+mon_nomount_refcount="$(_module_refcount xfs)"
+
+# Capture mod refcount after continuing healthmon (which should exit due to the
+# unmount) and killing it.
+kill -CONT $healer_pid
+kill $healer_pid
+wait
+nomon_nomount_refcount="$(_module_refcount xfs)"
+
+_within_tolerance "mount refcount" "$nomon_mount_refcount" "$((init_refcount + 1))" 0 -v
+_within_tolerance "mount + healthmon refcount" "$mon_mount_refcount" "$((init_refcount + 2))" 0 -v
+_within_tolerance "healthmon refcount" "$mon_nomount_refcount" "$((init_refcount + 1))" 0 -v
+_within_tolerance "end refcount" "$nomon_nomount_refcount" "$init_refcount" 0 -v
+
+status=0
+exit
diff --git a/tests/xfs/1885.out b/tests/xfs/1885.out
new file mode 100644
index 00000000000000..f152cef0525609
--- /dev/null
+++ b/tests/xfs/1885.out
@@ -0,0 +1,5 @@
+QA output created by 1885
+mount refcount is in range
+mount + healthmon refcount is in range
+healthmon refcount is in range
+end refcount is in range
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 01/14] xfs: test health monitoring code
2026-03-10 3:50 ` [PATCH 01/14] xfs: test health monitoring code Darrick J. Wong
@ 2026-03-13 18:18 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 18:18 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:50:23PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Add some functionality tests for the new health monitoring code.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> common/module | 11 ++++++++++
> tests/xfs/1885 | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1885.out | 5 ++++
> 3 files changed, 75 insertions(+)
> create mode 100755 tests/xfs/1885
> create mode 100644 tests/xfs/1885.out
>
>
> diff --git a/common/module b/common/module
> index 697d76ba718bbc..c0529b65ad6e2b 100644
> --- a/common/module
> +++ b/common/module
> @@ -225,3 +225,14 @@ _optional_reload_fs_module()
> _test_loadable_fs_module "$@" 2>&1 | \
> sed -e '/patient module removal/d'
> }
> +
> +_require_module_refcount()
> +{
> + local refcount_file="/sys/module/$1/refcnt"
> + test -e "$refcount_file" || _notrun "cannot find $1 module refcount"
> +}
> +
> +_module_refcount()
> +{
> + cat "/sys/module/$1/refcnt"
> +}
> diff --git a/tests/xfs/1885 b/tests/xfs/1885
> new file mode 100755
> index 00000000000000..d44b29d1c57e06
> --- /dev/null
> +++ b/tests/xfs/1885
> @@ -0,0 +1,59 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1885
> +#
> +# Make sure that healthmon handles module refcount correctly.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing quick
> +
> +. ./common/filter
> +. ./common/module
> +
> +_cleanup()
> +{
> + test -n "$healer_pid" && kill $healer_pid &>/dev/null
I'll add a "wait" at here, and ...
> + cd /
> + rm -r -f $tmp.*
> +}
> +
> +_require_test
> +_require_xfs_io_command healthmon
> +_require_module_refcount xfs
> +
> +# Capture mod refcount without the test fs mounted
> +_test_unmount
> +init_refcount="$(_module_refcount xfs)"
> +
> +# Capture mod refcount with the test fs mounted
> +_test_mount
> +nomon_mount_refcount="$(_module_refcount xfs)"
> +
> +# Capture mod refcount with test fs mounted and the healthmon fd open.
> +# Pause the xfs_io process so that it doesn't actually respond to events.
> +$XFS_IO_PROG -c 'healthmon -c -v' $TEST_DIR >> $seqres.full &
> +healer_pid=$!
> +sleep 0.5
> +kill -STOP $healer_pid
> +mon_mount_refcount="$(_module_refcount xfs)"
> +
> +# Capture mod refcount with only the healthmon fd open.
> +_test_unmount
> +mon_nomount_refcount="$(_module_refcount xfs)"
> +
> +# Capture mod refcount after continuing healthmon (which should exit due to the
> +# unmount) and killing it.
> +kill -CONT $healer_pid
> +kill $healer_pid
> +wait
unset healer_pid
others look good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> +nomon_nomount_refcount="$(_module_refcount xfs)"
> +
> +_within_tolerance "mount refcount" "$nomon_mount_refcount" "$((init_refcount + 1))" 0 -v
> +_within_tolerance "mount + healthmon refcount" "$mon_mount_refcount" "$((init_refcount + 2))" 0 -v
> +_within_tolerance "healthmon refcount" "$mon_nomount_refcount" "$((init_refcount + 1))" 0 -v
> +_within_tolerance "end refcount" "$nomon_nomount_refcount" "$init_refcount" 0 -v
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1885.out b/tests/xfs/1885.out
> new file mode 100644
> index 00000000000000..f152cef0525609
> --- /dev/null
> +++ b/tests/xfs/1885.out
> @@ -0,0 +1,5 @@
> +QA output created by 1885
> +mount refcount is in range
> +mount + healthmon refcount is in range
> +healthmon refcount is in range
> +end refcount is in range
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 02/14] xfs: test for metadata corruption error reporting via healthmon
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
2026-03-10 3:50 ` [PATCH 01/14] xfs: test health monitoring code Darrick J. Wong
@ 2026-03-10 3:50 ` Darrick J. Wong
2026-03-13 18:35 ` Zorro Lang
2026-03-10 3:50 ` [PATCH 03/14] xfs: test io " Darrick J. Wong
` (12 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:50 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Check if we can detect runtime metadata corruptions via the health
monitor.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1879 | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1879.out | 8 ++++
2 files changed, 101 insertions(+)
create mode 100755 tests/xfs/1879
create mode 100644 tests/xfs/1879.out
diff --git a/tests/xfs/1879 b/tests/xfs/1879
new file mode 100755
index 00000000000000..75bc8e3b5f4316
--- /dev/null
+++ b/tests/xfs/1879
@@ -0,0 +1,93 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1879
+#
+# Corrupt some metadata and try to access it with the health monitoring program
+# running. Check that healthmon observes a metadata error.
+#
+. ./common/preamble
+_begin_fstest auto quick eio selfhealing
+
+_cleanup()
+{
+ cd /
+ rm -rf $tmp.* $testdir
+}
+
+. ./common/filter
+
+_require_scratch_nocheck
+_require_scratch_xfs_crc # can't detect minor corruption w/o crc
+_require_xfs_io_command healthmon
+
+# Disable the scratch rt device to avoid test failures relating to the rt
+# bitmap consuming all the free space in our small data device.
+unset SCRATCH_RTDEV
+
+echo "Format and mount"
+_scratch_mkfs -d agcount=1 | _filter_mkfs 2> $tmp.mkfs >> $seqres.full
+. $tmp.mkfs
+_scratch_mount
+mkdir $SCRATCH_MNT/a/
+# Enough entries to get to a single block directory
+for ((i = 0; i < ( (isize + 255) / 256); i++)); do
+ path="$(printf "%s/a/%0255d" "$SCRATCH_MNT" "$i")"
+ touch "$path"
+done
+inum="$(stat -c %i "$SCRATCH_MNT/a")"
+_scratch_unmount
+
+# Fuzz the directory block so that the touch below will be guaranteed to trip
+# a runtime sickness report in exactly the manner we desire.
+_scratch_xfs_db -x -c "inode $inum" -c "dblock 0" -c 'fuzz bhdr.hdr.owner add' -c print &>> $seqres.full
+
+# Try to allocate space to trigger a metadata corruption event
+echo "Runtime corruption detection"
+_scratch_mount
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
+sleep 1 # wait for program to start up
+touch $SCRATCH_MNT/a/farts &>> $seqres.full
+_scratch_unmount
+
+wait # for healthmon to finish
+
+# Did we get errors?
+check_healthmon()
+{
+ cat $tmp.healthmon >> $seqres.full
+ _filter_scratch < $tmp.healthmon | \
+ grep -E '(sick|corrupt)' | \
+ sed -e 's|SCRATCH_MNT/a|VICTIM|g' \
+ -e 's|SCRATCH_MNT ino [0-9]* gen 0x[0-9a-f]*|VICTIM|g' | \
+ sort | \
+ uniq
+}
+check_healthmon
+
+# Run scrub to trigger a health event from there too.
+echo "Scrub corruption detection"
+_scratch_mount
+if _supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV; then
+ $XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
+ sleep 1 # wait for program to start up
+ $XFS_SCRUB_PROG -n $SCRATCH_MNT &>> $seqres.full
+ _scratch_unmount
+
+ wait # for healthmon to finish
+
+ # Did we get errors?
+ check_healthmon
+else
+ # mock the output since we don't support scrub
+ _scratch_unmount
+ cat << ENDL
+VICTIM directory: corrupt
+VICTIM directory: sick
+VICTIM parent: corrupt
+ENDL
+fi
+
+status=0
+exit
diff --git a/tests/xfs/1879.out b/tests/xfs/1879.out
new file mode 100644
index 00000000000000..2f6acbe1c4fb22
--- /dev/null
+++ b/tests/xfs/1879.out
@@ -0,0 +1,8 @@
+QA output created by 1879
+Format and mount
+Runtime corruption detection
+VICTIM directory: sick
+Scrub corruption detection
+VICTIM directory: corrupt
+VICTIM directory: sick
+VICTIM parent: corrupt
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 02/14] xfs: test for metadata corruption error reporting via healthmon
2026-03-10 3:50 ` [PATCH 02/14] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
@ 2026-03-13 18:35 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 18:35 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:50:39PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Check if we can detect runtime metadata corruptions via the health
> monitor.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> tests/xfs/1879 | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1879.out | 8 ++++
> 2 files changed, 101 insertions(+)
> create mode 100755 tests/xfs/1879
> create mode 100644 tests/xfs/1879.out
>
>
> diff --git a/tests/xfs/1879 b/tests/xfs/1879
> new file mode 100755
> index 00000000000000..75bc8e3b5f4316
> --- /dev/null
> +++ b/tests/xfs/1879
> @@ -0,0 +1,93 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1879
> +#
> +# Corrupt some metadata and try to access it with the health monitoring program
> +# running. Check that healthmon observes a metadata error.
> +#
> +. ./common/preamble
> +_begin_fstest auto quick eio selfhealing
> +
> +_cleanup()
> +{
> + cd /
test -n "$healer_pid" && kill $healer_pid &>/dev/null
wait
> + rm -rf $tmp.* $testdir
> +}
> +
> +. ./common/filter
> +
> +_require_scratch_nocheck
> +_require_scratch_xfs_crc # can't detect minor corruption w/o crc
> +_require_xfs_io_command healthmon
> +
> +# Disable the scratch rt device to avoid test failures relating to the rt
> +# bitmap consuming all the free space in our small data device.
> +unset SCRATCH_RTDEV
> +
> +echo "Format and mount"
> +_scratch_mkfs -d agcount=1 | _filter_mkfs 2> $tmp.mkfs >> $seqres.full
> +. $tmp.mkfs
> +_scratch_mount
> +mkdir $SCRATCH_MNT/a/
> +# Enough entries to get to a single block directory
> +for ((i = 0; i < ( (isize + 255) / 256); i++)); do
> + path="$(printf "%s/a/%0255d" "$SCRATCH_MNT" "$i")"
> + touch "$path"
> +done
> +inum="$(stat -c %i "$SCRATCH_MNT/a")"
> +_scratch_unmount
> +
> +# Fuzz the directory block so that the touch below will be guaranteed to trip
> +# a runtime sickness report in exactly the manner we desire.
> +_scratch_xfs_db -x -c "inode $inum" -c "dblock 0" -c 'fuzz bhdr.hdr.owner add' -c print &>> $seqres.full
> +
> +# Try to allocate space to trigger a metadata corruption event
> +echo "Runtime corruption detection"
> +_scratch_mount
> +$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
healer_pid=$!
> +sleep 1 # wait for program to start up
> +touch $SCRATCH_MNT/a/farts &>> $seqres.full
> +_scratch_unmount
> +
> +wait # for healthmon to finish
unset healer_pid
Others look good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> +
> +# Did we get errors?
> +check_healthmon()
> +{
> + cat $tmp.healthmon >> $seqres.full
> + _filter_scratch < $tmp.healthmon | \
> + grep -E '(sick|corrupt)' | \
> + sed -e 's|SCRATCH_MNT/a|VICTIM|g' \
> + -e 's|SCRATCH_MNT ino [0-9]* gen 0x[0-9a-f]*|VICTIM|g' | \
> + sort | \
> + uniq
> +}
> +check_healthmon
> +
> +# Run scrub to trigger a health event from there too.
> +echo "Scrub corruption detection"
> +_scratch_mount
> +if _supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV; then
> + $XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT > $tmp.healthmon &
> + sleep 1 # wait for program to start up
> + $XFS_SCRUB_PROG -n $SCRATCH_MNT &>> $seqres.full
> + _scratch_unmount
> +
> + wait # for healthmon to finish
> +
> + # Did we get errors?
> + check_healthmon
> +else
> + # mock the output since we don't support scrub
> + _scratch_unmount
> + cat << ENDL
> +VICTIM directory: corrupt
> +VICTIM directory: sick
> +VICTIM parent: corrupt
> +ENDL
> +fi
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1879.out b/tests/xfs/1879.out
> new file mode 100644
> index 00000000000000..2f6acbe1c4fb22
> --- /dev/null
> +++ b/tests/xfs/1879.out
> @@ -0,0 +1,8 @@
> +QA output created by 1879
> +Format and mount
> +Runtime corruption detection
> +VICTIM directory: sick
> +Scrub corruption detection
> +VICTIM directory: corrupt
> +VICTIM directory: sick
> +VICTIM parent: corrupt
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 03/14] xfs: test io error reporting via healthmon
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
2026-03-10 3:50 ` [PATCH 01/14] xfs: test health monitoring code Darrick J. Wong
2026-03-10 3:50 ` [PATCH 02/14] xfs: test for metadata corruption error reporting via healthmon Darrick J. Wong
@ 2026-03-10 3:50 ` Darrick J. Wong
2026-03-13 18:53 ` Zorro Lang
2026-03-10 3:51 ` [PATCH 04/14] xfs: set up common code for testing xfs_healer Darrick J. Wong
` (11 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:50 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a new test to make sure the kernel can report IO errors via
health monitoring.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1878 | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1878.out | 10 ++++++
2 files changed, 103 insertions(+)
create mode 100755 tests/xfs/1878
create mode 100644 tests/xfs/1878.out
diff --git a/tests/xfs/1878 b/tests/xfs/1878
new file mode 100755
index 00000000000000..1ff6ae040fb193
--- /dev/null
+++ b/tests/xfs/1878
@@ -0,0 +1,93 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1878
+#
+# Attempt to read and write a file in buffered and directio mode with the
+# health monitoring program running. Check that healthmon observes all four
+# types of IO errors.
+#
+. ./common/preamble
+_begin_fstest auto quick eio selfhealing
+
+_cleanup()
+{
+ cd /
+ rm -rf $tmp.* $testdir
+ _dmerror_cleanup
+}
+
+. ./common/filter
+. ./common/dmerror
+
+_require_scratch_nocheck
+_require_xfs_io_command healthmon
+_require_dm_target error
+
+filter_healer_errors() {
+ _filter_scratch | \
+ grep -E '(buffered|directio)' | \
+ sed \
+ -e 's/ino [0-9]*/ino NUM/g' \
+ -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+ -e 's/pos [0-9]*/pos NUM/g' \
+ -e 's/len [0-9]*/len NUM/g' \
+ -e 's|SCRATCH_MNT/a|VICTIM|g' \
+ -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
+ uniq
+}
+
+# Disable the scratch rt device to avoid test failures relating to the rt
+# bitmap consuming all the free space in our small data device.
+unset SCRATCH_RTDEV
+
+echo "Format and mount"
+_scratch_mkfs > $seqres.full 2>&1
+_dmerror_init no_log
+_dmerror_mount
+
+_require_fs_space $SCRATCH_MNT 65536
+
+# Create a file with written regions far enough apart that the pagecache can't
+# possibly be caching the regions with a single folio.
+testfile=$SCRATCH_MNT/fsync-err-test
+$XFS_IO_PROG -f \
+ -c 'pwrite -b 1m 0 1m' \
+ -c 'pwrite -b 1m 10g 1m' \
+ -c 'pwrite -b 1m 20g 1m' \
+ -c fsync $testfile >> $seqres.full
+
+# First we check if directio errors get reported
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
+sleep 1 # wait for program to start up
+_dmerror_load_error_table
+$XFS_IO_PROG -d -c 'pwrite -b 256k 12k 16k' $testfile >> $seqres.full
+$XFS_IO_PROG -d -c 'pread -b 256k 10g 16k' $testfile >> $seqres.full
+_dmerror_load_working_table
+
+_dmerror_unmount
+wait # for healthmon to finish
+_dmerror_mount
+
+# Next we check if buffered io errors get reported. We have to write something
+# before loading the error table to ensure the dquots get loaded.
+$XFS_IO_PROG -c 'pwrite -b 256k 20g 1k' -c fsync $testfile >> $seqres.full
+$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
+sleep 1 # wait for program to start up
+_dmerror_load_error_table
+$XFS_IO_PROG -c 'pread -b 256k 12k 16k' $testfile >> $seqres.full
+$XFS_IO_PROG -c 'pwrite -b 256k 20g 16k' -c fsync $testfile >> $seqres.full
+_dmerror_load_working_table
+
+_dmerror_unmount
+wait # for healthmon to finish
+
+# Did we get errors?
+cat $tmp.healthmon >> $seqres.full
+filter_healer_errors < $tmp.healthmon
+
+_dmerror_cleanup
+
+status=0
+exit
diff --git a/tests/xfs/1878.out b/tests/xfs/1878.out
new file mode 100644
index 00000000000000..f64c440b1a6ed1
--- /dev/null
+++ b/tests/xfs/1878.out
@@ -0,0 +1,10 @@
+QA output created by 1878
+Format and mount
+pwrite: Input/output error
+pread: Input/output error
+pread: Input/output error
+fsync: Input/output error
+VICTIM pos NUM len NUM: directio_write: Input/output error
+VICTIM pos NUM len NUM: directio_read: Input/output error
+VICTIM pos NUM len NUM: buffered_read: Input/output error
+VICTIM pos NUM len NUM: buffered_write: Input/output error
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 03/14] xfs: test io error reporting via healthmon
2026-03-10 3:50 ` [PATCH 03/14] xfs: test io " Darrick J. Wong
@ 2026-03-13 18:53 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 18:53 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:50:55PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Create a new test to make sure the kernel can report IO errors via
> health monitoring.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> tests/xfs/1878 | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1878.out | 10 ++++++
> 2 files changed, 103 insertions(+)
> create mode 100755 tests/xfs/1878
> create mode 100644 tests/xfs/1878.out
>
>
> diff --git a/tests/xfs/1878 b/tests/xfs/1878
> new file mode 100755
> index 00000000000000..1ff6ae040fb193
> --- /dev/null
> +++ b/tests/xfs/1878
> @@ -0,0 +1,93 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1878
> +#
> +# Attempt to read and write a file in buffered and directio mode with the
> +# health monitoring program running. Check that healthmon observes all four
> +# types of IO errors.
> +#
> +. ./common/preamble
> +_begin_fstest auto quick eio selfhealing
> +
> +_cleanup()
> +{
> + cd /
test -n "$healer_pid" && kill $healer_pid &>/dev/null
wait
> + rm -rf $tmp.* $testdir
> + _dmerror_cleanup
> +}
> +
> +. ./common/filter
> +. ./common/dmerror
> +
> +_require_scratch_nocheck
> +_require_xfs_io_command healthmon
> +_require_dm_target error
> +
> +filter_healer_errors() {
> + _filter_scratch | \
> + grep -E '(buffered|directio)' | \
> + sed \
> + -e 's/ino [0-9]*/ino NUM/g' \
> + -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
> + -e 's/pos [0-9]*/pos NUM/g' \
> + -e 's/len [0-9]*/len NUM/g' \
> + -e 's|SCRATCH_MNT/a|VICTIM|g' \
> + -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
> + uniq
> +}
> +
> +# Disable the scratch rt device to avoid test failures relating to the rt
> +# bitmap consuming all the free space in our small data device.
> +unset SCRATCH_RTDEV
> +
> +echo "Format and mount"
> +_scratch_mkfs > $seqres.full 2>&1
> +_dmerror_init no_log
> +_dmerror_mount
> +
> +_require_fs_space $SCRATCH_MNT 65536
> +
> +# Create a file with written regions far enough apart that the pagecache can't
> +# possibly be caching the regions with a single folio.
> +testfile=$SCRATCH_MNT/fsync-err-test
> +$XFS_IO_PROG -f \
> + -c 'pwrite -b 1m 0 1m' \
> + -c 'pwrite -b 1m 10g 1m' \
> + -c 'pwrite -b 1m 20g 1m' \
> + -c fsync $testfile >> $seqres.full
> +
> +# First we check if directio errors get reported
> +$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
healer_pid=$!
> +sleep 1 # wait for program to start up
> +_dmerror_load_error_table
> +$XFS_IO_PROG -d -c 'pwrite -b 256k 12k 16k' $testfile >> $seqres.full
> +$XFS_IO_PROG -d -c 'pread -b 256k 10g 16k' $testfile >> $seqres.full
> +_dmerror_load_working_table
> +
> +_dmerror_unmount
> +wait # for healthmon to finish
unset healer_pid
> +_dmerror_mount
> +
> +# Next we check if buffered io errors get reported. We have to write something
> +# before loading the error table to ensure the dquots get loaded.
> +$XFS_IO_PROG -c 'pwrite -b 256k 20g 1k' -c fsync $testfile >> $seqres.full
> +$XFS_IO_PROG -c 'healthmon -c -v' $SCRATCH_MNT >> $tmp.healthmon &
healer_pid=$!
> +sleep 1 # wait for program to start up
> +_dmerror_load_error_table
> +$XFS_IO_PROG -c 'pread -b 256k 12k 16k' $testfile >> $seqres.full
> +$XFS_IO_PROG -c 'pwrite -b 256k 20g 16k' -c fsync $testfile >> $seqres.full
> +_dmerror_load_working_table
> +
> +_dmerror_unmount
> +wait # for healthmon to finish
unset healer_pid
> +
> +# Did we get errors?
> +cat $tmp.healthmon >> $seqres.full
> +filter_healer_errors < $tmp.healthmon
> +
> +_dmerror_cleanup
> +
> +status=0
> +exit
_exit 0
Others look good to me, I'll change these when I merge this patch.
Reviewed-by: Zorro Lang <zlang@redhat.com>
> diff --git a/tests/xfs/1878.out b/tests/xfs/1878.out
> new file mode 100644
> index 00000000000000..f64c440b1a6ed1
> --- /dev/null
> +++ b/tests/xfs/1878.out
> @@ -0,0 +1,10 @@
> +QA output created by 1878
> +Format and mount
> +pwrite: Input/output error
> +pread: Input/output error
> +pread: Input/output error
> +fsync: Input/output error
> +VICTIM pos NUM len NUM: directio_write: Input/output error
> +VICTIM pos NUM len NUM: directio_read: Input/output error
> +VICTIM pos NUM len NUM: buffered_read: Input/output error
> +VICTIM pos NUM len NUM: buffered_write: Input/output error
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 04/14] xfs: set up common code for testing xfs_healer
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (2 preceding siblings ...)
2026-03-10 3:50 ` [PATCH 03/14] xfs: test io " Darrick J. Wong
@ 2026-03-10 3:51 ` Darrick J. Wong
2026-03-13 19:04 ` Zorro Lang
2026-03-14 20:37 ` Zorro Lang
2026-03-10 3:51 ` [PATCH 05/14] xfs: test xfs_healer's event handling Darrick J. Wong
` (10 subsequent siblings)
14 siblings, 2 replies; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:51 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add a bunch of common code so that we can test the xfs_healer daemon.
Most of the changes here are to make it easier to manage the systemd
service units for xfs_healer and xfs_scrub.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
common/config | 14 ++++++++
common/rc | 5 +++
common/systemd | 39 ++++++++++++++++++++++
common/xfs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/802 | 4 +-
5 files changed, 158 insertions(+), 2 deletions(-)
diff --git a/common/config b/common/config
index 1420e35ddfee42..8468a60081f50c 100644
--- a/common/config
+++ b/common/config
@@ -161,6 +161,20 @@ export XFS_ADMIN_PROG="$(type -P xfs_admin)"
export XFS_GROWFS_PROG=$(type -P xfs_growfs)
export XFS_SPACEMAN_PROG="$(type -P xfs_spaceman)"
export XFS_SCRUB_PROG="$(type -P xfs_scrub)"
+
+XFS_HEALER_PROG="$(type -P xfs_healer)"
+XFS_HEALER_START_PROG="$(type -P xfs_healer_start)"
+
+# If not found, try the ones installed in libexec
+if [ ! -x "$XFS_HEALER_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer ]; then
+ XFS_HEALER_PROG=/usr/libexec/xfsprogs/xfs_healer
+fi
+if [ ! -x "$XFS_HEALER_START_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer_start ]; then
+ XFS_HEALER_START_PROG=/usr/libexec/xfsprogs/xfs_healer_start
+fi
+export XFS_HEALER_PROG
+export XFS_HEALER_START_PROG
+
export XFS_PARALLEL_REPAIR_PROG="$(type -P xfs_prepair)"
export XFS_PARALLEL_REPAIR64_PROG="$(type -P xfs_prepair64)"
export __XFSDUMP_PROG="$(type -P xfsdump)"
diff --git a/common/rc b/common/rc
index ccb78baf5bd41a..0b740595d231b5 100644
--- a/common/rc
+++ b/common/rc
@@ -3021,6 +3021,11 @@ _require_xfs_io_command()
"label")
testio=`$XFS_IO_PROG -c "label" $TEST_DIR 2>&1`
;;
+ "verifymedia")
+ testio=`$XFS_IO_PROG -x -c "verifymedia $* 0 0" 2>&1`
+ echo $testio | grep -q "invalid option" && \
+ _notrun "xfs_io $command support is missing"
+ ;;
"open")
# -c "open $f" is broken in xfs_io <= 4.8. Along with the fix,
# a new -C flag was introduced to execute one shot commands.
diff --git a/common/systemd b/common/systemd
index b2e24f267b2d93..589aad1bef2637 100644
--- a/common/systemd
+++ b/common/systemd
@@ -44,6 +44,18 @@ _systemd_unit_active() {
test "$(systemctl is-active "$1")" = "active"
}
+# Wait for up to a certain number of seconds for a service to reach inactive
+# state.
+_systemd_unit_wait() {
+ local svcname="$1"
+ local timeout="${2:-30}"
+
+ for ((i = 0; i < (timeout * 2); i++)); do
+ test "$(systemctl is-active "$svcname")" = "inactive" && break
+ sleep 0.5
+ done
+}
+
_require_systemd_unit_active() {
_require_systemd_unit_defined "$1"
_systemd_unit_active "$1" || \
@@ -71,3 +83,30 @@ _systemd_unit_status() {
_systemd_installed || return 1
systemctl status "$1"
}
+
+# Start a running systemd unit
+_systemd_unit_start() {
+ systemctl start "$1"
+}
+# Stop a running systemd unit
+_systemd_unit_stop() {
+ systemctl stop "$1"
+}
+
+# Mask or unmask a running systemd unit
+_systemd_unit_mask() {
+ systemctl mask "$1"
+}
+_systemd_unit_unmask() {
+ systemctl unmask "$1"
+}
+_systemd_unit_masked() {
+ systemctl status "$1" 2>/dev/null | grep -q 'Loaded: masked'
+}
+
+_systemd_service_unit_path() {
+ local template="$1"
+ local path="$2"
+
+ systemd-escape --template "$template" --path "$path"
+}
diff --git a/common/xfs b/common/xfs
index 7fa0db2e26b4c9..f276325df8fbac 100644
--- a/common/xfs
+++ b/common/xfs
@@ -2301,3 +2301,101 @@ _filter_bmap_gno()
if ($ag =~ /\d+/) {print "$ag "} ;
'
}
+
+# Run the xfs_healer program on some filesystem
+_xfs_healer() {
+ $XFS_HEALER_PROG "$@"
+}
+
+# Compute the xfs_healer systemd service instance name for a given path.
+# This is easy because xfs_healer has always supported --svcname.
+_xfs_healer_svcname()
+{
+ _xfs_healer --svcname "$@"
+}
+
+# Compute the xfs_scrub systemd service instance name for a given path. This
+# is tricky because xfs_scrub only gained --svcname when xfs_healer was made.
+_xfs_scrub_svcname()
+{
+ local ret
+
+ if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
+ echo "$ret"
+ return 0
+ fi
+
+ # ...but if not, we can fall back to brute force systemd invocations.
+ _systemd_service_unit_path "xfs_scrub@.service" "$*"
+}
+
+# Run the xfs_healer program on the scratch fs
+_scratch_xfs_healer() {
+ _xfs_healer "$@" "$SCRATCH_MNT"
+}
+
+# Turn off the background xfs_healer service if any so that it doesn't fix
+# injected metadata errors; then start a background copy of xfs_healer to
+# capture that.
+_invoke_xfs_healer() {
+ local mount="$1"
+ local logfile="$2"
+ shift; shift
+
+ if _systemd_is_running; then
+ local svc="$(_xfs_healer_svcname "$mount")"
+ _systemd_unit_stop "$svc" &>> $seqres.full
+ fi
+
+ $XFS_HEALER_PROG "$mount" "$@" &> "$logfile" &
+ XFS_HEALER_PID=$!
+
+ # Wait 30s for the healer program to really start up
+ for ((i = 0; i < 60; i++)); do
+ test -e "$logfile" && \
+ grep -q 'monitoring started' "$logfile" && \
+ break
+ sleep 0.5
+ done
+}
+
+# Run our own copy of xfs_healer against the scratch device. Note that
+# unmounting the scratch fs causes the healer daemon to exit, so we don't need
+# to kill it explicitly from _cleanup.
+_scratch_invoke_xfs_healer() {
+ _invoke_xfs_healer "$SCRATCH_MNT" "$@"
+}
+
+# Unmount the filesystem to kill the xfs_healer instance started by
+# _invoke_xfs_healer, and wait up to a certain amount of time for it to exit.
+_kill_xfs_healer() {
+ local unmount="$1"
+ local timeout="${2:-30}"
+ local i
+
+ # Unmount fs to kill healer, then wait for it to finish
+ for ((i = 0; i < (timeout * 2); i++)); do
+ $unmount &>> $seqres.full && break
+ sleep 0.5
+ done
+
+ test -n "$XFS_HEALER_PID" && \
+ kill $XFS_HEALER_PID &>> $seqres.full
+ wait
+ unset XFS_HEALER_PID
+}
+
+# Unmount the scratch fs to kill a _scratch_invoke_xfs_healer instance.
+_scratch_kill_xfs_healer() {
+ local unmount="${1:-_scratch_unmount}"
+ shift
+
+ _kill_xfs_healer "$unmount" "$@"
+}
+
+# Does this mounted filesystem support xfs_healer?
+_require_xfs_healer()
+{
+ _xfs_healer --supported "$@" &>/dev/null || \
+ _notrun "health monitoring not supported on this kernel"
+}
diff --git a/tests/xfs/802 b/tests/xfs/802
index fc4767acb66a55..18312b15b645bd 100755
--- a/tests/xfs/802
+++ b/tests/xfs/802
@@ -105,8 +105,8 @@ run_scrub_service() {
}
echo "Scrub Scratch FS"
-scratch_path=$(systemd-escape --path "$SCRATCH_MNT")
-run_scrub_service xfs_scrub@$scratch_path
+svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
+run_scrub_service "$svc"
find_scrub_trace "$SCRATCH_MNT"
# Remove the xfs_scrub_all media scan stamp directory (if specified) because we
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 04/14] xfs: set up common code for testing xfs_healer
2026-03-10 3:51 ` [PATCH 04/14] xfs: set up common code for testing xfs_healer Darrick J. Wong
@ 2026-03-13 19:04 ` Zorro Lang
2026-03-14 20:37 ` Zorro Lang
1 sibling, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:04 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:51:10PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Add a bunch of common code so that we can test the xfs_healer daemon.
> Most of the changes here are to make it easier to manage the systemd
> service units for xfs_healer and xfs_scrub.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> common/config | 14 ++++++++
> common/rc | 5 +++
> common/systemd | 39 ++++++++++++++++++++++
> common/xfs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/802 | 4 +-
> 5 files changed, 158 insertions(+), 2 deletions(-)
>
>
> diff --git a/common/config b/common/config
> index 1420e35ddfee42..8468a60081f50c 100644
> --- a/common/config
> +++ b/common/config
> @@ -161,6 +161,20 @@ export XFS_ADMIN_PROG="$(type -P xfs_admin)"
> export XFS_GROWFS_PROG=$(type -P xfs_growfs)
> export XFS_SPACEMAN_PROG="$(type -P xfs_spaceman)"
> export XFS_SCRUB_PROG="$(type -P xfs_scrub)"
> +
> +XFS_HEALER_PROG="$(type -P xfs_healer)"
> +XFS_HEALER_START_PROG="$(type -P xfs_healer_start)"
> +
> +# If not found, try the ones installed in libexec
> +if [ ! -x "$XFS_HEALER_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer ]; then
> + XFS_HEALER_PROG=/usr/libexec/xfsprogs/xfs_healer
> +fi
> +if [ ! -x "$XFS_HEALER_START_PROG" ] && [ -e /usr/libexec/xfsprogs/xfs_healer_start ]; then
> + XFS_HEALER_START_PROG=/usr/libexec/xfsprogs/xfs_healer_start
> +fi
> +export XFS_HEALER_PROG
> +export XFS_HEALER_START_PROG
> +
> export XFS_PARALLEL_REPAIR_PROG="$(type -P xfs_prepair)"
> export XFS_PARALLEL_REPAIR64_PROG="$(type -P xfs_prepair64)"
> export __XFSDUMP_PROG="$(type -P xfsdump)"
> diff --git a/common/rc b/common/rc
> index ccb78baf5bd41a..0b740595d231b5 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -3021,6 +3021,11 @@ _require_xfs_io_command()
> "label")
> testio=`$XFS_IO_PROG -c "label" $TEST_DIR 2>&1`
> ;;
> + "verifymedia")
> + testio=`$XFS_IO_PROG -x -c "verifymedia $* 0 0" 2>&1`
> + echo $testio | grep -q "invalid option" && \
> + _notrun "xfs_io $command support is missing"
> + ;;
> "open")
> # -c "open $f" is broken in xfs_io <= 4.8. Along with the fix,
> # a new -C flag was introduced to execute one shot commands.
> diff --git a/common/systemd b/common/systemd
> index b2e24f267b2d93..589aad1bef2637 100644
> --- a/common/systemd
> +++ b/common/systemd
> @@ -44,6 +44,18 @@ _systemd_unit_active() {
> test "$(systemctl is-active "$1")" = "active"
> }
>
> +# Wait for up to a certain number of seconds for a service to reach inactive
> +# state.
> +_systemd_unit_wait() {
> + local svcname="$1"
> + local timeout="${2:-30}"
> +
> + for ((i = 0; i < (timeout * 2); i++)); do
> + test "$(systemctl is-active "$svcname")" = "inactive" && break
> + sleep 0.5
> + done
> +}
> +
> _require_systemd_unit_active() {
> _require_systemd_unit_defined "$1"
> _systemd_unit_active "$1" || \
> @@ -71,3 +83,30 @@ _systemd_unit_status() {
> _systemd_installed || return 1
> systemctl status "$1"
> }
> +
> +# Start a running systemd unit
> +_systemd_unit_start() {
> + systemctl start "$1"
> +}
> +# Stop a running systemd unit
> +_systemd_unit_stop() {
> + systemctl stop "$1"
> +}
> +
> +# Mask or unmask a running systemd unit
> +_systemd_unit_mask() {
> + systemctl mask "$1"
> +}
> +_systemd_unit_unmask() {
> + systemctl unmask "$1"
> +}
> +_systemd_unit_masked() {
> + systemctl status "$1" 2>/dev/null | grep -q 'Loaded: masked'
> +}
> +
> +_systemd_service_unit_path() {
> + local template="$1"
> + local path="$2"
> +
> + systemd-escape --template "$template" --path "$path"
> +}
> diff --git a/common/xfs b/common/xfs
> index 7fa0db2e26b4c9..f276325df8fbac 100644
> --- a/common/xfs
> +++ b/common/xfs
> @@ -2301,3 +2301,101 @@ _filter_bmap_gno()
> if ($ag =~ /\d+/) {print "$ag "} ;
> '
> }
> +
> +# Run the xfs_healer program on some filesystem
> +_xfs_healer() {
> + $XFS_HEALER_PROG "$@"
> +}
> +
> +# Compute the xfs_healer systemd service instance name for a given path.
> +# This is easy because xfs_healer has always supported --svcname.
> +_xfs_healer_svcname()
> +{
> + _xfs_healer --svcname "$@"
> +}
> +
> +# Compute the xfs_scrub systemd service instance name for a given path. This
> +# is tricky because xfs_scrub only gained --svcname when xfs_healer was made.
> +_xfs_scrub_svcname()
> +{
> + local ret
> +
> + if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
> + echo "$ret"
> + return 0
> + fi
> +
> + # ...but if not, we can fall back to brute force systemd invocations.
> + _systemd_service_unit_path "xfs_scrub@.service" "$*"
> +}
> +
> +# Run the xfs_healer program on the scratch fs
> +_scratch_xfs_healer() {
> + _xfs_healer "$@" "$SCRATCH_MNT"
> +}
> +
> +# Turn off the background xfs_healer service if any so that it doesn't fix
> +# injected metadata errors; then start a background copy of xfs_healer to
> +# capture that.
> +_invoke_xfs_healer() {
> + local mount="$1"
> + local logfile="$2"
> + shift; shift
> +
> + if _systemd_is_running; then
> + local svc="$(_xfs_healer_svcname "$mount")"
> + _systemd_unit_stop "$svc" &>> $seqres.full
> + fi
> +
> + $XFS_HEALER_PROG "$mount" "$@" &> "$logfile" &
> + XFS_HEALER_PID=$!
> +
> + # Wait 30s for the healer program to really start up
> + for ((i = 0; i < 60; i++)); do
> + test -e "$logfile" && \
> + grep -q 'monitoring started' "$logfile" && \
> + break
> + sleep 0.5
> + done
> +}
> +
> +# Run our own copy of xfs_healer against the scratch device. Note that
> +# unmounting the scratch fs causes the healer daemon to exit, so we don't need
> +# to kill it explicitly from _cleanup.
> +_scratch_invoke_xfs_healer() {
> + _invoke_xfs_healer "$SCRATCH_MNT" "$@"
> +}
> +
> +# Unmount the filesystem to kill the xfs_healer instance started by
> +# _invoke_xfs_healer, and wait up to a certain amount of time for it to exit.
> +_kill_xfs_healer() {
> + local unmount="$1"
> + local timeout="${2:-30}"
> + local i
> +
> + # Unmount fs to kill healer, then wait for it to finish
> + for ((i = 0; i < (timeout * 2); i++)); do
> + $unmount &>> $seqres.full && break
> + sleep 0.5
> + done
> +
> + test -n "$XFS_HEALER_PID" && \
> + kill $XFS_HEALER_PID &>> $seqres.full
> + wait
> + unset XFS_HEALER_PID
> +}
> +
> +# Unmount the scratch fs to kill a _scratch_invoke_xfs_healer instance.
> +_scratch_kill_xfs_healer() {
> + local unmount="${1:-_scratch_unmount}"
> + shift
> +
> + _kill_xfs_healer "$unmount" "$@"
> +}
> +
> +# Does this mounted filesystem support xfs_healer?
> +_require_xfs_healer()
> +{
> + _xfs_healer --supported "$@" &>/dev/null || \
> + _notrun "health monitoring not supported on this kernel"
> +}
> diff --git a/tests/xfs/802 b/tests/xfs/802
> index fc4767acb66a55..18312b15b645bd 100755
> --- a/tests/xfs/802
> +++ b/tests/xfs/802
> @@ -105,8 +105,8 @@ run_scrub_service() {
> }
>
> echo "Scrub Scratch FS"
> -scratch_path=$(systemd-escape --path "$SCRATCH_MNT")
> -run_scrub_service xfs_scrub@$scratch_path
> +svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
> +run_scrub_service "$svc"
> find_scrub_trace "$SCRATCH_MNT"
>
> # Remove the xfs_scrub_all media scan stamp directory (if specified) because we
>
^ permalink raw reply [flat|nested] 45+ messages in thread* Re: [PATCH 04/14] xfs: set up common code for testing xfs_healer
2026-03-10 3:51 ` [PATCH 04/14] xfs: set up common code for testing xfs_healer Darrick J. Wong
2026-03-13 19:04 ` Zorro Lang
@ 2026-03-14 20:37 ` Zorro Lang
2026-03-15 4:51 ` Darrick J. Wong
1 sibling, 1 reply; 45+ messages in thread
From: Zorro Lang @ 2026-03-14 20:37 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:51:10PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Add a bunch of common code so that we can test the xfs_healer daemon.
> Most of the changes here are to make it easier to manage the systemd
> service units for xfs_healer and xfs_scrub.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> common/config | 14 ++++++++
> common/rc | 5 +++
> common/systemd | 39 ++++++++++++++++++++++
> common/xfs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/802 | 4 +-
> 5 files changed, 158 insertions(+), 2 deletions(-)
>
[snip]
> +# Compute the xfs_scrub systemd service instance name for a given path. This
> +# is tricky because xfs_scrub only gained --svcname when xfs_healer was made.
> +_xfs_scrub_svcname()
> +{
> + local ret
> +
> + if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
Better to be:
- if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
+ if ret="$($XFS_SCRUB_PROG --svcname "$@" 2>/dev/null)"; then
Or below xfs/802 will ...
> + echo "$ret"
> + return 0
> + fi
[snip]
> diff --git a/tests/xfs/802 b/tests/xfs/802
> index fc4767acb66a55..18312b15b645bd 100755
> --- a/tests/xfs/802
> +++ b/tests/xfs/802
> @@ -105,8 +105,8 @@ run_scrub_service() {
> }
>
> echo "Scrub Scratch FS"
> -scratch_path=$(systemd-escape --path "$SCRATCH_MNT")
> -run_scrub_service xfs_scrub@$scratch_path
> +svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
... fails on old xfsprogs as:
--- /dev/fd/63 2026-03-13 19:16:15.217899866 -0400
+++ xfs/802.out.bad 2026-03-13 19:16:15.191834546 -0400
@@ -1,5 +1,21 @@
QA output created by 802
Format and populate
Scrub Scratch FS
+/usr/sbin/xfs_scrub: invalid option -- '-'
+Usage: xfs_scrub [OPTIONS] mountpoint
+
+Options:
+ -a count Stop after this many errors are found.
+ -b Background mode.
+ -C fd Print progress information to this fd.
+ -e behavior What to do if errors are found.
+ -k Do not FITRIM the free space.
+ -m path Path to /etc/mtab.
+ -n Dry run. Do not modify anything.
+ -p Only optimize, do not fix corruptions.
+ -T Display timing/usage information.
+ -v Verbose output.
+ -V Print version.
+ -x Scrub file data too.
Scrub Everything
Scrub Done
If you don't have more suggestion, I'll help to change that :)
Thanks,
Zorro
> +run_scrub_service "$svc"
> find_scrub_trace "$SCRATCH_MNT"
>
> # Remove the xfs_scrub_all media scan stamp directory (if specified) because we
>
^ permalink raw reply [flat|nested] 45+ messages in thread* Re: [PATCH 04/14] xfs: set up common code for testing xfs_healer
2026-03-14 20:37 ` Zorro Lang
@ 2026-03-15 4:51 ` Darrick J. Wong
0 siblings, 0 replies; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-15 4:51 UTC (permalink / raw)
To: Zorro Lang; +Cc: fstests, linux-xfs
On Sun, Mar 15, 2026 at 04:37:59AM +0800, Zorro Lang wrote:
> On Mon, Mar 09, 2026 at 08:51:10PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Add a bunch of common code so that we can test the xfs_healer daemon.
> > Most of the changes here are to make it easier to manage the systemd
> > service units for xfs_healer and xfs_scrub.
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> > common/config | 14 ++++++++
> > common/rc | 5 +++
> > common/systemd | 39 ++++++++++++++++++++++
> > common/xfs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > tests/xfs/802 | 4 +-
> > 5 files changed, 158 insertions(+), 2 deletions(-)
> >
>
> [snip]
>
> > +# Compute the xfs_scrub systemd service instance name for a given path. This
> > +# is tricky because xfs_scrub only gained --svcname when xfs_healer was made.
> > +_xfs_scrub_svcname()
> > +{
> > + local ret
> > +
> > + if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
>
> Better to be:
>
> - if ret="$($XFS_SCRUB_PROG --svcname "$@")"; then
> + if ret="$($XFS_SCRUB_PROG --svcname "$@" 2>/dev/null)"; then
>
> Or below xfs/802 will ...
>
> > + echo "$ret"
> > + return 0
> > + fi
>
> [snip]
>
> > diff --git a/tests/xfs/802 b/tests/xfs/802
> > index fc4767acb66a55..18312b15b645bd 100755
> > --- a/tests/xfs/802
> > +++ b/tests/xfs/802
> > @@ -105,8 +105,8 @@ run_scrub_service() {
> > }
> >
> > echo "Scrub Scratch FS"
> > -scratch_path=$(systemd-escape --path "$SCRATCH_MNT")
> > -run_scrub_service xfs_scrub@$scratch_path
> > +svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
>
> ... fails on old xfsprogs as:
>
> --- /dev/fd/63 2026-03-13 19:16:15.217899866 -0400
> +++ xfs/802.out.bad 2026-03-13 19:16:15.191834546 -0400
> @@ -1,5 +1,21 @@
> QA output created by 802
> Format and populate
> Scrub Scratch FS
> +/usr/sbin/xfs_scrub: invalid option -- '-'
> +Usage: xfs_scrub [OPTIONS] mountpoint
> +
> +Options:
> + -a count Stop after this many errors are found.
> + -b Background mode.
> + -C fd Print progress information to this fd.
> + -e behavior What to do if errors are found.
> + -k Do not FITRIM the free space.
> + -m path Path to /etc/mtab.
> + -n Dry run. Do not modify anything.
> + -p Only optimize, do not fix corruptions.
> + -T Display timing/usage information.
> + -v Verbose output.
> + -V Print version.
> + -x Scrub file data too.
> Scrub Everything
> Scrub Done
>
> If you don't have more suggestion, I'll help to change that :)
That seems the proper correction to make. Thanks for your help!
--D
> Thanks,
> Zorro
>
> > +run_scrub_service "$svc"
> > find_scrub_trace "$SCRATCH_MNT"
> >
> > # Remove the xfs_scrub_all media scan stamp directory (if specified) because we
> >
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 05/14] xfs: test xfs_healer's event handling
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (3 preceding siblings ...)
2026-03-10 3:51 ` [PATCH 04/14] xfs: set up common code for testing xfs_healer Darrick J. Wong
@ 2026-03-10 3:51 ` Darrick J. Wong
2026-03-13 19:19 ` Zorro Lang
2026-03-10 3:51 ` [PATCH 06/14] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
` (9 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:51 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer can handle every type of event that the kernel
can throw at it by initiating a full scrub of a test filesystem.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1882 | 44 ++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1882.out | 2 ++
2 files changed, 46 insertions(+)
create mode 100755 tests/xfs/1882
create mode 100644 tests/xfs/1882.out
diff --git a/tests/xfs/1882 b/tests/xfs/1882
new file mode 100755
index 00000000000000..2fb4589418401e
--- /dev/null
+++ b/tests/xfs/1882
@@ -0,0 +1,44 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1882
+#
+# Make sure that xfs_healer correctly handles all the reports that it gets
+# from the kernel. We simulate this by using the --everything mode so we get
+# all the events, not just the sickness reports.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+. ./common/populate
+
+_require_scrub
+_require_xfs_io_command "scrub" # online check support
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_scratch
+
+# Does this fs support health monitoring?
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+# Create a sample fs with all the goodies
+_scratch_populate_cached nofill &>> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --everything
+
+# Run scrub to make some noise
+_scratch_scrub -b -n >> $seqres.full
+
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1882.out b/tests/xfs/1882.out
new file mode 100644
index 00000000000000..9b31ccb735cabd
--- /dev/null
+++ b/tests/xfs/1882.out
@@ -0,0 +1,2 @@
+QA output created by 1882
+Silence is golden
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 05/14] xfs: test xfs_healer's event handling
2026-03-10 3:51 ` [PATCH 05/14] xfs: test xfs_healer's event handling Darrick J. Wong
@ 2026-03-13 19:19 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:19 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:51:26PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer can handle every type of event that the kernel
> can throw at it by initiating a full scrub of a test filesystem.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1882 | 44 ++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1882.out | 2 ++
> 2 files changed, 46 insertions(+)
> create mode 100755 tests/xfs/1882
> create mode 100644 tests/xfs/1882.out
>
>
> diff --git a/tests/xfs/1882 b/tests/xfs/1882
> new file mode 100755
> index 00000000000000..2fb4589418401e
> --- /dev/null
> +++ b/tests/xfs/1882
> @@ -0,0 +1,44 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1882
> +#
> +# Make sure that xfs_healer correctly handles all the reports that it gets
> +# from the kernel. We simulate this by using the --everything mode so we get
> +# all the events, not just the sickness reports.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +. ./common/populate
> +
> +_require_scrub
> +_require_xfs_io_command "scrub" # online check support
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_scratch
> +
> +# Does this fs support health monitoring?
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +_require_xfs_healer $SCRATCH_MNT
> +_scratch_unmount
> +
> +# Create a sample fs with all the goodies
> +_scratch_populate_cached nofill &>> $seqres.full
> +_scratch_mount
> +
> +_scratch_invoke_xfs_healer "$tmp.healer" --everything
> +
> +# Run scrub to make some noise
> +_scratch_scrub -b -n >> $seqres.full
> +
> +_scratch_kill_xfs_healer
> +cat $tmp.healer >> $seqres.full
> +
> +echo Silence is golden
> +status=0
> +exit
> diff --git a/tests/xfs/1882.out b/tests/xfs/1882.out
> new file mode 100644
> index 00000000000000..9b31ccb735cabd
> --- /dev/null
> +++ b/tests/xfs/1882.out
> @@ -0,0 +1,2 @@
> +QA output created by 1882
> +Silence is golden
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 06/14] xfs: test xfs_healer can fix a filesystem
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (4 preceding siblings ...)
2026-03-10 3:51 ` [PATCH 05/14] xfs: test xfs_healer's event handling Darrick J. Wong
@ 2026-03-10 3:51 ` Darrick J. Wong
2026-03-13 19:28 ` Zorro Lang
2026-03-10 3:51 ` [PATCH 07/14] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
` (8 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:51 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer can actually fix an injected metadata corruption.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1884 | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1884.out | 2 +
2 files changed, 91 insertions(+)
create mode 100755 tests/xfs/1884
create mode 100644 tests/xfs/1884.out
diff --git a/tests/xfs/1884 b/tests/xfs/1884
new file mode 100755
index 00000000000000..1fa6457ad25203
--- /dev/null
+++ b/tests/xfs/1884
@@ -0,0 +1,89 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1884
+#
+# Ensure that autonomous self healing fixes the filesystem correctly.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "try $try saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "retry $try still saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1884.out b/tests/xfs/1884.out
new file mode 100644
index 00000000000000..929e33da01f92c
--- /dev/null
+++ b/tests/xfs/1884.out
@@ -0,0 +1,2 @@
+QA output created by 1884
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 06/14] xfs: test xfs_healer can fix a filesystem
2026-03-10 3:51 ` [PATCH 06/14] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
@ 2026-03-13 19:28 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:28 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:51:41PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer can actually fix an injected metadata corruption.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1884 | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1884.out | 2 +
> 2 files changed, 91 insertions(+)
> create mode 100755 tests/xfs/1884
> create mode 100644 tests/xfs/1884.out
>
>
> diff --git a/tests/xfs/1884 b/tests/xfs/1884
> new file mode 100755
> index 00000000000000..1fa6457ad25203
> --- /dev/null
> +++ b/tests/xfs/1884
> @@ -0,0 +1,89 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1884
> +#
> +# Ensure that autonomous self healing fixes the filesystem correctly.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +_require_scratch
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +
> +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $SCRATCH_MNT parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $SCRATCH_MNT --repair
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> +echo testdata > $SCRATCH_MNT/a
> +mkdir -p "$SCRATCH_MNT/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Break the directory, remount filesystem
> +_scratch_unmount
> +_scratch_xfs_db -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> +_scratch_mount
> +
> +_scratch_invoke_xfs_healer "$tmp.healer" --repair
> +
> +# Access the broken directory to trigger a repair, then poll the directory
> +# for 5 seconds to see if it gets fixed without us needing to intervene.
> +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> +_filter_scratch < $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "try $try saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "try $try no longer saw corruption or gave up" >> $seqres.full
> +_filter_scratch < $tmp.err
> +
> +# List the dirents of /victimdir to see if it stops reporting corruption
> +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "retry $try still saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> +
> +# Unmount to kill the healer
> +_scratch_kill_xfs_healer
> +cat $tmp.healer >> $seqres.full
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1884.out b/tests/xfs/1884.out
> new file mode 100644
> index 00000000000000..929e33da01f92c
> --- /dev/null
> +++ b/tests/xfs/1884.out
> @@ -0,0 +1,2 @@
> +QA output created by 1884
> +ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 07/14] xfs: test xfs_healer can report file I/O errors
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (5 preceding siblings ...)
2026-03-10 3:51 ` [PATCH 06/14] xfs: test xfs_healer can fix a filesystem Darrick J. Wong
@ 2026-03-10 3:51 ` Darrick J. Wong
2026-03-13 19:32 ` Zorro Lang
2026-03-10 3:52 ` [PATCH 08/14] xfs: test xfs_healer can report file media errors Darrick J. Wong
` (7 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:51 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer can actually report file I/O errors.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1896 | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1896.out | 21 +++++
2 files changed, 231 insertions(+)
create mode 100755 tests/xfs/1896
create mode 100644 tests/xfs/1896.out
diff --git a/tests/xfs/1896 b/tests/xfs/1896
new file mode 100755
index 00000000000000..911e1d5ee8a576
--- /dev/null
+++ b/tests/xfs/1896
@@ -0,0 +1,210 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1896
+#
+# Check that xfs_healer can report file IO errors.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+# Override the default cleanup function.
+_cleanup()
+{
+ cd /
+ rm -f $tmp.*
+ _dmerror_cleanup
+}
+
+# Import common functions.
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+. ./common/systemd
+
+_require_scratch
+_require_scrub
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_dm_target error
+_require_no_xfs_always_cow # no out of place writes
+
+# Ignore everything from the healer except for the four IO error log messages.
+# Strip out file handle and range information because the blocksize can vary.
+# Writeback and readahead can trigger multiple error messages due to retries,
+# hence the uniq.
+filter_healer_errors() {
+ _filter_scratch | \
+ grep -E '(buffered|directio)' | \
+ sed \
+ -e 's/ino [0-9]*/ino NUM/g' \
+ -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+ -e 's/pos [0-9]*/pos NUM/g' \
+ -e 's/len [0-9]*/len NUM/g' \
+ -e 's|SCRATCH_MNT/a|VICTIM|g' \
+ -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
+ sort | \
+ uniq
+}
+
+_scratch_mkfs >> $seqres.full
+
+#
+# The dm-error map added by this test doesn't work on zoned devices because
+# table sizes need to be aligned to the zone size, and even for zoned on
+# conventional this test will get confused because of the internal RT device.
+#
+# That check requires a mounted file system, so do a dummy mount before setting
+# up DM.
+#
+_scratch_mount
+_require_xfs_scratch_non_zoned
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+_dmerror_init
+_dmerror_mount >> $seqres.full 2>&1
+
+# Write a file with 4 file blocks worth of data, figure out the LBA to target
+victim=$SCRATCH_MNT/a
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
+unset errordev
+
+awk_len_prog='{print $6}'
+if _xfs_is_realtime_file $victim; then
+ if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+ awk_len_prog='{print $4}'
+ fi
+ errordev="RT"
+fi
+bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
+echo "$errordev:$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
+fs_blksz=$(_get_block_size $SCRATCH_MNT)
+echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
+kernel_sectors_per_fs_block=$((fs_blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
+test "$len" -lt $min_len_sectors && \
+ _fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Set the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# All sector numbers that we feed to the kernel must be in units of 512b, but
+# they also must be aligned to the device's logical block size.
+logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
+kernel_sectors_per_device_lba=$((logical_block_size / 512))
+
+# Mark as bad one of the device LBAs in the middle of the extent. Target the
+# second LBA of the third block of the four-block file extent that we allocated
+# earlier, but without overflowing into the fourth file block.
+bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
+bad_len=$kernel_sectors_per_device_lba
+if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
+ bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
+fi
+if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
+ echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
+fi
+
+# Remount to flush the page cache, start the healer, and make the LBA bad
+_dmerror_unmount
+_dmerror_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer"
+
+_dmerror_mark_range_bad $bad_sector $bad_len $errordev
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+# See if buffered reads pick it up
+echo "Try buffered read"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Now mark the bad range good so that unmount won't fail due to IO errors.
+echo "Fix device"
+_dmerror_mark_range_good $bad_sector $bad_len $errordev
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer _dmerror_unmount
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer_errors
+
+# Start the healer again so that can verify that the errors don't persist after
+# we flip back to the good dm table.
+echo "Remount and restart healer"
+_dmerror_mount
+_scratch_invoke_xfs_healer "$tmp.healer"
+
+# See if buffered reads pick it up
+echo "Try buffered read again"
+$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio reads pick it up
+echo "Try directio read again"
+$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
+
+# See if directio writes pick it up
+echo "Try directio write again"
+$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# See if buffered writes pick it up
+echo "Try buffered write again"
+$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
+
+# Unmount fs to kill healer, then wait for it to finish
+echo "Kill healer again"
+_scratch_kill_xfs_healer _dmerror_unmount
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer_errors
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1896.out b/tests/xfs/1896.out
new file mode 100644
index 00000000000000..1378d4fad44522
--- /dev/null
+++ b/tests/xfs/1896.out
@@ -0,0 +1,21 @@
+QA output created by 1896
+Try buffered read
+pread: Input/output error
+Try directio read
+pread: Input/output error
+Try directio write
+pwrite: Input/output error
+Try buffered write
+fsync: Input/output error
+Fix device
+Kill healer
+VICTIM pos NUM len NUM: buffered_read: Input/output error
+VICTIM pos NUM len NUM: buffered_write: Input/output error
+VICTIM pos NUM len NUM: directio_read: Input/output error
+VICTIM pos NUM len NUM: directio_write: Input/output error
+Remount and restart healer
+Try buffered read again
+Try directio read again
+Try directio write again
+Try buffered write again
+Kill healer again
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 07/14] xfs: test xfs_healer can report file I/O errors
2026-03-10 3:51 ` [PATCH 07/14] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
@ 2026-03-13 19:32 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:32 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:51:57PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer can actually report file I/O errors.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1896 | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1896.out | 21 +++++
> 2 files changed, 231 insertions(+)
> create mode 100755 tests/xfs/1896
> create mode 100644 tests/xfs/1896.out
>
>
> diff --git a/tests/xfs/1896 b/tests/xfs/1896
> new file mode 100755
> index 00000000000000..911e1d5ee8a576
> --- /dev/null
> +++ b/tests/xfs/1896
> @@ -0,0 +1,210 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1896
> +#
> +# Check that xfs_healer can report file IO errors.
> +
> +. ./common/preamble
> +_begin_fstest auto quick scrub eio selfhealing
> +
> +# Override the default cleanup function.
> +_cleanup()
> +{
> + cd /
> + rm -f $tmp.*
> + _dmerror_cleanup
> +}
> +
> +# Import common functions.
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/dmerror
> +. ./common/systemd
> +
> +_require_scratch
> +_require_scrub
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_dm_target error
> +_require_no_xfs_always_cow # no out of place writes
> +
> +# Ignore everything from the healer except for the four IO error log messages.
> +# Strip out file handle and range information because the blocksize can vary.
> +# Writeback and readahead can trigger multiple error messages due to retries,
> +# hence the uniq.
> +filter_healer_errors() {
> + _filter_scratch | \
> + grep -E '(buffered|directio)' | \
> + sed \
> + -e 's/ino [0-9]*/ino NUM/g' \
> + -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
> + -e 's/pos [0-9]*/pos NUM/g' \
> + -e 's/len [0-9]*/len NUM/g' \
> + -e 's|SCRATCH_MNT/a|VICTIM|g' \
> + -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g' | \
> + sort | \
> + uniq
> +}
> +
> +_scratch_mkfs >> $seqres.full
> +
> +#
> +# The dm-error map added by this test doesn't work on zoned devices because
> +# table sizes need to be aligned to the zone size, and even for zoned on
> +# conventional this test will get confused because of the internal RT device.
> +#
> +# That check requires a mounted file system, so do a dummy mount before setting
> +# up DM.
> +#
> +_scratch_mount
> +_require_xfs_scratch_non_zoned
> +_require_xfs_healer $SCRATCH_MNT
> +_scratch_unmount
> +
> +_dmerror_init
> +_dmerror_mount >> $seqres.full 2>&1
> +
> +# Write a file with 4 file blocks worth of data, figure out the LBA to target
> +victim=$SCRATCH_MNT/a
> +file_blksz=$(_get_file_block_size $SCRATCH_MNT)
> +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
> +unset errordev
> +
> +awk_len_prog='{print $6}'
> +if _xfs_is_realtime_file $victim; then
> + if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
> + awk_len_prog='{print $4}'
> + fi
> + errordev="RT"
> +fi
> +bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
> +echo "$errordev:$bmap_str" >> $seqres.full
> +
> +phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
> +len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
> +
> +fs_blksz=$(_get_block_size $SCRATCH_MNT)
> +echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
> +kernel_sectors_per_fs_block=$((fs_blksz / 512))
> +
> +# Did we get at least 4 fs blocks worth of extent?
> +min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
> +test "$len" -lt $min_len_sectors && \
> + _fail "could not format a long enough extent on an empty fs??"
> +
> +phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
> +
> +echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
> +echo "victim file:" >> $seqres.full
> +od -tx1 -Ad -c $victim >> $seqres.full
> +
> +# Set the dmerror table so that all IO will pass through.
> +_dmerror_reset_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror before:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +# All sector numbers that we feed to the kernel must be in units of 512b, but
> +# they also must be aligned to the device's logical block size.
> +logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
> +kernel_sectors_per_device_lba=$((logical_block_size / 512))
> +
> +# Mark as bad one of the device LBAs in the middle of the extent. Target the
> +# second LBA of the third block of the four-block file extent that we allocated
> +# earlier, but without overflowing into the fourth file block.
> +bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
> +bad_len=$kernel_sectors_per_device_lba
> +if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
> + bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
> +fi
> +if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
> + echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
> +fi
> +
> +# Remount to flush the page cache, start the healer, and make the LBA bad
> +_dmerror_unmount
> +_dmerror_mount
> +
> +_scratch_invoke_xfs_healer "$tmp.healer"
> +
> +_dmerror_mark_range_bad $bad_sector $bad_len $errordev
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking bad:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +_dmerror_load_error_table
> +
> +# See if buffered reads pick it up
> +echo "Try buffered read"
> +$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
> +
> +# See if directio reads pick it up
> +echo "Try directio read"
> +$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
> +
> +# See if directio writes pick it up
> +echo "Try directio write"
> +$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
> +
> +# See if buffered writes pick it up
> +echo "Try buffered write"
> +$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
> +
> +# Now mark the bad range good so that unmount won't fail due to IO errors.
> +echo "Fix device"
> +_dmerror_mark_range_good $bad_sector $bad_len $errordev
> +_dmerror_load_error_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking good:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +# Unmount filesystem to start fresh
> +echo "Kill healer"
> +_scratch_kill_xfs_healer _dmerror_unmount
> +cat $tmp.healer >> $seqres.full
> +cat $tmp.healer | filter_healer_errors
> +
> +# Start the healer again so that can verify that the errors don't persist after
> +# we flip back to the good dm table.
> +echo "Remount and restart healer"
> +_dmerror_mount
> +_scratch_invoke_xfs_healer "$tmp.healer"
> +
> +# See if buffered reads pick it up
> +echo "Try buffered read again"
> +$XFS_IO_PROG -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
> +
> +# See if directio reads pick it up
> +echo "Try directio read again"
> +$XFS_IO_PROG -d -c "pread 0 $((4 * file_blksz))" $victim >> $seqres.full
> +
> +# See if directio writes pick it up
> +echo "Try directio write again"
> +$XFS_IO_PROG -d -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
> +
> +# See if buffered writes pick it up
> +echo "Try buffered write again"
> +$XFS_IO_PROG -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c fsync $victim >> $seqres.full
> +
> +# Unmount fs to kill healer, then wait for it to finish
> +echo "Kill healer again"
> +_scratch_kill_xfs_healer _dmerror_unmount
> +cat $tmp.healer >> $seqres.full
> +cat $tmp.healer | filter_healer_errors
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/1896.out b/tests/xfs/1896.out
> new file mode 100644
> index 00000000000000..1378d4fad44522
> --- /dev/null
> +++ b/tests/xfs/1896.out
> @@ -0,0 +1,21 @@
> +QA output created by 1896
> +Try buffered read
> +pread: Input/output error
> +Try directio read
> +pread: Input/output error
> +Try directio write
> +pwrite: Input/output error
> +Try buffered write
> +fsync: Input/output error
> +Fix device
> +Kill healer
> +VICTIM pos NUM len NUM: buffered_read: Input/output error
> +VICTIM pos NUM len NUM: buffered_write: Input/output error
> +VICTIM pos NUM len NUM: directio_read: Input/output error
> +VICTIM pos NUM len NUM: directio_write: Input/output error
> +Remount and restart healer
> +Try buffered read again
> +Try directio read again
> +Try directio write again
> +Try buffered write again
> +Kill healer again
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 08/14] xfs: test xfs_healer can report file media errors
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (6 preceding siblings ...)
2026-03-10 3:51 ` [PATCH 07/14] xfs: test xfs_healer can report file I/O errors Darrick J. Wong
@ 2026-03-10 3:52 ` Darrick J. Wong
2026-03-13 19:36 ` Zorro Lang
2026-03-10 3:52 ` [PATCH 09/14] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
` (6 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:52 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer can actually report media errors as found by the
kernel.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1897 | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1897.out | 7 ++
2 files changed, 179 insertions(+)
create mode 100755 tests/xfs/1897
create mode 100755 tests/xfs/1897.out
diff --git a/tests/xfs/1897 b/tests/xfs/1897
new file mode 100755
index 00000000000000..4670c333a2d82c
--- /dev/null
+++ b/tests/xfs/1897
@@ -0,0 +1,172 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1897
+#
+# Check that xfs_healer can report media errors.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+_cleanup()
+{
+ cd /
+ rm -f $tmp.*
+ _dmerror_cleanup
+}
+
+. ./common/fuzzy
+. ./common/filter
+. ./common/dmerror
+. ./common/systemd
+
+_require_scratch
+_require_scrub
+_require_dm_target error
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_xfs_io_command verifymedia
+
+filter_healer() {
+ _filter_scratch | \
+ grep -E '(media failed|media error)' | \
+ sed \
+ -e 's/datadev/DEVICE/g' \
+ -e 's/rtdev/DEVICE/g' \
+ -e 's/ino [0-9]*/ino NUM/g' \
+ -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
+ -e 's/pos [0-9]*/pos NUM/g' \
+ -e 's/len [0-9]*/len NUM/g' \
+ -e 's/0x[0-9a-f]*/NUM/g' \
+ -e 's|SCRATCH_MNT/a|VICTIM|g' \
+ -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g'
+}
+
+filter_verify() {
+ sed -e 's/\([a-z]*dev\): verify error at offset \([0-9]*\) length \([0-9]*\)/DEVICE: verify error at offset XXX length XXX/g'
+}
+
+_scratch_mkfs >> $seqres.full
+
+# The dm-error map added by this test doesn't work on zoned devices because
+# table sizes need to be aligned to the zone size, and even for zoned on
+# conventional this test will get confused because of the internal RT device.
+#
+# That check requires a mounted file system, so do a dummy mount before setting
+# up DM.
+_scratch_mount
+_require_xfs_scratch_non_zoned
+_require_xfs_healer $SCRATCH_MNT
+_scratch_unmount
+
+_dmerror_init
+_dmerror_mount
+
+# Write a file with 4 file blocks worth of data, figure out the LBA to target
+victim=$SCRATCH_MNT/a
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
+unset errordev
+verifymediadev="-d"
+
+awk_len_prog='{print $6}'
+if _xfs_is_realtime_file $victim; then
+ if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+ awk_len_prog='{print $4}'
+ fi
+ errordev="RT"
+ verifymediadev="-r"
+fi
+bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
+echo "$errordev:$bmap_str" >> $seqres.full
+
+phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
+fs_blksz=$(_get_block_size $SCRATCH_MNT)
+echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
+kernel_sectors_per_fs_block=$((fs_blksz / 512))
+
+# Did we get at least 4 fs blocks worth of extent?
+min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
+test "$len" -lt $min_len_sectors && \
+ _fail "could not format a long enough extent on an empty fs??"
+
+phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
+
+echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
+echo "victim file:" >> $seqres.full
+od -tx1 -Ad -c $victim >> $seqres.full
+
+# Set the dmerror table so that all IO will pass through.
+_dmerror_reset_table
+
+cat >> $seqres.full << ENDL
+dmerror before:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+# All sector numbers that we feed to the kernel must be in units of 512b, but
+# they also must be aligned to the device's logical block size.
+logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
+kernel_sectors_per_device_lba=$((logical_block_size / 512))
+
+# Pretend as bad one of the device LBAs in the middle of the extent. Target
+# the second LBA of the third block of the four-block file extent that we
+# allocated earlier, but without overflowing into the fourth file block.
+bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
+bad_len=$kernel_sectors_per_device_lba
+if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
+ bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
+fi
+if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
+ echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
+fi
+_dmerror_mark_range_bad $bad_sector $bad_len $errordev
+
+cat >> $seqres.full << ENDL
+dmerror after marking bad:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+_dmerror_load_error_table
+
+echo "Simulate media error"
+_scratch_invoke_xfs_healer "$tmp.healer"
+echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
+$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT 2>&1 | filter_verify
+
+# Now mark the bad range good so that a retest shows no media failure.
+_dmerror_mark_range_good $bad_sector $bad_len $errordev
+_dmerror_load_error_table
+
+cat >> $seqres.full << ENDL
+dmerror after marking good:
+$DMERROR_TABLE
+$DMERROR_RTTABLE
+<end table>
+ENDL
+
+echo "No more media error"
+echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
+$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT >> $seqres.full
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer _dmerror_unmount
+
+# filesystems without rmap do not translate media errors to lost file ranges
+# so fake the output
+_xfs_has_feature "$SCRATCH_DEV" rmapbt || \
+ echo "VICTIM pos 0 len 0: media failed" >> $tmp.healer
+
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1897.out b/tests/xfs/1897.out
new file mode 100755
index 00000000000000..1bb615c3119dce
--- /dev/null
+++ b/tests/xfs/1897.out
@@ -0,0 +1,7 @@
+QA output created by 1897
+Simulate media error
+DEVICE: verify error at offset XXX length XXX: Input/output error
+No more media error
+Kill healer
+SCRATCH_MNT DEVICE daddr NUM bbcount NUM: media error
+VICTIM pos NUM len NUM: media failed
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 08/14] xfs: test xfs_healer can report file media errors
2026-03-10 3:52 ` [PATCH 08/14] xfs: test xfs_healer can report file media errors Darrick J. Wong
@ 2026-03-13 19:36 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:36 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:52:13PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer can actually report media errors as found by the
> kernel.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1897 | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1897.out | 7 ++
> 2 files changed, 179 insertions(+)
> create mode 100755 tests/xfs/1897
> create mode 100755 tests/xfs/1897.out
>
>
> diff --git a/tests/xfs/1897 b/tests/xfs/1897
> new file mode 100755
> index 00000000000000..4670c333a2d82c
> --- /dev/null
> +++ b/tests/xfs/1897
> @@ -0,0 +1,172 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1897
> +#
> +# Check that xfs_healer can report media errors.
> +
> +. ./common/preamble
> +_begin_fstest auto quick scrub eio selfhealing
> +
> +_cleanup()
> +{
> + cd /
> + rm -f $tmp.*
> + _dmerror_cleanup
> +}
> +
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/dmerror
> +. ./common/systemd
> +
> +_require_scratch
> +_require_scrub
> +_require_dm_target error
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_xfs_io_command verifymedia
> +
> +filter_healer() {
> + _filter_scratch | \
> + grep -E '(media failed|media error)' | \
> + sed \
> + -e 's/datadev/DEVICE/g' \
> + -e 's/rtdev/DEVICE/g' \
> + -e 's/ino [0-9]*/ino NUM/g' \
> + -e 's/gen 0x[0-9a-f]*/gen NUM/g' \
> + -e 's/pos [0-9]*/pos NUM/g' \
> + -e 's/len [0-9]*/len NUM/g' \
> + -e 's/0x[0-9a-f]*/NUM/g' \
> + -e 's|SCRATCH_MNT/a|VICTIM|g' \
> + -e 's|SCRATCH_MNT ino NUM gen NUM|VICTIM|g'
> +}
> +
> +filter_verify() {
> + sed -e 's/\([a-z]*dev\): verify error at offset \([0-9]*\) length \([0-9]*\)/DEVICE: verify error at offset XXX length XXX/g'
> +}
> +
> +_scratch_mkfs >> $seqres.full
> +
> +# The dm-error map added by this test doesn't work on zoned devices because
> +# table sizes need to be aligned to the zone size, and even for zoned on
> +# conventional this test will get confused because of the internal RT device.
> +#
> +# That check requires a mounted file system, so do a dummy mount before setting
> +# up DM.
> +_scratch_mount
> +_require_xfs_scratch_non_zoned
> +_require_xfs_healer $SCRATCH_MNT
> +_scratch_unmount
> +
> +_dmerror_init
> +_dmerror_mount
> +
> +# Write a file with 4 file blocks worth of data, figure out the LBA to target
> +victim=$SCRATCH_MNT/a
> +file_blksz=$(_get_file_block_size $SCRATCH_MNT)
> +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
> +unset errordev
> +verifymediadev="-d"
> +
> +awk_len_prog='{print $6}'
> +if _xfs_is_realtime_file $victim; then
> + if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
> + awk_len_prog='{print $4}'
> + fi
> + errordev="RT"
> + verifymediadev="-r"
> +fi
> +bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
> +echo "$errordev:$bmap_str" >> $seqres.full
> +
> +phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
> +len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
> +
> +fs_blksz=$(_get_block_size $SCRATCH_MNT)
> +echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
> +kernel_sectors_per_fs_block=$((fs_blksz / 512))
> +
> +# Did we get at least 4 fs blocks worth of extent?
> +min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
> +test "$len" -lt $min_len_sectors && \
> + _fail "could not format a long enough extent on an empty fs??"
> +
> +phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')
> +
> +echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
> +echo "victim file:" >> $seqres.full
> +od -tx1 -Ad -c $victim >> $seqres.full
> +
> +# Set the dmerror table so that all IO will pass through.
> +_dmerror_reset_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror before:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +# All sector numbers that we feed to the kernel must be in units of 512b, but
> +# they also must be aligned to the device's logical block size.
> +logical_block_size=`$here/src/min_dio_alignment $SCRATCH_MNT $SCRATCH_DEV`
> +kernel_sectors_per_device_lba=$((logical_block_size / 512))
> +
> +# Pretend as bad one of the device LBAs in the middle of the extent. Target
> +# the second LBA of the third block of the four-block file extent that we
> +# allocated earlier, but without overflowing into the fourth file block.
> +bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
> +bad_len=$kernel_sectors_per_device_lba
> +if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
> + bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
> +fi
> +if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
> + echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
> +fi
> +_dmerror_mark_range_bad $bad_sector $bad_len $errordev
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking bad:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +_dmerror_load_error_table
> +
> +echo "Simulate media error"
> +_scratch_invoke_xfs_healer "$tmp.healer"
> +echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
> +$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT 2>&1 | filter_verify
> +
> +# Now mark the bad range good so that a retest shows no media failure.
> +_dmerror_mark_range_good $bad_sector $bad_len $errordev
> +_dmerror_load_error_table
> +
> +cat >> $seqres.full << ENDL
> +dmerror after marking good:
> +$DMERROR_TABLE
> +$DMERROR_RTTABLE
> +<end table>
> +ENDL
> +
> +echo "No more media error"
> +echo "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" >> $seqres.full
> +$XFS_IO_PROG -x -c "verifymedia $verifymediadev -R $((bad_sector * 512)) $(((bad_sector + bad_len) * 512))" $SCRATCH_MNT >> $seqres.full
> +
> +# Unmount filesystem to start fresh
> +echo "Kill healer"
> +_scratch_kill_xfs_healer _dmerror_unmount
> +
> +# filesystems without rmap do not translate media errors to lost file ranges
> +# so fake the output
> +_xfs_has_feature "$SCRATCH_DEV" rmapbt || \
> + echo "VICTIM pos 0 len 0: media failed" >> $tmp.healer
> +
> +cat $tmp.healer >> $seqres.full
> +cat $tmp.healer | filter_healer
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/1897.out b/tests/xfs/1897.out
> new file mode 100755
> index 00000000000000..1bb615c3119dce
> --- /dev/null
> +++ b/tests/xfs/1897.out
> @@ -0,0 +1,7 @@
> +QA output created by 1897
> +Simulate media error
> +DEVICE: verify error at offset XXX length XXX: Input/output error
> +No more media error
> +Kill healer
> +SCRATCH_MNT DEVICE daddr NUM bbcount NUM: media error
> +VICTIM pos NUM len NUM: media failed
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 09/14] xfs: test xfs_healer can report filesystem shutdowns
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (7 preceding siblings ...)
2026-03-10 3:52 ` [PATCH 08/14] xfs: test xfs_healer can report file media errors Darrick J. Wong
@ 2026-03-10 3:52 ` Darrick J. Wong
2026-03-13 19:45 ` Zorro Lang
2026-03-10 3:52 ` [PATCH 10/14] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
` (5 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:52 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer can actually report abnormal filesystem shutdowns.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1898 | 37 +++++++++++++++++++++++++++++++++++++
tests/xfs/1898.out | 4 ++++
2 files changed, 41 insertions(+)
create mode 100755 tests/xfs/1898
create mode 100755 tests/xfs/1898.out
diff --git a/tests/xfs/1898 b/tests/xfs/1898
new file mode 100755
index 00000000000000..2b6c72093e7021
--- /dev/null
+++ b/tests/xfs/1898
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1898
+#
+# Check that xfs_healer can report filesystem shutdowns.
+
+. ./common/preamble
+_begin_fstest auto quick scrub eio selfhealing
+
+. ./common/fuzzy
+. ./common/filter
+. ./common/systemd
+
+_require_scratch_nocheck
+_require_scrub
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+$XFS_IO_PROG -f -c "pwrite -S 0x58 0 500k" -c "fsync" $victim >> $seqres.full
+
+echo "Start healer and shut down"
+_scratch_invoke_xfs_healer "$tmp.healer"
+_scratch_shutdown -f
+
+# Unmount filesystem to start fresh
+echo "Kill healer"
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | _filter_scratch | grep 'shut down'
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1898.out b/tests/xfs/1898.out
new file mode 100755
index 00000000000000..f71f848da810ce
--- /dev/null
+++ b/tests/xfs/1898.out
@@ -0,0 +1,4 @@
+QA output created by 1898
+Start healer and shut down
+Kill healer
+SCRATCH_MNT: filesystem shut down due to forced unmount
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 09/14] xfs: test xfs_healer can report filesystem shutdowns
2026-03-10 3:52 ` [PATCH 09/14] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
@ 2026-03-13 19:45 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:45 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:52:28PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer can actually report abnormal filesystem shutdowns.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1898 | 37 +++++++++++++++++++++++++++++++++++++
> tests/xfs/1898.out | 4 ++++
> 2 files changed, 41 insertions(+)
> create mode 100755 tests/xfs/1898
> create mode 100755 tests/xfs/1898.out
>
>
> diff --git a/tests/xfs/1898 b/tests/xfs/1898
> new file mode 100755
> index 00000000000000..2b6c72093e7021
> --- /dev/null
> +++ b/tests/xfs/1898
> @@ -0,0 +1,37 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1898
> +#
> +# Check that xfs_healer can report filesystem shutdowns.
> +
> +. ./common/preamble
> +_begin_fstest auto quick scrub eio selfhealing
> +
> +. ./common/fuzzy
> +. ./common/filter
> +. ./common/systemd
> +
> +_require_scratch_nocheck
> +_require_scrub
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +_require_xfs_healer $SCRATCH_MNT
> +$XFS_IO_PROG -f -c "pwrite -S 0x58 0 500k" -c "fsync" $victim >> $seqres.full
> +
> +echo "Start healer and shut down"
> +_scratch_invoke_xfs_healer "$tmp.healer"
> +_scratch_shutdown -f
> +
> +# Unmount filesystem to start fresh
> +echo "Kill healer"
> +_scratch_kill_xfs_healer
> +cat $tmp.healer >> $seqres.full
> +cat $tmp.healer | _filter_scratch | grep 'shut down'
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/1898.out b/tests/xfs/1898.out
> new file mode 100755
> index 00000000000000..f71f848da810ce
> --- /dev/null
> +++ b/tests/xfs/1898.out
> @@ -0,0 +1,4 @@
> +QA output created by 1898
> +Start healer and shut down
> +Kill healer
> +SCRATCH_MNT: filesystem shut down due to forced unmount
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 10/14] xfs: test xfs_healer can initiate full filesystem repairs
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (8 preceding siblings ...)
2026-03-10 3:52 ` [PATCH 09/14] xfs: test xfs_healer can report filesystem shutdowns Darrick J. Wong
@ 2026-03-10 3:52 ` Darrick J. Wong
2026-03-13 19:48 ` Zorro Lang
2026-03-10 3:52 ` [PATCH 11/14] xfs: test xfs_healer can follow mount moves Darrick J. Wong
` (4 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:52 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that when xfs_healer can't perform a spot repair, it will actually
start up xfs_scrub to perform a full scan and repair.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1899 | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1899.out | 3 +
2 files changed, 111 insertions(+)
create mode 100755 tests/xfs/1899
create mode 100644 tests/xfs/1899.out
diff --git a/tests/xfs/1899 b/tests/xfs/1899
new file mode 100755
index 00000000000000..5d35ca8265645f
--- /dev/null
+++ b/tests/xfs/1899
@@ -0,0 +1,108 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1899
+#
+# Ensure that autonomous self healing works fixes the filesystem correctly
+# even if the spot repair doesn't work and it falls back to a full fsck.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+_require_systemd_unit_defined "xfs_scrub@.service"
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+filter_healer() {
+ _filter_scratch | \
+ grep 'Full repairs in progress' | \
+ uniq
+}
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
+ -c 'path /a' \
+ -c 'bmap -a' \
+ -c 'ablock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
+ >> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "try $try saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# Wait for the background fixer to finish
+svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
+_systemd_unit_wait "$svc"
+
+# List the dirents of /victimdir and parent pointers of /a to see if they both
+# stop reporting corruption
+(ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "retry $try still saw corruption" >> $seqres.full
+ sleep 0.1
+ (ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+cat $tmp.healer >> $seqres.full
+cat $tmp.healer | filter_healer
+
+status=0
+exit
diff --git a/tests/xfs/1899.out b/tests/xfs/1899.out
new file mode 100644
index 00000000000000..5345fd400f3627
--- /dev/null
+++ b/tests/xfs/1899.out
@@ -0,0 +1,3 @@
+QA output created by 1899
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
+SCRATCH_MNT: Full repairs in progress.
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 10/14] xfs: test xfs_healer can initiate full filesystem repairs
2026-03-10 3:52 ` [PATCH 10/14] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
@ 2026-03-13 19:48 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:48 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:52:44PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that when xfs_healer can't perform a spot repair, it will actually
> start up xfs_scrub to perform a full scan and repair.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1899 | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1899.out | 3 +
> 2 files changed, 111 insertions(+)
> create mode 100755 tests/xfs/1899
> create mode 100644 tests/xfs/1899.out
>
>
> diff --git a/tests/xfs/1899 b/tests/xfs/1899
> new file mode 100755
> index 00000000000000..5d35ca8265645f
> --- /dev/null
> +++ b/tests/xfs/1899
> @@ -0,0 +1,108 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1899
> +#
> +# Ensure that autonomous self healing works fixes the filesystem correctly
> +# even if the spot repair doesn't work and it falls back to a full fsck.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +_require_scratch
> +_require_systemd_unit_defined "xfs_scrub@.service"
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +
> +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $SCRATCH_MNT parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $SCRATCH_MNT --repair
> +
> +filter_healer() {
> + _filter_scratch | \
> + grep 'Full repairs in progress' | \
> + uniq
> +}
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> +echo testdata > $SCRATCH_MNT/a
> +mkdir -p "$SCRATCH_MNT/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Break the directory, remount filesystem
> +_scratch_unmount
> +_scratch_xfs_db -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
> + -c 'path /a' \
> + -c 'bmap -a' \
> + -c 'ablock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' \
> + >> $seqres.full
> +_scratch_mount
> +
> +_scratch_invoke_xfs_healer "$tmp.healer" --repair
> +
> +# Access the broken directory to trigger a repair, then poll the directory
> +# for 5 seconds to see if it gets fixed without us needing to intervene.
> +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> +_filter_scratch < $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "try $try saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "try $try no longer saw corruption or gave up" >> $seqres.full
> +_filter_scratch < $tmp.err
> +
> +# Wait for the background fixer to finish
> +svc="$(_xfs_scrub_svcname "$SCRATCH_MNT")"
> +_systemd_unit_wait "$svc"
> +
> +# List the dirents of /victimdir and parent pointers of /a to see if they both
> +# stop reporting corruption
> +(ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "retry $try still saw corruption" >> $seqres.full
> + sleep 0.1
> + (ls $SCRATCH_MNT/some/victimdir ; $XFS_IO_PROG -c 'parent') > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> +
> +# Unmount to kill the healer
> +_scratch_kill_xfs_healer
> +cat $tmp.healer >> $seqres.full
> +cat $tmp.healer | filter_healer
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1899.out b/tests/xfs/1899.out
> new file mode 100644
> index 00000000000000..5345fd400f3627
> --- /dev/null
> +++ b/tests/xfs/1899.out
> @@ -0,0 +1,3 @@
> +QA output created by 1899
> +ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
> +SCRATCH_MNT: Full repairs in progress.
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 11/14] xfs: test xfs_healer can follow mount moves
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (9 preceding siblings ...)
2026-03-10 3:52 ` [PATCH 10/14] xfs: test xfs_healer can initiate full filesystem repairs Darrick J. Wong
@ 2026-03-10 3:52 ` Darrick J. Wong
2026-03-13 19:39 ` Zorro Lang
2026-03-10 3:53 ` [PATCH 12/14] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
` (3 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:52 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that when xfs_healer needs to reopen a filesystem to repair it,
it can still find the filesystem even if it has been mount --move'd.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1900 | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1900.out | 2 +
2 files changed, 117 insertions(+)
create mode 100755 tests/xfs/1900
create mode 100755 tests/xfs/1900.out
diff --git a/tests/xfs/1900 b/tests/xfs/1900
new file mode 100755
index 00000000000000..9a8f9fabd124ad
--- /dev/null
+++ b/tests/xfs/1900
@@ -0,0 +1,115 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1900
+#
+# Ensure that autonomous self healing fixes the filesystem correctly even if
+# the original mount has moved somewhere else.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_cleanup()
+{
+ command -v _kill_fsstress &>/dev/null && _kill_fsstress
+ cd /
+ rm -r -f $tmp.*
+ if [ -n "$new_dir" ]; then
+ _unmount "$new_dir" &>/dev/null
+ rm -rf "$new_dir"
+ fi
+}
+
+_require_test
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_scratch_mount
+
+_scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+# Move the scratch filesystem to a completely different mountpoint so that
+# we can test if the healer can find it again.
+new_dir=$TEST_DIR/moocow
+mkdir -p $new_dir
+_mount --bind $SCRATCH_MNT $new_dir
+_unmount $SCRATCH_MNT
+
+df -t xfs >> $seqres.full
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err | _filter_test_dir
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "try $try saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err | _filter_test_dir
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "retry $try still saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+new_dir_unmount() {
+ _unmount $new_dir
+}
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer new_dir_unmount
+cat $tmp.healer >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1900.out b/tests/xfs/1900.out
new file mode 100755
index 00000000000000..604c9eb5eb10f4
--- /dev/null
+++ b/tests/xfs/1900.out
@@ -0,0 +1,2 @@
+QA output created by 1900
+ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 11/14] xfs: test xfs_healer can follow mount moves
2026-03-10 3:52 ` [PATCH 11/14] xfs: test xfs_healer can follow mount moves Darrick J. Wong
@ 2026-03-13 19:39 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:39 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:52:59PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that when xfs_healer needs to reopen a filesystem to repair it,
> it can still find the filesystem even if it has been mount --move'd.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1900 | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1900.out | 2 +
> 2 files changed, 117 insertions(+)
> create mode 100755 tests/xfs/1900
> create mode 100755 tests/xfs/1900.out
>
>
> diff --git a/tests/xfs/1900 b/tests/xfs/1900
> new file mode 100755
> index 00000000000000..9a8f9fabd124ad
> --- /dev/null
> +++ b/tests/xfs/1900
> @@ -0,0 +1,115 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1900
> +#
> +# Ensure that autonomous self healing fixes the filesystem correctly even if
> +# the original mount has moved somewhere else.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_cleanup()
> +{
> + command -v _kill_fsstress &>/dev/null && _kill_fsstress
> + cd /
> + rm -r -f $tmp.*
> + if [ -n "$new_dir" ]; then
> + _unmount "$new_dir" &>/dev/null
> + rm -rf "$new_dir"
> + fi
> +}
> +
> +_require_test
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +_require_scratch
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +
> +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $SCRATCH_MNT parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $SCRATCH_MNT --repair
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> +echo testdata > $SCRATCH_MNT/a
> +mkdir -p "$SCRATCH_MNT/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Break the directory, remount filesystem
> +_scratch_unmount
> +_scratch_xfs_db -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> +_scratch_mount
> +
> +_scratch_invoke_xfs_healer "$tmp.healer" --repair
> +
> +# Move the scratch filesystem to a completely different mountpoint so that
> +# we can test if the healer can find it again.
> +new_dir=$TEST_DIR/moocow
> +mkdir -p $new_dir
> +_mount --bind $SCRATCH_MNT $new_dir
> +_unmount $SCRATCH_MNT
> +
> +df -t xfs >> $seqres.full
> +
> +# Access the broken directory to trigger a repair, then poll the directory
> +# for 5 seconds to see if it gets fixed without us needing to intervene.
> +ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> +_filter_scratch < $tmp.err | _filter_test_dir
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "try $try saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "try $try no longer saw corruption or gave up" >> $seqres.full
> +_filter_scratch < $tmp.err | _filter_test_dir
> +
> +# List the dirents of /victimdir to see if it stops reporting corruption
> +ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "retry $try still saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> +
> +new_dir_unmount() {
> + _unmount $new_dir
> +}
> +
> +# Unmount to kill the healer
> +_scratch_kill_xfs_healer new_dir_unmount
> +cat $tmp.healer >> $seqres.full
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1900.out b/tests/xfs/1900.out
> new file mode 100755
> index 00000000000000..604c9eb5eb10f4
> --- /dev/null
> +++ b/tests/xfs/1900.out
> @@ -0,0 +1,2 @@
> +QA output created by 1900
> +ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 12/14] xfs: test xfs_healer wont repair the wrong filesystem
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (10 preceding siblings ...)
2026-03-10 3:52 ` [PATCH 11/14] xfs: test xfs_healer can follow mount moves Darrick J. Wong
@ 2026-03-10 3:53 ` Darrick J. Wong
2026-03-13 19:53 ` Zorro Lang
2026-03-10 3:53 ` [PATCH 13/14] xfs: test xfs_healer background service Darrick J. Wong
` (2 subsequent siblings)
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:53 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that when xfs_healer needs to reopen a filesystem to repair it, it
won't latch on to another xfs filesystem that has been mounted atop the same
mountpoint.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1901 | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1901.out | 2 +
2 files changed, 139 insertions(+)
create mode 100755 tests/xfs/1901
create mode 100755 tests/xfs/1901.out
diff --git a/tests/xfs/1901 b/tests/xfs/1901
new file mode 100755
index 00000000000000..c92dcf9a3b3d48
--- /dev/null
+++ b/tests/xfs/1901
@@ -0,0 +1,137 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2025-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1901
+#
+# Ensure that autonomous self healing won't fix the wrong filesystem if a
+# snapshot of the original filesystem is now mounted on the same directory as
+# the original.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_cleanup()
+{
+ command -v _kill_fsstress &>/dev/null && _kill_fsstress
+ cd /
+ rm -r -f $tmp.*
+ test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
+ test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
+ test -e "$loop1" && _destroy_loop_device "$loop1"
+ test -e "$loop2" && _destroy_loop_device "$loop2"
+ test -e "$testdir" && rm -r -f "$testdir"
+}
+
+_require_test
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+
+testdir=$TEST_DIR/$seq
+mntpt=$testdir/mount
+disk1=$testdir/disk1
+disk2=$testdir/disk2
+
+mkdir -p "$mntpt"
+$XFS_IO_PROG -f -c "truncate 300m" $disk1
+$XFS_IO_PROG -f -c "truncate 300m" $disk2
+loop1="$(_create_loop_device "$disk1")"
+
+filter_mntpt() {
+ sed -e "s|$mntpt|MNTPT|g"
+}
+
+_mkfs_dev "$loop1" >> $seqres.full
+_mount "$loop1" "$mntpt" || _notrun "cannot mount victim filesystem"
+
+_xfs_has_feature $mntpt rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $mntpt parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $mntpt --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $mntpt set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$mntpt")
+echo testdata > $mntpt/a
+mkdir -p "$mntpt/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $mntpt/a $mntpt/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $mntpt/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Clone the fs, break the directory, remount filesystem
+_unmount "$mntpt"
+
+cp --sparse=always "$disk1" "$disk2" || _fail "cannot copy disk1"
+loop2="$(_create_loop_device_like_bdev "$disk2" "$loop1")"
+
+$XFS_DB_PROG "$loop1" -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+_mount "$loop1" "$mntpt" || _fail "cannot mount broken fs"
+
+_invoke_xfs_healer "$mntpt" "$tmp.healer" --repair
+
+# Stop the healer process so that it can't read error events while we do some
+# shenanigans.
+test -n "$XFS_HEALER_PID" || _fail "nobody set XFS_HEALER_PID?"
+kill -STOP $XFS_HEALER_PID
+
+
+echo "LOG $XFS_HEALER_PID SO FAR:" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Access the broken directory to trigger a repair event, which will not yet be
+# processed.
+ls $mntpt/some/victimdir > /dev/null 2> $tmp.err
+filter_mntpt < $tmp.err
+
+ps auxfww | grep xfs_healer >> $seqres.full
+
+echo "LOG AFTER TRYING TO POKE:" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Mount the clone filesystem to the same mountpoint so that the healer cannot
+# actually reopen it to perform repairs.
+_mount "$loop2" "$mntpt" -o nouuid || _fail "cannot mount decoy fs"
+
+grep -w xfs /proc/mounts >> $seqres.full
+
+# Continue the healer process so it can handle events now. Wait a few seconds
+# while it fails to reopen disk1's mount point to repair things.
+kill -CONT $XFS_HEALER_PID
+sleep 2
+
+new_dir_unmount() {
+ _unmount "$mntpt"
+ _unmount "$mntpt"
+}
+
+# Unmount to kill the healer
+_kill_xfs_healer new_dir_unmount
+echo "LOG AFTER FAILURE" >> $seqres.full
+cat $tmp.healer >> $seqres.full
+
+# Did the healer log complaints about not being able to reopen the mountpoint
+# to enact repairs?
+grep -q 'Stale file handle' $tmp.healer || \
+ echo "Should have seen stale file handle complaints"
+
+status=0
+exit
diff --git a/tests/xfs/1901.out b/tests/xfs/1901.out
new file mode 100755
index 00000000000000..ff83e03725307a
--- /dev/null
+++ b/tests/xfs/1901.out
@@ -0,0 +1,2 @@
+QA output created by 1901
+ls: reading directory 'MNTPT/some/victimdir': Structure needs cleaning
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 12/14] xfs: test xfs_healer wont repair the wrong filesystem
2026-03-10 3:53 ` [PATCH 12/14] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
@ 2026-03-13 19:53 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:53 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:53:15PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that when xfs_healer needs to reopen a filesystem to repair it, it
> won't latch on to another xfs filesystem that has been mounted atop the same
> mountpoint.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1901 | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1901.out | 2 +
> 2 files changed, 139 insertions(+)
> create mode 100755 tests/xfs/1901
> create mode 100755 tests/xfs/1901.out
>
>
> diff --git a/tests/xfs/1901 b/tests/xfs/1901
> new file mode 100755
> index 00000000000000..c92dcf9a3b3d48
> --- /dev/null
> +++ b/tests/xfs/1901
> @@ -0,0 +1,137 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2025-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1901
> +#
> +# Ensure that autonomous self healing won't fix the wrong filesystem if a
> +# snapshot of the original filesystem is now mounted on the same directory as
> +# the original.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_cleanup()
> +{
> + command -v _kill_fsstress &>/dev/null && _kill_fsstress
> + cd /
> + rm -r -f $tmp.*
> + test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
> + test -e "$mntpt" && _unmount "$mntpt" &>/dev/null
> + test -e "$loop1" && _destroy_loop_device "$loop1"
> + test -e "$loop2" && _destroy_loop_device "$loop2"
> + test -e "$testdir" && rm -r -f "$testdir"
> +}
> +
> +_require_test
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +
> +testdir=$TEST_DIR/$seq
> +mntpt=$testdir/mount
> +disk1=$testdir/disk1
> +disk2=$testdir/disk2
> +
> +mkdir -p "$mntpt"
> +$XFS_IO_PROG -f -c "truncate 300m" $disk1
> +$XFS_IO_PROG -f -c "truncate 300m" $disk2
> +loop1="$(_create_loop_device "$disk1")"
> +
> +filter_mntpt() {
> + sed -e "s|$mntpt|MNTPT|g"
> +}
> +
> +_mkfs_dev "$loop1" >> $seqres.full
> +_mount "$loop1" "$mntpt" || _notrun "cannot mount victim filesystem"
> +
> +_xfs_has_feature $mntpt rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $mntpt parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $mntpt --repair
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $mntpt set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$mntpt")
> +echo testdata > $mntpt/a
> +mkdir -p "$mntpt/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $mntpt/a $mntpt/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $mntpt/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Clone the fs, break the directory, remount filesystem
> +_unmount "$mntpt"
> +
> +cp --sparse=always "$disk1" "$disk2" || _fail "cannot copy disk1"
> +loop2="$(_create_loop_device_like_bdev "$disk2" "$loop1")"
> +
> +$XFS_DB_PROG "$loop1" -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> +_mount "$loop1" "$mntpt" || _fail "cannot mount broken fs"
> +
> +_invoke_xfs_healer "$mntpt" "$tmp.healer" --repair
> +
> +# Stop the healer process so that it can't read error events while we do some
> +# shenanigans.
> +test -n "$XFS_HEALER_PID" || _fail "nobody set XFS_HEALER_PID?"
> +kill -STOP $XFS_HEALER_PID
> +
> +
> +echo "LOG $XFS_HEALER_PID SO FAR:" >> $seqres.full
> +cat $tmp.healer >> $seqres.full
> +
> +# Access the broken directory to trigger a repair event, which will not yet be
> +# processed.
> +ls $mntpt/some/victimdir > /dev/null 2> $tmp.err
> +filter_mntpt < $tmp.err
> +
> +ps auxfww | grep xfs_healer >> $seqres.full
> +
> +echo "LOG AFTER TRYING TO POKE:" >> $seqres.full
> +cat $tmp.healer >> $seqres.full
> +
> +# Mount the clone filesystem to the same mountpoint so that the healer cannot
> +# actually reopen it to perform repairs.
> +_mount "$loop2" "$mntpt" -o nouuid || _fail "cannot mount decoy fs"
> +
> +grep -w xfs /proc/mounts >> $seqres.full
> +
> +# Continue the healer process so it can handle events now. Wait a few seconds
> +# while it fails to reopen disk1's mount point to repair things.
> +kill -CONT $XFS_HEALER_PID
> +sleep 2
> +
> +new_dir_unmount() {
> + _unmount "$mntpt"
> + _unmount "$mntpt"
> +}
> +
> +# Unmount to kill the healer
> +_kill_xfs_healer new_dir_unmount
> +echo "LOG AFTER FAILURE" >> $seqres.full
> +cat $tmp.healer >> $seqres.full
> +
> +# Did the healer log complaints about not being able to reopen the mountpoint
> +# to enact repairs?
> +grep -q 'Stale file handle' $tmp.healer || \
> + echo "Should have seen stale file handle complaints"
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1901.out b/tests/xfs/1901.out
> new file mode 100755
> index 00000000000000..ff83e03725307a
> --- /dev/null
> +++ b/tests/xfs/1901.out
> @@ -0,0 +1,2 @@
> +QA output created by 1901
> +ls: reading directory 'MNTPT/some/victimdir': Structure needs cleaning
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 13/14] xfs: test xfs_healer background service
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (11 preceding siblings ...)
2026-03-10 3:53 ` [PATCH 12/14] xfs: test xfs_healer wont repair the wrong filesystem Darrick J. Wong
@ 2026-03-10 3:53 ` Darrick J. Wong
2026-03-13 19:56 ` Zorro Lang
2026-03-10 3:53 ` [PATCH 14/14] xfs: test xfs_healer startup service Darrick J. Wong
2026-03-12 14:21 ` [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves Darrick J. Wong
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:53 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that when xfs_healer can monitor and repair filesystems when it's
running as a systemd service, which is the intended usage model.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1902 | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1902.out | 2 +
2 files changed, 154 insertions(+)
create mode 100755 tests/xfs/1902
create mode 100755 tests/xfs/1902.out
diff --git a/tests/xfs/1902 b/tests/xfs/1902
new file mode 100755
index 00000000000000..6de2d602d52cdb
--- /dev/null
+++ b/tests/xfs/1902
@@ -0,0 +1,152 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1902
+#
+# Ensure that autonomous self healing fixes the filesystem correctly when
+# running in a systemd service
+#
+# unreliable_in_parallel: this test runs the xfs_healer systemd service, which
+# cannot be isolated to a specific testcase with the way check-parallel is
+# implemented.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing unreliable_in_parallel
+
+_cleanup()
+{
+ cd /
+ if [ -n "$new_svcfile" ]; then
+ rm -f "$new_svcfile"
+ systemctl daemon-reload
+ fi
+ rm -r -f $tmp.*
+}
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+_require_systemd_is_running
+_require_systemd_unit_defined xfs_healer@.service
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory
+_scratch_unmount
+_scratch_xfs_db -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+
+# Find the existing xfs_healer@ service definition, figure out where we're
+# going to land our test-specific override
+orig_svcfile="$(_systemd_unit_path "xfs_healer@-.service")"
+test -f "$orig_svcfile" || \
+ _notrun "cannot find xfs_healer@ service file"
+
+new_svcdir="$(_systemd_runtime_dir)"
+test -d "$new_svcdir" || \
+ _notrun "cannot find runtime systemd service dir"
+
+# We need to make some local mods to the xfs_healer@ service definition
+# so we fork it and create a new service just for this test.
+new_healer_template="xfs_healer_fstest@.service"
+new_healer_svc="$(_systemd_service_unit_path "$new_healer_template" "$SCRATCH_MNT")"
+_systemd_unit_status "$new_healer_svc" 2>&1 | \
+ grep -E -q '(could not be found|Loaded: not-found)' || \
+ _notrun "systemd service \"$new_healer_svc\" found, will not mess with this"
+
+new_svcfile="$new_svcdir/$new_healer_template"
+cp "$orig_svcfile" "$new_svcfile"
+
+# Pick up all the CLI args except for --repair and --no-autofsck because we're
+# going to force it to --autofsck below
+execargs="$(grep '^ExecStart=' $new_svcfile | \
+ sed -e 's/^ExecStart=\S*//g' \
+ -e 's/--no-autofsck//g' \
+ -e 's/--repair//g')"
+sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
+cat >> "$new_svcfile" << ENDL
+
+[Service]
+ExecCondition=$XFS_HEALER_PROG --supported %f
+ExecStart=$XFS_HEALER_PROG $execargs
+ENDL
+_systemd_reload
+
+# Emit the results of our editing to the full log.
+systemctl cat "$new_healer_svc" >> $seqres.full
+
+# Remount, with service activation
+_scratch_mount
+
+old_healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
+_systemd_unit_stop "$old_healer_svc" &>> $seqres.full
+_systemd_unit_start "$new_healer_svc" &>> $seqres.full
+
+_systemd_unit_status "$new_healer_svc" 2>&1 | grep -q 'Active: active' || \
+ echo "systemd service \"$new_healer_svc\" not running??"
+
+# Access the broken directory to trigger a repair, then poll the directory
+# for 5 seconds to see if it gets fixed without us needing to intervene.
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+_filter_scratch < $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "try $try saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "try $try no longer saw corruption or gave up" >> $seqres.full
+_filter_scratch < $tmp.err
+
+# List the dirents of /victimdir to see if it stops reporting corruption
+ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+try=0
+while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "retry $try still saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+done
+echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+# Unmount to kill the healer
+_scratch_kill_xfs_healer
+journalctl -u "$new_healer_svc" >> $seqres.full
+
+status=0
+exit
diff --git a/tests/xfs/1902.out b/tests/xfs/1902.out
new file mode 100755
index 00000000000000..84f9b9e50e1e02
--- /dev/null
+++ b/tests/xfs/1902.out
@@ -0,0 +1,2 @@
+QA output created by 1902
+ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 13/14] xfs: test xfs_healer background service
2026-03-10 3:53 ` [PATCH 13/14] xfs: test xfs_healer background service Darrick J. Wong
@ 2026-03-13 19:56 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:56 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:53:31PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that when xfs_healer can monitor and repair filesystems when it's
> running as a systemd service, which is the intended usage model.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1902 | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1902.out | 2 +
> 2 files changed, 154 insertions(+)
> create mode 100755 tests/xfs/1902
> create mode 100755 tests/xfs/1902.out
>
>
> diff --git a/tests/xfs/1902 b/tests/xfs/1902
> new file mode 100755
> index 00000000000000..6de2d602d52cdb
> --- /dev/null
> +++ b/tests/xfs/1902
> @@ -0,0 +1,152 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1902
> +#
> +# Ensure that autonomous self healing fixes the filesystem correctly when
> +# running in a systemd service
> +#
> +# unreliable_in_parallel: this test runs the xfs_healer systemd service, which
> +# cannot be isolated to a specific testcase with the way check-parallel is
> +# implemented.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing unreliable_in_parallel
> +
> +_cleanup()
> +{
> + cd /
> + if [ -n "$new_svcfile" ]; then
> + rm -f "$new_svcfile"
> + systemctl daemon-reload
> + fi
> + rm -r -f $tmp.*
> +}
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_require_systemd_is_running
> +_require_systemd_unit_defined xfs_healer@.service
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +_require_scratch
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +
> +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $SCRATCH_MNT parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $SCRATCH_MNT --repair
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> +echo testdata > $SCRATCH_MNT/a
> +mkdir -p "$SCRATCH_MNT/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Break the directory
> +_scratch_unmount
> +_scratch_xfs_db -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> +
> +# Find the existing xfs_healer@ service definition, figure out where we're
> +# going to land our test-specific override
> +orig_svcfile="$(_systemd_unit_path "xfs_healer@-.service")"
> +test -f "$orig_svcfile" || \
> + _notrun "cannot find xfs_healer@ service file"
> +
> +new_svcdir="$(_systemd_runtime_dir)"
> +test -d "$new_svcdir" || \
> + _notrun "cannot find runtime systemd service dir"
> +
> +# We need to make some local mods to the xfs_healer@ service definition
> +# so we fork it and create a new service just for this test.
> +new_healer_template="xfs_healer_fstest@.service"
> +new_healer_svc="$(_systemd_service_unit_path "$new_healer_template" "$SCRATCH_MNT")"
> +_systemd_unit_status "$new_healer_svc" 2>&1 | \
> + grep -E -q '(could not be found|Loaded: not-found)' || \
> + _notrun "systemd service \"$new_healer_svc\" found, will not mess with this"
> +
> +new_svcfile="$new_svcdir/$new_healer_template"
> +cp "$orig_svcfile" "$new_svcfile"
> +
> +# Pick up all the CLI args except for --repair and --no-autofsck because we're
> +# going to force it to --autofsck below
> +execargs="$(grep '^ExecStart=' $new_svcfile | \
> + sed -e 's/^ExecStart=\S*//g' \
> + -e 's/--no-autofsck//g' \
> + -e 's/--repair//g')"
> +sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
> +cat >> "$new_svcfile" << ENDL
> +
> +[Service]
> +ExecCondition=$XFS_HEALER_PROG --supported %f
> +ExecStart=$XFS_HEALER_PROG $execargs
> +ENDL
> +_systemd_reload
> +
> +# Emit the results of our editing to the full log.
> +systemctl cat "$new_healer_svc" >> $seqres.full
> +
> +# Remount, with service activation
> +_scratch_mount
> +
> +old_healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
> +_systemd_unit_stop "$old_healer_svc" &>> $seqres.full
> +_systemd_unit_start "$new_healer_svc" &>> $seqres.full
> +
> +_systemd_unit_status "$new_healer_svc" 2>&1 | grep -q 'Active: active' || \
> + echo "systemd service \"$new_healer_svc\" not running??"
> +
> +# Access the broken directory to trigger a repair, then poll the directory
> +# for 5 seconds to see if it gets fixed without us needing to intervene.
> +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> +_filter_scratch < $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "try $try saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "try $try no longer saw corruption or gave up" >> $seqres.full
> +_filter_scratch < $tmp.err
> +
> +# List the dirents of /victimdir to see if it stops reporting corruption
> +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> +try=0
> +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "retry $try still saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> +done
> +echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> +
> +# Unmount to kill the healer
> +_scratch_kill_xfs_healer
> +journalctl -u "$new_healer_svc" >> $seqres.full
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1902.out b/tests/xfs/1902.out
> new file mode 100755
> index 00000000000000..84f9b9e50e1e02
> --- /dev/null
> +++ b/tests/xfs/1902.out
> @@ -0,0 +1,2 @@
> +QA output created by 1902
> +ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 14/14] xfs: test xfs_healer startup service
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (12 preceding siblings ...)
2026-03-10 3:53 ` [PATCH 13/14] xfs: test xfs_healer background service Darrick J. Wong
@ 2026-03-10 3:53 ` Darrick J. Wong
2026-03-13 19:58 ` Zorro Lang
2026-03-12 14:21 ` [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves Darrick J. Wong
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-10 3:53 UTC (permalink / raw)
To: zlang, djwong; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that xfs_healer_start can actually start up xfs_healer service
instances when a filesystem is mounted.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1903 | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1903.out | 6 +++
2 files changed, 130 insertions(+)
create mode 100755 tests/xfs/1903
create mode 100644 tests/xfs/1903.out
diff --git a/tests/xfs/1903 b/tests/xfs/1903
new file mode 100755
index 00000000000000..d71d75a6af3f9d
--- /dev/null
+++ b/tests/xfs/1903
@@ -0,0 +1,124 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1903
+#
+# Check that the xfs_healer startup service starts the per-mount xfs_healer
+# service for the scratch filesystem. IOWs, this is basic testing for the
+# xfs_healer systemd background services.
+#
+
+# unreliable_in_parallel: this appears to try to run healer services on all
+# mounted filesystems - that's a problem when there are a hundred other test
+# filesystems mounted running other tests...
+
+. ./common/preamble
+_begin_fstest auto selfhealing unreliable_in_parallel
+
+_cleanup()
+{
+ cd /
+ test -n "$new_healerstart_svc" &&
+ _systemd_unit_stop "$new_healerstart_svc"
+ test -n "$was_masked" && \
+ _systemd_unit_mask "$healer_svc" &>> $seqres.full
+ if [ -n "$new_svcfile" ]; then
+ rm -f "$new_svcfile"
+ systemctl daemon-reload
+ fi
+ rm -r -f $tmp.*
+}
+
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/systemd
+
+_require_systemd_is_running
+_require_systemd_unit_defined xfs_healer@.service
+_require_systemd_unit_defined xfs_healer_start.service
+_require_scratch
+_require_scrub
+_require_xfs_io_command "scrub"
+_require_xfs_spaceman_command "health"
+_require_populate_commands
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command $ATTR_PROG "attr"
+
+_xfs_skip_online_rebuild
+_xfs_skip_offline_rebuild
+
+orig_svcfile="$(_systemd_unit_path "xfs_healer_start.service")"
+test -f "$orig_svcfile" || \
+ _notrun "cannot find xfs_healer_start service file"
+
+new_svcdir="$(_systemd_runtime_dir)"
+test -d "$new_svcdir" || \
+ _notrun "cannot find runtime systemd service dir"
+
+# We need to make some local mods to the xfs_healer_start service definition
+# so we fork it and create a new service just for this test.
+new_healerstart_svc="xfs_healer_start_fstest.service"
+_systemd_unit_status "$new_healerstart_svc" 2>&1 | \
+ grep -E -q '(could not be found|Loaded: not-found)' || \
+ _notrun "systemd service \"$new_healerstart_svc\" found, will not mess with this"
+
+find_healer_trace() {
+ local path="$1"
+
+ sleep 2 # wait for delays in startup
+ $XFS_HEALER_PROG --supported "$path" 2>&1 | grep -q 'already running' || \
+ echo "cannot find evidence that xfs_healer is running for $path"
+}
+
+echo "Format and populate"
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+_require_xfs_healer $SCRATCH_MNT
+
+# Configure the filesystem for background checks of the filesystem.
+$ATTR_PROG -R -s xfs:autofsck -V check $SCRATCH_MNT >> $seqres.full
+
+was_masked=
+healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
+
+# Preserve the xfs_healer@ mask state -- we don't want this permanently
+# changing global state.
+if _systemd_unit_masked "$healer_svc"; then
+ _systemd_unit_unmask "$healer_svc" &>> $seqres.full
+ was_masked=1
+fi
+
+echo "Start healer on scratch FS"
+_systemd_unit_start "$healer_svc"
+find_healer_trace "$SCRATCH_MNT"
+_systemd_unit_stop "$healer_svc"
+
+new_svcfile="$new_svcdir/$new_healerstart_svc"
+cp "$orig_svcfile" "$new_svcfile"
+
+sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
+cat >> "$new_svcfile" << ENDL
+[Service]
+ExecCondition=$XFS_HEALER_START_PROG --supported
+ExecStart=$XFS_HEALER_START_PROG
+ENDL
+_systemd_reload
+
+# Emit the results of our editing to the full log.
+systemctl cat "$new_healerstart_svc" >> $seqres.full
+
+echo "Start healer for everything"
+_systemd_unit_start "$new_healerstart_svc"
+find_healer_trace "$SCRATCH_MNT"
+
+echo "Restart healer for scratch FS"
+_scratch_cycle_mount
+find_healer_trace "$SCRATCH_MNT"
+
+echo "Healer testing done" | tee -a $seqres.full
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1903.out b/tests/xfs/1903.out
new file mode 100644
index 00000000000000..07810f60ca10c6
--- /dev/null
+++ b/tests/xfs/1903.out
@@ -0,0 +1,6 @@
+QA output created by 1903
+Format and populate
+Start healer on scratch FS
+Start healer for everything
+Restart healer for scratch FS
+Healer testing done
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 14/14] xfs: test xfs_healer startup service
2026-03-10 3:53 ` [PATCH 14/14] xfs: test xfs_healer startup service Darrick J. Wong
@ 2026-03-13 19:58 ` Zorro Lang
0 siblings, 0 replies; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 19:58 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: fstests, linux-xfs
On Mon, Mar 09, 2026 at 08:53:46PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that xfs_healer_start can actually start up xfs_healer service
> instances when a filesystem is mounted.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Tests and looks good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> tests/xfs/1903 | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1903.out | 6 +++
> 2 files changed, 130 insertions(+)
> create mode 100755 tests/xfs/1903
> create mode 100644 tests/xfs/1903.out
>
>
> diff --git a/tests/xfs/1903 b/tests/xfs/1903
> new file mode 100755
> index 00000000000000..d71d75a6af3f9d
> --- /dev/null
> +++ b/tests/xfs/1903
> @@ -0,0 +1,124 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test No. 1903
> +#
> +# Check that the xfs_healer startup service starts the per-mount xfs_healer
> +# service for the scratch filesystem. IOWs, this is basic testing for the
> +# xfs_healer systemd background services.
> +#
> +
> +# unreliable_in_parallel: this appears to try to run healer services on all
> +# mounted filesystems - that's a problem when there are a hundred other test
> +# filesystems mounted running other tests...
> +
> +. ./common/preamble
> +_begin_fstest auto selfhealing unreliable_in_parallel
> +
> +_cleanup()
> +{
> + cd /
> + test -n "$new_healerstart_svc" &&
> + _systemd_unit_stop "$new_healerstart_svc"
> + test -n "$was_masked" && \
> + _systemd_unit_mask "$healer_svc" &>> $seqres.full
> + if [ -n "$new_svcfile" ]; then
> + rm -f "$new_svcfile"
> + systemctl daemon-reload
> + fi
> + rm -r -f $tmp.*
> +}
> +
> +. ./common/filter
> +. ./common/populate
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +_require_systemd_is_running
> +_require_systemd_unit_defined xfs_healer@.service
> +_require_systemd_unit_defined xfs_healer_start.service
> +_require_scratch
> +_require_scrub
> +_require_xfs_io_command "scrub"
> +_require_xfs_spaceman_command "health"
> +_require_populate_commands
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command $ATTR_PROG "attr"
> +
> +_xfs_skip_online_rebuild
> +_xfs_skip_offline_rebuild
> +
> +orig_svcfile="$(_systemd_unit_path "xfs_healer_start.service")"
> +test -f "$orig_svcfile" || \
> + _notrun "cannot find xfs_healer_start service file"
> +
> +new_svcdir="$(_systemd_runtime_dir)"
> +test -d "$new_svcdir" || \
> + _notrun "cannot find runtime systemd service dir"
> +
> +# We need to make some local mods to the xfs_healer_start service definition
> +# so we fork it and create a new service just for this test.
> +new_healerstart_svc="xfs_healer_start_fstest.service"
> +_systemd_unit_status "$new_healerstart_svc" 2>&1 | \
> + grep -E -q '(could not be found|Loaded: not-found)' || \
> + _notrun "systemd service \"$new_healerstart_svc\" found, will not mess with this"
> +
> +find_healer_trace() {
> + local path="$1"
> +
> + sleep 2 # wait for delays in startup
> + $XFS_HEALER_PROG --supported "$path" 2>&1 | grep -q 'already running' || \
> + echo "cannot find evidence that xfs_healer is running for $path"
> +}
> +
> +echo "Format and populate"
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +_require_xfs_healer $SCRATCH_MNT
> +
> +# Configure the filesystem for background checks of the filesystem.
> +$ATTR_PROG -R -s xfs:autofsck -V check $SCRATCH_MNT >> $seqres.full
> +
> +was_masked=
> +healer_svc="$(_xfs_healer_svcname "$SCRATCH_MNT")"
> +
> +# Preserve the xfs_healer@ mask state -- we don't want this permanently
> +# changing global state.
> +if _systemd_unit_masked "$healer_svc"; then
> + _systemd_unit_unmask "$healer_svc" &>> $seqres.full
> + was_masked=1
> +fi
> +
> +echo "Start healer on scratch FS"
> +_systemd_unit_start "$healer_svc"
> +find_healer_trace "$SCRATCH_MNT"
> +_systemd_unit_stop "$healer_svc"
> +
> +new_svcfile="$new_svcdir/$new_healerstart_svc"
> +cp "$orig_svcfile" "$new_svcfile"
> +
> +sed -e '/ExecStart=/d' -e '/BindPaths=/d' -e '/ExecCondition=/d' -i $new_svcfile
> +cat >> "$new_svcfile" << ENDL
> +[Service]
> +ExecCondition=$XFS_HEALER_START_PROG --supported
> +ExecStart=$XFS_HEALER_START_PROG
> +ENDL
> +_systemd_reload
> +
> +# Emit the results of our editing to the full log.
> +systemctl cat "$new_healerstart_svc" >> $seqres.full
> +
> +echo "Start healer for everything"
> +_systemd_unit_start "$new_healerstart_svc"
> +find_healer_trace "$SCRATCH_MNT"
> +
> +echo "Restart healer for scratch FS"
> +_scratch_cycle_mount
> +find_healer_trace "$SCRATCH_MNT"
> +
> +echo "Healer testing done" | tee -a $seqres.full
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/xfs/1903.out b/tests/xfs/1903.out
> new file mode 100644
> index 00000000000000..07810f60ca10c6
> --- /dev/null
> +++ b/tests/xfs/1903.out
> @@ -0,0 +1,6 @@
> +QA output created by 1903
> +Format and populate
> +Start healer on scratch FS
> +Start healer for everything
> +Restart healer for scratch FS
> +Healer testing done
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves
2026-03-10 3:42 ` [PATCHSET v9 2/2] fstests: autonomous self healing of filesystems Darrick J. Wong
` (13 preceding siblings ...)
2026-03-10 3:53 ` [PATCH 14/14] xfs: test xfs_healer startup service Darrick J. Wong
@ 2026-03-12 14:21 ` Darrick J. Wong
2026-03-13 20:05 ` Zorro Lang
14 siblings, 1 reply; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-12 14:21 UTC (permalink / raw)
To: zlang, Christoph Hellwig; +Cc: fstests, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make sure that when xfs_healer needs to reopen a filesystem to repair
it, it can still find the filesystem even if it has been mount --move'd.
This requires a bunch of private namespace magic.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
tests/xfs/1904 | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1904.out | 3 +
2 files changed, 132 insertions(+)
create mode 100755 tests/xfs/1904
create mode 100755 tests/xfs/1904.out
diff --git a/tests/xfs/1904 b/tests/xfs/1904
new file mode 100755
index 00000000000000..78e8f5dcb0e834
--- /dev/null
+++ b/tests/xfs/1904
@@ -0,0 +1,129 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Oracle. All Rights Reserved.
+#
+# FS QA Test 1904
+#
+# Ensure that autonomous self healing fixes the filesystem correctly even if
+# the original mount has moved somewhere else via --move.
+#
+. ./common/preamble
+_begin_fstest auto selfhealing
+
+. ./common/filter
+. ./common/fuzzy
+. ./common/systemd
+
+if [ -n "$IN_MOUNTNS" ]; then
+ _mount --make-rprivate /
+ findmnt -o TARGET,PROPAGATION >> $seqres.full
+
+ _scratch_mount
+ _scratch_invoke_xfs_healer "$tmp.healer" --repair
+
+ # Move the scratch filesystem to a completely different mountpoint so that
+ # we can test if the healer can find it again.
+ new_dir=$TEST_DIR/moocow
+ mkdir -p $new_dir
+ _mount --move $SCRATCH_MNT $new_dir
+
+ df -t xfs >> $seqres.full
+
+ # Access the broken directory to trigger a repair, then poll the directory
+ # for 5 seconds to see if it gets fixed without us needing to intervene.
+ ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+ _filter_scratch < $tmp.err | _filter_test_dir
+ try=0
+ while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "try $try saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+ done
+ echo "try $try no longer saw corruption or gave up" >> $seqres.full
+ _filter_scratch < $tmp.err | _filter_test_dir
+
+ # List the dirents of /victimdir to see if it stops reporting corruption
+ ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
+ try=0
+ while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
+ echo "retry $try still saw corruption" >> $seqres.full
+ sleep 0.1
+ ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
+ try=$((try + 1))
+ done
+ echo "retry $try no longer saw corruption or gave up" >> $seqres.full
+
+ new_dir_unmount() {
+ _unmount $new_dir
+ }
+
+ # Unmount to kill the healer
+ _scratch_kill_xfs_healer new_dir_unmount
+ cat $tmp.healer >> $seqres.full
+
+ # No need to clean up, the mount ns destructor will detach the
+ # filesystems for us.
+ exit
+fi
+
+_cleanup()
+{
+ command -v _kill_fsstress &>/dev/null && _kill_fsstress
+ cd /
+ rm -r -f $tmp.*
+ if [ -n "$new_dir" ]; then
+ _unmount "$new_dir" &>/dev/null
+ rm -rf "$new_dir"
+ fi
+}
+
+_require_unshare
+_require_test
+_require_scrub
+_require_xfs_io_command "repair" # online repair support
+_require_xfs_db_command "blocktrash"
+_require_command "$XFS_HEALER_PROG" "xfs_healer"
+_require_command "$XFS_PROPERTY_PROG" "xfs_property"
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount
+
+_xfs_has_feature $SCRATCH_MNT rmapbt || \
+ _notrun "reverse mapping required to test directory auto-repair"
+_xfs_has_feature $SCRATCH_MNT parent || \
+ _notrun "parent pointers required to test directory auto-repair"
+_require_xfs_healer $SCRATCH_MNT --repair
+
+# Configure the filesystem for automatic repair of the filesystem.
+$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
+
+# Create a largeish directory
+dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
+echo testdata > $SCRATCH_MNT/a
+mkdir -p "$SCRATCH_MNT/some/victimdir"
+for ((i = 0; i < (dblksz / 255); i++)); do
+ fname="$(printf "%0255d" "$i")"
+ ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
+done
+
+# Did we get at least two dir blocks?
+dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
+test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
+
+# Break the directory, remount filesystem
+_scratch_unmount
+_scratch_xfs_db -x \
+ -c 'path /some/victimdir' \
+ -c 'bmap' \
+ -c 'dblock 1' \
+ -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
+
+# mount --move only works if mount propagation is disabled, so we have to start
+# a subshell with a separate mount namespace, disable propagation for the
+# entire directory tree, and only then can we run our tests.
+IN_MOUNTNS=1 unshare -m bash "$0"
+
+status=0
+exit
diff --git a/tests/xfs/1904.out b/tests/xfs/1904.out
new file mode 100755
index 00000000000000..34a46298dd439a
--- /dev/null
+++ b/tests/xfs/1904.out
@@ -0,0 +1,3 @@
+QA output created by 1904
+QA output created by 1904
+ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning
^ permalink raw reply related [flat|nested] 45+ messages in thread* Re: [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves
2026-03-12 14:21 ` [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves Darrick J. Wong
@ 2026-03-13 20:05 ` Zorro Lang
2026-03-13 23:41 ` Darrick J. Wong
0 siblings, 1 reply; 45+ messages in thread
From: Zorro Lang @ 2026-03-13 20:05 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: Christoph Hellwig, fstests, linux-xfs
On Thu, Mar 12, 2026 at 07:21:30AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Make sure that when xfs_healer needs to reopen a filesystem to repair
> it, it can still find the filesystem even if it has been mount --move'd.
> This requires a bunch of private namespace magic.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> tests/xfs/1904 | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/1904.out | 3 +
> 2 files changed, 132 insertions(+)
> create mode 100755 tests/xfs/1904
> create mode 100755 tests/xfs/1904.out
>
> diff --git a/tests/xfs/1904 b/tests/xfs/1904
> new file mode 100755
> index 00000000000000..78e8f5dcb0e834
> --- /dev/null
> +++ b/tests/xfs/1904
> @@ -0,0 +1,129 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 1904
> +#
> +# Ensure that autonomous self healing fixes the filesystem correctly even if
> +# the original mount has moved somewhere else via --move.
> +#
> +. ./common/preamble
> +_begin_fstest auto selfhealing
> +
> +. ./common/filter
> +. ./common/fuzzy
> +. ./common/systemd
> +
> +if [ -n "$IN_MOUNTNS" ]; then
> + _mount --make-rprivate /
I'd like to add this case and other cases related with mount propagation in this
patchset to "mount" group. I'll do it when I merge this patchset. Others look and
test good to me,
Reviewed-by: Zorro Lang <zlang@redhat.com>
> + findmnt -o TARGET,PROPAGATION >> $seqres.full
> +
> + _scratch_mount
> + _scratch_invoke_xfs_healer "$tmp.healer" --repair
> +
> + # Move the scratch filesystem to a completely different mountpoint so that
> + # we can test if the healer can find it again.
> + new_dir=$TEST_DIR/moocow
> + mkdir -p $new_dir
> + _mount --move $SCRATCH_MNT $new_dir
> +
> + df -t xfs >> $seqres.full
> +
> + # Access the broken directory to trigger a repair, then poll the directory
> + # for 5 seconds to see if it gets fixed without us needing to intervene.
> + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> + _filter_scratch < $tmp.err | _filter_test_dir
> + try=0
> + while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "try $try saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> + done
> + echo "try $try no longer saw corruption or gave up" >> $seqres.full
> + _filter_scratch < $tmp.err | _filter_test_dir
> +
> + # List the dirents of /victimdir to see if it stops reporting corruption
> + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> + try=0
> + while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> + echo "retry $try still saw corruption" >> $seqres.full
> + sleep 0.1
> + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> + try=$((try + 1))
> + done
> + echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> +
> + new_dir_unmount() {
> + _unmount $new_dir
> + }
> +
> + # Unmount to kill the healer
> + _scratch_kill_xfs_healer new_dir_unmount
> + cat $tmp.healer >> $seqres.full
> +
> + # No need to clean up, the mount ns destructor will detach the
> + # filesystems for us.
> + exit
> +fi
> +
> +_cleanup()
> +{
> + command -v _kill_fsstress &>/dev/null && _kill_fsstress
> + cd /
> + rm -r -f $tmp.*
> + if [ -n "$new_dir" ]; then
> + _unmount "$new_dir" &>/dev/null
> + rm -rf "$new_dir"
> + fi
> +}
> +
> +_require_unshare
> +_require_test
> +_require_scrub
> +_require_xfs_io_command "repair" # online repair support
> +_require_xfs_db_command "blocktrash"
> +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> +_require_scratch
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount
> +
> +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> + _notrun "reverse mapping required to test directory auto-repair"
> +_xfs_has_feature $SCRATCH_MNT parent || \
> + _notrun "parent pointers required to test directory auto-repair"
> +_require_xfs_healer $SCRATCH_MNT --repair
> +
> +# Configure the filesystem for automatic repair of the filesystem.
> +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> +
> +# Create a largeish directory
> +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> +echo testdata > $SCRATCH_MNT/a
> +mkdir -p "$SCRATCH_MNT/some/victimdir"
> +for ((i = 0; i < (dblksz / 255); i++)); do
> + fname="$(printf "%0255d" "$i")"
> + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> +done
> +
> +# Did we get at least two dir blocks?
> +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> +
> +# Break the directory, remount filesystem
> +_scratch_unmount
> +_scratch_xfs_db -x \
> + -c 'path /some/victimdir' \
> + -c 'bmap' \
> + -c 'dblock 1' \
> + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> +
> +# mount --move only works if mount propagation is disabled, so we have to start
> +# a subshell with a separate mount namespace, disable propagation for the
> +# entire directory tree, and only then can we run our tests.
> +IN_MOUNTNS=1 unshare -m bash "$0"
> +
> +status=0
> +exit
> diff --git a/tests/xfs/1904.out b/tests/xfs/1904.out
> new file mode 100755
> index 00000000000000..34a46298dd439a
> --- /dev/null
> +++ b/tests/xfs/1904.out
> @@ -0,0 +1,3 @@
> +QA output created by 1904
> +QA output created by 1904
> +ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning
>
^ permalink raw reply [flat|nested] 45+ messages in thread* Re: [PATCH 15/14] xfs: test xfs_healer can follow private mntns mount moves
2026-03-13 20:05 ` Zorro Lang
@ 2026-03-13 23:41 ` Darrick J. Wong
0 siblings, 0 replies; 45+ messages in thread
From: Darrick J. Wong @ 2026-03-13 23:41 UTC (permalink / raw)
To: Zorro Lang; +Cc: Christoph Hellwig, fstests, linux-xfs
On Sat, Mar 14, 2026 at 04:05:53AM +0800, Zorro Lang wrote:
> On Thu, Mar 12, 2026 at 07:21:30AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Make sure that when xfs_healer needs to reopen a filesystem to repair
> > it, it can still find the filesystem even if it has been mount --move'd.
> > This requires a bunch of private namespace magic.
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> > tests/xfs/1904 | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > tests/xfs/1904.out | 3 +
> > 2 files changed, 132 insertions(+)
> > create mode 100755 tests/xfs/1904
> > create mode 100755 tests/xfs/1904.out
> >
> > diff --git a/tests/xfs/1904 b/tests/xfs/1904
> > new file mode 100755
> > index 00000000000000..78e8f5dcb0e834
> > --- /dev/null
> > +++ b/tests/xfs/1904
> > @@ -0,0 +1,129 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2026 Oracle. All Rights Reserved.
> > +#
> > +# FS QA Test 1904
> > +#
> > +# Ensure that autonomous self healing fixes the filesystem correctly even if
> > +# the original mount has moved somewhere else via --move.
> > +#
> > +. ./common/preamble
> > +_begin_fstest auto selfhealing
> > +
> > +. ./common/filter
> > +. ./common/fuzzy
> > +. ./common/systemd
> > +
> > +if [ -n "$IN_MOUNTNS" ]; then
> > + _mount --make-rprivate /
>
> I'd like to add this case and other cases related with mount propagation in this
> patchset to "mount" group. I'll do it when I merge this patchset. Others look and
> test good to me,
<nod> That sounds reasonable. Thanks for all the other minor touch-ups
that you applied before merging into patches-in-queue!
--D
>
> Reviewed-by: Zorro Lang <zlang@redhat.com>
>
> > + findmnt -o TARGET,PROPAGATION >> $seqres.full
> > +
> > + _scratch_mount
> > + _scratch_invoke_xfs_healer "$tmp.healer" --repair
> > +
> > + # Move the scratch filesystem to a completely different mountpoint so that
> > + # we can test if the healer can find it again.
> > + new_dir=$TEST_DIR/moocow
> > + mkdir -p $new_dir
> > + _mount --move $SCRATCH_MNT $new_dir
> > +
> > + df -t xfs >> $seqres.full
> > +
> > + # Access the broken directory to trigger a repair, then poll the directory
> > + # for 5 seconds to see if it gets fixed without us needing to intervene.
> > + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> > + _filter_scratch < $tmp.err | _filter_test_dir
> > + try=0
> > + while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> > + echo "try $try saw corruption" >> $seqres.full
> > + sleep 0.1
> > + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> > + try=$((try + 1))
> > + done
> > + echo "try $try no longer saw corruption or gave up" >> $seqres.full
> > + _filter_scratch < $tmp.err | _filter_test_dir
> > +
> > + # List the dirents of /victimdir to see if it stops reporting corruption
> > + ls $new_dir/some/victimdir > /dev/null 2> $tmp.err
> > + try=0
> > + while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do
> > + echo "retry $try still saw corruption" >> $seqres.full
> > + sleep 0.1
> > + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err
> > + try=$((try + 1))
> > + done
> > + echo "retry $try no longer saw corruption or gave up" >> $seqres.full
> > +
> > + new_dir_unmount() {
> > + _unmount $new_dir
> > + }
> > +
> > + # Unmount to kill the healer
> > + _scratch_kill_xfs_healer new_dir_unmount
> > + cat $tmp.healer >> $seqres.full
> > +
> > + # No need to clean up, the mount ns destructor will detach the
> > + # filesystems for us.
> > + exit
> > +fi
> > +
> > +_cleanup()
> > +{
> > + command -v _kill_fsstress &>/dev/null && _kill_fsstress
> > + cd /
> > + rm -r -f $tmp.*
> > + if [ -n "$new_dir" ]; then
> > + _unmount "$new_dir" &>/dev/null
> > + rm -rf "$new_dir"
> > + fi
> > +}
> > +
> > +_require_unshare
> > +_require_test
> > +_require_scrub
> > +_require_xfs_io_command "repair" # online repair support
> > +_require_xfs_db_command "blocktrash"
> > +_require_command "$XFS_HEALER_PROG" "xfs_healer"
> > +_require_command "$XFS_PROPERTY_PROG" "xfs_property"
> > +_require_scratch
> > +
> > +_scratch_mkfs >> $seqres.full
> > +_scratch_mount
> > +
> > +_xfs_has_feature $SCRATCH_MNT rmapbt || \
> > + _notrun "reverse mapping required to test directory auto-repair"
> > +_xfs_has_feature $SCRATCH_MNT parent || \
> > + _notrun "parent pointers required to test directory auto-repair"
> > +_require_xfs_healer $SCRATCH_MNT --repair
> > +
> > +# Configure the filesystem for automatic repair of the filesystem.
> > +$XFS_PROPERTY_PROG $SCRATCH_MNT set autofsck=repair >> $seqres.full
> > +
> > +# Create a largeish directory
> > +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT")
> > +echo testdata > $SCRATCH_MNT/a
> > +mkdir -p "$SCRATCH_MNT/some/victimdir"
> > +for ((i = 0; i < (dblksz / 255); i++)); do
> > + fname="$(printf "%0255d" "$i")"
> > + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname
> > +done
> > +
> > +# Did we get at least two dir blocks?
> > +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir)
> > +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory"
> > +
> > +# Break the directory, remount filesystem
> > +_scratch_unmount
> > +_scratch_xfs_db -x \
> > + -c 'path /some/victimdir' \
> > + -c 'bmap' \
> > + -c 'dblock 1' \
> > + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full
> > +
> > +# mount --move only works if mount propagation is disabled, so we have to start
> > +# a subshell with a separate mount namespace, disable propagation for the
> > +# entire directory tree, and only then can we run our tests.
> > +IN_MOUNTNS=1 unshare -m bash "$0"
> > +
> > +status=0
> > +exit
> > diff --git a/tests/xfs/1904.out b/tests/xfs/1904.out
> > new file mode 100755
> > index 00000000000000..34a46298dd439a
> > --- /dev/null
> > +++ b/tests/xfs/1904.out
> > @@ -0,0 +1,3 @@
> > +QA output created by 1904
> > +QA output created by 1904
> > +ls: reading directory 'TEST_DIR/moocow/some/victimdir': Structure needs cleaning
> >
>
>
^ permalink raw reply [flat|nested] 45+ messages in thread