* [RFC PATCH] xfs: regression test for writes racing with reclaim writeback
[not found] ` <Y4VeuqfVBU4/x9aB@magnolia>
@ 2022-11-29 21:55 ` Darrick J. Wong
0 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2022-11-29 21:55 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs, linux-fsdevel, fstests
From: Darrick J. Wong <djwong@kernel.org>
This test uses the new write delay debug knobs to set up a slow-moving
write racing with writeback of an unwritten block at the end of the
file range targetted by the slow write. The test succeeds if the file
does not end up corrupt and if the ftrace log captures a message about
the revalidation occurring.
NOTE: I'm not convinced that madvise actually causes the page to be
removed from the pagecache, which means that this is effectively a
functional test for the invalidation, not a corruption reproducer.
On the other hand, the functional test can be done quickly.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
This test has been adapted to use the tracepoints defined in the final
version of this patch.
---
tests/xfs/925 | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/925.out | 2 +
2 files changed, 129 insertions(+)
create mode 100755 tests/xfs/925
create mode 100644 tests/xfs/925.out
diff --git a/tests/xfs/925 b/tests/xfs/925
new file mode 100755
index 0000000000..6503377c0f
--- /dev/null
+++ b/tests/xfs/925
@@ -0,0 +1,127 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Oracle. All Rights Reserved.
+#
+# FS QA Test 925
+#
+# This is a regression test for a data corruption bug that existed in iomap's
+# buffered write routines.
+#
+. ./common/preamble
+_begin_fstest auto quick rw
+
+# Import common functions.
+. ./common/inject
+. ./common/tracing
+
+# real QA test starts here
+_cleanup()
+{
+ _ftrace_cleanup
+ cd /
+ rm -r -f $tmp.* $sentryfile $tracefile
+}
+
+# Modify as appropriate.
+_supported_fs xfs
+_require_ftrace
+_require_error_injection
+_require_scratch
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount >> $seqres.full
+
+# This is a pagecache test, so try to disable fsdax mode.
+$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
+_require_pagecache_access $SCRATCH_MNT
+
+knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "write_delay_ms")"
+test -w "$knob" || _notrun "Kernel does not have write_delay_ms error injector"
+
+blocks=10
+blksz=$(get_page_size)
+filesz=$((blocks * blksz))
+dirty_offset=$(( filesz - 1 ))
+write_len=$(( ( (blocks - 1) * blksz) + 1 ))
+
+# Create a large file with a large unwritten range.
+$XFS_IO_PROG -f -c "falloc 0 $filesz" $SCRATCH_MNT/file >> $seqres.full
+
+# Write the same data to file.compare as we're about to do to file. Do this
+# before slowing down writeback to avoid unnecessary delay.
+_pwrite_byte 0x58 $dirty_offset 1 $SCRATCH_MNT/file.compare >> $seqres.full
+_pwrite_byte 0x57 0 $write_len $SCRATCH_MNT/file.compare >> $seqres.full
+
+# Reinitialize the page cache for this file.
+_scratch_cycle_mount
+
+# Dirty the last page in the range and immediately set the write delay so that
+# any subsequent writes have to wait.
+$XFS_IO_PROG -c "pwrite -S 0x58 $dirty_offset 1" $SCRATCH_MNT/file >> $seqres.full
+write_delay=500
+echo $write_delay > $knob
+curval="$(cat $knob)"
+test "$curval" -eq $write_delay || echo "expected write_delay_ms == $write_delay"
+
+_ftrace_record_events 'xfs_iomap_invalid'
+
+# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
+# we see that, we know we've forced writeback to revalidate a mapping. The
+# test has been successful, so turn off the delay.
+sentryfile=$TEST_DIR/$seq.sentry
+tracefile=$TEST_DIR/$seq.ftrace
+wait_for_errortag() {
+ while [ -e "$sentryfile" ]; do
+ _ftrace_dump | grep iomap_invalid >> "$tracefile"
+ if grep -q iomap_invalid "$tracefile"; then
+ echo 0 > $knob
+ _ftrace_ignore_events
+ break;
+ fi
+ sleep 0.5
+ done
+}
+touch $sentryfile
+wait_for_errortag &
+
+# Start thread 1 + writeback above
+($XFS_IO_PROG -c "pwrite -S 0x57 -b $write_len 0 $write_len" \
+ -c 'bmap -celpv' -c 'bmap -elpv' \
+ $SCRATCH_MNT/file >> $seqres.full; rm -f $sentryfile) &
+sleep 1
+
+# Start thread 2 to simulate reclaim writeback via sync_file_range and fadvise
+# to drop the page cache.
+# -c "fadvise -d $dirty_offset 1" \
+dirty_pageoff=$((filesz - blksz))
+$XFS_IO_PROG -c "sync_range -a -w $dirty_pageoff $blksz" \
+ -c "mmap -r 0 $filesz" \
+ -c "madvise -d 0 $filesz" \
+ -c 'bmap -celpv' -c 'bmap -elpv' \
+ $SCRATCH_MNT/file >> $seqres.full
+wait
+rm -f $sentryfile
+
+cat "$tracefile" >> $seqres.full
+grep -q iomap_invalid "$tracefile"
+saw_invalidation=$?
+
+# Flush everything to disk. If the bug manifests, then after the cycle,
+# file should have stale 0x58 in block 0 because we silently dropped a write.
+_scratch_cycle_mount
+
+if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
+ echo file and file.compare do not match
+ $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
+ echo file.compare
+ od -tx1 -Ad -c $SCRATCH_MNT/file.compare
+ echo file
+ od -tx1 -Ad -c $SCRATCH_MNT/file
+elif [ $saw_invalidation -ne 0 ]; then
+ # The files matched, but nothing got logged about the revalidation?
+ echo "Expected to hear about write iomap invalidation?"
+fi
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/925.out b/tests/xfs/925.out
new file mode 100644
index 0000000000..95088ce8a5
--- /dev/null
+++ b/tests/xfs/925.out
@@ -0,0 +1,2 @@
+QA output created by 925
+Silence is golden
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [RFC PATCH] xfs: regression test for writeback corruption bug
[not found] ` <Y4VejsHGU/tZuRYs@magnolia>
@ 2022-11-29 21:57 ` Darrick J. Wong
2022-11-30 17:34 ` Zorro Lang
0 siblings, 1 reply; 6+ messages in thread
From: Darrick J. Wong @ 2022-11-29 21:57 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs, linux-fsdevel, wen.gang.wang, fstests
From: Darrick J. Wong <djwong@kernel.org>
This is a regression test for a data corruption bug that existed in XFS'
copy on write code between 4.9 and 4.19. The root cause is a
concurrency bug wherein we would drop ILOCK_SHARED after querying the
CoW fork in xfs_map_cow and retake it before querying the data fork in
xfs_map_blocks. See the test description for a lot more details.
Cc: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
common/rc | 15 ++++
common/tracing | 69 +++++++++++++++++
tests/xfs/924 | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/924.out | 2
4 files changed, 301 insertions(+)
create mode 100644 common/tracing
create mode 100755 tests/xfs/924
create mode 100644 tests/xfs/924.out
diff --git a/common/rc b/common/rc
index d71fc0603f..b1b7a3e553 100644
--- a/common/rc
+++ b/common/rc
@@ -3625,6 +3625,21 @@ _check_xflag()
fi
}
+# Make sure the given file access mode is set to use the pagecache. If
+# userspace or kernel don't support statx or STATX_ATTR_DAX, we assume that
+# means pagecache. The sole parameter must be a directory.
+_require_pagecache_access() {
+ local testfile="$1/testfile"
+
+ touch "$testfile"
+ if ! _check_s_dax "$testfile" 0 &>> $seqres.full; then
+ rm -f "$testfile"
+ _notrun 'test requires pagecache access'
+ fi
+
+ rm -f "$testfile"
+}
+
# Check if dax mount options are supported
#
# $1 can be either 'dax=always' or 'dax'
diff --git a/common/tracing b/common/tracing
new file mode 100644
index 0000000000..35e5ed41c2
--- /dev/null
+++ b/common/tracing
@@ -0,0 +1,69 @@
+##/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022 Oracle. All Rights Reserved.
+#
+# Routines for dealing with ftrace (or any other tracing).
+
+_require_ftrace() {
+ local ftrace_dir="/sys/kernel/debug/tracing/instances/"
+ test -d "$ftrace_dir" || _notrun "kernel does not support ftrace"
+
+ # Give this fstest its own ftrace buffer so that we don't mess up
+ # any other tracers that might be running.
+ FTRACE_DIR="$ftrace_dir/fstests.$seq"
+ test -d "$FTRACE_DIR" && rmdir "$FTRACE_DIR"
+}
+
+_ftrace_cleanup() {
+ if [ -d "$FTRACE_DIR" ]; then
+ _ftrace_ignore_events
+ # Removing an ftrace buffer requires rmdir, even though the
+ # virtual directory contains children.
+ rmdir "$FTRACE_DIR"
+ fi
+}
+
+# Intercept the given events. Arguments may be regular expressions.
+_ftrace_record_events() {
+ local pwd="$PWD"
+
+ test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
+ mkdir "$FTRACE_DIR"
+ cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
+
+ for arg in "$@"; do
+ for tp in */${arg}; do
+ # Replace slashes with semicolons per ftrace convention
+ echo "${tp////:}" >> ../set_event
+ done
+ done
+ cd "$pwd"
+}
+
+# Stop intercepting the given events. If no arguments, stops all events.
+_ftrace_ignore_events() {
+ local pwd="$PWD"
+
+ test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
+ cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
+
+ if [ "$#" -eq 0 ]; then
+ echo > ../set_event
+ else
+ for arg in "$@"; do
+ for tp in */${arg}; do
+ # Replace slashes with semicolons per ftrace convention
+ echo "!${tp////:}" >> ../set_event
+ done
+ done
+ fi
+
+ cd "$pwd"
+}
+
+# Dump whatever was written to the ftrace buffer since the last time this
+# helper was called.
+_ftrace_dump() {
+ test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
+ (cd "$FTRACE_DIR" && cat trace)
+}
diff --git a/tests/xfs/924 b/tests/xfs/924
new file mode 100755
index 0000000000..81f8ba2743
--- /dev/null
+++ b/tests/xfs/924
@@ -0,0 +1,215 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Oracle. All Rights Reserved.
+#
+# FS QA Test 924
+#
+# This is a regression test for a data corruption bug that existed in XFS' copy
+# on write code between 4.9 and 4.19. The root cause is a concurrency bug
+# wherein we would drop ILOCK_SHARED after querying the CoW fork in xfs_map_cow
+# and retake it before querying the data fork in xfs_map_blocks. If a second
+# thread changes the CoW fork mappings between the two calls, it's possible for
+# xfs_map_blocks to return a zero-block mapping, which results in writeback
+# being elided for that block. Elided writeback of dirty data results in
+# silent loss of writes.
+#
+# Worse yet, kernels from that era still used buffer heads, which means that an
+# elided writeback leaves the page clean but the bufferheads dirty. Due to a
+# naïve optimization in mark_buffer_dirty, the SetPageDirty call is elided if
+# the bufferhead is dirty, which means that a subsequent rewrite of the data
+# block will never result in the page being marked dirty, and all subsequent
+# writes are lost.
+#
+# It turns out that Christoph Hellwig unwittingly fixed the race in commit
+# 5c665e5b5af6 ("xfs: remove xfs_map_cow"), and no testcase was ever written.
+# Four years later, we hit it on a production 4.14 kernel. This testcase
+# relies on a debugging knob that introduces artificial delays into writeback.
+#
+# Before the race, the file blocks 0-1 are not shared and blocks 2-5 are
+# shared. There are no extents in CoW fork.
+#
+# Two threads race like this:
+#
+# Thread 1 (writeback block 0) | Thread 2 (write to block 2)
+# ---------------------------------|--------------------------------
+# |
+# 1. Check if block 0 in CoW fork |
+# from xfs_map_cow. |
+# |
+# 2. Block 0 not found in CoW |
+# fork; the block is considered |
+# not shared. |
+# |
+# 3. xfs_map_blocks looks up data |
+# fork to get a map covering |
+# block 0. |
+# |
+# 4. It gets a data fork mapping |
+# for block 0 with length 2. |
+# |
+# | 1. A buffered write to block 2 sees
+# | that it is a shared block and no
+# | extent covers block 2 in CoW fork.
+# |
+# | It creates a new CoW fork mapping.
+# | Due to the cowextsize, the new
+# | extent starts at block 0 with
+# | length 128.
+# |
+# |
+# 5. It lookup CoW fork again to |
+# trim the map (0, 2) to a |
+# shared block boundary. |
+# |
+# 5a. It finds (0, 128) in CoW fork|
+# 5b. It trims the data fork map |
+# from (0, 1) to (0, 0) (!!!) |
+# |
+# 6. The xfs_imap_valid call after |
+# the xfs_map_blocks call checks|
+# if the mapping (0, 0) covers |
+# block 0. The result is "NO". |
+# |
+# 7. Since block 0 has no physical |
+# block mapped, it's not added |
+# to the ioend. This is the |
+# first problem. |
+# |
+# 8. xfs_add_to_ioend usually |
+# clears the bufferhead dirty |
+# flag Because this is skipped,|
+# we leave the page clean with |
+# the associated buffer head(s) |
+# dirty (the second problem). |
+# Now the dirty state is |
+# inconsistent.
+#
+# On newer kernels, this is also a functionality test for the ifork sequence
+# counter because the writeback completions will change the data fork and force
+# revalidations of the wb mapping.
+#
+. ./common/preamble
+_begin_fstest auto quick clone
+
+# Import common functions.
+. ./common/reflink
+. ./common/inject
+. ./common/tracing
+
+# real QA test starts here
+_cleanup()
+{
+ _ftrace_cleanup
+ cd /
+ rm -r -f $tmp.* $sentryfile $tracefile
+}
+
+# Modify as appropriate.
+_supported_fs xfs
+_fixed_by_kernel_commit 5c665e5b5af6 "xfs: remove xfs_map_cow"
+_require_ftrace
+_require_error_injection
+_require_scratch_reflink
+_require_cp_reflink
+
+_scratch_mkfs >> $seqres.full
+_scratch_mount >> $seqres.full
+
+# This is a pagecache test, so try to disable fsdax mode.
+$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
+_require_pagecache_access $SCRATCH_MNT
+
+knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "wb_delay_ms")"
+test -w "$knob" || _notrun "Kernel does not have wb_delay_ms error injector"
+
+blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
+
+# Make sure we have sufficient extent size to create speculative CoW
+# preallocations.
+$XFS_IO_PROG -c 'cowextsize 1m' $SCRATCH_MNT
+
+# Write out a file with the first two blocks unshared and the rest shared.
+_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file >> $seqres.full
+_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
+sync
+
+_cp_reflink $SCRATCH_MNT/file $SCRATCH_MNT/file.reflink
+
+_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file >> $seqres.full
+_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
+sync
+
+# Avoid creation of large folios on newer kernels by cycling the mount and
+# immediately writing to the page cache.
+_scratch_cycle_mount
+
+# Write the same data to file.compare as we're about to do to file. Do this
+# before slowing down writeback to avoid unnecessary delay.
+_pwrite_byte 0x57 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
+_pwrite_byte 0x56 $((2 * blksz)) $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
+sync
+
+# Introduce a half-second wait to each writeback block mapping call. This
+# gives us a chance to race speculative cow prealloc with writeback.
+wb_delay=500
+echo $wb_delay > $knob
+curval="$(cat $knob)"
+test "$curval" -eq $wb_delay || echo "expected wb_delay_ms == $wb_delay"
+
+_ftrace_record_events 'xfs_wb*iomap_invalid'
+
+# Start thread 1 + writeback above
+$XFS_IO_PROG -c "pwrite -S 0x57 0 $((2 * blksz))" \
+ -c 'bmap -celpv' -c 'bmap -elpv' \
+ -c 'fsync' $SCRATCH_MNT/file >> $seqres.full &
+sleep 1
+
+# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
+# we see that, we know we've forced writeback to revalidate a mapping. The
+# test has been successful, so turn off the delay.
+sentryfile=$TEST_DIR/$seq.sentry
+tracefile=$TEST_DIR/$seq.ftrace
+wait_for_errortag() {
+ while [ -e "$sentryfile" ]; do
+ _ftrace_dump | grep iomap_invalid >> "$tracefile"
+ if grep -q iomap_invalid "$tracefile"; then
+ echo 0 > "$knob"
+ _ftrace_ignore_events
+ break;
+ fi
+ sleep 0.5
+ done
+}
+touch $sentryfile
+wait_for_errortag &
+
+# Start thread 2 to create the cowextsize reservation
+$XFS_IO_PROG -c "pwrite -S 0x56 $((2 * blksz)) $((2 * blksz))" \
+ -c 'bmap -celpv' -c 'bmap -elpv' \
+ -c 'fsync' $SCRATCH_MNT/file >> $seqres.full
+rm -f $sentryfile
+
+cat "$tracefile" >> $seqres.full
+grep -q iomap_invalid "$tracefile"
+saw_invalidation=$?
+
+# Flush everything to disk. If the bug manifests, then after the cycle,
+# file should have stale 0x58 in block 0 because we silently dropped a write.
+_scratch_cycle_mount
+
+if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
+ echo file and file.compare do not match
+ $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
+ echo file.compare
+ od -tx1 -Ad -c $SCRATCH_MNT/file.compare
+ echo file
+ od -tx1 -Ad -c $SCRATCH_MNT/file
+elif [ $saw_invalidation -ne 0 ]; then
+ # The files matched, but nothing got logged about the revalidation?
+ echo "Expected to hear about writeback iomap invalidations?"
+fi
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/924.out b/tests/xfs/924.out
new file mode 100644
index 0000000000..c6655da35a
--- /dev/null
+++ b/tests/xfs/924.out
@@ -0,0 +1,2 @@
+QA output created by 924
+Silence is golden
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] xfs: regression test for writeback corruption bug
2022-11-29 21:57 ` [RFC PATCH] xfs: regression test for writeback corruption bug Darrick J. Wong
@ 2022-11-30 17:34 ` Zorro Lang
2022-11-30 19:04 ` Darrick J. Wong
0 siblings, 1 reply; 6+ messages in thread
From: Zorro Lang @ 2022-11-30 17:34 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, linux-xfs, linux-fsdevel, wen.gang.wang, fstests
On Tue, Nov 29, 2022 at 01:57:14PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> This is a regression test for a data corruption bug that existed in XFS'
> copy on write code between 4.9 and 4.19. The root cause is a
> concurrency bug wherein we would drop ILOCK_SHARED after querying the
> CoW fork in xfs_map_cow and retake it before querying the data fork in
> xfs_map_blocks. See the test description for a lot more details.
>
> Cc: Wengang Wang <wen.gang.wang@oracle.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
> common/rc | 15 ++++
> common/tracing | 69 +++++++++++++++++
> tests/xfs/924 | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/xfs/924.out | 2
> 4 files changed, 301 insertions(+)
> create mode 100644 common/tracing
> create mode 100755 tests/xfs/924
> create mode 100644 tests/xfs/924.out
>
> diff --git a/common/rc b/common/rc
> index d71fc0603f..b1b7a3e553 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -3625,6 +3625,21 @@ _check_xflag()
> fi
> }
>
> +# Make sure the given file access mode is set to use the pagecache. If
> +# userspace or kernel don't support statx or STATX_ATTR_DAX, we assume that
> +# means pagecache. The sole parameter must be a directory.
> +_require_pagecache_access() {
> + local testfile="$1/testfile"
> +
> + touch "$testfile"
> + if ! _check_s_dax "$testfile" 0 &>> $seqres.full; then
> + rm -f "$testfile"
> + _notrun 'test requires pagecache access'
> + fi
> +
> + rm -f "$testfile"
> +}
> +
> # Check if dax mount options are supported
> #
> # $1 can be either 'dax=always' or 'dax'
> diff --git a/common/tracing b/common/tracing
> new file mode 100644
> index 0000000000..35e5ed41c2
> --- /dev/null
> +++ b/common/tracing
> @@ -0,0 +1,69 @@
> +##/bin/bash
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2022 Oracle. All Rights Reserved.
> +#
> +# Routines for dealing with ftrace (or any other tracing).
> +
> +_require_ftrace() {
> + local ftrace_dir="/sys/kernel/debug/tracing/instances/"
> + test -d "$ftrace_dir" || _notrun "kernel does not support ftrace"
> +
> + # Give this fstest its own ftrace buffer so that we don't mess up
> + # any other tracers that might be running.
> + FTRACE_DIR="$ftrace_dir/fstests.$seq"
> + test -d "$FTRACE_DIR" && rmdir "$FTRACE_DIR"
> +}
> +
> +_ftrace_cleanup() {
> + if [ -d "$FTRACE_DIR" ]; then
> + _ftrace_ignore_events
> + # Removing an ftrace buffer requires rmdir, even though the
> + # virtual directory contains children.
> + rmdir "$FTRACE_DIR"
> + fi
> +}
> +
> +# Intercept the given events. Arguments may be regular expressions.
> +_ftrace_record_events() {
> + local pwd="$PWD"
> +
> + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> + mkdir "$FTRACE_DIR"
> + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> +
> + for arg in "$@"; do
> + for tp in */${arg}; do
> + # Replace slashes with semicolons per ftrace convention
> + echo "${tp////:}" >> ../set_event
> + done
> + done
> + cd "$pwd"
Is the relative path necessary, can we use absolute path at here?
> +}
> +
> +# Stop intercepting the given events. If no arguments, stops all events.
> +_ftrace_ignore_events() {
> + local pwd="$PWD"
> +
> + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> +
> + if [ "$#" -eq 0 ]; then
> + echo > ../set_event
> + else
> + for arg in "$@"; do
> + for tp in */${arg}; do
> + # Replace slashes with semicolons per ftrace convention
> + echo "!${tp////:}" >> ../set_event
> + done
> + done
> + fi
> +
> + cd "$pwd"
Same at here
> +}
> +
> +# Dump whatever was written to the ftrace buffer since the last time this
> +# helper was called.
> +_ftrace_dump() {
> + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> + (cd "$FTRACE_DIR" && cat trace)
Why not "cat $FTRACE_DIR/trace" ?
> +}
> diff --git a/tests/xfs/924 b/tests/xfs/924
> new file mode 100755
> index 0000000000..81f8ba2743
> --- /dev/null
> +++ b/tests/xfs/924
> @@ -0,0 +1,215 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2022 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 924
> +#
> +# This is a regression test for a data corruption bug that existed in XFS' copy
> +# on write code between 4.9 and 4.19. The root cause is a concurrency bug
> +# wherein we would drop ILOCK_SHARED after querying the CoW fork in xfs_map_cow
> +# and retake it before querying the data fork in xfs_map_blocks. If a second
> +# thread changes the CoW fork mappings between the two calls, it's possible for
> +# xfs_map_blocks to return a zero-block mapping, which results in writeback
> +# being elided for that block. Elided writeback of dirty data results in
> +# silent loss of writes.
> +#
> +# Worse yet, kernels from that era still used buffer heads, which means that an
> +# elided writeback leaves the page clean but the bufferheads dirty. Due to a
> +# naïve optimization in mark_buffer_dirty, the SetPageDirty call is elided if
> +# the bufferhead is dirty, which means that a subsequent rewrite of the data
> +# block will never result in the page being marked dirty, and all subsequent
> +# writes are lost.
> +#
> +# It turns out that Christoph Hellwig unwittingly fixed the race in commit
> +# 5c665e5b5af6 ("xfs: remove xfs_map_cow"), and no testcase was ever written.
> +# Four years later, we hit it on a production 4.14 kernel. This testcase
> +# relies on a debugging knob that introduces artificial delays into writeback.
> +#
> +# Before the race, the file blocks 0-1 are not shared and blocks 2-5 are
> +# shared. There are no extents in CoW fork.
> +#
> +# Two threads race like this:
> +#
> +# Thread 1 (writeback block 0) | Thread 2 (write to block 2)
> +# ---------------------------------|--------------------------------
> +# |
> +# 1. Check if block 0 in CoW fork |
> +# from xfs_map_cow. |
> +# |
> +# 2. Block 0 not found in CoW |
> +# fork; the block is considered |
> +# not shared. |
> +# |
> +# 3. xfs_map_blocks looks up data |
> +# fork to get a map covering |
> +# block 0. |
> +# |
> +# 4. It gets a data fork mapping |
> +# for block 0 with length 2. |
> +# |
> +# | 1. A buffered write to block 2 sees
> +# | that it is a shared block and no
> +# | extent covers block 2 in CoW fork.
> +# |
> +# | It creates a new CoW fork mapping.
> +# | Due to the cowextsize, the new
> +# | extent starts at block 0 with
> +# | length 128.
> +# |
> +# |
> +# 5. It lookup CoW fork again to |
> +# trim the map (0, 2) to a |
> +# shared block boundary. |
> +# |
> +# 5a. It finds (0, 128) in CoW fork|
> +# 5b. It trims the data fork map |
> +# from (0, 1) to (0, 0) (!!!) |
> +# |
> +# 6. The xfs_imap_valid call after |
> +# the xfs_map_blocks call checks|
> +# if the mapping (0, 0) covers |
> +# block 0. The result is "NO". |
> +# |
> +# 7. Since block 0 has no physical |
> +# block mapped, it's not added |
> +# to the ioend. This is the |
> +# first problem. |
> +# |
> +# 8. xfs_add_to_ioend usually |
> +# clears the bufferhead dirty |
> +# flag Because this is skipped,|
> +# we leave the page clean with |
> +# the associated buffer head(s) |
> +# dirty (the second problem). |
> +# Now the dirty state is |
> +# inconsistent.
> +#
> +# On newer kernels, this is also a functionality test for the ifork sequence
> +# counter because the writeback completions will change the data fork and force
> +# revalidations of the wb mapping.
> +#
> +. ./common/preamble
> +_begin_fstest auto quick clone
> +
> +# Import common functions.
> +. ./common/reflink
> +. ./common/inject
> +. ./common/tracing
> +
> +# real QA test starts here
> +_cleanup()
> +{
> + _ftrace_cleanup
> + cd /
> + rm -r -f $tmp.* $sentryfile $tracefile
> +}
> +
> +# Modify as appropriate.
> +_supported_fs xfs
> +_fixed_by_kernel_commit 5c665e5b5af6 "xfs: remove xfs_map_cow"
> +_require_ftrace
> +_require_error_injection
> +_require_scratch_reflink
> +_require_cp_reflink
> +
> +_scratch_mkfs >> $seqres.full
> +_scratch_mount >> $seqres.full
> +
> +# This is a pagecache test, so try to disable fsdax mode.
> +$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
> +_require_pagecache_access $SCRATCH_MNT
> +
> +knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "wb_delay_ms")"
> +test -w "$knob" || _notrun "Kernel does not have wb_delay_ms error injector"
Can `_require_xfs_io_error_injection` help that?
> +
> +blksz=65536
> +_require_congruent_file_oplen $SCRATCH_MNT $blksz
> +
> +# Make sure we have sufficient extent size to create speculative CoW
> +# preallocations.
> +$XFS_IO_PROG -c 'cowextsize 1m' $SCRATCH_MNT
> +
> +# Write out a file with the first two blocks unshared and the rest shared.
> +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> +sync
> +
> +_cp_reflink $SCRATCH_MNT/file $SCRATCH_MNT/file.reflink
> +
> +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> +sync
> +
> +# Avoid creation of large folios on newer kernels by cycling the mount and
> +# immediately writing to the page cache.
> +_scratch_cycle_mount
> +
> +# Write the same data to file.compare as we're about to do to file. Do this
> +# before slowing down writeback to avoid unnecessary delay.
> +_pwrite_byte 0x57 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> +_pwrite_byte 0x56 $((2 * blksz)) $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> +sync
> +
> +# Introduce a half-second wait to each writeback block mapping call. This
> +# gives us a chance to race speculative cow prealloc with writeback.
> +wb_delay=500
> +echo $wb_delay > $knob
Oh, you'd like to avoid depending on xfs_io ?
> +curval="$(cat $knob)"
> +test "$curval" -eq $wb_delay || echo "expected wb_delay_ms == $wb_delay"
> +
> +_ftrace_record_events 'xfs_wb*iomap_invalid'
> +
> +# Start thread 1 + writeback above
> +$XFS_IO_PROG -c "pwrite -S 0x57 0 $((2 * blksz))" \
> + -c 'bmap -celpv' -c 'bmap -elpv' \
I didn't find the "bmap -c" option, is it a new option? Won't it break the
golden image if a system doesn't support it?
> + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full &
> +sleep 1
> +
> +# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
> +# we see that, we know we've forced writeback to revalidate a mapping. The
> +# test has been successful, so turn off the delay.
> +sentryfile=$TEST_DIR/$seq.sentry
> +tracefile=$TEST_DIR/$seq.ftrace
> +wait_for_errortag() {
> + while [ -e "$sentryfile" ]; do
> + _ftrace_dump | grep iomap_invalid >> "$tracefile"
> + if grep -q iomap_invalid "$tracefile"; then
> + echo 0 > "$knob"
> + _ftrace_ignore_events
> + break;
> + fi
> + sleep 0.5
> + done
> +}
> +touch $sentryfile
> +wait_for_errortag &
Should we *wait* background processes in cleanup after removing $sentryfile.
> +
> +# Start thread 2 to create the cowextsize reservation
> +$XFS_IO_PROG -c "pwrite -S 0x56 $((2 * blksz)) $((2 * blksz))" \
> + -c 'bmap -celpv' -c 'bmap -elpv' \
> + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full
> +rm -f $sentryfile
> +
> +cat "$tracefile" >> $seqres.full
> +grep -q iomap_invalid "$tracefile"
> +saw_invalidation=$?
> +
> +# Flush everything to disk. If the bug manifests, then after the cycle,
> +# file should have stale 0x58 in block 0 because we silently dropped a write.
> +_scratch_cycle_mount
> +
> +if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
> + echo file and file.compare do not match
> + $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
> + echo file.compare
> + od -tx1 -Ad -c $SCRATCH_MNT/file.compare
> + echo file
> + od -tx1 -Ad -c $SCRATCH_MNT/file
> +elif [ $saw_invalidation -ne 0 ]; then
> + # The files matched, but nothing got logged about the revalidation?
> + echo "Expected to hear about writeback iomap invalidations?"
> +fi
> +
> +echo Silence is golden
> +status=0
> +exit
> diff --git a/tests/xfs/924.out b/tests/xfs/924.out
> new file mode 100644
> index 0000000000..c6655da35a
> --- /dev/null
> +++ b/tests/xfs/924.out
> @@ -0,0 +1,2 @@
> +QA output created by 924
> +Silence is golden
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] xfs: regression test for writeback corruption bug
2022-11-30 17:34 ` Zorro Lang
@ 2022-11-30 19:04 ` Darrick J. Wong
2022-12-01 15:21 ` Zorro Lang
0 siblings, 1 reply; 6+ messages in thread
From: Darrick J. Wong @ 2022-11-30 19:04 UTC (permalink / raw)
To: Zorro Lang; +Cc: Dave Chinner, linux-xfs, linux-fsdevel, wen.gang.wang, fstests
On Thu, Dec 01, 2022 at 01:34:47AM +0800, Zorro Lang wrote:
> On Tue, Nov 29, 2022 at 01:57:14PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > This is a regression test for a data corruption bug that existed in XFS'
> > copy on write code between 4.9 and 4.19. The root cause is a
> > concurrency bug wherein we would drop ILOCK_SHARED after querying the
> > CoW fork in xfs_map_cow and retake it before querying the data fork in
> > xfs_map_blocks. See the test description for a lot more details.
> >
> > Cc: Wengang Wang <wen.gang.wang@oracle.com>
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> > common/rc | 15 ++++
> > common/tracing | 69 +++++++++++++++++
> > tests/xfs/924 | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > tests/xfs/924.out | 2
> > 4 files changed, 301 insertions(+)
> > create mode 100644 common/tracing
> > create mode 100755 tests/xfs/924
> > create mode 100644 tests/xfs/924.out
> >
> > diff --git a/common/rc b/common/rc
> > index d71fc0603f..b1b7a3e553 100644
> > --- a/common/rc
> > +++ b/common/rc
> > @@ -3625,6 +3625,21 @@ _check_xflag()
> > fi
> > }
> >
> > +# Make sure the given file access mode is set to use the pagecache. If
> > +# userspace or kernel don't support statx or STATX_ATTR_DAX, we assume that
> > +# means pagecache. The sole parameter must be a directory.
> > +_require_pagecache_access() {
> > + local testfile="$1/testfile"
> > +
> > + touch "$testfile"
> > + if ! _check_s_dax "$testfile" 0 &>> $seqres.full; then
> > + rm -f "$testfile"
> > + _notrun 'test requires pagecache access'
> > + fi
> > +
> > + rm -f "$testfile"
> > +}
> > +
> > # Check if dax mount options are supported
> > #
> > # $1 can be either 'dax=always' or 'dax'
> > diff --git a/common/tracing b/common/tracing
> > new file mode 100644
> > index 0000000000..35e5ed41c2
> > --- /dev/null
> > +++ b/common/tracing
> > @@ -0,0 +1,69 @@
> > +##/bin/bash
> > +# SPDX-License-Identifier: GPL-2.0-or-later
> > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > +#
> > +# Routines for dealing with ftrace (or any other tracing).
> > +
> > +_require_ftrace() {
> > + local ftrace_dir="/sys/kernel/debug/tracing/instances/"
> > + test -d "$ftrace_dir" || _notrun "kernel does not support ftrace"
> > +
> > + # Give this fstest its own ftrace buffer so that we don't mess up
> > + # any other tracers that might be running.
> > + FTRACE_DIR="$ftrace_dir/fstests.$seq"
> > + test -d "$FTRACE_DIR" && rmdir "$FTRACE_DIR"
> > +}
> > +
> > +_ftrace_cleanup() {
> > + if [ -d "$FTRACE_DIR" ]; then
> > + _ftrace_ignore_events
> > + # Removing an ftrace buffer requires rmdir, even though the
> > + # virtual directory contains children.
> > + rmdir "$FTRACE_DIR"
> > + fi
> > +}
> > +
> > +# Intercept the given events. Arguments may be regular expressions.
> > +_ftrace_record_events() {
> > + local pwd="$PWD"
> > +
> > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > + mkdir "$FTRACE_DIR"
> > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > +
> > + for arg in "$@"; do
> > + for tp in */${arg}; do
> > + # Replace slashes with semicolons per ftrace convention
> > + echo "${tp////:}" >> ../set_event
> > + done
> > + done
> > + cd "$pwd"
>
> Is the relative path necessary, can we use absolute path at here?
Hm. I suppose that inner loop could be replaced by:
find "$FTRACE_DIR/events/ -type d -name "$arg" -printf '%P\n' | \
tr '/' ':' >> "$FTRACE_DIR/set_event"
and then we don't need all this cd'ing insanity. I'll try that and
report back.
> > +}
> > +
> > +# Stop intercepting the given events. If no arguments, stops all events.
> > +_ftrace_ignore_events() {
> > + local pwd="$PWD"
> > +
> > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > +
> > + if [ "$#" -eq 0 ]; then
> > + echo > ../set_event
> > + else
> > + for arg in "$@"; do
> > + for tp in */${arg}; do
> > + # Replace slashes with semicolons per ftrace convention
> > + echo "!${tp////:}" >> ../set_event
> > + done
> > + done
> > + fi
> > +
> > + cd "$pwd"
>
> Same at here
>
> > +}
> > +
> > +# Dump whatever was written to the ftrace buffer since the last time this
> > +# helper was called.
> > +_ftrace_dump() {
> > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > + (cd "$FTRACE_DIR" && cat trace)
>
> Why not "cat $FTRACE_DIR/trace" ?
Fixed.
> > +}
> > diff --git a/tests/xfs/924 b/tests/xfs/924
> > new file mode 100755
> > index 0000000000..81f8ba2743
> > --- /dev/null
> > +++ b/tests/xfs/924
> > @@ -0,0 +1,215 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > +#
> > +# FS QA Test 924
> > +#
> > +# This is a regression test for a data corruption bug that existed in XFS' copy
> > +# on write code between 4.9 and 4.19. The root cause is a concurrency bug
> > +# wherein we would drop ILOCK_SHARED after querying the CoW fork in xfs_map_cow
> > +# and retake it before querying the data fork in xfs_map_blocks. If a second
> > +# thread changes the CoW fork mappings between the two calls, it's possible for
> > +# xfs_map_blocks to return a zero-block mapping, which results in writeback
> > +# being elided for that block. Elided writeback of dirty data results in
> > +# silent loss of writes.
> > +#
> > +# Worse yet, kernels from that era still used buffer heads, which means that an
> > +# elided writeback leaves the page clean but the bufferheads dirty. Due to a
> > +# naïve optimization in mark_buffer_dirty, the SetPageDirty call is elided if
> > +# the bufferhead is dirty, which means that a subsequent rewrite of the data
> > +# block will never result in the page being marked dirty, and all subsequent
> > +# writes are lost.
> > +#
> > +# It turns out that Christoph Hellwig unwittingly fixed the race in commit
> > +# 5c665e5b5af6 ("xfs: remove xfs_map_cow"), and no testcase was ever written.
> > +# Four years later, we hit it on a production 4.14 kernel. This testcase
> > +# relies on a debugging knob that introduces artificial delays into writeback.
> > +#
> > +# Before the race, the file blocks 0-1 are not shared and blocks 2-5 are
> > +# shared. There are no extents in CoW fork.
> > +#
> > +# Two threads race like this:
> > +#
> > +# Thread 1 (writeback block 0) | Thread 2 (write to block 2)
> > +# ---------------------------------|--------------------------------
> > +# |
> > +# 1. Check if block 0 in CoW fork |
> > +# from xfs_map_cow. |
> > +# |
> > +# 2. Block 0 not found in CoW |
> > +# fork; the block is considered |
> > +# not shared. |
> > +# |
> > +# 3. xfs_map_blocks looks up data |
> > +# fork to get a map covering |
> > +# block 0. |
> > +# |
> > +# 4. It gets a data fork mapping |
> > +# for block 0 with length 2. |
> > +# |
> > +# | 1. A buffered write to block 2 sees
> > +# | that it is a shared block and no
> > +# | extent covers block 2 in CoW fork.
> > +# |
> > +# | It creates a new CoW fork mapping.
> > +# | Due to the cowextsize, the new
> > +# | extent starts at block 0 with
> > +# | length 128.
> > +# |
> > +# |
> > +# 5. It lookup CoW fork again to |
> > +# trim the map (0, 2) to a |
> > +# shared block boundary. |
> > +# |
> > +# 5a. It finds (0, 128) in CoW fork|
> > +# 5b. It trims the data fork map |
> > +# from (0, 1) to (0, 0) (!!!) |
> > +# |
> > +# 6. The xfs_imap_valid call after |
> > +# the xfs_map_blocks call checks|
> > +# if the mapping (0, 0) covers |
> > +# block 0. The result is "NO". |
> > +# |
> > +# 7. Since block 0 has no physical |
> > +# block mapped, it's not added |
> > +# to the ioend. This is the |
> > +# first problem. |
> > +# |
> > +# 8. xfs_add_to_ioend usually |
> > +# clears the bufferhead dirty |
> > +# flag Because this is skipped,|
> > +# we leave the page clean with |
> > +# the associated buffer head(s) |
> > +# dirty (the second problem). |
> > +# Now the dirty state is |
> > +# inconsistent.
> > +#
> > +# On newer kernels, this is also a functionality test for the ifork sequence
> > +# counter because the writeback completions will change the data fork and force
> > +# revalidations of the wb mapping.
> > +#
> > +. ./common/preamble
> > +_begin_fstest auto quick clone
> > +
> > +# Import common functions.
> > +. ./common/reflink
> > +. ./common/inject
> > +. ./common/tracing
> > +
> > +# real QA test starts here
> > +_cleanup()
> > +{
> > + _ftrace_cleanup
> > + cd /
> > + rm -r -f $tmp.* $sentryfile $tracefile
> > +}
> > +
> > +# Modify as appropriate.
> > +_supported_fs xfs
> > +_fixed_by_kernel_commit 5c665e5b5af6 "xfs: remove xfs_map_cow"
> > +_require_ftrace
> > +_require_error_injection
> > +_require_scratch_reflink
> > +_require_cp_reflink
> > +
> > +_scratch_mkfs >> $seqres.full
> > +_scratch_mount >> $seqres.full
> > +
> > +# This is a pagecache test, so try to disable fsdax mode.
> > +$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
> > +_require_pagecache_access $SCRATCH_MNT
> > +
> > +knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "wb_delay_ms")"
> > +test -w "$knob" || _notrun "Kernel does not have wb_delay_ms error injector"
>
> Can `_require_xfs_io_error_injection` help that?
>
> > +
> > +blksz=65536
> > +_require_congruent_file_oplen $SCRATCH_MNT $blksz
> > +
> > +# Make sure we have sufficient extent size to create speculative CoW
> > +# preallocations.
> > +$XFS_IO_PROG -c 'cowextsize 1m' $SCRATCH_MNT
> > +
> > +# Write out a file with the first two blocks unshared and the rest shared.
> > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > +sync
> > +
> > +_cp_reflink $SCRATCH_MNT/file $SCRATCH_MNT/file.reflink
> > +
> > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > +sync
> > +
> > +# Avoid creation of large folios on newer kernels by cycling the mount and
> > +# immediately writing to the page cache.
> > +_scratch_cycle_mount
> > +
> > +# Write the same data to file.compare as we're about to do to file. Do this
> > +# before slowing down writeback to avoid unnecessary delay.
> > +_pwrite_byte 0x57 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > +_pwrite_byte 0x56 $((2 * blksz)) $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > +sync
> > +
> > +# Introduce a half-second wait to each writeback block mapping call. This
> > +# gives us a chance to race speculative cow prealloc with writeback.
> > +wb_delay=500
> > +echo $wb_delay > $knob
>
> Oh, you'd like to avoid depending on xfs_io ?
Oops, this was leftover from before I ported the xfs_errortag.h changes
to xfsprogs.
> > +curval="$(cat $knob)"
> > +test "$curval" -eq $wb_delay || echo "expected wb_delay_ms == $wb_delay"
> > +
> > +_ftrace_record_events 'xfs_wb*iomap_invalid'
> > +
> > +# Start thread 1 + writeback above
> > +$XFS_IO_PROG -c "pwrite -S 0x57 0 $((2 * blksz))" \
> > + -c 'bmap -celpv' -c 'bmap -elpv' \
>
> I didn't find the "bmap -c" option, is it a new option? Won't it break the
> golden image if a system doesn't support it?
-c is a deliberately undocumented option to the bmap command; it's been
there since the introduction of reflink.
> > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full &
> > +sleep 1
> > +
> > +# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
> > +# we see that, we know we've forced writeback to revalidate a mapping. The
> > +# test has been successful, so turn off the delay.
> > +sentryfile=$TEST_DIR/$seq.sentry
> > +tracefile=$TEST_DIR/$seq.ftrace
> > +wait_for_errortag() {
> > + while [ -e "$sentryfile" ]; do
> > + _ftrace_dump | grep iomap_invalid >> "$tracefile"
> > + if grep -q iomap_invalid "$tracefile"; then
> > + echo 0 > "$knob"
> > + _ftrace_ignore_events
> > + break;
> > + fi
> > + sleep 0.5
> > + done
> > +}
> > +touch $sentryfile
> > +wait_for_errortag &
>
> Should we *wait* background processes in cleanup after removing $sentryfile.
Yes.
> > +
> > +# Start thread 2 to create the cowextsize reservation
> > +$XFS_IO_PROG -c "pwrite -S 0x56 $((2 * blksz)) $((2 * blksz))" \
> > + -c 'bmap -celpv' -c 'bmap -elpv' \
> > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full
> > +rm -f $sentryfile
> > +
> > +cat "$tracefile" >> $seqres.full
> > +grep -q iomap_invalid "$tracefile"
> > +saw_invalidation=$?
> > +
> > +# Flush everything to disk. If the bug manifests, then after the cycle,
> > +# file should have stale 0x58 in block 0 because we silently dropped a write.
> > +_scratch_cycle_mount
> > +
> > +if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
> > + echo file and file.compare do not match
> > + $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
> > + echo file.compare
> > + od -tx1 -Ad -c $SCRATCH_MNT/file.compare
> > + echo file
> > + od -tx1 -Ad -c $SCRATCH_MNT/file
> > +elif [ $saw_invalidation -ne 0 ]; then
> > + # The files matched, but nothing got logged about the revalidation?
> > + echo "Expected to hear about writeback iomap invalidations?"
> > +fi
> > +
> > +echo Silence is golden
> > +status=0
> > +exit
> > diff --git a/tests/xfs/924.out b/tests/xfs/924.out
> > new file mode 100644
> > index 0000000000..c6655da35a
> > --- /dev/null
> > +++ b/tests/xfs/924.out
> > @@ -0,0 +1,2 @@
> > +QA output created by 924
> > +Silence is golden
> >
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] xfs: regression test for writeback corruption bug
2022-11-30 19:04 ` Darrick J. Wong
@ 2022-12-01 15:21 ` Zorro Lang
2022-12-01 15:53 ` Darrick J. Wong
0 siblings, 1 reply; 6+ messages in thread
From: Zorro Lang @ 2022-12-01 15:21 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, linux-xfs, linux-fsdevel, wen.gang.wang, fstests
On Wed, Nov 30, 2022 at 11:04:18AM -0800, Darrick J. Wong wrote:
> On Thu, Dec 01, 2022 at 01:34:47AM +0800, Zorro Lang wrote:
> > On Tue, Nov 29, 2022 at 01:57:14PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > >
> > > This is a regression test for a data corruption bug that existed in XFS'
> > > copy on write code between 4.9 and 4.19. The root cause is a
> > > concurrency bug wherein we would drop ILOCK_SHARED after querying the
> > > CoW fork in xfs_map_cow and retake it before querying the data fork in
> > > xfs_map_blocks. See the test description for a lot more details.
> > >
> > > Cc: Wengang Wang <wen.gang.wang@oracle.com>
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > > common/rc | 15 ++++
> > > common/tracing | 69 +++++++++++++++++
> > > tests/xfs/924 | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > tests/xfs/924.out | 2
> > > 4 files changed, 301 insertions(+)
> > > create mode 100644 common/tracing
> > > create mode 100755 tests/xfs/924
> > > create mode 100644 tests/xfs/924.out
> > >
> > > diff --git a/common/rc b/common/rc
> > > index d71fc0603f..b1b7a3e553 100644
> > > --- a/common/rc
> > > +++ b/common/rc
> > > @@ -3625,6 +3625,21 @@ _check_xflag()
> > > fi
> > > }
> > >
> > > +# Make sure the given file access mode is set to use the pagecache. If
> > > +# userspace or kernel don't support statx or STATX_ATTR_DAX, we assume that
> > > +# means pagecache. The sole parameter must be a directory.
> > > +_require_pagecache_access() {
> > > + local testfile="$1/testfile"
> > > +
> > > + touch "$testfile"
> > > + if ! _check_s_dax "$testfile" 0 &>> $seqres.full; then
> > > + rm -f "$testfile"
> > > + _notrun 'test requires pagecache access'
> > > + fi
> > > +
> > > + rm -f "$testfile"
> > > +}
> > > +
> > > # Check if dax mount options are supported
> > > #
> > > # $1 can be either 'dax=always' or 'dax'
> > > diff --git a/common/tracing b/common/tracing
> > > new file mode 100644
> > > index 0000000000..35e5ed41c2
> > > --- /dev/null
> > > +++ b/common/tracing
> > > @@ -0,0 +1,69 @@
> > > +##/bin/bash
> > > +# SPDX-License-Identifier: GPL-2.0-or-later
> > > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > > +#
> > > +# Routines for dealing with ftrace (or any other tracing).
> > > +
> > > +_require_ftrace() {
> > > + local ftrace_dir="/sys/kernel/debug/tracing/instances/"
> > > + test -d "$ftrace_dir" || _notrun "kernel does not support ftrace"
> > > +
> > > + # Give this fstest its own ftrace buffer so that we don't mess up
> > > + # any other tracers that might be running.
> > > + FTRACE_DIR="$ftrace_dir/fstests.$seq"
> > > + test -d "$FTRACE_DIR" && rmdir "$FTRACE_DIR"
> > > +}
> > > +
> > > +_ftrace_cleanup() {
> > > + if [ -d "$FTRACE_DIR" ]; then
> > > + _ftrace_ignore_events
> > > + # Removing an ftrace buffer requires rmdir, even though the
> > > + # virtual directory contains children.
> > > + rmdir "$FTRACE_DIR"
> > > + fi
> > > +}
> > > +
> > > +# Intercept the given events. Arguments may be regular expressions.
> > > +_ftrace_record_events() {
> > > + local pwd="$PWD"
> > > +
> > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > + mkdir "$FTRACE_DIR"
> > > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > > +
> > > + for arg in "$@"; do
> > > + for tp in */${arg}; do
> > > + # Replace slashes with semicolons per ftrace convention
> > > + echo "${tp////:}" >> ../set_event
> > > + done
> > > + done
> > > + cd "$pwd"
> >
> > Is the relative path necessary, can we use absolute path at here?
>
> Hm. I suppose that inner loop could be replaced by:
>
> find "$FTRACE_DIR/events/ -type d -name "$arg" -printf '%P\n' | \
> tr '/' ':' >> "$FTRACE_DIR/set_event"
>
> and then we don't need all this cd'ing insanity. I'll try that and
> report back.
>
> > > +}
> > > +
> > > +# Stop intercepting the given events. If no arguments, stops all events.
> > > +_ftrace_ignore_events() {
> > > + local pwd="$PWD"
> > > +
> > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > > +
> > > + if [ "$#" -eq 0 ]; then
> > > + echo > ../set_event
> > > + else
> > > + for arg in "$@"; do
> > > + for tp in */${arg}; do
> > > + # Replace slashes with semicolons per ftrace convention
> > > + echo "!${tp////:}" >> ../set_event
> > > + done
> > > + done
> > > + fi
> > > +
> > > + cd "$pwd"
> >
> > Same at here
> >
> > > +}
> > > +
> > > +# Dump whatever was written to the ftrace buffer since the last time this
> > > +# helper was called.
> > > +_ftrace_dump() {
> > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > + (cd "$FTRACE_DIR" && cat trace)
> >
> > Why not "cat $FTRACE_DIR/trace" ?
>
> Fixed.
>
> > > +}
> > > diff --git a/tests/xfs/924 b/tests/xfs/924
> > > new file mode 100755
> > > index 0000000000..81f8ba2743
> > > --- /dev/null
> > > +++ b/tests/xfs/924
> > > @@ -0,0 +1,215 @@
> > > +#! /bin/bash
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > > +#
> > > +# FS QA Test 924
> > > +#
> > > +# This is a regression test for a data corruption bug that existed in XFS' copy
> > > +# on write code between 4.9 and 4.19. The root cause is a concurrency bug
> > > +# wherein we would drop ILOCK_SHARED after querying the CoW fork in xfs_map_cow
> > > +# and retake it before querying the data fork in xfs_map_blocks. If a second
> > > +# thread changes the CoW fork mappings between the two calls, it's possible for
> > > +# xfs_map_blocks to return a zero-block mapping, which results in writeback
> > > +# being elided for that block. Elided writeback of dirty data results in
> > > +# silent loss of writes.
> > > +#
> > > +# Worse yet, kernels from that era still used buffer heads, which means that an
> > > +# elided writeback leaves the page clean but the bufferheads dirty. Due to a
> > > +# naïve optimization in mark_buffer_dirty, the SetPageDirty call is elided if
> > > +# the bufferhead is dirty, which means that a subsequent rewrite of the data
> > > +# block will never result in the page being marked dirty, and all subsequent
> > > +# writes are lost.
> > > +#
> > > +# It turns out that Christoph Hellwig unwittingly fixed the race in commit
> > > +# 5c665e5b5af6 ("xfs: remove xfs_map_cow"), and no testcase was ever written.
> > > +# Four years later, we hit it on a production 4.14 kernel. This testcase
> > > +# relies on a debugging knob that introduces artificial delays into writeback.
> > > +#
> > > +# Before the race, the file blocks 0-1 are not shared and blocks 2-5 are
> > > +# shared. There are no extents in CoW fork.
> > > +#
> > > +# Two threads race like this:
> > > +#
> > > +# Thread 1 (writeback block 0) | Thread 2 (write to block 2)
> > > +# ---------------------------------|--------------------------------
> > > +# |
> > > +# 1. Check if block 0 in CoW fork |
> > > +# from xfs_map_cow. |
> > > +# |
> > > +# 2. Block 0 not found in CoW |
> > > +# fork; the block is considered |
> > > +# not shared. |
> > > +# |
> > > +# 3. xfs_map_blocks looks up data |
> > > +# fork to get a map covering |
> > > +# block 0. |
> > > +# |
> > > +# 4. It gets a data fork mapping |
> > > +# for block 0 with length 2. |
> > > +# |
> > > +# | 1. A buffered write to block 2 sees
> > > +# | that it is a shared block and no
> > > +# | extent covers block 2 in CoW fork.
> > > +# |
> > > +# | It creates a new CoW fork mapping.
> > > +# | Due to the cowextsize, the new
> > > +# | extent starts at block 0 with
> > > +# | length 128.
> > > +# |
> > > +# |
> > > +# 5. It lookup CoW fork again to |
> > > +# trim the map (0, 2) to a |
> > > +# shared block boundary. |
> > > +# |
> > > +# 5a. It finds (0, 128) in CoW fork|
> > > +# 5b. It trims the data fork map |
> > > +# from (0, 1) to (0, 0) (!!!) |
> > > +# |
> > > +# 6. The xfs_imap_valid call after |
> > > +# the xfs_map_blocks call checks|
> > > +# if the mapping (0, 0) covers |
> > > +# block 0. The result is "NO". |
> > > +# |
> > > +# 7. Since block 0 has no physical |
> > > +# block mapped, it's not added |
> > > +# to the ioend. This is the |
> > > +# first problem. |
> > > +# |
> > > +# 8. xfs_add_to_ioend usually |
> > > +# clears the bufferhead dirty |
> > > +# flag Because this is skipped,|
> > > +# we leave the page clean with |
> > > +# the associated buffer head(s) |
> > > +# dirty (the second problem). |
> > > +# Now the dirty state is |
> > > +# inconsistent.
> > > +#
> > > +# On newer kernels, this is also a functionality test for the ifork sequence
> > > +# counter because the writeback completions will change the data fork and force
> > > +# revalidations of the wb mapping.
> > > +#
> > > +. ./common/preamble
> > > +_begin_fstest auto quick clone
> > > +
> > > +# Import common functions.
> > > +. ./common/reflink
> > > +. ./common/inject
> > > +. ./common/tracing
> > > +
> > > +# real QA test starts here
> > > +_cleanup()
> > > +{
> > > + _ftrace_cleanup
> > > + cd /
> > > + rm -r -f $tmp.* $sentryfile $tracefile
> > > +}
> > > +
> > > +# Modify as appropriate.
> > > +_supported_fs xfs
> > > +_fixed_by_kernel_commit 5c665e5b5af6 "xfs: remove xfs_map_cow"
> > > +_require_ftrace
> > > +_require_error_injection
> > > +_require_scratch_reflink
> > > +_require_cp_reflink
> > > +
> > > +_scratch_mkfs >> $seqres.full
> > > +_scratch_mount >> $seqres.full
> > > +
> > > +# This is a pagecache test, so try to disable fsdax mode.
> > > +$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
> > > +_require_pagecache_access $SCRATCH_MNT
> > > +
> > > +knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "wb_delay_ms")"
> > > +test -w "$knob" || _notrun "Kernel does not have wb_delay_ms error injector"
> >
> > Can `_require_xfs_io_error_injection` help that?
> >
> > > +
> > > +blksz=65536
> > > +_require_congruent_file_oplen $SCRATCH_MNT $blksz
> > > +
> > > +# Make sure we have sufficient extent size to create speculative CoW
> > > +# preallocations.
> > > +$XFS_IO_PROG -c 'cowextsize 1m' $SCRATCH_MNT
> > > +
> > > +# Write out a file with the first two blocks unshared and the rest shared.
> > > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > +sync
> > > +
> > > +_cp_reflink $SCRATCH_MNT/file $SCRATCH_MNT/file.reflink
> > > +
> > > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > +sync
> > > +
> > > +# Avoid creation of large folios on newer kernels by cycling the mount and
> > > +# immediately writing to the page cache.
> > > +_scratch_cycle_mount
> > > +
> > > +# Write the same data to file.compare as we're about to do to file. Do this
> > > +# before slowing down writeback to avoid unnecessary delay.
> > > +_pwrite_byte 0x57 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > +_pwrite_byte 0x56 $((2 * blksz)) $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > +sync
> > > +
> > > +# Introduce a half-second wait to each writeback block mapping call. This
> > > +# gives us a chance to race speculative cow prealloc with writeback.
> > > +wb_delay=500
> > > +echo $wb_delay > $knob
> >
> > Oh, you'd like to avoid depending on xfs_io ?
>
> Oops, this was leftover from before I ported the xfs_errortag.h changes
> to xfsprogs.
>
> > > +curval="$(cat $knob)"
> > > +test "$curval" -eq $wb_delay || echo "expected wb_delay_ms == $wb_delay"
> > > +
> > > +_ftrace_record_events 'xfs_wb*iomap_invalid'
> > > +
> > > +# Start thread 1 + writeback above
> > > +$XFS_IO_PROG -c "pwrite -S 0x57 0 $((2 * blksz))" \
> > > + -c 'bmap -celpv' -c 'bmap -elpv' \
> >
> > I didn't find the "bmap -c" option, is it a new option? Won't it break the
> > golden image if a system doesn't support it?
>
> -c is a deliberately undocumented option to the bmap command; it's been
> there since the introduction of reflink.
But when I tried that on rhel8 with a varietal ersion of xfsprogs 5.0.0
(xfsprogs-5.0.0-10.el8), I got below error output:
# xfs_io -c "bmap -celpv" testfile >/dev/null
xfs_io: xfsctl(XFS_IOC_GETBMAPX) iflags=0x28 ["testfile"]: Invalid argument
# xfs_io -c "bmap -c" testfile >/dev/null
xfs_io: xfsctl(XFS_IOC_GETBMAPX) iflags=0x28 ["testfile"]: Invalid argument
# xfs_io -c "bmap -elpv" testfile >/dev/null
(no error output)
So, maybe, if you don't need this output to be golden image, how about filter
out the error output to $seqres.full too? Or any better idea?
Thanks,
Zorro
>
> > > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full &
> > > +sleep 1
> > > +
> > > +# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
> > > +# we see that, we know we've forced writeback to revalidate a mapping. The
> > > +# test has been successful, so turn off the delay.
> > > +sentryfile=$TEST_DIR/$seq.sentry
> > > +tracefile=$TEST_DIR/$seq.ftrace
> > > +wait_for_errortag() {
> > > + while [ -e "$sentryfile" ]; do
> > > + _ftrace_dump | grep iomap_invalid >> "$tracefile"
> > > + if grep -q iomap_invalid "$tracefile"; then
> > > + echo 0 > "$knob"
> > > + _ftrace_ignore_events
> > > + break;
> > > + fi
> > > + sleep 0.5
> > > + done
> > > +}
> > > +touch $sentryfile
> > > +wait_for_errortag &
> >
> > Should we *wait* background processes in cleanup after removing $sentryfile.
>
> Yes.
>
> > > +
> > > +# Start thread 2 to create the cowextsize reservation
> > > +$XFS_IO_PROG -c "pwrite -S 0x56 $((2 * blksz)) $((2 * blksz))" \
> > > + -c 'bmap -celpv' -c 'bmap -elpv' \
> > > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full
> > > +rm -f $sentryfile
> > > +
> > > +cat "$tracefile" >> $seqres.full
> > > +grep -q iomap_invalid "$tracefile"
> > > +saw_invalidation=$?
> > > +
> > > +# Flush everything to disk. If the bug manifests, then after the cycle,
> > > +# file should have stale 0x58 in block 0 because we silently dropped a write.
> > > +_scratch_cycle_mount
> > > +
> > > +if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
> > > + echo file and file.compare do not match
> > > + $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
> > > + echo file.compare
> > > + od -tx1 -Ad -c $SCRATCH_MNT/file.compare
> > > + echo file
> > > + od -tx1 -Ad -c $SCRATCH_MNT/file
> > > +elif [ $saw_invalidation -ne 0 ]; then
> > > + # The files matched, but nothing got logged about the revalidation?
> > > + echo "Expected to hear about writeback iomap invalidations?"
> > > +fi
> > > +
> > > +echo Silence is golden
> > > +status=0
> > > +exit
> > > diff --git a/tests/xfs/924.out b/tests/xfs/924.out
> > > new file mode 100644
> > > index 0000000000..c6655da35a
> > > --- /dev/null
> > > +++ b/tests/xfs/924.out
> > > @@ -0,0 +1,2 @@
> > > +QA output created by 924
> > > +Silence is golden
> > >
> >
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] xfs: regression test for writeback corruption bug
2022-12-01 15:21 ` Zorro Lang
@ 2022-12-01 15:53 ` Darrick J. Wong
0 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2022-12-01 15:53 UTC (permalink / raw)
To: Zorro Lang; +Cc: Dave Chinner, linux-xfs, linux-fsdevel, wen.gang.wang, fstests
On Thu, Dec 01, 2022 at 11:21:45PM +0800, Zorro Lang wrote:
> On Wed, Nov 30, 2022 at 11:04:18AM -0800, Darrick J. Wong wrote:
> > On Thu, Dec 01, 2022 at 01:34:47AM +0800, Zorro Lang wrote:
> > > On Tue, Nov 29, 2022 at 01:57:14PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > >
> > > > This is a regression test for a data corruption bug that existed in XFS'
> > > > copy on write code between 4.9 and 4.19. The root cause is a
> > > > concurrency bug wherein we would drop ILOCK_SHARED after querying the
> > > > CoW fork in xfs_map_cow and retake it before querying the data fork in
> > > > xfs_map_blocks. See the test description for a lot more details.
> > > >
> > > > Cc: Wengang Wang <wen.gang.wang@oracle.com>
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > > > common/rc | 15 ++++
> > > > common/tracing | 69 +++++++++++++++++
> > > > tests/xfs/924 | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > tests/xfs/924.out | 2
> > > > 4 files changed, 301 insertions(+)
> > > > create mode 100644 common/tracing
> > > > create mode 100755 tests/xfs/924
> > > > create mode 100644 tests/xfs/924.out
> > > >
> > > > diff --git a/common/rc b/common/rc
> > > > index d71fc0603f..b1b7a3e553 100644
> > > > --- a/common/rc
> > > > +++ b/common/rc
> > > > @@ -3625,6 +3625,21 @@ _check_xflag()
> > > > fi
> > > > }
> > > >
> > > > +# Make sure the given file access mode is set to use the pagecache. If
> > > > +# userspace or kernel don't support statx or STATX_ATTR_DAX, we assume that
> > > > +# means pagecache. The sole parameter must be a directory.
> > > > +_require_pagecache_access() {
> > > > + local testfile="$1/testfile"
> > > > +
> > > > + touch "$testfile"
> > > > + if ! _check_s_dax "$testfile" 0 &>> $seqres.full; then
> > > > + rm -f "$testfile"
> > > > + _notrun 'test requires pagecache access'
> > > > + fi
> > > > +
> > > > + rm -f "$testfile"
> > > > +}
> > > > +
> > > > # Check if dax mount options are supported
> > > > #
> > > > # $1 can be either 'dax=always' or 'dax'
> > > > diff --git a/common/tracing b/common/tracing
> > > > new file mode 100644
> > > > index 0000000000..35e5ed41c2
> > > > --- /dev/null
> > > > +++ b/common/tracing
> > > > @@ -0,0 +1,69 @@
> > > > +##/bin/bash
> > > > +# SPDX-License-Identifier: GPL-2.0-or-later
> > > > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > > > +#
> > > > +# Routines for dealing with ftrace (or any other tracing).
> > > > +
> > > > +_require_ftrace() {
> > > > + local ftrace_dir="/sys/kernel/debug/tracing/instances/"
> > > > + test -d "$ftrace_dir" || _notrun "kernel does not support ftrace"
> > > > +
> > > > + # Give this fstest its own ftrace buffer so that we don't mess up
> > > > + # any other tracers that might be running.
> > > > + FTRACE_DIR="$ftrace_dir/fstests.$seq"
> > > > + test -d "$FTRACE_DIR" && rmdir "$FTRACE_DIR"
> > > > +}
> > > > +
> > > > +_ftrace_cleanup() {
> > > > + if [ -d "$FTRACE_DIR" ]; then
> > > > + _ftrace_ignore_events
> > > > + # Removing an ftrace buffer requires rmdir, even though the
> > > > + # virtual directory contains children.
> > > > + rmdir "$FTRACE_DIR"
> > > > + fi
> > > > +}
> > > > +
> > > > +# Intercept the given events. Arguments may be regular expressions.
> > > > +_ftrace_record_events() {
> > > > + local pwd="$PWD"
> > > > +
> > > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > > + mkdir "$FTRACE_DIR"
> > > > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > > > +
> > > > + for arg in "$@"; do
> > > > + for tp in */${arg}; do
> > > > + # Replace slashes with semicolons per ftrace convention
> > > > + echo "${tp////:}" >> ../set_event
> > > > + done
> > > > + done
> > > > + cd "$pwd"
> > >
> > > Is the relative path necessary, can we use absolute path at here?
> >
> > Hm. I suppose that inner loop could be replaced by:
> >
> > find "$FTRACE_DIR/events/ -type d -name "$arg" -printf '%P\n' | \
> > tr '/' ':' >> "$FTRACE_DIR/set_event"
> >
> > and then we don't need all this cd'ing insanity. I'll try that and
> > report back.
> >
> > > > +}
> > > > +
> > > > +# Stop intercepting the given events. If no arguments, stops all events.
> > > > +_ftrace_ignore_events() {
> > > > + local pwd="$PWD"
> > > > +
> > > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > > + cd "$FTRACE_DIR/events/" || _fail "$FTRACE_DIR: ftrace not set up?"
> > > > +
> > > > + if [ "$#" -eq 0 ]; then
> > > > + echo > ../set_event
> > > > + else
> > > > + for arg in "$@"; do
> > > > + for tp in */${arg}; do
> > > > + # Replace slashes with semicolons per ftrace convention
> > > > + echo "!${tp////:}" >> ../set_event
> > > > + done
> > > > + done
> > > > + fi
> > > > +
> > > > + cd "$pwd"
> > >
> > > Same at here
> > >
> > > > +}
> > > > +
> > > > +# Dump whatever was written to the ftrace buffer since the last time this
> > > > +# helper was called.
> > > > +_ftrace_dump() {
> > > > + test -n "$FTRACE_DIR" || _fail "_require_ftrace not run?"
> > > > + (cd "$FTRACE_DIR" && cat trace)
> > >
> > > Why not "cat $FTRACE_DIR/trace" ?
> >
> > Fixed.
> >
> > > > +}
> > > > diff --git a/tests/xfs/924 b/tests/xfs/924
> > > > new file mode 100755
> > > > index 0000000000..81f8ba2743
> > > > --- /dev/null
> > > > +++ b/tests/xfs/924
> > > > @@ -0,0 +1,215 @@
> > > > +#! /bin/bash
> > > > +# SPDX-License-Identifier: GPL-2.0
> > > > +# Copyright (c) 2022 Oracle. All Rights Reserved.
> > > > +#
> > > > +# FS QA Test 924
> > > > +#
> > > > +# This is a regression test for a data corruption bug that existed in XFS' copy
> > > > +# on write code between 4.9 and 4.19. The root cause is a concurrency bug
> > > > +# wherein we would drop ILOCK_SHARED after querying the CoW fork in xfs_map_cow
> > > > +# and retake it before querying the data fork in xfs_map_blocks. If a second
> > > > +# thread changes the CoW fork mappings between the two calls, it's possible for
> > > > +# xfs_map_blocks to return a zero-block mapping, which results in writeback
> > > > +# being elided for that block. Elided writeback of dirty data results in
> > > > +# silent loss of writes.
> > > > +#
> > > > +# Worse yet, kernels from that era still used buffer heads, which means that an
> > > > +# elided writeback leaves the page clean but the bufferheads dirty. Due to a
> > > > +# naïve optimization in mark_buffer_dirty, the SetPageDirty call is elided if
> > > > +# the bufferhead is dirty, which means that a subsequent rewrite of the data
> > > > +# block will never result in the page being marked dirty, and all subsequent
> > > > +# writes are lost.
> > > > +#
> > > > +# It turns out that Christoph Hellwig unwittingly fixed the race in commit
> > > > +# 5c665e5b5af6 ("xfs: remove xfs_map_cow"), and no testcase was ever written.
> > > > +# Four years later, we hit it on a production 4.14 kernel. This testcase
> > > > +# relies on a debugging knob that introduces artificial delays into writeback.
> > > > +#
> > > > +# Before the race, the file blocks 0-1 are not shared and blocks 2-5 are
> > > > +# shared. There are no extents in CoW fork.
> > > > +#
> > > > +# Two threads race like this:
> > > > +#
> > > > +# Thread 1 (writeback block 0) | Thread 2 (write to block 2)
> > > > +# ---------------------------------|--------------------------------
> > > > +# |
> > > > +# 1. Check if block 0 in CoW fork |
> > > > +# from xfs_map_cow. |
> > > > +# |
> > > > +# 2. Block 0 not found in CoW |
> > > > +# fork; the block is considered |
> > > > +# not shared. |
> > > > +# |
> > > > +# 3. xfs_map_blocks looks up data |
> > > > +# fork to get a map covering |
> > > > +# block 0. |
> > > > +# |
> > > > +# 4. It gets a data fork mapping |
> > > > +# for block 0 with length 2. |
> > > > +# |
> > > > +# | 1. A buffered write to block 2 sees
> > > > +# | that it is a shared block and no
> > > > +# | extent covers block 2 in CoW fork.
> > > > +# |
> > > > +# | It creates a new CoW fork mapping.
> > > > +# | Due to the cowextsize, the new
> > > > +# | extent starts at block 0 with
> > > > +# | length 128.
> > > > +# |
> > > > +# |
> > > > +# 5. It lookup CoW fork again to |
> > > > +# trim the map (0, 2) to a |
> > > > +# shared block boundary. |
> > > > +# |
> > > > +# 5a. It finds (0, 128) in CoW fork|
> > > > +# 5b. It trims the data fork map |
> > > > +# from (0, 1) to (0, 0) (!!!) |
> > > > +# |
> > > > +# 6. The xfs_imap_valid call after |
> > > > +# the xfs_map_blocks call checks|
> > > > +# if the mapping (0, 0) covers |
> > > > +# block 0. The result is "NO". |
> > > > +# |
> > > > +# 7. Since block 0 has no physical |
> > > > +# block mapped, it's not added |
> > > > +# to the ioend. This is the |
> > > > +# first problem. |
> > > > +# |
> > > > +# 8. xfs_add_to_ioend usually |
> > > > +# clears the bufferhead dirty |
> > > > +# flag Because this is skipped,|
> > > > +# we leave the page clean with |
> > > > +# the associated buffer head(s) |
> > > > +# dirty (the second problem). |
> > > > +# Now the dirty state is |
> > > > +# inconsistent.
> > > > +#
> > > > +# On newer kernels, this is also a functionality test for the ifork sequence
> > > > +# counter because the writeback completions will change the data fork and force
> > > > +# revalidations of the wb mapping.
> > > > +#
> > > > +. ./common/preamble
> > > > +_begin_fstest auto quick clone
> > > > +
> > > > +# Import common functions.
> > > > +. ./common/reflink
> > > > +. ./common/inject
> > > > +. ./common/tracing
> > > > +
> > > > +# real QA test starts here
> > > > +_cleanup()
> > > > +{
> > > > + _ftrace_cleanup
> > > > + cd /
> > > > + rm -r -f $tmp.* $sentryfile $tracefile
> > > > +}
> > > > +
> > > > +# Modify as appropriate.
> > > > +_supported_fs xfs
> > > > +_fixed_by_kernel_commit 5c665e5b5af6 "xfs: remove xfs_map_cow"
> > > > +_require_ftrace
> > > > +_require_error_injection
> > > > +_require_scratch_reflink
> > > > +_require_cp_reflink
> > > > +
> > > > +_scratch_mkfs >> $seqres.full
> > > > +_scratch_mount >> $seqres.full
> > > > +
> > > > +# This is a pagecache test, so try to disable fsdax mode.
> > > > +$XFS_IO_PROG -c 'chattr -x' $SCRATCH_MNT &> $seqres.full
> > > > +_require_pagecache_access $SCRATCH_MNT
> > > > +
> > > > +knob="$(_find_xfs_mountdev_errortag_knob $SCRATCH_DEV "wb_delay_ms")"
> > > > +test -w "$knob" || _notrun "Kernel does not have wb_delay_ms error injector"
> > >
> > > Can `_require_xfs_io_error_injection` help that?
> > >
> > > > +
> > > > +blksz=65536
> > > > +_require_congruent_file_oplen $SCRATCH_MNT $blksz
> > > > +
> > > > +# Make sure we have sufficient extent size to create speculative CoW
> > > > +# preallocations.
> > > > +$XFS_IO_PROG -c 'cowextsize 1m' $SCRATCH_MNT
> > > > +
> > > > +# Write out a file with the first two blocks unshared and the rest shared.
> > > > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > > > +_pwrite_byte 0x59 0 $((160 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > > +sync
> > > > +
> > > > +_cp_reflink $SCRATCH_MNT/file $SCRATCH_MNT/file.reflink
> > > > +
> > > > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file >> $seqres.full
> > > > +_pwrite_byte 0x58 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > > +sync
> > > > +
> > > > +# Avoid creation of large folios on newer kernels by cycling the mount and
> > > > +# immediately writing to the page cache.
> > > > +_scratch_cycle_mount
> > > > +
> > > > +# Write the same data to file.compare as we're about to do to file. Do this
> > > > +# before slowing down writeback to avoid unnecessary delay.
> > > > +_pwrite_byte 0x57 0 $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > > +_pwrite_byte 0x56 $((2 * blksz)) $((2 * blksz)) $SCRATCH_MNT/file.compare >> $seqres.full
> > > > +sync
> > > > +
> > > > +# Introduce a half-second wait to each writeback block mapping call. This
> > > > +# gives us a chance to race speculative cow prealloc with writeback.
> > > > +wb_delay=500
> > > > +echo $wb_delay > $knob
> > >
> > > Oh, you'd like to avoid depending on xfs_io ?
> >
> > Oops, this was leftover from before I ported the xfs_errortag.h changes
> > to xfsprogs.
> >
> > > > +curval="$(cat $knob)"
> > > > +test "$curval" -eq $wb_delay || echo "expected wb_delay_ms == $wb_delay"
> > > > +
> > > > +_ftrace_record_events 'xfs_wb*iomap_invalid'
> > > > +
> > > > +# Start thread 1 + writeback above
> > > > +$XFS_IO_PROG -c "pwrite -S 0x57 0 $((2 * blksz))" \
> > > > + -c 'bmap -celpv' -c 'bmap -elpv' \
> > >
> > > I didn't find the "bmap -c" option, is it a new option? Won't it break the
> > > golden image if a system doesn't support it?
> >
> > -c is a deliberately undocumented option to the bmap command; it's been
> > there since the introduction of reflink.
>
> But when I tried that on rhel8 with a varietal ersion of xfsprogs 5.0.0
> (xfsprogs-5.0.0-10.el8), I got below error output:
>
> # xfs_io -c "bmap -celpv" testfile >/dev/null
> xfs_io: xfsctl(XFS_IOC_GETBMAPX) iflags=0x28 ["testfile"]: Invalid argument
> # xfs_io -c "bmap -c" testfile >/dev/null
> xfs_io: xfsctl(XFS_IOC_GETBMAPX) iflags=0x28 ["testfile"]: Invalid argument
> # xfs_io -c "bmap -elpv" testfile >/dev/null
> (no error output)
>
> So, maybe, if you don't need this output to be golden image, how about filter
> out the error output to $seqres.full too? Or any better idea?
D'oh, forgot that -c only works if CONFFIG_XFS_DEBUG=y. I'll remove the
bmap commands, since the trace data now tells us everything we need.
--D
> Thanks,
> Zorro
>
> >
> > > > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full &
> > > > +sleep 1
> > > > +
> > > > +# Start a sentry to look for evidence of the XFS_ERRORTAG_REPORT logging. If
> > > > +# we see that, we know we've forced writeback to revalidate a mapping. The
> > > > +# test has been successful, so turn off the delay.
> > > > +sentryfile=$TEST_DIR/$seq.sentry
> > > > +tracefile=$TEST_DIR/$seq.ftrace
> > > > +wait_for_errortag() {
> > > > + while [ -e "$sentryfile" ]; do
> > > > + _ftrace_dump | grep iomap_invalid >> "$tracefile"
> > > > + if grep -q iomap_invalid "$tracefile"; then
> > > > + echo 0 > "$knob"
> > > > + _ftrace_ignore_events
> > > > + break;
> > > > + fi
> > > > + sleep 0.5
> > > > + done
> > > > +}
> > > > +touch $sentryfile
> > > > +wait_for_errortag &
> > >
> > > Should we *wait* background processes in cleanup after removing $sentryfile.
> >
> > Yes.
> >
> > > > +
> > > > +# Start thread 2 to create the cowextsize reservation
> > > > +$XFS_IO_PROG -c "pwrite -S 0x56 $((2 * blksz)) $((2 * blksz))" \
> > > > + -c 'bmap -celpv' -c 'bmap -elpv' \
> > > > + -c 'fsync' $SCRATCH_MNT/file >> $seqres.full
> > > > +rm -f $sentryfile
> > > > +
> > > > +cat "$tracefile" >> $seqres.full
> > > > +grep -q iomap_invalid "$tracefile"
> > > > +saw_invalidation=$?
> > > > +
> > > > +# Flush everything to disk. If the bug manifests, then after the cycle,
> > > > +# file should have stale 0x58 in block 0 because we silently dropped a write.
> > > > +_scratch_cycle_mount
> > > > +
> > > > +if ! cmp -s $SCRATCH_MNT/file $SCRATCH_MNT/file.compare; then
> > > > + echo file and file.compare do not match
> > > > + $XFS_IO_PROG -c 'bmap -celpv' -c 'bmap -elpv' $SCRATCH_MNT/file >> $seqres.full
> > > > + echo file.compare
> > > > + od -tx1 -Ad -c $SCRATCH_MNT/file.compare
> > > > + echo file
> > > > + od -tx1 -Ad -c $SCRATCH_MNT/file
> > > > +elif [ $saw_invalidation -ne 0 ]; then
> > > > + # The files matched, but nothing got logged about the revalidation?
> > > > + echo "Expected to hear about writeback iomap invalidations?"
> > > > +fi
> > > > +
> > > > +echo Silence is golden
> > > > +status=0
> > > > +exit
> > > > diff --git a/tests/xfs/924.out b/tests/xfs/924.out
> > > > new file mode 100644
> > > > index 0000000000..c6655da35a
> > > > --- /dev/null
> > > > +++ b/tests/xfs/924.out
> > > > @@ -0,0 +1,2 @@
> > > > +QA output created by 924
> > > > +Silence is golden
> > > >
> > >
> >
>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-12-01 15:53 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20221123055812.747923-1-david@fromorbit.com>
[not found] ` <Y4U3dj5qvpKSQuNM@magnolia>
[not found] ` <Y4VeuqfVBU4/x9aB@magnolia>
2022-11-29 21:55 ` [RFC PATCH] xfs: regression test for writes racing with reclaim writeback Darrick J. Wong
[not found] ` <Y4U3XWf5j1zVGvV4@magnolia>
[not found] ` <Y4VejsHGU/tZuRYs@magnolia>
2022-11-29 21:57 ` [RFC PATCH] xfs: regression test for writeback corruption bug Darrick J. Wong
2022-11-30 17:34 ` Zorro Lang
2022-11-30 19:04 ` Darrick J. Wong
2022-12-01 15:21 ` Zorro Lang
2022-12-01 15:53 ` Darrick J. Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox