* [PATCH v3] md/raid5: fix parity corruption on journal failure
@ 2025-08-19 7:38 Meir Elisha
2025-08-25 16:08 ` Meir Elisha
0 siblings, 1 reply; 3+ messages in thread
From: Meir Elisha @ 2025-08-19 7:38 UTC (permalink / raw)
To: Song Liu, Yu Kuai; +Cc: linux-raid, Meir Elisha
When operating in write-through journal mode, a journal device failure
can lead to parity corruption and silent data loss.
This occurs because the current implementation continues to update
parity even when journal writes fail, violating the write-through
consistency guarantee.
Signed-off-by: Meir Elisha <meir.elisha@volumez.com>
---
Changes in v3:
- Remove unnecessary braces in ops_run_io()
- Adding comment on clearing the R5_Want* flags
Changes in v2:
- Drop writes only when s->failed > conf->max_degraded
- Remove changes in handle_stripe()
Wrote a script for showcasing the issue.
#!/bin/bash
set -e
RAID_DEV="/dev/md127"
DATA_OFFSET=32
# Arrays to store disk states
declare -a BEFORE_BYTES
declare -a DISK_NAMES
cleanup() {
mdadm --stop $RAID_DEV 2>/dev/null || true
dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
for i in {10..15}; do
losetup -d /dev/loop$i 2>/dev/null || true
done
rm -f /tmp/disk*.img 2>/dev/null || true
}
# Function to read first byte from device at offset
read_first_byte() {
local device=$1
local offset=$2
dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
}
# Function to calculate which disk holds parity for a given stripe
# RAID5 left-symmetric algorithm (default)
get_parity_disk() {
local stripe_num=$1
local n_disks=$2
local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
echo $pd_idx
}
cleanup
echo "=== RAID5 Parity Bug Test ==="
echo
# Create backing files
for i in {0..5}; do
dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
losetup /dev/loop$((10+i)) /tmp/disk$i.img
done
SIZE=$(blockdev --getsz /dev/loop10)
# Create normal linear targets first
dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"
# Create RAID5 using the dm devices
echo "1. Creating RAID5 array..."
mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
/dev/mapper/disk0-flakey \
/dev/mapper/disk1-flakey \
/dev/loop12 /dev/loop13 /dev/loop14 \
--write-journal /dev/mapper/journal-flakey \
--assume-clean --force
echo "write-through" > /sys/block/md127/md/journal_mode
echo 0 > /sys/block/md127/md/safe_mode_delay
# Write test pattern
echo "2. Writing test pattern..."
for i in 0 1 2 3; do
VAL=$((1 << i))
echo "VAL:$VAL"
perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
done
sync
sleep 1 # Give time for writes to settle
echo "3. Reading disk states before failure..."
# Calculate parity disk for stripe 0 (first 32k chunk)
STRIPE_NUM=0
N_DISKS=5
PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"
# Map RAID device index to loop device
PARITY_DISK=$((10 + $PARITY_INDEX))
echo "Parity is on loop$PARITY_DISK"
echo
for i in {10..14}; do
# Read first byte from device
BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
BEFORE_BYTES[$i]=$BYTE
DISK_NAMES[$i]="loop$i"
echo -n "loop$i: 0x$BYTE"
if [ "$i" = "$PARITY_DISK" ]; then
echo " <-- PARITY disk"
else
echo
fi
done
echo
echo "4. Fail the first disk..."
dmsetup suspend disk0-flakey
dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk0-flakey
perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
sync
sleep 1
dmsetup suspend journal-flakey
dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
dmsetup resume journal-flakey
dmsetup suspend disk1-flakey
dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk1-flakey
echo "5. Attempting write (should fail the 2nd disk and the journal)..."
dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
sync
sleep 1
echo
echo "6. Checking if parity was incorrectly updated:"
CHANGED=0
for i in {10..14}; do
# Read current state from device
BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
BYTE_BEFORE=${BEFORE_BYTES[$i]}
if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
CHANGED=$((CHANGED + 1))
if [ "$i" = "$PARITY_DISK" ]; then
echo " ^^ PARITY WAS UPDATED - BUG DETECTED!"
fi
else
echo "loop$i unchanged: 0x$BYTE_BEFORE"
fi
done
echo
echo "RESULT:"
if [ $CHANGED -gt 0 ]; then
echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
else
echo "✓ GOOD: No disks changed"
fi
cleanup
drivers/md/raid5.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..da7756cc6a2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1146,9 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
+ /* Successfully logged to journal */
if (log_stripe(sh, s) == 0)
return;
+ /*
+ * Journal device failed. Only abort writes if we have
+ * too many failed devices to maintain consistency.
+ */
+ if (conf->log && r5l_log_disk_error(conf) &&
+ s->failed > conf->max_degraded &&
+ (s->to_write || s->written)) {
+ set_bit(STRIPE_HANDLE, &sh->state);
+ return;
+ }
+
should_defer = conf->batch_bio_dispatch && conf->group_cnt;
for (i = disks; i--; ) {
@@ -3672,6 +3684,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* still be locked - so just clear all R5_LOCKED flags
*/
clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ /* Clear R5_Want* flags to prevent stale operations
+ * from executing on retry.
+ */
+ clear_bit(R5_Wantwrite, &sh->dev[i].flags);
+ clear_bit(R5_Wantcompute, &sh->dev[i].flags);
+ clear_bit(R5_WantFUA, &sh->dev[i].flags);
+ clear_bit(R5_Wantdrain, &sh->dev[i].flags);
}
s->to_write = 0;
s->written = 0;
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v3] md/raid5: fix parity corruption on journal failure
2025-08-19 7:38 [PATCH v3] md/raid5: fix parity corruption on journal failure Meir Elisha
@ 2025-08-25 16:08 ` Meir Elisha
2025-08-26 2:33 ` Yu Kuai
0 siblings, 1 reply; 3+ messages in thread
From: Meir Elisha @ 2025-08-25 16:08 UTC (permalink / raw)
To: Song Liu, Yu Kuai; +Cc: linux-raid
Hi Kuai
Appreciate your thoughts on V3. Thanks in advance.
On 8/19/25 10:38, Meir Elisha wrote:
> When operating in write-through journal mode, a journal device failure
> can lead to parity corruption and silent data loss.
> This occurs because the current implementation continues to update
> parity even when journal writes fail, violating the write-through
> consistency guarantee.
>
> Signed-off-by: Meir Elisha <meir.elisha@volumez.com>
> ---
> Changes in v3:
> - Remove unnecessary braces in ops_run_io()
> - Adding comment on clearing the R5_Want* flags
> Changes in v2:
> - Drop writes only when s->failed > conf->max_degraded
> - Remove changes in handle_stripe()
>
> Wrote a script for showcasing the issue.
>
> #!/bin/bash
>
> set -e
>
> RAID_DEV="/dev/md127"
> DATA_OFFSET=32
>
> # Arrays to store disk states
> declare -a BEFORE_BYTES
> declare -a DISK_NAMES
>
> cleanup() {
> mdadm --stop $RAID_DEV 2>/dev/null || true
> dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
> for i in {10..15}; do
> losetup -d /dev/loop$i 2>/dev/null || true
> done
> rm -f /tmp/disk*.img 2>/dev/null || true
> }
>
> # Function to read first byte from device at offset
> read_first_byte() {
> local device=$1
> local offset=$2
> dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
> }
>
> # Function to calculate which disk holds parity for a given stripe
> # RAID5 left-symmetric algorithm (default)
> get_parity_disk() {
> local stripe_num=$1
> local n_disks=$2
> local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
> echo $pd_idx
> }
>
> cleanup
> echo "=== RAID5 Parity Bug Test ==="
> echo
>
> # Create backing files
> for i in {0..5}; do
> dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
> losetup /dev/loop$((10+i)) /tmp/disk$i.img
> done
>
> SIZE=$(blockdev --getsz /dev/loop10)
>
> # Create normal linear targets first
> dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
> dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
> dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"
>
> # Create RAID5 using the dm devices
> echo "1. Creating RAID5 array..."
> mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
> /dev/mapper/disk0-flakey \
> /dev/mapper/disk1-flakey \
> /dev/loop12 /dev/loop13 /dev/loop14 \
> --write-journal /dev/mapper/journal-flakey \
> --assume-clean --force
>
> echo "write-through" > /sys/block/md127/md/journal_mode
> echo 0 > /sys/block/md127/md/safe_mode_delay
>
> # Write test pattern
> echo "2. Writing test pattern..."
> for i in 0 1 2 3; do
> VAL=$((1 << i))
> echo "VAL:$VAL"
> perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
> done
> sync
> sleep 1 # Give time for writes to settle
>
> echo "3. Reading disk states before failure..."
>
> # Calculate parity disk for stripe 0 (first 32k chunk)
> STRIPE_NUM=0
> N_DISKS=5
> PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
> echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"
>
> # Map RAID device index to loop device
> PARITY_DISK=$((10 + $PARITY_INDEX))
> echo "Parity is on loop$PARITY_DISK"
> echo
>
> for i in {10..14}; do
> # Read first byte from device
> BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
> BEFORE_BYTES[$i]=$BYTE
> DISK_NAMES[$i]="loop$i"
> echo -n "loop$i: 0x$BYTE"
> if [ "$i" = "$PARITY_DISK" ]; then
> echo " <-- PARITY disk"
> else
> echo
> fi
> done
>
> echo
> echo "4. Fail the first disk..."
>
> dmsetup suspend disk0-flakey
> dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume disk0-flakey
>
> perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
> sync
> sleep 1
>
> dmsetup suspend journal-flakey
> dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume journal-flakey
>
> dmsetup suspend disk1-flakey
> dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume disk1-flakey
>
> echo "5. Attempting write (should fail the 2nd disk and the journal)..."
> dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
> sync
> sleep 1
>
> echo
> echo "6. Checking if parity was incorrectly updated:"
> CHANGED=0
> for i in {10..14}; do
> # Read current state from device
> BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
> BYTE_BEFORE=${BEFORE_BYTES[$i]}
>
> if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
> echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
> CHANGED=$((CHANGED + 1))
>
> if [ "$i" = "$PARITY_DISK" ]; then
> echo " ^^ PARITY WAS UPDATED - BUG DETECTED!"
> fi
> else
> echo "loop$i unchanged: 0x$BYTE_BEFORE"
> fi
> done
>
> echo
> echo "RESULT:"
> if [ $CHANGED -gt 0 ]; then
> echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
> else
> echo "✓ GOOD: No disks changed"
> fi
>
> cleanup
>
> drivers/md/raid5.c | 19 +++++++++++++++++++
> 1 file changed, 19 insertions(+)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 023649fe2476..da7756cc6a2a 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1146,9 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>
> might_sleep();
>
> + /* Successfully logged to journal */
> if (log_stripe(sh, s) == 0)
> return;
>
> + /*
> + * Journal device failed. Only abort writes if we have
> + * too many failed devices to maintain consistency.
> + */
> + if (conf->log && r5l_log_disk_error(conf) &&
> + s->failed > conf->max_degraded &&
> + (s->to_write || s->written)) {
> + set_bit(STRIPE_HANDLE, &sh->state);
> + return;
> + }
> +
> should_defer = conf->batch_bio_dispatch && conf->group_cnt;
>
> for (i = disks; i--; ) {
> @@ -3672,6 +3684,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> * still be locked - so just clear all R5_LOCKED flags
> */
> clear_bit(R5_LOCKED, &sh->dev[i].flags);
> + /* Clear R5_Want* flags to prevent stale operations
> + * from executing on retry.
> + */
> + clear_bit(R5_Wantwrite, &sh->dev[i].flags);
> + clear_bit(R5_Wantcompute, &sh->dev[i].flags);
> + clear_bit(R5_WantFUA, &sh->dev[i].flags);
> + clear_bit(R5_Wantdrain, &sh->dev[i].flags);
> }
> s->to_write = 0;
> s->written = 0;
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v3] md/raid5: fix parity corruption on journal failure
2025-08-25 16:08 ` Meir Elisha
@ 2025-08-26 2:33 ` Yu Kuai
0 siblings, 0 replies; 3+ messages in thread
From: Yu Kuai @ 2025-08-26 2:33 UTC (permalink / raw)
To: Meir Elisha, Song Liu; +Cc: linux-raid, yukuai (C)
Hi,
在 2025/08/26 0:08, Meir Elisha 写道:
> Hi Kuai
>
> Appreciate your thoughts on V3. Thanks in advance.
> On 8/19/25 10:38, Meir Elisha wrote:
>> When operating in write-through journal mode, a journal device failure
>> can lead to parity corruption and silent data loss.
>> This occurs because the current implementation continues to update
>> parity even when journal writes fail, violating the write-through
>> consistency guarantee.
>>
>> Signed-off-by: Meir Elisha <meir.elisha@volumez.com>
>> ---
>> Changes in v3:
>> - Remove unnecessary braces in ops_run_io()
>> - Adding comment on clearing the R5_Want* flags
>> Changes in v2:
>> - Drop writes only when s->failed > conf->max_degraded
>> - Remove changes in handle_stripe()
>>
>> Wrote a script for showcasing the issue.
>>
>> #!/bin/bash
>>
>> set -e
>>
>> RAID_DEV="/dev/md127"
>> DATA_OFFSET=32
>>
>> # Arrays to store disk states
>> declare -a BEFORE_BYTES
>> declare -a DISK_NAMES
>>
>> cleanup() {
>> mdadm --stop $RAID_DEV 2>/dev/null || true
>> dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
>> for i in {10..15}; do
>> losetup -d /dev/loop$i 2>/dev/null || true
>> done
>> rm -f /tmp/disk*.img 2>/dev/null || true
>> }
>>
>> # Function to read first byte from device at offset
>> read_first_byte() {
>> local device=$1
>> local offset=$2
>> dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
>> }
>>
>> # Function to calculate which disk holds parity for a given stripe
>> # RAID5 left-symmetric algorithm (default)
>> get_parity_disk() {
>> local stripe_num=$1
>> local n_disks=$2
>> local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
>> echo $pd_idx
>> }
>>
>> cleanup
>> echo "=== RAID5 Parity Bug Test ==="
>> echo
>>
>> # Create backing files
>> for i in {0..5}; do
>> dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
>> losetup /dev/loop$((10+i)) /tmp/disk$i.img
>> done
>>
>> SIZE=$(blockdev --getsz /dev/loop10)
>>
>> # Create normal linear targets first
>> dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
>> dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
>> dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"
>>
>> # Create RAID5 using the dm devices
>> echo "1. Creating RAID5 array..."
>> mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
>> /dev/mapper/disk0-flakey \
>> /dev/mapper/disk1-flakey \
>> /dev/loop12 /dev/loop13 /dev/loop14 \
>> --write-journal /dev/mapper/journal-flakey \
>> --assume-clean --force
>>
>> echo "write-through" > /sys/block/md127/md/journal_mode
>> echo 0 > /sys/block/md127/md/safe_mode_delay
>>
>> # Write test pattern
>> echo "2. Writing test pattern..."
>> for i in 0 1 2 3; do
>> VAL=$((1 << i))
>> echo "VAL:$VAL"
>> perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
>> done
>> sync
>> sleep 1 # Give time for writes to settle
>>
>> echo "3. Reading disk states before failure..."
>>
>> # Calculate parity disk for stripe 0 (first 32k chunk)
>> STRIPE_NUM=0
>> N_DISKS=5
>> PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
>> echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"
>>
>> # Map RAID device index to loop device
>> PARITY_DISK=$((10 + $PARITY_INDEX))
>> echo "Parity is on loop$PARITY_DISK"
>> echo
>>
>> for i in {10..14}; do
>> # Read first byte from device
>> BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
>> BEFORE_BYTES[$i]=$BYTE
>> DISK_NAMES[$i]="loop$i"
>> echo -n "loop$i: 0x$BYTE"
>> if [ "$i" = "$PARITY_DISK" ]; then
>> echo " <-- PARITY disk"
>> else
>> echo
>> fi
>> done
>>
>> echo
>> echo "4. Fail the first disk..."
>>
>> dmsetup suspend disk0-flakey
>> dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
>> dmsetup resume disk0-flakey
>>
>> perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
>> sync
>> sleep 1
>>
>> dmsetup suspend journal-flakey
>> dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
>> dmsetup resume journal-flakey
>>
>> dmsetup suspend disk1-flakey
>> dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
>> dmsetup resume disk1-flakey
>>
>> echo "5. Attempting write (should fail the 2nd disk and the journal)..."
>> dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
>> sync
>> sleep 1
>>
>> echo
>> echo "6. Checking if parity was incorrectly updated:"
>> CHANGED=0
>> for i in {10..14}; do
>> # Read current state from device
>> BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
>> BYTE_BEFORE=${BEFORE_BYTES[$i]}
>>
>> if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
>> echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
>> CHANGED=$((CHANGED + 1))
>>
>> if [ "$i" = "$PARITY_DISK" ]; then
>> echo " ^^ PARITY WAS UPDATED - BUG DETECTED!"
>> fi
>> else
>> echo "loop$i unchanged: 0x$BYTE_BEFORE"
>> fi
>> done
>>
>> echo
>> echo "RESULT:"
>> if [ $CHANGED -gt 0 ]; then
>> echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
>> else
>> echo "✓ GOOD: No disks changed"
>> fi
>>
>> cleanup
>>
>> drivers/md/raid5.c | 19 +++++++++++++++++++
>> 1 file changed, 19 insertions(+)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 023649fe2476..da7756cc6a2a 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -1146,9 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>>
>> might_sleep();
>>
>> + /* Successfully logged to journal */
>> if (log_stripe(sh, s) == 0)
>> return;
>>
>> + /*
>> + * Journal device failed. Only abort writes if we have
>> + * too many failed devices to maintain consistency.
>> + */
>> + if (conf->log && r5l_log_disk_error(conf) &&
>> + s->failed > conf->max_degraded &&
>> + (s->to_write || s->written)) {
>> + set_bit(STRIPE_HANDLE, &sh->state);
>> + return;
>> + }
>> +
>> should_defer = conf->batch_bio_dispatch && conf->group_cnt;
>>
>> for (i = disks; i--; ) {
>> @@ -3672,6 +3684,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>> * still be locked - so just clear all R5_LOCKED flags
>> */
>> clear_bit(R5_LOCKED, &sh->dev[i].flags);
>> + /* Clear R5_Want* flags to prevent stale operations
>> + * from executing on retry.
>> + */
>> + clear_bit(R5_Wantwrite, &sh->dev[i].flags);
>> + clear_bit(R5_Wantcompute, &sh->dev[i].flags);
>> + clear_bit(R5_WantFUA, &sh->dev[i].flags);
>> + clear_bit(R5_Wantdrain, &sh->dev[i].flags);
Still not sure about this, I need more time to go through each flag.
BTW, I'm not that familiar with raid5 internal details.
Thanks,
Kuai
>> }
>> s->to_write = 0;
>> s->written = 0;
>
>
> .
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-08-26 2:33 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-19 7:38 [PATCH v3] md/raid5: fix parity corruption on journal failure Meir Elisha
2025-08-25 16:08 ` Meir Elisha
2025-08-26 2:33 ` Yu Kuai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).