linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] md/raid5: fix parity corruption on journal failure
@ 2025-08-16 12:28 Meir Elisha
  2025-08-18  7:40 ` Yu Kuai
  0 siblings, 1 reply; 2+ messages in thread
From: Meir Elisha @ 2025-08-16 12:28 UTC (permalink / raw)
  To: Song Liu, Yu Kuai; +Cc: linux-raid, Meir Elisha

When operating in write-through journal mode, a journal device failure
can lead to parity corruption and silent data loss.
This occurs because the current implementation continues to update
parity even when journal writes fail, violating the write-through
consistency guarantee.

Signed-off-by: Meir Elisha <meir.elisha@volumez.com>
---
Changes in v2:
	- Drop writes only when s->failed > conf->max_degraded
	- Remove changes in handle_stripe()

When log_stripe() fails in ops_run_io() we keep write to the parity
disk causing parity to get updated as if the write succeeded.
shouldn't we fail if the journal is down? am I missing something here?
Thanks in advance for reviewing.

Wrote a script for showcasing the issue.

#!/bin/bash

set -e

RAID_DEV="/dev/md127"
DATA_OFFSET=32

# Arrays to store disk states
declare -a BEFORE_BYTES
declare -a DISK_NAMES

cleanup() {
      mdadm --stop $RAID_DEV 2>/dev/null || true
      dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
      for i in {10..15}; do
          losetup -d /dev/loop$i 2>/dev/null || true
      done
      rm -f /tmp/disk*.img 2>/dev/null || true
}

# Function to read first byte from device at offset
read_first_byte() {
      local device=$1
      local offset=$2
      dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
}

# Function to calculate which disk holds parity for a given stripe
# RAID5 left-symmetric algorithm (default)
get_parity_disk() {
      local stripe_num=$1
      local n_disks=$2
      local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
      echo $pd_idx
}

cleanup
echo "=== RAID5 Parity Bug Test ==="
echo

# Create backing files
for i in {0..5}; do
      dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
      losetup /dev/loop$((10+i)) /tmp/disk$i.img
done

SIZE=$(blockdev --getsz /dev/loop10)

# Create normal linear targets first
dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"

# Create RAID5 using the dm devices
echo "1. Creating RAID5 array..."
mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
      /dev/mapper/disk0-flakey \
      /dev/mapper/disk1-flakey \
      /dev/loop12 /dev/loop13 /dev/loop14 \
      --write-journal /dev/mapper/journal-flakey \
      --assume-clean --force

echo "write-through" > /sys/block/md127/md/journal_mode
echo 0 > /sys/block/md127/md/safe_mode_delay

# Write test pattern
echo "2. Writing test pattern..."
for i in 0 1 2 3; do
      VAL=$((1 << i))
      echo "VAL:$VAL"
      perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
done
sync
sleep 1  # Give time for writes to settle

echo "3. Reading disk states before failure..."

# Calculate parity disk for stripe 0 (first 32k chunk)
STRIPE_NUM=0
N_DISKS=5
PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"

# Map RAID device index to loop device
PARITY_DISK=$((10 + $PARITY_INDEX))
echo "Parity is on loop$PARITY_DISK"
echo

for i in {10..14}; do
      # Read first byte from device
      BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
      BEFORE_BYTES[$i]=$BYTE
      DISK_NAMES[$i]="loop$i"
           echo -n "loop$i: 0x$BYTE"
      if [ "$i" = "$PARITY_DISK" ]; then
          echo " <-- PARITY disk"
      else
          echo
      fi
done

echo
echo "4. Fail the first disk..."

dmsetup suspend disk0-flakey
dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk0-flakey

perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
sync
sleep 1

dmsetup suspend journal-flakey
dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
dmsetup resume journal-flakey

dmsetup suspend disk1-flakey
dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk1-flakey

echo "5. Attempting write (should fail the 2nd disk and the journal)..."
dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
sync
sleep 1

echo
echo "6. Checking if parity was incorrectly updated:"
CHANGED=0
for i in {10..14}; do
      # Read current state from device
      BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
      BYTE_BEFORE=${BEFORE_BYTES[$i]}

      if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
          echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
          CHANGED=$((CHANGED + 1))

          if [ "$i" = "$PARITY_DISK" ]; then
              echo "  ^^ PARITY WAS UPDATED - BUG DETECTED!"
          fi
      else
          echo "loop$i unchanged: 0x$BYTE_BEFORE"
      fi
done

echo
echo "RESULT:"
if [ $CHANGED -gt 0 ]; then
      echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
else
      echo "✓ GOOD: No disks changed
fi

cleanup 

 drivers/md/raid5.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..509ab5673cf8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1146,8 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (log_stripe(sh, s) == 0)
+	if (log_stripe(sh, s) == 0) {
+		/* Successfully logged to journal */
 		return;
+	}
+
+	if (conf->log && r5l_log_disk_error(conf)) {
+		/*
+		 * Journal device failed. Only abort writes if we have
+		 * too many failed devices to maintain consistency.
+		 */
+		if (s->failed > conf->max_degraded && (s->to_write || s->written)) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			return;
+		}
+	}
 
 	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
 
@@ -3672,6 +3685,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		 * still be locked - so just clear all R5_LOCKED flags
 		 */
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
+		clear_bit(R5_Wantwrite, &sh->dev[i].flags);
+		clear_bit(R5_Wantcompute, &sh->dev[i].flags);
+		clear_bit(R5_WantFUA, &sh->dev[i].flags);
+		clear_bit(R5_Wantdrain, &sh->dev[i].flags);
 	}
 	s->to_write = 0;
 	s->written = 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v2] md/raid5: fix parity corruption on journal failure
  2025-08-16 12:28 [PATCH v2] md/raid5: fix parity corruption on journal failure Meir Elisha
@ 2025-08-18  7:40 ` Yu Kuai
  0 siblings, 0 replies; 2+ messages in thread
From: Yu Kuai @ 2025-08-18  7:40 UTC (permalink / raw)
  To: Meir Elisha, Song Liu; +Cc: linux-raid, yukuai (C)

Hi,

在 2025/08/16 20:28, Meir Elisha 写道:
> When operating in write-through journal mode, a journal device failure
> can lead to parity corruption and silent data loss.
> This occurs because the current implementation continues to update
> parity even when journal writes fail, violating the write-through
> consistency guarantee.
> 
> Signed-off-by: Meir Elisha <meir.elisha@volumez.com>
> ---
> Changes in v2:
> 	- Drop writes only when s->failed > conf->max_degraded
> 	- Remove changes in handle_stripe()
> 
> When log_stripe() fails in ops_run_io() we keep write to the parity
> disk causing parity to get updated as if the write succeeded.
> shouldn't we fail if the journal is down? am I missing something here?
> Thanks in advance for reviewing.
> 
> Wrote a script for showcasing the issue.
> 
> #!/bin/bash
> 
> set -e
> 
> RAID_DEV="/dev/md127"
> DATA_OFFSET=32
> 
> # Arrays to store disk states
> declare -a BEFORE_BYTES
> declare -a DISK_NAMES
> 
> cleanup() {
>        mdadm --stop $RAID_DEV 2>/dev/null || true
>        dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
>        for i in {10..15}; do
>            losetup -d /dev/loop$i 2>/dev/null || true
>        done
>        rm -f /tmp/disk*.img 2>/dev/null || true
> }
> 
> # Function to read first byte from device at offset
> read_first_byte() {
>        local device=$1
>        local offset=$2
>        dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
> }
> 
> # Function to calculate which disk holds parity for a given stripe
> # RAID5 left-symmetric algorithm (default)
> get_parity_disk() {
>        local stripe_num=$1
>        local n_disks=$2
>        local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
>        echo $pd_idx
> }
> 
> cleanup
> echo "=== RAID5 Parity Bug Test ==="
> echo
> 
> # Create backing files
> for i in {0..5}; do
>        dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
>        losetup /dev/loop$((10+i)) /tmp/disk$i.img
> done
> 
> SIZE=$(blockdev --getsz /dev/loop10)
> 
> # Create normal linear targets first
> dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
> dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
> dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"
> 
> # Create RAID5 using the dm devices
> echo "1. Creating RAID5 array..."
> mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
>        /dev/mapper/disk0-flakey \
>        /dev/mapper/disk1-flakey \
>        /dev/loop12 /dev/loop13 /dev/loop14 \
>        --write-journal /dev/mapper/journal-flakey \
>        --assume-clean --force
> 
> echo "write-through" > /sys/block/md127/md/journal_mode
> echo 0 > /sys/block/md127/md/safe_mode_delay
> 
> # Write test pattern
> echo "2. Writing test pattern..."
> for i in 0 1 2 3; do
>        VAL=$((1 << i))
>        echo "VAL:$VAL"
>        perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
> done
> sync
> sleep 1  # Give time for writes to settle
> 
> echo "3. Reading disk states before failure..."
> 
> # Calculate parity disk for stripe 0 (first 32k chunk)
> STRIPE_NUM=0
> N_DISKS=5
> PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
> echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"
> 
> # Map RAID device index to loop device
> PARITY_DISK=$((10 + $PARITY_INDEX))
> echo "Parity is on loop$PARITY_DISK"
> echo
> 
> for i in {10..14}; do
>        # Read first byte from device
>        BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
>        BEFORE_BYTES[$i]=$BYTE
>        DISK_NAMES[$i]="loop$i"
>             echo -n "loop$i: 0x$BYTE"
>        if [ "$i" = "$PARITY_DISK" ]; then
>            echo " <-- PARITY disk"
>        else
>            echo
>        fi
> done
> 
> echo
> echo "4. Fail the first disk..."
> 
> dmsetup suspend disk0-flakey
> dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume disk0-flakey
> 
> perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
> sync
> sleep 1
> 
> dmsetup suspend journal-flakey
> dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume journal-flakey
> 
> dmsetup suspend disk1-flakey
> dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
> dmsetup resume disk1-flakey
> 
> echo "5. Attempting write (should fail the 2nd disk and the journal)..."
> dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
> sync
> sleep 1
> 
> echo
> echo "6. Checking if parity was incorrectly updated:"
> CHANGED=0
> for i in {10..14}; do
>        # Read current state from device
>        BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
>        BYTE_BEFORE=${BEFORE_BYTES[$i]}
> 
>        if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
>            echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
>            CHANGED=$((CHANGED + 1))
> 
>            if [ "$i" = "$PARITY_DISK" ]; then
>                echo "  ^^ PARITY WAS UPDATED - BUG DETECTED!"
>            fi
>        else
>            echo "loop$i unchanged: 0x$BYTE_BEFORE"
>        fi
> done
> 
> echo
> echo "RESULT:"
> if [ $CHANGED -gt 0 ]; then
>        echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
> else
>        echo "✓ GOOD: No disks changed
> fi
> 
> cleanup
> 
>   drivers/md/raid5.c | 19 ++++++++++++++++++-
>   1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 023649fe2476..509ab5673cf8 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1146,8 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>   
>   	might_sleep();
>   
> -	if (log_stripe(sh, s) == 0)
> +	if (log_stripe(sh, s) == 0) {
> +		/* Successfully logged to journal */
>   		return;
> +	}

No need to add braces here, just place the comment above if.
> +
> +	if (conf->log && r5l_log_disk_error(conf)) {
> +		/*
> +		 * Journal device failed. Only abort writes if we have
> +		 * too many failed devices to maintain consistency.
> +		 */

Same here, and please use just one if.

> +		if (s->failed > conf->max_degraded && (s->to_write || s->written)) {
> +			set_bit(STRIPE_HANDLE, &sh->state);
> +			return;
> +		}
> +	}
>   
>   	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
>   
> @@ -3672,6 +3685,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>   		 * still be locked - so just clear all R5_LOCKED flags
>   		 */
>   		clear_bit(R5_LOCKED, &sh->dev[i].flags);
> +		clear_bit(R5_Wantwrite, &sh->dev[i].flags);
> +		clear_bit(R5_Wantcompute, &sh->dev[i].flags);
> +		clear_bit(R5_WantFUA, &sh->dev[i].flags);
> +		clear_bit(R5_Wantdrain, &sh->dev[i].flags);

And please explain why you're clearing these flags.

Thanks,
Kuai

>   	}
>   	s->to_write = 0;
>   	s->written = 0;
> 


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-08-18  7:40 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-16 12:28 [PATCH v2] md/raid5: fix parity corruption on journal failure Meir Elisha
2025-08-18  7:40 ` Yu Kuai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).