linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT
@ 2025-05-27  8:14 Yu Kuai
  2025-05-27  8:24 ` Yu Kuai
  2025-05-30  6:43 ` Yu Kuai
  0 siblings, 2 replies; 5+ messages in thread
From: Yu Kuai @ 2025-05-27  8:14 UTC (permalink / raw)
  To: mpatocka, zdenek.kabelac, song, yukuai3
  Cc: linux-raid, linux-kernel, yukuai1, yi.zhang, yangerkun,
	johnny.chenyi

From: Yu Kuai <yukuai3@huawei.com>

IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium
is fine, hence record badblocks or remove the disk from array does not
make sense.

This problem if found by lvm2 test lvcreate-large-raid, where dm-zero
will fail read ahead IO directly.

Reported-and-tested-by: Mikulas Patocka <mpatocka@redhat.com>
Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
Changes in v2:
 - handle REQ_NOWAIT as well.

 drivers/md/raid1-10.c | 10 ++++++++++
 drivers/md/raid1.c    | 19 ++++++++++---------
 drivers/md/raid10.c   | 11 ++++++-----
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index c7efd8aab675..b8b3a9069701 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
 
 	return false;
 }
+
+/*
+ * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
+ * submitted to the underlying disks, hence don't record badblocks or retry
+ * in this case.
+ */
+static inline bool raid1_should_handle_error(struct bio *bio)
+{
+	return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 657d481525be..19c5a0ce5a40 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio)
 	 */
 	update_head_pos(r1_bio->read_disk, r1_bio);
 
-	if (uptodate)
+	if (uptodate) {
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
-	else if (test_bit(FailFast, &rdev->flags) &&
-		 test_bit(R1BIO_FailFast, &r1_bio->state))
+	} else if (test_bit(FailFast, &rdev->flags) &&
+		 test_bit(R1BIO_FailFast, &r1_bio->state)) {
 		/* This was a fail-fast read so we definitely
 		 * want to retry */
 		;
-	else {
+	} else if (!raid1_should_handle_error(bio)) {
+		uptodate = 1;
+	} else {
 		/* If all other devices have failed, we want to return
 		 * the error upwards rather than fail the last device.
 		 * Here we redefine "uptodate" to mean "Don't want to retry"
@@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio)
 	struct bio *to_put = NULL;
 	int mirror = find_bio_disk(r1_bio, bio);
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
-	bool discard_error;
 	sector_t lo = r1_bio->sector;
 	sector_t hi = r1_bio->sector + r1_bio->sectors;
-
-	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
+	bool ignore_error = !raid1_should_handle_error(bio) ||
+		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
 
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_status && !discard_error) {
+	if (bio->bi_status && !ignore_error) {
 		set_bit(WriteErrorSeen,	&rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
 			set_bit(MD_RECOVERY_NEEDED, &
@@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio)
 
 		/* Maybe we can clear some bad blocks. */
 		if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
-		    !discard_error) {
+		    !ignore_error) {
 			r1_bio->bios[mirror] = IO_MADE_GOOD;
 			set_bit(R1BIO_MadeGood, &r1_bio->state);
 		}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dce06bf65016..b74780af4c22 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio)
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
+	} else if (!raid1_should_handle_error(bio)) {
+		uptodate = 1;
 	} else {
 		/* If all other devices that store this block have
 		 * failed, we want to return the error upwards rather
@@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio)
 	int slot, repl;
 	struct md_rdev *rdev = NULL;
 	struct bio *to_put = NULL;
-	bool discard_error;
-
-	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
+	bool ignore_error = !raid1_should_handle_error(bio) ||
+		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
@@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio)
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_status && !discard_error) {
+	if (bio->bi_status && !ignore_error) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
@@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio)
 		/* Maybe we can clear some bad blocks. */
 		if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
 				      r10_bio->sectors) &&
-		    !discard_error) {
+		    !ignore_error) {
 			bio_put(bio);
 			if (repl)
 				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT
  2025-05-27  8:14 [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT Yu Kuai
@ 2025-05-27  8:24 ` Yu Kuai
  2025-05-28  1:28   ` Xiao Ni
  2025-05-30  6:43 ` Yu Kuai
  1 sibling, 1 reply; 5+ messages in thread
From: Yu Kuai @ 2025-05-27  8:24 UTC (permalink / raw)
  To: Yu Kuai, mpatocka, zdenek.kabelac, song
  Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, johnny.chenyi,
	yukuai (C)

Hi, Zdenek Kabelac

在 2025/05/27 16:14, Yu Kuai 写道:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium
> is fine, hence record badblocks or remove the disk from array does not
> make sense.
> 
> This problem if found by lvm2 test lvcreate-large-raid, where dm-zero
> will fail read ahead IO directly.
> 
> Reported-and-tested-by: Mikulas Patocka <mpatocka@redhat.com>
> Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> Changes in v2:
>   - handle REQ_NOWAIT as well.
> 
>   drivers/md/raid1-10.c | 10 ++++++++++
>   drivers/md/raid1.c    | 19 ++++++++++---------
>   drivers/md/raid10.c   | 11 ++++++-----
>   3 files changed, 26 insertions(+), 14 deletions(-)
> 

Just to let you know that while testing lvcreate-large-raid, the test
can fail sometime:

[ 0:12.021] ## DEBUG0: 08:11:43.596775 lvcreate[8576] 
device_mapper/ioctl/libdm-iface.c:2118  device-mapper: create ioctl on 
LVMTEST8371vg1-LV1_rmeta_0 
LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH 
failed: Device or resource busy

And add debug info in kernel:
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4165fef4c170..04b66b804365 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -263,6 +263,7 @@ static int dm_hash_insert(const char *name, const 
char *uuid, struct mapped_devi
  {
         struct hash_cell *cell, *hc;

+       printk("%s: %s %s\n", __func__, name, uuid);
         /*
          * Allocate the new cells.
          */
@@ -310,6 +311,7 @@ static struct dm_table *__hash_remove(struct 
hash_cell *hc)
         struct dm_table *table;
         int srcu_idx;

+       printk("%s: %s %s\n", __func__, hc->name, hc->uuid);
         lockdep_assert_held(&_hash_lock);

         /* remove from the dev trees */
@@ -882,18 +884,23 @@ static int dev_create(struct file *filp, struct 
dm_ioctl *param, size_t param_si
         struct mapped_device *md;

         r = check_name(param->name);
-       if (r)
+       if (r) {
+               printk("%s: check_name failed %d\n", __func__, r);
                 return r;
+       }

         if (param->flags & DM_PERSISTENT_DEV_FLAG)
                 m = MINOR(huge_decode_dev(param->dev));

         r = dm_create(m, &md);
-       if (r)
+       if (r) {
+               printk("%s: dm_create failed %d\n", __func__, r);
                 return r;
+       }

         r = dm_hash_insert(param->name, *param->uuid ? param->uuid : 
NULL, md);
         if (r) {
+               printk("%s: dm_hash_insert failed %d\n", __func__, r);
                 dm_put(md);
                 dm_destroy(md);
                 return r;

I got:
[ 2265.277214] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0 
LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M
[ 2265.279666] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0 
LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M
[ 2265.281043] dev_create: dm_hash_insert failed -16

Looks like the test somehow insert the same target twice? Is this a
known issue?

Thanks,
Kuai

> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> index c7efd8aab675..b8b3a9069701 100644
> --- a/drivers/md/raid1-10.c
> +++ b/drivers/md/raid1-10.c
> @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
>   
>   	return false;
>   }
> +
> +/*
> + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
> + * submitted to the underlying disks, hence don't record badblocks or retry
> + * in this case.
> + */
> +static inline bool raid1_should_handle_error(struct bio *bio)
> +{
> +	return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
> +}
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 657d481525be..19c5a0ce5a40 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio)
>   	 */
>   	update_head_pos(r1_bio->read_disk, r1_bio);
>   
> -	if (uptodate)
> +	if (uptodate) {
>   		set_bit(R1BIO_Uptodate, &r1_bio->state);
> -	else if (test_bit(FailFast, &rdev->flags) &&
> -		 test_bit(R1BIO_FailFast, &r1_bio->state))
> +	} else if (test_bit(FailFast, &rdev->flags) &&
> +		 test_bit(R1BIO_FailFast, &r1_bio->state)) {
>   		/* This was a fail-fast read so we definitely
>   		 * want to retry */
>   		;
> -	else {
> +	} else if (!raid1_should_handle_error(bio)) {
> +		uptodate = 1;
> +	} else {
>   		/* If all other devices have failed, we want to return
>   		 * the error upwards rather than fail the last device.
>   		 * Here we redefine "uptodate" to mean "Don't want to retry"
> @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio)
>   	struct bio *to_put = NULL;
>   	int mirror = find_bio_disk(r1_bio, bio);
>   	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
> -	bool discard_error;
>   	sector_t lo = r1_bio->sector;
>   	sector_t hi = r1_bio->sector + r1_bio->sectors;
> -
> -	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> +	bool ignore_error = !raid1_should_handle_error(bio) ||
> +		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
>   
>   	/*
>   	 * 'one mirror IO has finished' event handler:
>   	 */
> -	if (bio->bi_status && !discard_error) {
> +	if (bio->bi_status && !ignore_error) {
>   		set_bit(WriteErrorSeen,	&rdev->flags);
>   		if (!test_and_set_bit(WantReplacement, &rdev->flags))
>   			set_bit(MD_RECOVERY_NEEDED, &
> @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio)
>   
>   		/* Maybe we can clear some bad blocks. */
>   		if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
> -		    !discard_error) {
> +		    !ignore_error) {
>   			r1_bio->bios[mirror] = IO_MADE_GOOD;
>   			set_bit(R1BIO_MadeGood, &r1_bio->state);
>   		}
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dce06bf65016..b74780af4c22 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio)
>   		 * wait for the 'master' bio.
>   		 */
>   		set_bit(R10BIO_Uptodate, &r10_bio->state);
> +	} else if (!raid1_should_handle_error(bio)) {
> +		uptodate = 1;
>   	} else {
>   		/* If all other devices that store this block have
>   		 * failed, we want to return the error upwards rather
> @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio)
>   	int slot, repl;
>   	struct md_rdev *rdev = NULL;
>   	struct bio *to_put = NULL;
> -	bool discard_error;
> -
> -	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> +	bool ignore_error = !raid1_should_handle_error(bio) ||
> +		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
>   
>   	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
>   
> @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio)
>   	/*
>   	 * this branch is our 'one mirror IO has finished' event handler:
>   	 */
> -	if (bio->bi_status && !discard_error) {
> +	if (bio->bi_status && !ignore_error) {
>   		if (repl)
>   			/* Never record new bad blocks to replacement,
>   			 * just fail it.
> @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio)
>   		/* Maybe we can clear some bad blocks. */
>   		if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
>   				      r10_bio->sectors) &&
> -		    !discard_error) {
> +		    !ignore_error) {
>   			bio_put(bio);
>   			if (repl)
>   				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
> 


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT
  2025-05-27  8:24 ` Yu Kuai
@ 2025-05-28  1:28   ` Xiao Ni
  2025-05-28  9:27     ` Zdenek Kabelac
  0 siblings, 1 reply; 5+ messages in thread
From: Xiao Ni @ 2025-05-28  1:28 UTC (permalink / raw)
  To: Yu Kuai
  Cc: mpatocka, zdenek.kabelac, song, linux-raid, linux-kernel,
	yi.zhang, yangerkun, johnny.chenyi, yukuai (C), Heinz Mauelshagen

CC Heinz who is a dm-raid expert. But he is on holiday this week.

On Tue, May 27, 2025 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi, Zdenek Kabelac
>
> 在 2025/05/27 16:14, Yu Kuai 写道:
> > From: Yu Kuai <yukuai3@huawei.com>
> >
> > IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium
> > is fine, hence record badblocks or remove the disk from array does not
> > make sense.
> >
> > This problem if found by lvm2 test lvcreate-large-raid, where dm-zero
> > will fail read ahead IO directly.
> >
> > Reported-and-tested-by: Mikulas Patocka <mpatocka@redhat.com>
> > Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/
> > Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> > ---
> > Changes in v2:
> >   - handle REQ_NOWAIT as well.
> >
> >   drivers/md/raid1-10.c | 10 ++++++++++
> >   drivers/md/raid1.c    | 19 ++++++++++---------
> >   drivers/md/raid10.c   | 11 ++++++-----
> >   3 files changed, 26 insertions(+), 14 deletions(-)
> >
>
> Just to let you know that while testing lvcreate-large-raid, the test
> can fail sometime:
>
> [ 0:12.021] ## DEBUG0: 08:11:43.596775 lvcreate[8576]
> device_mapper/ioctl/libdm-iface.c:2118  device-mapper: create ioctl on
> LVMTEST8371vg1-LV1_rmeta_0
> LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH
> failed: Device or resource busy
>
> And add debug info in kernel:
> diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
> index 4165fef4c170..04b66b804365 100644
> --- a/drivers/md/dm-ioctl.c
> +++ b/drivers/md/dm-ioctl.c
> @@ -263,6 +263,7 @@ static int dm_hash_insert(const char *name, const
> char *uuid, struct mapped_devi
>   {
>          struct hash_cell *cell, *hc;
>
> +       printk("%s: %s %s\n", __func__, name, uuid);
>          /*
>           * Allocate the new cells.
>           */
> @@ -310,6 +311,7 @@ static struct dm_table *__hash_remove(struct
> hash_cell *hc)
>          struct dm_table *table;
>          int srcu_idx;
>
> +       printk("%s: %s %s\n", __func__, hc->name, hc->uuid);
>          lockdep_assert_held(&_hash_lock);
>
>          /* remove from the dev trees */
> @@ -882,18 +884,23 @@ static int dev_create(struct file *filp, struct
> dm_ioctl *param, size_t param_si
>          struct mapped_device *md;
>
>          r = check_name(param->name);
> -       if (r)
> +       if (r) {
> +               printk("%s: check_name failed %d\n", __func__, r);
>                  return r;
> +       }
>
>          if (param->flags & DM_PERSISTENT_DEV_FLAG)
>                  m = MINOR(huge_decode_dev(param->dev));
>
>          r = dm_create(m, &md);
> -       if (r)
> +       if (r) {
> +               printk("%s: dm_create failed %d\n", __func__, r);
>                  return r;
> +       }
>
>          r = dm_hash_insert(param->name, *param->uuid ? param->uuid :
> NULL, md);
>          if (r) {
> +               printk("%s: dm_hash_insert failed %d\n", __func__, r);
>                  dm_put(md);
>                  dm_destroy(md);
>                  return r;
>
> I got:
> [ 2265.277214] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0
> LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M
> [ 2265.279666] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0
> LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M
> [ 2265.281043] dev_create: dm_hash_insert failed -16
>
> Looks like the test somehow insert the same target twice? Is this a
> known issue?
>
> Thanks,
> Kuai
>
> > diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> > index c7efd8aab675..b8b3a9069701 100644
> > --- a/drivers/md/raid1-10.c
> > +++ b/drivers/md/raid1-10.c
> > @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
> >
> >       return false;
> >   }
> > +
> > +/*
> > + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
> > + * submitted to the underlying disks, hence don't record badblocks or retry
> > + * in this case.
> > + */
> > +static inline bool raid1_should_handle_error(struct bio *bio)
> > +{
> > +     return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
> > +}
> > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > index 657d481525be..19c5a0ce5a40 100644
> > --- a/drivers/md/raid1.c
> > +++ b/drivers/md/raid1.c
> > @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio)
> >        */
> >       update_head_pos(r1_bio->read_disk, r1_bio);
> >
> > -     if (uptodate)
> > +     if (uptodate) {
> >               set_bit(R1BIO_Uptodate, &r1_bio->state);
> > -     else if (test_bit(FailFast, &rdev->flags) &&
> > -              test_bit(R1BIO_FailFast, &r1_bio->state))
> > +     } else if (test_bit(FailFast, &rdev->flags) &&
> > +              test_bit(R1BIO_FailFast, &r1_bio->state)) {
> >               /* This was a fail-fast read so we definitely
> >                * want to retry */
> >               ;
> > -     else {
> > +     } else if (!raid1_should_handle_error(bio)) {
> > +             uptodate = 1;
> > +     } else {
> >               /* If all other devices have failed, we want to return
> >                * the error upwards rather than fail the last device.
> >                * Here we redefine "uptodate" to mean "Don't want to retry"
> > @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio)
> >       struct bio *to_put = NULL;
> >       int mirror = find_bio_disk(r1_bio, bio);
> >       struct md_rdev *rdev = conf->mirrors[mirror].rdev;
> > -     bool discard_error;
> >       sector_t lo = r1_bio->sector;
> >       sector_t hi = r1_bio->sector + r1_bio->sectors;
> > -
> > -     discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> > +     bool ignore_error = !raid1_should_handle_error(bio) ||
> > +             (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
> >
> >       /*
> >        * 'one mirror IO has finished' event handler:
> >        */
> > -     if (bio->bi_status && !discard_error) {
> > +     if (bio->bi_status && !ignore_error) {
> >               set_bit(WriteErrorSeen, &rdev->flags);
> >               if (!test_and_set_bit(WantReplacement, &rdev->flags))
> >                       set_bit(MD_RECOVERY_NEEDED, &
> > @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio)
> >
> >               /* Maybe we can clear some bad blocks. */
> >               if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
> > -                 !discard_error) {
> > +                 !ignore_error) {
> >                       r1_bio->bios[mirror] = IO_MADE_GOOD;
> >                       set_bit(R1BIO_MadeGood, &r1_bio->state);
> >               }
> > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > index dce06bf65016..b74780af4c22 100644
> > --- a/drivers/md/raid10.c
> > +++ b/drivers/md/raid10.c
> > @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio)
> >                * wait for the 'master' bio.
> >                */
> >               set_bit(R10BIO_Uptodate, &r10_bio->state);
> > +     } else if (!raid1_should_handle_error(bio)) {
> > +             uptodate = 1;
> >       } else {
> >               /* If all other devices that store this block have
> >                * failed, we want to return the error upwards rather
> > @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio)
> >       int slot, repl;
> >       struct md_rdev *rdev = NULL;
> >       struct bio *to_put = NULL;
> > -     bool discard_error;
> > -
> > -     discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> > +     bool ignore_error = !raid1_should_handle_error(bio) ||
> > +             (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
> >
> >       dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
> >
> > @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio)
> >       /*
> >        * this branch is our 'one mirror IO has finished' event handler:
> >        */
> > -     if (bio->bi_status && !discard_error) {
> > +     if (bio->bi_status && !ignore_error) {
> >               if (repl)
> >                       /* Never record new bad blocks to replacement,
> >                        * just fail it.
> > @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio)
> >               /* Maybe we can clear some bad blocks. */
> >               if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
> >                                     r10_bio->sectors) &&
> > -                 !discard_error) {
> > +                 !ignore_error) {
> >                       bio_put(bio);
> >                       if (repl)
> >                               r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
> >
>
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT
  2025-05-28  1:28   ` Xiao Ni
@ 2025-05-28  9:27     ` Zdenek Kabelac
  0 siblings, 0 replies; 5+ messages in thread
From: Zdenek Kabelac @ 2025-05-28  9:27 UTC (permalink / raw)
  To: Xiao Ni, Yu Kuai
  Cc: mpatocka, song, linux-raid, linux-kernel, yi.zhang, yangerkun,
	johnny.chenyi, yukuai (C), Heinz Mauelshagen

Dne 28. 05. 25 v 3:28 Xiao Ni napsal(a):
> CC Heinz who is a dm-raid expert. But he is on holiday this week.
> 
> On Tue, May 27, 2025 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi, Zdenek Kabelac
>>
>> 在 2025/05/27 16:14, Yu Kuai 写道:
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium
>>> is fine, hence record badblocks or remove the disk from array does not
>>> make sense.
>>>
>>> This problem if found by lvm2 test lvcreate-large-raid, where dm-zero
>>> will fail read ahead IO directly.
>>>
>>> Reported-and-tested-by: Mikulas Patocka <mpatocka@redhat.com>
>>> Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>> ---
>>> Changes in v2:
>>>    - handle REQ_NOWAIT as well.
>>>
>>>    drivers/md/raid1-10.c | 10 ++++++++++
>>>    drivers/md/raid1.c    | 19 ++++++++++---------
>>>    drivers/md/raid10.c   | 11 ++++++-----
>>>    3 files changed, 26 insertions(+), 14 deletions(-)
>>>
>>
>> Just to let you know that while testing lvcreate-large-raid, the test
>> can fail sometime:
>>
>> [ 0:12.021] ## DEBUG0: 08:11:43.596775 lvcreate[8576]
>> device_mapper/ioctl/libdm-iface.c:2118  device-mapper: create ioctl on
>> LVMTEST8371vg1-LV1_rmeta_0
>> LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH
>> failed: Device or resource busy

Hi

This issue is caused by occasional race with udev - which holds open our 
internal device open while we are deactivating raid LV.

We are currently trying to resolve this issue.

Regards

Zdenek


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT
  2025-05-27  8:14 [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT Yu Kuai
  2025-05-27  8:24 ` Yu Kuai
@ 2025-05-30  6:43 ` Yu Kuai
  1 sibling, 0 replies; 5+ messages in thread
From: Yu Kuai @ 2025-05-30  6:43 UTC (permalink / raw)
  To: Yu Kuai, mpatocka, zdenek.kabelac, song
  Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, johnny.chenyi,
	yukuai (C)

在 2025/05/27 16:14, Yu Kuai 写道:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium
> is fine, hence record badblocks or remove the disk from array does not
> make sense.
> 
> This problem if found by lvm2 test lvcreate-large-raid, where dm-zero
> will fail read ahead IO directly.
> 
> Reported-and-tested-by: Mikulas Patocka <mpatocka@redhat.com>
> Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> Changes in v2:
>   - handle REQ_NOWAIT as well.
> 
>   drivers/md/raid1-10.c | 10 ++++++++++
>   drivers/md/raid1.c    | 19 ++++++++++---------
>   drivers/md/raid10.c   | 11 ++++++-----
>   3 files changed, 26 insertions(+), 14 deletions(-)
> 

Applied to md-6.16
Thanks

> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> index c7efd8aab675..b8b3a9069701 100644
> --- a/drivers/md/raid1-10.c
> +++ b/drivers/md/raid1-10.c
> @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
>   
>   	return false;
>   }
> +
> +/*
> + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
> + * submitted to the underlying disks, hence don't record badblocks or retry
> + * in this case.
> + */
> +static inline bool raid1_should_handle_error(struct bio *bio)
> +{
> +	return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
> +}
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 657d481525be..19c5a0ce5a40 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio)
>   	 */
>   	update_head_pos(r1_bio->read_disk, r1_bio);
>   
> -	if (uptodate)
> +	if (uptodate) {
>   		set_bit(R1BIO_Uptodate, &r1_bio->state);
> -	else if (test_bit(FailFast, &rdev->flags) &&
> -		 test_bit(R1BIO_FailFast, &r1_bio->state))
> +	} else if (test_bit(FailFast, &rdev->flags) &&
> +		 test_bit(R1BIO_FailFast, &r1_bio->state)) {
>   		/* This was a fail-fast read so we definitely
>   		 * want to retry */
>   		;
> -	else {
> +	} else if (!raid1_should_handle_error(bio)) {
> +		uptodate = 1;
> +	} else {
>   		/* If all other devices have failed, we want to return
>   		 * the error upwards rather than fail the last device.
>   		 * Here we redefine "uptodate" to mean "Don't want to retry"
> @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio)
>   	struct bio *to_put = NULL;
>   	int mirror = find_bio_disk(r1_bio, bio);
>   	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
> -	bool discard_error;
>   	sector_t lo = r1_bio->sector;
>   	sector_t hi = r1_bio->sector + r1_bio->sectors;
> -
> -	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> +	bool ignore_error = !raid1_should_handle_error(bio) ||
> +		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
>   
>   	/*
>   	 * 'one mirror IO has finished' event handler:
>   	 */
> -	if (bio->bi_status && !discard_error) {
> +	if (bio->bi_status && !ignore_error) {
>   		set_bit(WriteErrorSeen,	&rdev->flags);
>   		if (!test_and_set_bit(WantReplacement, &rdev->flags))
>   			set_bit(MD_RECOVERY_NEEDED, &
> @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio)
>   
>   		/* Maybe we can clear some bad blocks. */
>   		if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
> -		    !discard_error) {
> +		    !ignore_error) {
>   			r1_bio->bios[mirror] = IO_MADE_GOOD;
>   			set_bit(R1BIO_MadeGood, &r1_bio->state);
>   		}
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dce06bf65016..b74780af4c22 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio)
>   		 * wait for the 'master' bio.
>   		 */
>   		set_bit(R10BIO_Uptodate, &r10_bio->state);
> +	} else if (!raid1_should_handle_error(bio)) {
> +		uptodate = 1;
>   	} else {
>   		/* If all other devices that store this block have
>   		 * failed, we want to return the error upwards rather
> @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio)
>   	int slot, repl;
>   	struct md_rdev *rdev = NULL;
>   	struct bio *to_put = NULL;
> -	bool discard_error;
> -
> -	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
> +	bool ignore_error = !raid1_should_handle_error(bio) ||
> +		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
>   
>   	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
>   
> @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio)
>   	/*
>   	 * this branch is our 'one mirror IO has finished' event handler:
>   	 */
> -	if (bio->bi_status && !discard_error) {
> +	if (bio->bi_status && !ignore_error) {
>   		if (repl)
>   			/* Never record new bad blocks to replacement,
>   			 * just fail it.
> @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio)
>   		/* Maybe we can clear some bad blocks. */
>   		if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
>   				      r10_bio->sectors) &&
> -		    !discard_error) {
> +		    !ignore_error) {
>   			bio_put(bio);
>   			if (repl)
>   				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-05-30  6:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-27  8:14 [PATCH v2] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT Yu Kuai
2025-05-27  8:24 ` Yu Kuai
2025-05-28  1:28   ` Xiao Ni
2025-05-28  9:27     ` Zdenek Kabelac
2025-05-30  6:43 ` Yu Kuai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).