[PATCH md ] Better handling of readerrors with raid5.

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH md ] Better handling of readerrors with raid5.
       [not found] <20050916125754.11044.patches@notabene>
@ 2005-09-16  3:01 ` NeilBrown
  2005-09-16 16:53   ` Mike Hardy
                     ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: NeilBrown @ 2005-09-16  3:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid

TESTERS WANTED!!  SEE BELOW...

This patch changes the behaviour of raid5 when it gets a read error.
Instead of just failing the device, it tried to find out what should
have been there, and writes it over the bad block.  For some
media-errors, this has a reasonable chance of fixing the error.
If the write succeeds, and a subsequent read succeeds as well, raid5
decided the address is OK and conitnues.

I have tested this using the 'faulty' md personality, but it would be
really good to test it with real disks that have real errors.  If
anyone has such drives in a cupboard (or even in a computer) and would
be willing to give this a try, I would really appreciate it.

Meanwhile, I think it is OK to go into -mm, but certainly not to go to
Linus yet.

Thanks,
NeilBrown


### Comments for Changeset


Instead of failing a drive on read-error, we attempt to
re-write the block, and then re-read.  If that all works,
we allow the device to remain in the array.


Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./drivers/md/raid5.c         |   61 +++++++++++++++++++++++++++++++++++++++----
 ./include/linux/raid/raid5.h |    2 +
 2 files changed, 58 insertions(+), 5 deletions(-)

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2005-09-16 12:21:24.000000000 +1000
+++ ./drivers/md/raid5.c	2005-09-16 12:57:12.000000000 +1000
@@ -349,7 +349,7 @@ static void shrink_stripes(raid5_conf_t 
 	conf->slab_cache = NULL;
 }
 
-static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
+static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 				   int error)
 {
  	struct stripe_head *sh = bi->bi_private;
@@ -401,10 +401,27 @@ static int raid5_end_read_request (struc
 		}
 #else
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif		
+#endif
+		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			printk("R5: read error corrected!!\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+		}
 	} else {
-		md_error(conf->mddev, conf->disks[i].rdev);
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+		if (conf->mddev->degraded) {
+			printk("R5: read error not correctable.\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			md_error(conf->mddev, conf->disks[i].rdev);
+		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
+			/* Oh, no!!! */
+			printk("R5: read error NOT corrected!!\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			md_error(conf->mddev, conf->disks[i].rdev);
+		} else
+			set_bit(R5_ReadError, &sh->dev[i].flags);
 	}
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 #if 0
@@ -966,6 +983,12 @@ static void handle_stripe(struct stripe_
 		if (dev->written) written++;
 		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
 		if (!rdev || !rdev->in_sync) {
+			/* The ReadError flag wil just be confusing now */
+			clear_bit(R5_ReadError, &dev->flags);
+			clear_bit(R5_ReWrite, &dev->flags);
+		}
+		if (!rdev || !rdev->in_sync
+		    || test_bit(R5_ReadError, &dev->flags)) {
 			failed++;
 			failed_num = i;
 		} else
@@ -980,6 +1003,14 @@ static void handle_stripe(struct stripe_
 	if (failed > 1 && to_read+to_write+written) {
 		for (i=disks; i--; ) {
 			int bitmap_end = 0;
+
+			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				if (rdev && rdev->in_sync)
+					/* multiple read failures in one stripe */
+					md_error(conf->mddev, rdev);
+			}
+
 			spin_lock_irq(&conf->device_lock);
 			/* fail all writes first */
 			bi = sh->dev[i].towrite;
@@ -1015,7 +1046,8 @@ static void handle_stripe(struct stripe_
 			}
 
 			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
 				bi = sh->dev[i].toread;
 				sh->dev[i].toread = NULL;
 				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1274,7 +1306,26 @@ static void handle_stripe(struct stripe_
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
-	
+
+	/* If the failed drive is just a ReadError, then we might need to progress
+	 * the repair/check process
+	 */
+	if (failed == 1 && test_bit(R5_ReadError, &sh->dev[failed_num].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
+		) {
+		dev = &sh->dev[failed_num];
+		if (!test_bit(R5_ReWrite, &dev->flags)) {
+			set_bit(R5_Wantwrite, &dev->flags);
+			set_bit(R5_ReWrite, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+		} else {
+			/* let's read it back */
+			set_bit(R5_Wantread, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+		}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {

diff ./include/linux/raid/raid5.h~current~ ./include/linux/raid/raid5.h
--- ./include/linux/raid/raid5.h~current~	2005-09-16 12:21:24.000000000 +1000
+++ ./include/linux/raid/raid5.h	2005-09-16 12:55:51.000000000 +1000
@@ -154,6 +154,8 @@ struct stripe_head {
 #define	R5_Wantwrite	5
 #define	R5_Syncio	6	/* this io need to be accounted as resync io */
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
 /*
  * Write method

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16  3:01 ` [PATCH md ] Better handling of readerrors with raid5 NeilBrown
@ 2005-09-16 16:53   ` Mike Hardy
  2005-09-16 21:39     ` Neil Brown
  2005-09-18 22:06     ` JaniD++
  2005-09-21  0:15   ` Molle Bestefich
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 11+ messages in thread
From: Mike Hardy @ 2005-09-16 16:53 UTC (permalink / raw)
  To: NeilBrown; +Cc: Andrew Morton, linux-raid


NeilBrown wrote:
> TESTERS WANTED!!  SEE BELOW...
> 
> This patch changes the behaviour of raid5 when it gets a read error.
> Instead of just failing the device, it tried to find out what should
> have been there, and writes it over the bad block.  For some
> media-errors, this has a reasonable chance of fixing the error.
> If the write succeeds, and a subsequent read succeeds as well, raid5
> decided the address is OK and conitnues.
> 
> I have tested this using the 'faulty' md personality, but it would be
> really good to test it with real disks that have real errors.  If
> anyone has such drives in a cupboard (or even in a computer) and would
> be willing to give this a try, I would really appreciate it.


Oh, this makes me so happy. Thank you thank you thank you.

I regret that I just yesterday sent a drive back with spontaneously
growing read errors that were fixable yet would pop up in another spot.

Does this apply cleanly to 2.6.13 though? Does it require any special
tool support or superblock (version of mdadm, etc)? I could put it on
one of my non-mission-critical rigs and let it run for a while.

-Mike

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16 16:53   ` Mike Hardy
@ 2005-09-16 21:39     ` Neil Brown
  2005-09-18 22:06     ` JaniD++
  1 sibling, 0 replies; 11+ messages in thread
From: Neil Brown @ 2005-09-16 21:39 UTC (permalink / raw)
  To: Mike Hardy; +Cc: linux-raid

On Friday September 16, mhardy@h3c.com wrote:
> 
> NeilBrown wrote:
> > TESTERS WANTED!!  SEE BELOW...
> > 
> > This patch changes the behaviour of raid5 when it gets a read error.
> > Instead of just failing the device, it tried to find out what should
> > have been there, and writes it over the bad block.  For some
> > media-errors, this has a reasonable chance of fixing the error.
> > If the write succeeds, and a subsequent read succeeds as well, raid5
> > decided the address is OK and continues.
> > 
> > I have tested this using the 'faulty' md personality, but it would be
> > really good to test it with real disks that have real errors.  If
> > anyone has such drives in a cupboard (or even in a computer) and would
> > be willing to give this a try, I would really appreciate it.
> 
> 
> Oh, this makes me so happy. Thank you thank you thank you.

Thanks....

> 
> I regret that I just yesterday sent a drive back with spontaneously
> growing read errors that were fixable yet would pop up in another
> spot.

Seems my timing is a little off :-)

> 
> Does this apply cleanly to 2.6.13 though? Does it require any special
> tool support or superblock (version of mdadm, etc)? I could put it on
> one of my non-mission-critical rigs and let it run for a while.

It should apply to 2.6.13  - it is purely a raid5 change, and most of
the md changes since 2.6.13 are md.c only.  So if 'patch' says it
applies, you can be sure it applies correctly.

It doesn't require any other changes - it's purely internal to raid5.

Thanks for trying to help..

NeilBrown

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16 16:53   ` Mike Hardy
  2005-09-16 21:39     ` Neil Brown
@ 2005-09-18 22:06     ` JaniD++
  1 sibling, 0 replies; 11+ messages in thread
From: JaniD++ @ 2005-09-18 22:06 UTC (permalink / raw)
  To: linux-raid

Hello list!

I have found today one bad drive! (Maxtor 200G IDE)
I have also lost a lot of data thanks to that drive.

The problem:
The drive goes to crazy, and "can't remember", but dont say any error!
The RAID5 is still used that drive, and wasted the data what the system
writes to it...

I dont know is there any good trick to detect this error, but I have one
idea that can help some people on similar situation....

The job is simple:
Recover from RAID5 parity information, but don't write the data to any disk,
just verify the original data from drives, and reply the differences!
(same on raid1)

This helps to detect this light-bug drives.

And it is more useful, if the kernel can do that on the working array with
low priority...

I am lucky, because I have found easy this bad drive, that loses own MBR,
and 2 raid superblocks! :-) (raid0 on raid5)

Thanks

Janos

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16  3:01 ` [PATCH md ] Better handling of readerrors with raid5 NeilBrown
  2005-09-16 16:53   ` Mike Hardy
@ 2005-09-21  0:15   ` Molle Bestefich
  2005-09-21  9:14     ` Neil Brown
  2005-10-11 14:31   ` Mattias Wadenstein
  2005-12-22 22:23   ` Stephan van Hienen
  3 siblings, 1 reply; 11+ messages in thread
From: Molle Bestefich @ 2005-09-21  0:15 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

NeilBrown wrote:
> TESTERS WANTED!!  SEE BELOW...
> 
> This patch changes the behaviour of raid5 when it gets a read error.
> Instead of just failing the device, it tried to find out what should
> have been there, and writes it over the bad block.

Jip-hee!  Beautiful!!
Neil, a big, warm and fuzzy Thank You for all the hard work you put into MD!

I do not have a test system, so I've tried to apply the patch to a
smallish (6 disk ata) live system instead.  (Am I sane?)

Some of the disks have probably developed a couple of bad blocks here
and there by now.  I imagine doing a 'dd' from the MD device will read
at least 83% of all sectors (?), so there's a fair chance I'll hit
something if it's there.

Applying the patch doesn't quite work for me:
==============================================
linux-2.6.13.2 # patch --dry-run -p0 < md-rewrite-bad-blocks.patch
patching file ./drivers/md/raid5.c
Hunk #1 succeeded at 339 (offset -10 lines).
Hunk #3 succeeded at 963 (offset -20 lines).
Hunk #4 FAILED at 983.
Hunk #5 succeeded at 1044 (offset -2 lines).
Hunk #6 succeeded at 1274 (offset -32 lines).
1 out of 6 hunks FAILED -- saving rejects to file ./drivers/md/raid5.c.rej
patching file ./include/linux/raid/raid5.h
Hunk #1 succeeded at 153 (offset -1 lines).
==============================================

Hunk #4 succeds if given a fuzz of 3...
Is it safe to use the patch with 2.6.13.2?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-21  0:15   ` Molle Bestefich
@ 2005-09-21  9:14     ` Neil Brown
  2005-09-21 15:07       ` Al Boldi
  2005-10-23  3:57       ` Patrik Jonsson
  0 siblings, 2 replies; 11+ messages in thread
From: Neil Brown @ 2005-09-21  9:14 UTC (permalink / raw)
  To: molle.bestefich; +Cc: linux-raid

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 2387 bytes --]

On Wednesday September 21, molle.bestefich@gmail.com wrote:
> NeilBrown wrote:
> > TESTERS WANTED!!  SEE BELOW...
> > 
> > This patch changes the behaviour of raid5 when it gets a read error.
> > Instead of just failing the device, it tried to find out what should
> > have been there, and writes it over the bad block.
> 
> Jip-hee!  Beautiful!!
> Neil, a big, warm and fuzzy Thank You for all the hard work you put into MD!
> 

:-)


> I do not have a test system, so I've tried to apply the patch to a
> smallish (6 disk ata) live system instead.  (Am I sane?)

Think "courageous".  However I think there is very little risk.  My
desire for testing is not that I think it might corrupt data or
anything like that, but simply that real failures have fairly
complicated characteristics, in terms of delays so many-happen-at-once
and such, I would really like to see a trace of what happens in
several real-world situations - maybe some refinements can make it
handle failure conditions better.


> 
> Some of the disks have probably developed a couple of bad blocks here
> and there by now.  I imagine doing a 'dd' from the MD device will read
> at least 83% of all sectors (?), so there's a fair chance I'll hit
> something if it's there.

I'm nearly ready to give away a patch which does a background parity
check - it reads all blocks and checks parity, and reports the error
count etc.  Running such a thing every night (or every week) is
probably a good idea in some situations, but really needs the
fix-read-errors patch.

> 
> Applying the patch doesn't quite work for me:
> ==============================================
> linux-2.6.13.2 # patch --dry-run -p0 < md-rewrite-bad-blocks.patch
> patching file ./drivers/md/raid5.c
> Hunk #1 succeeded at 339 (offset -10 lines).
> Hunk #3 succeeded at 963 (offset -20 lines).
> Hunk #4 FAILED at 983.
> Hunk #5 succeeded at 1044 (offset -2 lines).
> Hunk #6 succeeded at 1274 (offset -32 lines).
> 1 out of 6 hunks FAILED -- saving rejects to file ./drivers/md/raid5.c.rej
> patching file ./include/linux/raid/raid5.h
> Hunk #1 succeeded at 153 (offset -1 lines).
> ==============================================
> 
> Hunk #4 succeds if given a fuzz of 3...
> Is it safe to use the patch with 2.6.13.2?

Yes, the changes are quite independent.  I've attached a version of
the patch that applies cleanly against 2.6.13.2.

NeilBrown


[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 4789 bytes --]

Status: ok

Better handling of readerrors with raid5.


Instead of failing a drive on read-error, we attempt to
re-write the block, and then re-read.  If that all works,
we allow the device to remain in the array.


Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./drivers/md/raid5.c         |   61 +++++++++++++++++++++++++++++++++++++++----
 ./include/linux/raid/raid5.h |    2 +
 2 files changed, 58 insertions(+), 5 deletions(-)

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2005-09-21 11:06:49.000000000 +0200
+++ ./drivers/md/raid5.c	2005-09-21 11:07:19.000000000 +0200
@@ -339,7 +339,7 @@ static void shrink_stripes(raid5_conf_t 
 	conf->slab_cache = NULL;
 }
 
-static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
+static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 				   int error)
 {
  	struct stripe_head *sh = bi->bi_private;
@@ -391,10 +391,27 @@ static int raid5_end_read_request (struc
 		}
 #else
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif		
+#endif
+		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			printk("R5: read error corrected!!\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+		}
 	} else {
-		md_error(conf->mddev, conf->disks[i].rdev);
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+		if (conf->mddev->degraded) {
+			printk("R5: read error not correctable.\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			md_error(conf->mddev, conf->disks[i].rdev);
+		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
+			/* Oh, no!!! */
+			printk("R5: read error NOT corrected!!\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			md_error(conf->mddev, conf->disks[i].rdev);
+		} else
+			set_bit(R5_ReadError, &sh->dev[i].flags);
 	}
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 #if 0
@@ -946,6 +963,12 @@ static void handle_stripe(struct stripe_
 		if (dev->written) written++;
 		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
 		if (!rdev || !rdev->in_sync) {
+			/* The ReadError flag wil just be confusing now */
+			clear_bit(R5_ReadError, &dev->flags);
+			clear_bit(R5_ReWrite, &dev->flags);
+		}
+		if (!rdev || !rdev->in_sync
+		    || test_bit(R5_ReadError, &dev->flags)) {
 			failed++;
 			failed_num = i;
 		} else
@@ -960,6 +983,14 @@ static void handle_stripe(struct stripe_
 	if (failed > 1 && to_read+to_write+written) {
 		spin_lock_irq(&conf->device_lock);
 		for (i=disks; i--; ) {
+
+			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				if (rdev && rdev->in_sync)
+					/* multiple read failures in one stripe */
+					md_error(conf->mddev, rdev);
+			}
+
 			/* fail all writes first */
 			bi = sh->dev[i].towrite;
 			sh->dev[i].towrite = NULL;
@@ -993,7 +1024,8 @@ static void handle_stripe(struct stripe_
 			}
 
 			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
 				bi = sh->dev[i].toread;
 				sh->dev[i].toread = NULL;
 				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1240,7 +1272,26 @@ static void handle_stripe(struct stripe_
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
-	
+
+	/* If the failed drive is just a ReadError, then we might need to progress
+	 * the repair/check process
+	 */
+	if (failed == 1 && test_bit(R5_ReadError, &sh->dev[failed_num].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
+		) {
+		dev = &sh->dev[failed_num];
+		if (!test_bit(R5_ReWrite, &dev->flags)) {
+			set_bit(R5_Wantwrite, &dev->flags);
+			set_bit(R5_ReWrite, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+		} else {
+			/* let's read it back */
+			set_bit(R5_Wantread, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+		}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {

diff ./include/linux/raid/raid5.h~current~ ./include/linux/raid/raid5.h
--- ./include/linux/raid/raid5.h~current~	2005-09-21 11:06:49.000000000 +0200
+++ ./include/linux/raid/raid5.h	2005-09-21 11:06:52.000000000 +0200
@@ -153,6 +153,8 @@ struct stripe_head {
 #define	R5_Wantwrite	5
 #define	R5_Syncio	6	/* this io need to be accounted as resync io */
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
 /*
  * Write method

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-21  9:14     ` Neil Brown
@ 2005-09-21 15:07       ` Al Boldi
  2005-10-23  3:57       ` Patrik Jonsson
  1 sibling, 0 replies; 11+ messages in thread
From: Al Boldi @ 2005-09-21 15:07 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid

Neil Brown wrote:
> I'm nearly ready to give away a patch which does a background parity
> check - it reads all blocks and checks parity, and reports the error
> count etc.  Running such a thing every night (or every week) is
> probably a good idea in some situations, but really needs the
> fix-read-errors patch.

I love it!

Thanks!

--
Al


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-21  9:14     ` Neil Brown
  2005-09-21 15:07       ` Al Boldi
@ 2005-10-23  3:57       ` Patrik Jonsson
  2005-10-23 22:52         ` Neil Brown
  1 sibling, 1 reply; 11+ messages in thread
From: Patrik Jonsson @ 2005-10-23  3:57 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi,

After having the "dreaded double disk failure", I've decided to upgrade
my kernel and include this patch. I also want to include the "parity
check" option. However, I'm having a hard time figuring out which
patches need to be applied. Is there a collection of all the md patches
somewhere? It's also hard to know which patches have already been
included in the kernel when you upgrade to a newer version.

Any tips for handling this would be much appreciated!

/Patrik
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.2 (MingW32)
Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org

iD8DBQFDWwo6T+KvsdUW5p8RAiMeAKCBUJwoXbgmbVggmEh9RMNqsxxc7gCeKg/i
pkbg70VZ00xsnYdKqn2CjZE=
=es3o
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-10-23  3:57       ` Patrik Jonsson
@ 2005-10-23 22:52         ` Neil Brown
  0 siblings, 0 replies; 11+ messages in thread
From: Neil Brown @ 2005-10-23 22:52 UTC (permalink / raw)
  To: Patrik Jonsson; +Cc: linux-raid

On Saturday October 22, patrik@ucolick.org wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Hi,
> 
> After having the "dreaded double disk failure", I've decided to upgrade
> my kernel and include this patch. I also want to include the "parity
> check" option. However, I'm having a hard time figuring out which
> patches need to be applied. Is there a collection of all the md patches
> somewhere? It's also hard to know which patches have already been
> included in the kernel when you upgrade to a newer version.
> 
> Any tips for handling this would be much appreciated!

The easiest thing to do would be to grab the latest -mm kernel.  All
the patches are in there.

NeilBrown

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16  3:01 ` [PATCH md ] Better handling of readerrors with raid5 NeilBrown
  2005-09-16 16:53   ` Mike Hardy
  2005-09-21  0:15   ` Molle Bestefich
@ 2005-10-11 14:31   ` Mattias Wadenstein
  2005-12-22 22:23   ` Stephan van Hienen
  3 siblings, 0 replies; 11+ messages in thread
From: Mattias Wadenstein @ 2005-10-11 14:31 UTC (permalink / raw)
  To: NeilBrown; +Cc: Andrew Morton, linux-raid

On Fri, 16 Sep 2005, NeilBrown wrote:

> TESTERS WANTED!!  SEE BELOW...
>
> This patch changes the behaviour of raid5 when it gets a read error.
> Instead of just failing the device, it tried to find out what should
> have been there, and writes it over the bad block.  For some
> media-errors, this has a reasonable chance of fixing the error.
> If the write succeeds, and a subsequent read succeeds as well, raid5
> decided the address is OK and conitnues.
>
> I have tested this using the 'faulty' md personality, but it would be
> really good to test it with real disks that have real errors.  If
> anyone has such drives in a cupboard (or even in a computer) and would
> be willing to give this a try, I would really appreciate it.

I have been trying for the last couple of weeks on a batch of drives[1] 
that are known to now and then pop up such errors, but I've so far only 
managed to find two drives with real, permanent, failure modes. I don't 
know if that's just because I haven't been looking hard enough, or that 
the disks have started to behave though.

On the other hand, it does seem stable, I still have all my [test] data. 
It managed to properly fail the broken disk and restripe onto a hot spare, 
but I have no good observations on fixable media errors.

/Mattias Wadenstein

[1]: 3 4-drive raid5s with a hot spare each

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH md ] Better handling of readerrors with raid5.
  2005-09-16  3:01 ` [PATCH md ] Better handling of readerrors with raid5 NeilBrown
                     ` (2 preceding siblings ...)
  2005-10-11 14:31   ` Mattias Wadenstein
@ 2005-12-22 22:23   ` Stephan van Hienen
  3 siblings, 0 replies; 11+ messages in thread
From: Stephan van Hienen @ 2005-12-22 22:23 UTC (permalink / raw)
  To: NeilBrown; +Cc: Andrew Morton, linux-raid

On Fri, 16 Sep 2005, NeilBrown wrote:

> TESTERS WANTED!!  SEE BELOW...
>
> This patch changes the behaviour of raid5 when it gets a read error.
> Instead of just failing the device, it tried to find out what should
> have been there, and writes it over the bad block.  For some
> media-errors, this has a reasonable chance of fixing the error.
> If the write succeeds, and a subsequent read succeeds as well, raid5
> decided the address is OK and conitnues.

Neil,

what is the current status of this patch ?
yesterday one of my disks decided to fail during the night (3ware ide 
timeout)
during the rebuild one of my disks decided it had a read error so the raid 
was 'down'

i recovered my raid by using dd_rescue the last failed disk to a spare 
disk (with 4 read errors)
(and doing a mdadm -A -force)

(so I should have a corrupted file somewhere?)

looks like this patch would help in situations like this

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2005-12-22 22:23 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20050916125754.11044.patches@notabene>
2005-09-16  3:01 ` [PATCH md ] Better handling of readerrors with raid5 NeilBrown
2005-09-16 16:53   ` Mike Hardy
2005-09-16 21:39     ` Neil Brown
2005-09-18 22:06     ` JaniD++
2005-09-21  0:15   ` Molle Bestefich
2005-09-21  9:14     ` Neil Brown
2005-09-21 15:07       ` Al Boldi
2005-10-23  3:57       ` Patrik Jonsson
2005-10-23 22:52         ` Neil Brown
2005-10-11 14:31   ` Mattias Wadenstein
2005-12-22 22:23   ` Stephan van Hienen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).