[RFC PATCH] raid5: Correct some failed-stripe which because the badsector.

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] raid5: Correct some failed-stripe which because the badsector.
@ 2012-09-22  3:14 Jianpeng Ma
  2012-09-26  1:20 ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: Jianpeng Ma @ 2012-09-22  3:14 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid, 周, 麒, 边, 雨

Because raid5 had implemented badsector feature,some failed-stripe which because the
badsector can try to correct instead of only failing.

For example:
1:Create a raid5 by four disks missing/sdb/sdc/sdd.
2:If readed  data from sdb and met error,so the stripe is failed
3:If later write-stripe contains sdb which overwrite.The stripe has a
chance to correct.
4:If step3 successed, add a spare disk to raid,the raid can be active
from degraded.

There are two methods to compute raid5 parity,rcw and rmw.
In this situation,it only used rcw.
The purpose is only change state of stripe from failed to degraded or
active.

NOTE:
The patch looks ugly. It is post out mainly to make sure it is 
going in the correct direction and hope to get some helpful comments from other guys.

Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
---
 drivers/md/raid5.c |  152 +++++++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h |    8 +++
 2 files changed, 135 insertions(+), 25 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index adda94d..d085fa4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -220,6 +220,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			list_add_tail(&sh->lru, &conf->inactive_list);
 			wake_up(&conf->wait_for_stripe);
@@ -548,9 +549,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			else
 				rw = WRITE;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-			rw = READ;
+				rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
-					    &sh->dev[i].flags)) {
+				&sh->dev[i].flags)) {
 			rw = WRITE;
 			replace_only = 1;
 		} else
@@ -1737,6 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		s = sh->sector + rdev->new_data_offset;
 	else
 		s = sh->sector + rdev->data_offset;
+
 	if (uptodate) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
@@ -1754,6 +1756,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
@@ -1809,6 +1812,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 			if (!(set_bad
 			      && test_bit(In_sync, &rdev->flags)
 			      && rdev_set_badblocks(
@@ -1852,8 +1856,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		}
 	}
 	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
-		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		uptodate);
+		(unsigned long long)sh->sector, i,
+		atomic_read(&sh->count), uptodate);
 	if (i == disks) {
 		BUG();
 		return;
@@ -1870,6 +1874,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		if (!uptodate) {
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
+			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
 				set_bit(MD_RECOVERY_NEEDED,
 					&rdev->mddev->recovery);
@@ -2479,6 +2484,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bitmap_end = 1;
 		}
 
+		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
+			s->failed_overwrite--;;
+		clear_bit(R5_BadParity,	&sh->dev[i].flags);
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
@@ -2776,6 +2784,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		 * look like rcw is cheaper
 		 */
 		rcw = 1; rmw = 2;
+	} else if (s->failed > conf->max_degraded) {
+		/*For write failed-stripe, only use rcw */
+		rcw = 1;
+		rmw = 2;
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
@@ -2826,6 +2838,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	if (rcw <= rmw && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
 		rcw = 0;
+
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
@@ -2834,8 +2847,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
-				if (!test_bit(R5_Insync, &dev->flags))
+				if (!test_bit(R5_Insync, &dev->flags)) {
+					/* Only for all data-disks !R5_Insync */
+					if (s->failed > conf->max_degraded &&
+						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
+						set_bit(STRIPE_DELAYED, &sh->state);
 					continue; /* it's a failed drive */
+				}
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 					pr_debug("Read_old block "
@@ -3268,6 +3286,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		if (rdev) {
 			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 					     &first_bad, &bad_sectors);
+
 			if (s->blocked_rdev == NULL
 			    && (test_bit(Blocked, &rdev->flags)
 				|| is_bad < 0)) {
@@ -3279,9 +3298,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			}
 		}
 		clear_bit(R5_Insync, &dev->flags);
-		if (!rdev)
-			/* Not in-sync */;
-		else if (is_bad) {
+		if (!rdev) {
+			s->noworked++;
+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags)) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			}
+		} else if (is_bad) {
 			/* also not in-sync */
 			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
 			    test_bit(R5_UPTODATE, &dev->flags)) {
@@ -3291,6 +3315,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(R5_Insync, &dev->flags);
 				set_bit(R5_ReadError, &dev->flags);
 			}
+
+			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags))) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			} else if (i == sh->pd_idx || i == sh->qd_idx)
+				set_bit(R5_BadParity, &dev->flags);
 		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
 		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
@@ -3342,14 +3374,35 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
-		if (test_bit(R5_ReadError, &dev->flags))
-			clear_bit(R5_Insync, &dev->flags);
+		if (test_bit(R5_ReadError, &dev->flags)) {
+			/*
+			 * For failed-stripe which contained badsectors,after writing
+			 * it should reread to check.
+			 */
+			struct md_rdev *rdev2 = rcu_dereference(
+				conf->disks[i].rdev);
+			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
+				test_bit(R5_UPTODATE, &dev->flags) &&
+				(test_bit(R5_FailedOverwrite, &dev->flags) ||
+				 test_bit(R5_BadParity, &dev->flags) ||
+				 test_bit(R5_FailedReread, &dev->flags))) {
+				set_bit(STRIPE_FAILED_REREAD, &sh->state);
+				set_bit(R5_FailedReread, &dev->flags);
+				clear_bit(R5_FailedOverwrite, &dev->flags);
+				clear_bit(R5_BadParity, &dev->flags);
+			} else
+				clear_bit(R5_Insync, &dev->flags);
+		}
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			if (s->failed < 2)
 				s->failed_num[s->failed] = i;
 			s->failed++;
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
+			if (i == sh->pd_idx)
+				s->p_failed = 1;
+			else if (i == sh->qd_idx)
+				s->q_failed = 1;
 		}
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -3424,19 +3477,43 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
-	       " to_write=%d failed=%d failed_num=%d,%d\n",
+	       " to_write=%d failed=%d failed_num=%d,%d"
+	       " failed_overwrite=%d p_failed=%d q_failed=%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-	       s.failed_num[0], s.failed_num[1]);
+	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
+	       s.p_failed, s.q_failed);
 	/* check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
 	 */
 	if (s.failed > conf->max_degraded) {
-		sh->check_state = 0;
-		sh->reconstruct_state = 0;
-		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-		if (s.syncing + s.replacing)
-			handle_failed_sync(conf, sh, &s);
+		/*
+		 *Disks which removed or faulty must less or equal max_degraded
+		 */
+		if (s.noworked <= conf->max_degraded &&
+			((conf->level < 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed)) ||
+			(conf->level == 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
+			set_bit(STRIPE_FAILED_WRITE, &sh->state);
+			pr_debug("stripe(%llu) failed, try to recorrect\n",
+				(unsigned long long)sh->sector);
+		} else if ((s.failed_overwrite > 0 &&
+			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
+			pr_debug("stripe(%lld) delay for failed-write\n",
+				(unsigned long long)sh->sector);
+		else {
+			sh->check_state = 0;
+			sh->reconstruct_state = 0;
+			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
+			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
+			if (s.to_read+s.to_write+s.written) {
+				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+				pr_debug("handle_failed_stripe:%lld\n",
+					(unsigned long long)sh->sector);
+			}
+			if (s.syncing + s.replacing)
+				handle_failed_sync(conf, sh, &s);
+		}
 	}
 
 	/*
@@ -3487,6 +3564,8 @@ static void handle_stripe(struct stripe_head *sh)
 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
 		BUG_ON(sh->qd_idx >= 0 &&
 		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+
+		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3504,6 +3583,7 @@ static void handle_stripe(struct stripe_head *sh)
 		}
 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 			s.dec_preread_active = 1;
+
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3548,10 +3628,30 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
 
-	/* If the failed drives are just a ReadError, then we might need
-	 * to progress the repair/check process
+	/*
+	 * If stripe failed because badsector and
+	 * can write to correct the badsector
+	 * we used readerr routine which rewrite and reread
 	 */
-	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_FailedReread, &dev->flags)
+			    && !test_bit(R5_LOCKED, &dev->flags)
+			    && test_bit(R5_UPTODATE, &dev->flags)) {
+				/* it just wrote and sucessed,so not
+				 * necessary to rewrite,only set flag
+				 */
+				set_bit(R5_ReWrite, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				set_bit(R5_LOCKED, &dev->flags);
+				s.locked++;
+			}
+		}
+	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+		/* If the failed drives are just a ReadError, then we might need
+		 * to progress the repair/check process
+		 */
 		for (i = 0; i < s.failed; i++) {
 			struct r5dev *dev = &sh->dev[s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
@@ -3571,7 +3671,10 @@ static void handle_stripe(struct stripe_head *sh)
 				}
 			}
 		}
+
 
+
+
 
 	/* Finish reconstruct operations initiated by the expansion process */
 	if (sh->reconstruct_state == reconstruct_state_result) {
@@ -3648,7 +3751,7 @@ finish:
 			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
 				rdev = conf->disks[i].rdev;
 				rdev_clear_badblocks(rdev, sh->sector,
-						     STRIPE_SECTORS, 0);
+				     STRIPE_SECTORS, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -3853,7 +3956,6 @@ static void raid5_align_endio(struct bio *bi, int error)
 
 
 	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
-
 	add_bio_to_retry(raid_bi, conf);
 }
 
@@ -4088,7 +4190,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(mddev,bi))
 		return;
-
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 	bi->bi_next = NULL;
@@ -4586,8 +4687,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		handled++;
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0)
+	if (remaining == 0)
 		bio_endio(raid_bio, 0);
+
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a9fc249..04709d7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -251,6 +251,9 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	/* Overwrite disks which faulty or removed or contained badsector */
+	int failed_overwrite;
+	int noworked;	/*disk faulty or removed*/
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -298,6 +301,9 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
+	R5_FailedReread,
+	R5_BadParity,
 };
 
 /*
@@ -322,6 +328,8 @@ enum {
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
+	STRIPE_FAILED_WRITE,
+	STRIPE_FAILED_REREAD,
 };
 
 /*
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] raid5: Correct some failed-stripe which because the badsector.
  2012-09-22  3:14 [RFC PATCH] raid5: Correct some failed-stripe which because the badsector Jianpeng Ma
@ 2012-09-26  1:20 ` NeilBrown
  2012-09-26  5:25   ` Jianpeng Ma
  0 siblings, 1 reply; 4+ messages in thread
From: NeilBrown @ 2012-09-26  1:20 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: linux-raid, 周, 麒, 边, 雨

[-- Attachment #1: Type: text/plain, Size: 16643 bytes --]

On Sat, 22 Sep 2012 11:14:47 +0800 "Jianpeng Ma" <majianpeng@gmail.com> wrote:

> Because raid5 had implemented badsector feature,some failed-stripe which because the
> badsector can try to correct instead of only failing.
> 
> For example:
> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
> 2:If readed  data from sdb and met error,so the stripe is failed
> 3:If later write-stripe contains sdb which overwrite.The stripe has a
> chance to correct.

Your patch looks much more complicated that should be necessary, and you
don't provide any explanation for how it works, so it is very hard for me to
review.

I would think you just need to modify analyse_stripe slightly so that it
detects the "overwrite-a-bad-block" case and marks that device as
'in_sync' (if there hasn't been a write error on the device).

Then make sure that handle_stripe_dirtying detects that case and doesn't try
to read, so forces 'rcw'

You patch does seem to do some of those things, but it does a lot more as
well that doesn't seem to be justified.
It also makes pointless changes to formatting which makes the patch harder to
read.

Please
 - keep the patch as small as possible.
 - explain what is happening in as much detail as possible.

NeilBrown



> 4:If step3 successed, add a spare disk to raid,the raid can be active
> from degraded.
> 
> There are two methods to compute raid5 parity,rcw and rmw.
> In this situation,it only used rcw.
> The purpose is only change state of stripe from failed to degraded or
> active.
> 
> NOTE:
> The patch looks ugly. It is post out mainly to make sure it is 
> going in the correct direction and hope to get some helpful comments from other guys.
> 
> Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
> Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
> ---
>  drivers/md/raid5.c |  152 +++++++++++++++++++++++++++++++++++++++++++---------
>  drivers/md/raid5.h |    8 +++
>  2 files changed, 135 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index adda94d..d085fa4 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -220,6 +220,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>  			    < IO_THRESHOLD)
>  				md_wakeup_thread(conf->mddev->thread);
>  		atomic_dec(&conf->active_stripes);
> +		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>  		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
>  			list_add_tail(&sh->lru, &conf->inactive_list);
>  			wake_up(&conf->wait_for_stripe);
> @@ -548,9 +549,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  			else
>  				rw = WRITE;
>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> -			rw = READ;
> +				rw = READ;
>  		else if (test_and_clear_bit(R5_WantReplace,
> -					    &sh->dev[i].flags)) {
> +				&sh->dev[i].flags)) {
>  			rw = WRITE;
>  			replace_only = 1;
>  		} else
> @@ -1737,6 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  		s = sh->sector + rdev->new_data_offset;
>  	else
>  		s = sh->sector + rdev->data_offset;
> +
>  	if (uptodate) {
>  		set_bit(R5_UPTODATE, &sh->dev[i].flags);
>  		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
> @@ -1754,6 +1756,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>  
> @@ -1809,6 +1812,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  		else {
>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>  			if (!(set_bad
>  			      && test_bit(In_sync, &rdev->flags)
>  			      && rdev_set_badblocks(
> @@ -1852,8 +1856,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
>  		}
>  	}
>  	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
> -		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
> -		uptodate);
> +		(unsigned long long)sh->sector, i,
> +		atomic_read(&sh->count), uptodate);
>  	if (i == disks) {
>  		BUG();
>  		return;
> @@ -1870,6 +1874,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
>  		if (!uptodate) {
>  			set_bit(WriteErrorSeen, &rdev->flags);
>  			set_bit(R5_WriteError, &sh->dev[i].flags);
> +			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
>  			if (!test_and_set_bit(WantReplacement, &rdev->flags))
>  				set_bit(MD_RECOVERY_NEEDED,
>  					&rdev->mddev->recovery);
> @@ -2479,6 +2484,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>  			bitmap_end = 1;
>  		}
>  
> +		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
> +			s->failed_overwrite--;;
> +		clear_bit(R5_BadParity,	&sh->dev[i].flags);
>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>  			wake_up(&conf->wait_for_overlap);
>  
> @@ -2776,6 +2784,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  		 * look like rcw is cheaper
>  		 */
>  		rcw = 1; rmw = 2;
> +	} else if (s->failed > conf->max_degraded) {
> +		/*For write failed-stripe, only use rcw */
> +		rcw = 1;
> +		rmw = 2;
>  	} else for (i = disks; i--; ) {
>  		/* would I have to read this buffer for read_modify_write */
>  		struct r5dev *dev = &sh->dev[i];
> @@ -2826,6 +2838,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	if (rcw <= rmw && rcw > 0) {
>  		/* want reconstruct write, but need to get some data */
>  		rcw = 0;
> +
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
>  			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
> @@ -2834,8 +2847,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>  				rcw++;
> -				if (!test_bit(R5_Insync, &dev->flags))
> +				if (!test_bit(R5_Insync, &dev->flags)) {
> +					/* Only for all data-disks !R5_Insync */
> +					if (s->failed > conf->max_degraded &&
> +						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
> +						set_bit(STRIPE_DELAYED, &sh->state);
>  					continue; /* it's a failed drive */
> +				}
>  				if (
>  				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
>  					pr_debug("Read_old block "
> @@ -3268,6 +3286,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  		if (rdev) {
>  			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
>  					     &first_bad, &bad_sectors);
> +
>  			if (s->blocked_rdev == NULL
>  			    && (test_bit(Blocked, &rdev->flags)
>  				|| is_bad < 0)) {
> @@ -3279,9 +3298,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  			}
>  		}
>  		clear_bit(R5_Insync, &dev->flags);
> -		if (!rdev)
> -			/* Not in-sync */;
> -		else if (is_bad) {
> +		if (!rdev) {
> +			s->noworked++;
> +			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
> +				test_bit(R5_FailedOverwrite, &dev->flags)) {
> +				s->failed_overwrite++;
> +				set_bit(R5_FailedOverwrite, &dev->flags);
> +			}
> +		} else if (is_bad) {
>  			/* also not in-sync */
>  			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>  			    test_bit(R5_UPTODATE, &dev->flags)) {
> @@ -3291,6 +3315,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  				set_bit(R5_Insync, &dev->flags);
>  				set_bit(R5_ReadError, &dev->flags);
>  			}
> +
> +			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
> +				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
> +				test_bit(R5_FailedOverwrite, &dev->flags))) {
> +				s->failed_overwrite++;
> +				set_bit(R5_FailedOverwrite, &dev->flags);
> +			} else if (i == sh->pd_idx || i == sh->qd_idx)
> +				set_bit(R5_BadParity, &dev->flags);
>  		} else if (test_bit(In_sync, &rdev->flags))
>  			set_bit(R5_Insync, &dev->flags);
>  		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
> @@ -3342,14 +3374,35 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  			clear_bit(R5_ReadError, &dev->flags);
>  			clear_bit(R5_ReWrite, &dev->flags);
>  		}
> -		if (test_bit(R5_ReadError, &dev->flags))
> -			clear_bit(R5_Insync, &dev->flags);
> +		if (test_bit(R5_ReadError, &dev->flags)) {
> +			/*
> +			 * For failed-stripe which contained badsectors,after writing
> +			 * it should reread to check.
> +			 */
> +			struct md_rdev *rdev2 = rcu_dereference(
> +				conf->disks[i].rdev);
> +			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
> +				test_bit(R5_UPTODATE, &dev->flags) &&
> +				(test_bit(R5_FailedOverwrite, &dev->flags) ||
> +				 test_bit(R5_BadParity, &dev->flags) ||
> +				 test_bit(R5_FailedReread, &dev->flags))) {
> +				set_bit(STRIPE_FAILED_REREAD, &sh->state);
> +				set_bit(R5_FailedReread, &dev->flags);
> +				clear_bit(R5_FailedOverwrite, &dev->flags);
> +				clear_bit(R5_BadParity, &dev->flags);
> +			} else
> +				clear_bit(R5_Insync, &dev->flags);
> +		}
>  		if (!test_bit(R5_Insync, &dev->flags)) {
>  			if (s->failed < 2)
>  				s->failed_num[s->failed] = i;
>  			s->failed++;
>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>  				do_recovery = 1;
> +			if (i == sh->pd_idx)
> +				s->p_failed = 1;
> +			else if (i == sh->qd_idx)
> +				s->q_failed = 1;
>  		}
>  	}
>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
> @@ -3424,19 +3477,43 @@ static void handle_stripe(struct stripe_head *sh)
>  	}
>  
>  	pr_debug("locked=%d uptodate=%d to_read=%d"
> -	       " to_write=%d failed=%d failed_num=%d,%d\n",
> +	       " to_write=%d failed=%d failed_num=%d,%d"
> +	       " failed_overwrite=%d p_failed=%d q_failed=%d\n",
>  	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
> -	       s.failed_num[0], s.failed_num[1]);
> +	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
> +	       s.p_failed, s.q_failed);
>  	/* check if the array has lost more than max_degraded devices and,
>  	 * if so, some requests might need to be failed.
>  	 */
>  	if (s.failed > conf->max_degraded) {
> -		sh->check_state = 0;
> -		sh->reconstruct_state = 0;
> -		if (s.to_read+s.to_write+s.written)
> -			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
> -		if (s.syncing + s.replacing)
> -			handle_failed_sync(conf, sh, &s);
> +		/*
> +		 *Disks which removed or faulty must less or equal max_degraded
> +		 */
> +		if (s.noworked <= conf->max_degraded &&
> +			((conf->level < 6 &&
> +			(s.failed - s.failed_overwrite <= s.p_failed)) ||
> +			(conf->level == 6 &&
> +			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
> +			set_bit(STRIPE_FAILED_WRITE, &sh->state);
> +			pr_debug("stripe(%llu) failed, try to recorrect\n",
> +				(unsigned long long)sh->sector);
> +		} else if ((s.failed_overwrite > 0 &&
> +			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
> +			pr_debug("stripe(%lld) delay for failed-write\n",
> +				(unsigned long long)sh->sector);
> +		else {
> +			sh->check_state = 0;
> +			sh->reconstruct_state = 0;
> +			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
> +			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
> +			if (s.to_read+s.to_write+s.written) {
> +				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
> +				pr_debug("handle_failed_stripe:%lld\n",
> +					(unsigned long long)sh->sector);
> +			}
> +			if (s.syncing + s.replacing)
> +				handle_failed_sync(conf, sh, &s);
> +		}
>  	}
>  
>  	/*
> @@ -3487,6 +3564,8 @@ static void handle_stripe(struct stripe_head *sh)
>  		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
>  		BUG_ON(sh->qd_idx >= 0 &&
>  		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
> +
> +		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
>  			if (test_bit(R5_LOCKED, &dev->flags) &&
> @@ -3504,6 +3583,7 @@ static void handle_stripe(struct stripe_head *sh)
>  		}
>  		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>  			s.dec_preread_active = 1;
> +
>  	}
>  
>  	/* Now to consider new write requests and what else, if anything
> @@ -3548,10 +3628,30 @@ static void handle_stripe(struct stripe_head *sh)
>  		clear_bit(STRIPE_SYNCING, &sh->state);
>  	}
>  
> -	/* If the failed drives are just a ReadError, then we might need
> -	 * to progress the repair/check process
> +	/*
> +	 * If stripe failed because badsector and
> +	 * can write to correct the badsector
> +	 * we used readerr routine which rewrite and reread
>  	 */
> -	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
> +	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
> +		for (i = disks; i--; ) {
> +			struct r5dev *dev = &sh->dev[i];
> +			if (test_bit(R5_FailedReread, &dev->flags)
> +			    && !test_bit(R5_LOCKED, &dev->flags)
> +			    && test_bit(R5_UPTODATE, &dev->flags)) {
> +				/* it just wrote and sucessed,so not
> +				 * necessary to rewrite,only set flag
> +				 */
> +				set_bit(R5_ReWrite, &dev->flags);
> +				set_bit(R5_Wantread, &dev->flags);
> +				set_bit(R5_LOCKED, &dev->flags);
> +				s.locked++;
> +			}
> +		}
> +	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
> +		/* If the failed drives are just a ReadError, then we might need
> +		 * to progress the repair/check process
> +		 */
>  		for (i = 0; i < s.failed; i++) {
>  			struct r5dev *dev = &sh->dev[s.failed_num[i]];
>  			if (test_bit(R5_ReadError, &dev->flags)
> @@ -3571,7 +3671,10 @@ static void handle_stripe(struct stripe_head *sh)
>  				}
>  			}
>  		}
> +
>  
> +
> +
>  
>  	/* Finish reconstruct operations initiated by the expansion process */
>  	if (sh->reconstruct_state == reconstruct_state_result) {
> @@ -3648,7 +3751,7 @@ finish:
>  			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
>  				rdev = conf->disks[i].rdev;
>  				rdev_clear_badblocks(rdev, sh->sector,
> -						     STRIPE_SECTORS, 0);
> +				     STRIPE_SECTORS, 0);
>  				rdev_dec_pending(rdev, conf->mddev);
>  			}
>  			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
> @@ -3853,7 +3956,6 @@ static void raid5_align_endio(struct bio *bi, int error)
>  
>  
>  	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
> -
>  	add_bio_to_retry(raid_bi, conf);
>  }
>  
> @@ -4088,7 +4190,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
>  	     mddev->reshape_position == MaxSector &&
>  	     chunk_aligned_read(mddev,bi))
>  		return;
> -
>  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
>  	last_sector = bi->bi_sector + (bi->bi_size>>9);
>  	bi->bi_next = NULL;
> @@ -4586,8 +4687,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
>  		handled++;
>  	}
>  	remaining = raid5_dec_bi_active_stripes(raid_bio);
> -	if (remaining == 0)
> +	if (remaining == 0)
>  		bio_endio(raid_bio, 0);
> +
>  	if (atomic_dec_and_test(&conf->active_aligned_reads))
>  		wake_up(&conf->wait_for_stripe);
>  	return handled;
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index a9fc249..04709d7 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -251,6 +251,9 @@ struct stripe_head_state {
>  	 */
>  	int syncing, expanding, expanded, replacing;
>  	int locked, uptodate, to_read, to_write, failed, written;
> +	/* Overwrite disks which faulty or removed or contained badsector */
> +	int failed_overwrite;
> +	int noworked;	/*disk faulty or removed*/
>  	int to_fill, compute, req_compute, non_overwrite;
>  	int failed_num[2];
>  	int p_failed, q_failed;
> @@ -298,6 +301,9 @@ enum r5dev_flags {
>  	R5_WantReplace, /* We need to update the replacement, we have read
>  			 * data in, and now is a good time to write it out.
>  			 */
> +	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
> +	R5_FailedReread,
> +	R5_BadParity,
>  };
>  
>  /*
> @@ -322,6 +328,8 @@ enum {
>  	STRIPE_COMPUTE_RUN,
>  	STRIPE_OPS_REQ_PENDING,
>  	STRIPE_ON_UNPLUG_LIST,
> +	STRIPE_FAILED_WRITE,
> +	STRIPE_FAILED_REREAD,
>  };
>  
>  /*


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [RFC PATCH] raid5: Correct some failed-stripe which because the badsector.
  2012-09-26  1:20 ` NeilBrown
@ 2012-09-26  5:25   ` Jianpeng Ma
  2013-01-07 22:17     ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: Jianpeng Ma @ 2012-09-26  5:25 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid, 周, 麒, 边, 雨

On 2012-09-26 09:20 NeilBrown <neilb@suse.de> Wrote:
>On Sat, 22 Sep 2012 11:14:47 +0800 "Jianpeng Ma" <majianpeng@gmail.com> wrote:
>
>> Because raid5 had implemented badsector feature,some failed-stripe which because the
>> badsector can try to correct instead of only failing.
>> 
>> For example:
>> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
>> 2:If readed  data from sdb and met error,so the stripe is failed
>> 3:If later write-stripe contains sdb which overwrite.The stripe has a
>> chance to correct.
>
>Your patch looks much more complicated that should be necessary, and you
>don't provide any explanation for how it works, so it is very hard for me to
>review.
>
Before implementing badsector log, failed-stripes were becasue the faulty or removed disks.
For those, it's unnecessary to do something for those failed-stripe.
But after implementing badsector log, failed-stripe can by some badsector.
One example, RAID5 has four disk: one removed and one contained badsector, so the stripe is failed;
Another example is recovery, when recovery a stripe and fail,so all the disks set badsector.
But for above example, if full-write on this stripe,the badsector will be correct and failed-stripe become normal.

This patch is do this to try recorrect some failed-stripe because badsectors.
The workflow is:
1:in func analyse_stripe, we count the overwrite-on-failed-disks(contained badsector or faulty or removed), noworkeddisk(faulty or remove)
2:in func handle_stripe, it determines whether to be processed.There will be three cases:
A: noworkdisk > max_degraded. It indicates there are many removed or faulty disks.It only failed
B: All faild-disks are overwrited,which dosen't need read old data.So it can read other goog disks or computer parity data using RCW.
C: Some failed-disks are overwrited.If stripe didn't set STRIPE_PREREAD_ACTIVE, stripe can delay for waitting other failed-disks write-data.
Otherwise, it only failed the stripe.
3;in func handle_stripe_dirtying do two things:
	A: delay for failed-disks write-data
	B: set RCW to control this situation
4:After successly writed, one thing must to do.It will be rewrite and reread for check.
But for this case, it's unnecessary to rewrite,it only reread the failed-disks(only contains badsector).
  
>I would think you just need to modify analyse_stripe slightly so that it
>detects the "overwrite-a-bad-block" case and marks that device as
>'in_sync' (if there hasn't been a write error on the device).
I think it doesn't work. Because when some overwrite-a-bad-block arrived,but the stripe is still failed.So we must delay for other data unless
stripe already setted STRIPE_PREREAD_ACTIVE.

>
>Then make sure that handle_stripe_dirtying detects that case and doesn't try
>to read, so forces 'rcw'
>
>You patch does seem to do some of those things, but it does a lot more as
>well that doesn't seem to be justified.
>It also makes pointless changes to formatting which makes the patch harder to
>read.
>
>Please
> - keep the patch as small as possible.
> - explain what is happening in as much detail as possible.
>
>NeilBrown
>
>
>
>> 4:If step3 successed, add a spare disk to raid,the raid can be active
>> from degraded.
>> 
>> There are two methods to compute raid5 parity,rcw and rmw.
>> In this situation,it only used rcw.
>> The purpose is only change state of stripe from failed to degraded or
>> active.
>> 
>> NOTE:
>> The patch looks ugly. It is post out mainly to make sure it is 
>> going in the correct direction and hope to get some helpful comments from other guys.
>> 
>> Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
>> Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
>> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
>> ---
>>  drivers/md/raid5.c |  152 +++++++++++++++++++++++++++++++++++++++++++---------
>>  drivers/md/raid5.h |    8 +++
>>  2 files changed, 135 insertions(+), 25 deletions(-)
>> 
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index adda94d..d085fa4 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -220,6 +220,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>>  			    < IO_THRESHOLD)
>>  				md_wakeup_thread(conf->mddev->thread);
>>  		atomic_dec(&conf->active_stripes);
>> +		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>>  		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
>>  			list_add_tail(&sh->lru, &conf->inactive_list);
>>  			wake_up(&conf->wait_for_stripe);
>> @@ -548,9 +549,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>>  			else
>>  				rw = WRITE;
>>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
>> -			rw = READ;
>> +				rw = READ;
>>  		else if (test_and_clear_bit(R5_WantReplace,
>> -					    &sh->dev[i].flags)) {
>> +				&sh->dev[i].flags)) {
>>  			rw = WRITE;
>>  			replace_only = 1;
>>  		} else
>> @@ -1737,6 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  		s = sh->sector + rdev->new_data_offset;
>>  	else
>>  		s = sh->sector + rdev->data_offset;
>> +
>>  	if (uptodate) {
>>  		set_bit(R5_UPTODATE, &sh->dev[i].flags);
>>  		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
>> @@ -1754,6 +1756,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>>  
>> @@ -1809,6 +1812,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  		else {
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  			if (!(set_bad
>>  			      && test_bit(In_sync, &rdev->flags)
>>  			      && rdev_set_badblocks(
>> @@ -1852,8 +1856,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
>>  		}
>>  	}
>>  	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
>> -		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
>> -		uptodate);
>> +		(unsigned long long)sh->sector, i,
>> +		atomic_read(&sh->count), uptodate);
>>  	if (i == disks) {
>>  		BUG();
>>  		return;
>> @@ -1870,6 +1874,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
>>  		if (!uptodate) {
>>  			set_bit(WriteErrorSeen, &rdev->flags);
>>  			set_bit(R5_WriteError, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
>>  			if (!test_and_set_bit(WantReplacement, &rdev->flags))
>>  				set_bit(MD_RECOVERY_NEEDED,
>>  					&rdev->mddev->recovery);
>> @@ -2479,6 +2484,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>>  			bitmap_end = 1;
>>  		}
>>  
>> +		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
>> +			s->failed_overwrite--;;
>> +		clear_bit(R5_BadParity,	&sh->dev[i].flags);
>>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>>  			wake_up(&conf->wait_for_overlap);
>>  
>> @@ -2776,6 +2784,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  		 * look like rcw is cheaper
>>  		 */
>>  		rcw = 1; rmw = 2;
>> +	} else if (s->failed > conf->max_degraded) {
>> +		/*For write failed-stripe, only use rcw */
>> +		rcw = 1;
>> +		rmw = 2;
>>  	} else for (i = disks; i--; ) {
>>  		/* would I have to read this buffer for read_modify_write */
>>  		struct r5dev *dev = &sh->dev[i];
>> @@ -2826,6 +2838,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  	if (rcw <= rmw && rcw > 0) {
>>  		/* want reconstruct write, but need to get some data */
>>  		rcw = 0;
>> +
>>  		for (i = disks; i--; ) {
>>  			struct r5dev *dev = &sh->dev[i];
>>  			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
>> @@ -2834,8 +2847,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>>  				rcw++;
>> -				if (!test_bit(R5_Insync, &dev->flags))
>> +				if (!test_bit(R5_Insync, &dev->flags)) {
>> +					/* Only for all data-disks !R5_Insync */
>> +					if (s->failed > conf->max_degraded &&
>> +						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
>> +						set_bit(STRIPE_DELAYED, &sh->state);
>>  					continue; /* it's a failed drive */
>> +				}
>>  				if (
>>  				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
>>  					pr_debug("Read_old block "
>> @@ -3268,6 +3286,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  		if (rdev) {
>>  			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
>>  					     &first_bad, &bad_sectors);
>> +
>>  			if (s->blocked_rdev == NULL
>>  			    && (test_bit(Blocked, &rdev->flags)
>>  				|| is_bad < 0)) {
>> @@ -3279,9 +3298,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			}
>>  		}
>>  		clear_bit(R5_Insync, &dev->flags);
>> -		if (!rdev)
>> -			/* Not in-sync */;
>> -		else if (is_bad) {
>> +		if (!rdev) {
>> +			s->noworked++;
>> +			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags)) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			}
>> +		} else if (is_bad) {
>>  			/* also not in-sync */
>>  			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>>  			    test_bit(R5_UPTODATE, &dev->flags)) {
>> @@ -3291,6 +3315,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  				set_bit(R5_Insync, &dev->flags);
>>  				set_bit(R5_ReadError, &dev->flags);
>>  			}
>> +
>> +			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>> +				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags))) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			} else if (i == sh->pd_idx || i == sh->qd_idx)
>> +				set_bit(R5_BadParity, &dev->flags);
>>  		} else if (test_bit(In_sync, &rdev->flags))
>>  			set_bit(R5_Insync, &dev->flags);
>>  		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
>> @@ -3342,14 +3374,35 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			clear_bit(R5_ReadError, &dev->flags);
>>  			clear_bit(R5_ReWrite, &dev->flags);
>>  		}
>> -		if (test_bit(R5_ReadError, &dev->flags))
>> -			clear_bit(R5_Insync, &dev->flags);
>> +		if (test_bit(R5_ReadError, &dev->flags)) {
>> +			/*
>> +			 * For failed-stripe which contained badsectors,after writing
>> +			 * it should reread to check.
>> +			 */
>> +			struct md_rdev *rdev2 = rcu_dereference(
>> +				conf->disks[i].rdev);
>> +			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
>> +				test_bit(R5_UPTODATE, &dev->flags) &&
>> +				(test_bit(R5_FailedOverwrite, &dev->flags) ||
>> +				 test_bit(R5_BadParity, &dev->flags) ||
>> +				 test_bit(R5_FailedReread, &dev->flags))) {
>> +				set_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +				set_bit(R5_FailedReread, &dev->flags);
>> +				clear_bit(R5_FailedOverwrite, &dev->flags);
>> +				clear_bit(R5_BadParity, &dev->flags);
>> +			} else
>> +				clear_bit(R5_Insync, &dev->flags);
>> +		}
>>  		if (!test_bit(R5_Insync, &dev->flags)) {
>>  			if (s->failed < 2)
>>  				s->failed_num[s->failed] = i;
>>  			s->failed++;
>>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>>  				do_recovery = 1;
>> +			if (i == sh->pd_idx)
>> +				s->p_failed = 1;
>> +			else if (i == sh->qd_idx)
>> +				s->q_failed = 1;
>>  		}
>>  	}
>>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
>> @@ -3424,19 +3477,43 @@ static void handle_stripe(struct stripe_head *sh)
>>  	}
>>  
>>  	pr_debug("locked=%d uptodate=%d to_read=%d"
>> -	       " to_write=%d failed=%d failed_num=%d,%d\n",
>> +	       " to_write=%d failed=%d failed_num=%d,%d"
>> +	       " failed_overwrite=%d p_failed=%d q_failed=%d\n",
>>  	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
>> -	       s.failed_num[0], s.failed_num[1]);
>> +	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
>> +	       s.p_failed, s.q_failed);
>>  	/* check if the array has lost more than max_degraded devices and,
>>  	 * if so, some requests might need to be failed.
>>  	 */
>>  	if (s.failed > conf->max_degraded) {
>> -		sh->check_state = 0;
>> -		sh->reconstruct_state = 0;
>> -		if (s.to_read+s.to_write+s.written)
>> -			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> -		if (s.syncing + s.replacing)
>> -			handle_failed_sync(conf, sh, &s);
>> +		/*
>> +		 *Disks which removed or faulty must less or equal max_degraded
>> +		 */
>> +		if (s.noworked <= conf->max_degraded &&
>> +			((conf->level < 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed)) ||
>> +			(conf->level == 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
>> +			set_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			pr_debug("stripe(%llu) failed, try to recorrect\n",
>> +				(unsigned long long)sh->sector);
>> +		} else if ((s.failed_overwrite > 0 &&
>> +			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
>> +			pr_debug("stripe(%lld) delay for failed-write\n",
>> +				(unsigned long long)sh->sector);
>> +		else {
>> +			sh->check_state = 0;
>> +			sh->reconstruct_state = 0;
>> +			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +			if (s.to_read+s.to_write+s.written) {
>> +				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> +				pr_debug("handle_failed_stripe:%lld\n",
>> +					(unsigned long long)sh->sector);
>> +			}
>> +			if (s.syncing + s.replacing)
>> +				handle_failed_sync(conf, sh, &s);
>> +		}
>>  	}
>>  
>>  	/*
>> @@ -3487,6 +3564,8 @@ static void handle_stripe(struct stripe_head *sh)
>>  		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
>>  		BUG_ON(sh->qd_idx >= 0 &&
>>  		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
>> +
>> +		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>>  		for (i = disks; i--; ) {
>>  			struct r5dev *dev = &sh->dev[i];
>>  			if (test_bit(R5_LOCKED, &dev->flags) &&
>> @@ -3504,6 +3583,7 @@ static void handle_stripe(struct stripe_head *sh)
>>  		}
>>  		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>>  			s.dec_preread_active = 1;
>> +
>>  	}
>>  
>>  	/* Now to consider new write requests and what else, if anything
>> @@ -3548,10 +3628,30 @@ static void handle_stripe(struct stripe_head *sh)
>>  		clear_bit(STRIPE_SYNCING, &sh->state);
>>  	}
>>  
>> -	/* If the failed drives are just a ReadError, then we might need
>> -	 * to progress the repair/check process
>> +	/*
>> +	 * If stripe failed because badsector and
>> +	 * can write to correct the badsector
>> +	 * we used readerr routine which rewrite and reread
>>  	 */
>> -	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
>> +		for (i = disks; i--; ) {
>> +			struct r5dev *dev = &sh->dev[i];
>> +			if (test_bit(R5_FailedReread, &dev->flags)
>> +			    && !test_bit(R5_LOCKED, &dev->flags)
>> +			    && test_bit(R5_UPTODATE, &dev->flags)) {
>> +				/* it just wrote and sucessed,so not
>> +				 * necessary to rewrite,only set flag
>> +				 */
>> +				set_bit(R5_ReWrite, &dev->flags);
>> +				set_bit(R5_Wantread, &dev->flags);
>> +				set_bit(R5_LOCKED, &dev->flags);
>> +				s.locked++;
>> +			}
>> +		}
>> +	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +		/* If the failed drives are just a ReadError, then we might need
>> +		 * to progress the repair/check process
>> +		 */
>>  		for (i = 0; i < s.failed; i++) {
>>  			struct r5dev *dev = &sh->dev[s.failed_num[i]];
>>  			if (test_bit(R5_ReadError, &dev->flags)
>> @@ -3571,7 +3671,10 @@ static void handle_stripe(struct stripe_head *sh)
>>  				}
>>  			}
>>  		}
>> +
>>  
>> +
>> +
>>  
>>  	/* Finish reconstruct operations initiated by the expansion process */
>>  	if (sh->reconstruct_state == reconstruct_state_result) {
>> @@ -3648,7 +3751,7 @@ finish:
>>  			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
>>  				rdev = conf->disks[i].rdev;
>>  				rdev_clear_badblocks(rdev, sh->sector,
>> -						     STRIPE_SECTORS, 0);
>> +				     STRIPE_SECTORS, 0);
>>  				rdev_dec_pending(rdev, conf->mddev);
>>  			}
>>  			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
>> @@ -3853,7 +3956,6 @@ static void raid5_align_endio(struct bio *bi, int error)
>>  
>>  
>>  	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
>> -
>>  	add_bio_to_retry(raid_bi, conf);
>>  }
>>  
>> @@ -4088,7 +4190,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
>>  	     mddev->reshape_position == MaxSector &&
>>  	     chunk_aligned_read(mddev,bi))
>>  		return;
>> -
>>  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
>>  	last_sector = bi->bi_sector + (bi->bi_size>>9);
>>  	bi->bi_next = NULL;
>> @@ -4586,8 +4687,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
>>  		handled++;
>>  	}
>>  	remaining = raid5_dec_bi_active_stripes(raid_bio);
>> -	if (remaining == 0)
>> +	if (remaining == 0)
>>  		bio_endio(raid_bio, 0);
>> +
>>  	if (atomic_dec_and_test(&conf->active_aligned_reads))
>>  		wake_up(&conf->wait_for_stripe);
>>  	return handled;
>> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
>> index a9fc249..04709d7 100644
>> --- a/drivers/md/raid5.h
>> +++ b/drivers/md/raid5.h
>> @@ -251,6 +251,9 @@ struct stripe_head_state {
>>  	 */
>>  	int syncing, expanding, expanded, replacing;
>>  	int locked, uptodate, to_read, to_write, failed, written;
>> +	/* Overwrite disks which faulty or removed or contained badsector */
>> +	int failed_overwrite;
>> +	int noworked;	/*disk faulty or removed*/
>>  	int to_fill, compute, req_compute, non_overwrite;
>>  	int failed_num[2];
>>  	int p_failed, q_failed;
>> @@ -298,6 +301,9 @@ enum r5dev_flags {
>>  	R5_WantReplace, /* We need to update the replacement, we have read
>>  			 * data in, and now is a good time to write it out.
>>  			 */
>> +	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
>> +	R5_FailedReread,
>> +	R5_BadParity,
>>  };
>>  
>>  /*
>> @@ -322,6 +328,8 @@ enum {
>>  	STRIPE_COMPUTE_RUN,
>>  	STRIPE_OPS_REQ_PENDING,
>>  	STRIPE_ON_UNPLUG_LIST,
>> +	STRIPE_FAILED_WRITE,
>> +	STRIPE_FAILED_REREAD,
>>  };
>>  
>>  /*
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] raid5: Correct some failed-stripe which because the badsector.
  2012-09-26  5:25   ` Jianpeng Ma
@ 2013-01-07 22:17     ` NeilBrown
  0 siblings, 0 replies; 4+ messages in thread
From: NeilBrown @ 2013-01-07 22:17 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: linux-raid, 周, 麒, 边, 雨

[-- Attachment #1: Type: text/plain, Size: 3966 bytes --]

On Wed, 26 Sep 2012 13:25:21 +0800 "Jianpeng Ma" <majianpeng@gmail.com> wrote:

> On 2012-09-26 09:20 NeilBrown <neilb@suse.de> Wrote:
> >On Sat, 22 Sep 2012 11:14:47 +0800 "Jianpeng Ma" <majianpeng@gmail.com> wrote:
> >
> >> Because raid5 had implemented badsector feature,some failed-stripe which because the
> >> badsector can try to correct instead of only failing.
> >> 
> >> For example:
> >> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
> >> 2:If readed  data from sdb and met error,so the stripe is failed
> >> 3:If later write-stripe contains sdb which overwrite.The stripe has a
> >> chance to correct.
> >
> >Your patch looks much more complicated that should be necessary, and you
> >don't provide any explanation for how it works, so it is very hard for me to
> >review.
> >
> Before implementing badsector log, failed-stripes were becasue the faulty or removed disks.
> For those, it's unnecessary to do something for those failed-stripe.
> But after implementing badsector log, failed-stripe can by some badsector.
> One example, RAID5 has four disk: one removed and one contained badsector, so the stripe is failed;
> Another example is recovery, when recovery a stripe and fail,so all the disks set badsector.
> But for above example, if full-write on this stripe,the badsector will be correct and failed-stripe become normal.
> 
> This patch is do this to try recorrect some failed-stripe because badsectors.
> The workflow is:
> 1:in func analyse_stripe, we count the overwrite-on-failed-disks(contained badsector or faulty or removed), noworkeddisk(faulty or remove)
> 2:in func handle_stripe, it determines whether to be processed.There will be three cases:
> A: noworkdisk > max_degraded. It indicates there are many removed or faulty disks.It only failed
> B: All faild-disks are overwrited,which dosen't need read old data.So it can read other goog disks or computer parity data using RCW.
> C: Some failed-disks are overwrited.If stripe didn't set STRIPE_PREREAD_ACTIVE, stripe can delay for waitting other failed-disks write-data.
> Otherwise, it only failed the stripe.
> 3;in func handle_stripe_dirtying do two things:
> 	A: delay for failed-disks write-data
> 	B: set RCW to control this situation
> 4:After successly writed, one thing must to do.It will be rewrite and reread for check.
> But for this case, it's unnecessary to rewrite,it only reread the failed-disks(only contains badsector).


Thanks - I think I know what you are trying to do now.

Please resend the patch with *only* the changes that you actually need and
I'll try to review it.


i.e.

> >> @@ -548,9 +549,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
> >>  			else
> >>  				rw = WRITE;
> >>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> >> -			rw = READ;
> >> +				rw = READ;
> >>  		else if (test_and_clear_bit(R5_WantReplace,
> >> -					    &sh->dev[i].flags)) {
> >> +				&sh->dev[i].flags)) {
> >>  			rw = WRITE;
> >>  			replace_only = 1;
> >>  		} else

This is purely cosmetic so shouldn't be in the patch.


> >> @@ -1737,6 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
> >>  		s = sh->sector + rdev->new_data_offset;
> >>  	else
> >>  		s = sh->sector + rdev->data_offset;
> >> +
> >>  	if (uptodate) {
> >>  		set_bit(R5_UPTODATE, &sh->dev[i].flags);
> >>  		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {

So is this.


> >> @@ -1852,8 +1856,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
> >>  		}
> >>  	}
> >>  	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
> >> -		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
> >> -		uptodate);
> >> +		(unsigned long long)sh->sector, i,
> >> +		atomic_read(&sh->count), uptodate);
> >>  	if (i == disks) {
> >>  		BUG();
> >>  		return;

So is this.
etc.

NeilBrown



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-01-07 22:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-22  3:14 [RFC PATCH] raid5: Correct some failed-stripe which because the badsector Jianpeng Ma
2012-09-26  1:20 ` NeilBrown
2012-09-26  5:25   ` Jianpeng Ma
2013-01-07 22:17     ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).