All of lore.kernel.org
 help / color / mirror / Atom feed
From: "majianpeng" <majianpeng@gmail.com>
To: NeilBrown <neilb@suse.de>
Cc: linux-raid <linux-raid@vger.kernel.org>
Subject: Re: Re: md/raid5:Fix recover/replace stop if handle stipe failed
Date: Tue, 27 Mar 2012 15:33:25 +0800	[thread overview]
Message-ID: <201203271533149067900@gmail.com> (raw)
In-Reply-To: 20120327142642.3e40547d@notabene.brown

I tested and corrected some bug.

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 23ac880..f8480f8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2471,39 +2471,42 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
        int abort = 0;
        int i;
 
-       md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
        clear_bit(STRIPE_SYNCING, &sh->state);
        s->syncing = 0;
        s->replacing = 0;
        /* There is nothing more to do for sync/check/repair.
+        * Don't even need to abort as that is handled elsewhere
+        * if needed, and not always wanted e.g. if there is a known
+        * bad block here.
         * For recover/replace we need to record a bad block on all
         * non-sync devices, or abort the recovery
         */
-       if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
-               return;
-       /* During recovery devices cannot be removed, so locking and
-        * refcounting of rdevs is not needed
-        */
-       for (i = 0; i < conf->raid_disks; i++) {
-               struct md_rdev *rdev = conf->disks[i].rdev;
-               if (rdev
-                   && !test_bit(Faulty, &rdev->flags)
-                   && !test_bit(In_sync, &rdev->flags)
-                   && !rdev_set_badblocks(rdev, sh->sector,
-                                          STRIPE_SECTORS, 0))
-                       abort = 1;
-               rdev = conf->disks[i].replacement;
-               if (rdev
-                   && !test_bit(Faulty, &rdev->flags)
-                   && !test_bit(In_sync, &rdev->flags)
-                   && !rdev_set_badblocks(rdev, sh->sector,
-                                          STRIPE_SECTORS, 0))
-                       abort = 1;
-       }
-       if (abort) {
-               conf->recovery_disabled = conf->mddev->recovery_disabled;
-               set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
-       }
+       if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
+               /* During recovery devices cannot be removed, so
+                * locking and refcounting of rdevs is not needed
+                */
+               for (i = 0; i < conf->raid_disks; i++) {
+                       struct md_rdev *rdev = conf->disks[i].rdev;
+                       if (rdev
+                           && !test_bit(Faulty, &rdev->flags)
+                           && !test_bit(In_sync, &rdev->flags)
+                           && !rdev_set_badblocks(rdev, sh->sector,
+                                                  STRIPE_SECTORS, 0))
+                               abort = 1;
+                       rdev = conf->disks[i].replacement;
+                       if (rdev
+                           && !test_bit(Faulty, &rdev->flags)
+                           && !test_bit(In_sync, &rdev->flags)
+                           && !rdev_set_badblocks(rdev, sh->sector,
+                                                  STRIPE_SECTORS, 0))
+                               abort = 1;
+               }
+               if (abort)
+                       conf->recovery_disabled =
+                               conf->mddev->recovery_disabled;
+       } else
+               abort = 1;
+       md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
 }
 
 static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3203,7 +3206,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        /* Not in-sync */;
                else if (is_bad) {
                        /* also not in-sync */
-                       if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+                       if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+                           test_bit(R5_UPTODATE, &dev->flags)) {
                                /* treat as in-sync, but with a read error
                                 * which we can now try to correct
                                 */


------------------				 
majianpeng
2012-03-27

-------------------------------------------------------------
发件人:NeilBrown
发送日期:2012-03-27 11:26:50
收件人:majianpeng
抄送:linux-raid
主题:Re: md/raid5:Fix recover/replace stop if handle stipe failed

On Wed, 14 Mar 2012 17:27:44 +0800 "majianpeng" <majianpeng@gmail.com> wrote:

> I created a raid5 using three disks and disk0 add bad blocks.I set faulty disk2 and remov disk2 and readd disk2.
> It seems to recover well and set disk2 badblocks  as disk0.
> But the md0_resync repeatly stop and start.
> The recovery_start of disk2 all the same .
> 

Thanks for the extra details (and sorry for the delay in replying).

There certainly is something wrong with handling bad blocks during recovery.

I think this patch should fix it.  Are you able to test it and confirm?

Thanks,
NeilBrown


diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 23ac880..2186e0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2471,39 +2471,41 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int abort = 0;
 	int i;
 
-	md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
 	clear_bit(STRIPE_SYNCING, &sh->state);
 	s->syncing = 0;
 	s->replacing = 0;
 	/* There is nothing more to do for sync/check/repair.
+	 * Don't even need to abort as that is handled elsewhere
+	 * if needed, and not always wanted e.g. if there is a known
+	 * bad block here.
 	 * For recover/replace we need to record a bad block on all
 	 * non-sync devices, or abort the recovery
 	 */
-	if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
-		return;
-	/* During recovery devices cannot be removed, so locking and
-	 * refcounting of rdevs is not needed
-	 */
-	for (i = 0; i < conf->raid_disks; i++) {
-		struct md_rdev *rdev = conf->disks[i].rdev;
-		if (rdev
-		    && !test_bit(Faulty, &rdev->flags)
-		    && !test_bit(In_sync, &rdev->flags)
-		    && !rdev_set_badblocks(rdev, sh->sector,
-					   STRIPE_SECTORS, 0))
-			abort = 1;
-		rdev = conf->disks[i].replacement;
-		if (rdev
-		    && !test_bit(Faulty, &rdev->flags)
-		    && !test_bit(In_sync, &rdev->flags)
-		    && !rdev_set_badblocks(rdev, sh->sector,
-					   STRIPE_SECTORS, 0))
-			abort = 1;
-	}
-	if (abort) {
-		conf->recovery_disabled = conf->mddev->recovery_disabled;
-		set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
+	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
+		/* During recovery devices cannot be removed, so
+		 * locking and refcounting of rdevs is not needed
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			struct md_rdev *rdev = conf->disks[i].rdev;
+			if (rdev
+			    && !test_bit(Faulty, &rdev->flags)
+			    && !test_bit(In_sync, &rdev->flags)
+			    && !rdev_set_badblocks(rdev, sh->sector,
+						   STRIPE_SECTORS, 0))
+				abort = 1;
+			rdev = conf->disks[i].replacement;
+			if (rdev
+			    && !test_bit(Faulty, &rdev->flags)
+			    && !test_bit(In_sync, &rdev->flags)
+			    && !rdev_set_badblocks(rdev, sh->sector,
+						   STRIPE_SECTORS, 0))
+				abort = 1;
+		}
+		if (abort)
+			conf->recovery_disabled =
+				conf->mddev->recovery_disabled;
 	}
+	md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
 }
 
 static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3203,7 +3205,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			/* Not in-sync */;
 		else if (is_bad) {
 			/* also not in-sync */
-			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+			    test_bit(R5_UPTODATE, &sh->devs[i].flags)) {
 				/* treat as in-sync, but with a read error
 				 * which we can now try to correct
 				 */

  reply	other threads:[~2012-03-27  7:33 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-14  7:07 md/raid5:Fix recover/replace stop if handle stipe failed majianpeng
2012-03-14  7:33 ` NeilBrown
2012-03-14  9:27 ` majianpeng
2012-03-27  3:26   ` NeilBrown
2012-03-27  7:33     ` majianpeng [this message]
2012-03-28  1:40       ` NeilBrown
2012-03-28  1:45       ` majianpeng
2012-03-28  2:14         ` NeilBrown
2012-03-28  2:25           ` majianpeng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=201203271533149067900@gmail.com \
    --to=majianpeng@gmail.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.