[md PATCH 11/16] md/raid5: writes should get directed to replacement as well as original.

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: NeilBrown <neilb@suse.de>
To: linux-raid@vger.kernel.org
Subject: [md PATCH 11/16] md/raid5: writes should get directed to replacement as well as original.
Date: Wed, 26 Oct 2011 12:43:01 +1100	[thread overview]
Message-ID: <20111026014301.21110.32951.stgit@notabene.brown> (raw)
In-Reply-To: <20111026014240.21110.28487.stgit@notabene.brown>

When writing, we need to submit two writes, one to the original, and
one to the replacement - if there is a replacement.

If the write to the replacement results in a write error, we just fail
the device.  We only try to record write errors to the original.

When writing for recovery, we shouldn't write to the original.  This
will be addressed in a subsequent patch that generally addresses
recovery.

Signed-off-by: NeilBrown <neilb@suse.de>
---

 drivers/md/raid5.c |  119 ++++++++++++++++++++++++++++++++++++++++++----------
 drivers/md/raid5.h |    1 
 2 files changed, 97 insertions(+), 23 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 20f42f6..4ba4b8c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -491,8 +491,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	for (i = disks; i--; ) {
 		int rw;
-		struct bio *bi;
-		struct md_rdev *rdev;
+		struct bio *bi, *rbi;
+		struct md_rdev *rdev, *rrdev = NULL;
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 				rw = WRITE_FUA;
@@ -504,27 +504,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			continue;
 
 		bi = &sh->dev[i].req;
+		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
 		bi->bi_rw = rw;
-		if (rw & WRITE)
+		rbi->bi_rw = rw;
+		if (rw & WRITE) {
 			bi->bi_end_io = raid5_end_write_request;
-		else
+			rbi->bi_end_io = raid5_end_write_request;
+		} else
 			bi->bi_end_io = raid5_end_read_request;
 
 		rcu_read_lock();
-		if (rw == READ &&
-		    test_bit(R5_ReadRepl, &sh->dev[i].flags))
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rw & WRITE)
+			rrdev = rcu_dereference(conf->disks[i].replacement);
+		else if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
 			rdev = rcu_dereference(conf->disks[i].replacement);
-		else
-			rdev = rcu_dereference(conf->disks[i].rdev);
+
 		if (rdev && test_bit(Faulty, &rdev->flags))
 			rdev = NULL;
 		if (rdev)
 			atomic_inc(&rdev->nr_pending);
+		if (rrdev && test_bit(Faulty, &rrdev->flags))
+			rrdev = NULL;
+		if (rrdev)
+			atomic_inc(&rrdev->nr_pending);
 		rcu_read_unlock();
 
 		/* We have already checked bad blocks for reads.  Now
-		 * need to check for writes.
+		 * need to check for writes.  We never accept write errors
+		 * on the replacement, so we don't to check rrdev.
 		 */
 		while ((rw & WRITE) && rdev &&
 		       test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -571,8 +580,32 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rrdev)
+				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 			generic_make_request(bi);
-		} else {
+		}
+		if (rrdev) {
+			if (s->syncing || s->expanding || s->expanded)
+				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+
+			set_bit(STRIPE_IO_STARTED, &sh->state);
+
+			rbi->bi_bdev = rrdev->bdev;
+			pr_debug("%s: for %llu schedule op %ld on "
+				 "replacement disc %d\n",
+				__func__, (unsigned long long)sh->sector,
+				rbi->bi_rw, i);
+			atomic_inc(&sh->count);
+			rbi->bi_sector = sh->sector + rrdev->data_offset;
+			rbi->bi_flags = 1 << BIO_UPTODATE;
+			rbi->bi_idx = 0;
+			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+			rbi->bi_io_vec[0].bv_offset = 0;
+			rbi->bi_size = STRIPE_SIZE;
+			rbi->bi_next = NULL;
+			generic_make_request(rbi);
+		}
+		if (!rdev && !rrdev) {
 			if (rw & WRITE)
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1683,14 +1716,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	struct stripe_head *sh = bi->bi_private;
 	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
+	struct md_rdev *uninitialized_var(rdev);
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	sector_t first_bad;
 	int bad_sectors;
+	int replacement = 0;
 
-	for (i=0 ; i<disks; i++)
-		if (bi == &sh->dev[i].req)
+	for (i = 0 ; i < disks; i++) {
+		if (bi == &sh->dev[i].req) {
+			rdev = conf->disks[i].rdev;
 			break;
-
+		}
+		if (bi == &sh->dev[i].rreq) {
+			rdev = conf->disks[i].replacement;
+			replacement = 1;
+			break;
+		}
+	}
 	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
 		uptodate);
@@ -1699,21 +1741,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		return;
 	}
 
-	if (!uptodate) {
-		set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
-		set_bit(R5_WriteError, &sh->dev[i].flags);
-	} else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
-			       &first_bad, &bad_sectors))
-		set_bit(R5_MadeGood, &sh->dev[i].flags);
+	if (replacement) {
+		if (!uptodate)
+			md_error(conf->mddev, rdev);
+		else if (is_badblock(rdev, sh->sector,
+				     STRIPE_SECTORS,
+				     &first_bad, &bad_sectors))
+			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
+	} else {
+		if (!uptodate) {
+			set_bit(WriteErrorSeen, &rdev->flags);
+			set_bit(R5_WriteError, &sh->dev[i].flags);
+		} else if (is_badblock(rdev, sh->sector,
+				       STRIPE_SECTORS,
+				       &first_bad, &bad_sectors))
+			set_bit(R5_MadeGood, &sh->dev[i].flags);
+	}
+	rdev_dec_pending(rdev, conf->mddev);
 
-	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-	
-	clear_bit(R5_LOCKED, &sh->dev[i].flags);
+	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
+		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
-
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
 	
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1727,6 +1778,13 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 	dev->req.bi_private = sh;
 	dev->vec.bv_page = dev->page;
 
+	bio_init(&dev->rreq);
+	dev->rreq.bi_io_vec = &dev->rvec;
+	dev->rreq.bi_vcnt++;
+	dev->rreq.bi_max_vecs++;
+	dev->rreq.bi_private = sh;
+	dev->rvec.bv_page = dev->page;
+
 	dev->flags = 0;
 	dev->sector = compute_blocknr(sh, i, previous);
 }
@@ -3115,6 +3173,15 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			} else
 				clear_bit(R5_MadeGood, &dev->flags);
 		}
+		if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
+			struct md_rdev *rdev2 = rcu_dereference(
+				conf->disks[i].replacement);
+			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
+				s->handle_bad_blocks = 1;
+				atomic_inc(&rdev2->nr_pending);
+			} else
+				clear_bit(R5_MadeGoodRepl, &dev->flags);
+		}
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -3383,6 +3450,12 @@ finish:
 						     STRIPE_SECTORS);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
+			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
+				rdev = conf->disks[i].replacement;
+				rdev_clear_badblocks(rdev, sh->sector,
+						     STRIPE_SECTORS);
+				rdev_dec_pending(rdev, conf->mddev);
+			}
 		}
 
 	if (s.ops_request)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cfd801..f6faaa1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -259,6 +259,7 @@ struct stripe_head_state {
 enum r5dev_flags {
 	R5_UPTODATE,	/* page contains current data */
 	R5_LOCKED,	/* IO has been submitted on "req" */
+	R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
 	R5_OVERWRITE,	/* towrite covers whole page */
 /* and some that are internal to handle_stripe */
 	R5_Insync,	/* rdev && rdev->in_sync at start */

next prev parent reply	other threads:[~2011-10-26  1:43 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-10-26  1:43 [md PATCH 00/16] hot-replace support for RAID4/5/6 NeilBrown
2011-10-26  1:43 ` [md PATCH 04/16] md: change hot_remove_disk to take an rdev rather than a number NeilBrown
2011-10-26  1:43 ` [md PATCH 02/16] md: take after reference to mddev during sysfs access NeilBrown
2011-10-26  1:43 ` [md PATCH 01/16] md: refine interpretation of "hold_active == UNTIL_IOCTL" NeilBrown
2011-10-26  1:43 ` [md PATCH 03/16] md: remove test for duplicate device when setting slot number NeilBrown
2011-10-26  1:43 ` [md PATCH 07/16] md/raid5: raid5.h cleanup NeilBrown
2011-10-26  1:43 ` [md PATCH 14/16] md/raid5: recognise replacements when assembling array NeilBrown
2011-10-26  1:43 ` [md PATCH 06/16] md/raid5: allow each slot to have an extra replacement device NeilBrown
2011-10-26  1:43 ` [md PATCH 05/16] md: create externally visible flags for supporting hot-replace NeilBrown
2011-10-26  1:43 ` [md PATCH 12/16] md/raid5: detect and handle replacements during recovery NeilBrown
2011-10-26  1:43 ` [md PATCH 10/16] md/raid5: allow removal for failed replacement devices NeilBrown
2011-10-26  1:43 ` [md PATCH 09/16] md/raid5: preferentially read from replacement device if possible NeilBrown
2011-10-26  1:43 ` [md PATCH 13/16] md/raid5: handle activation of replacement device when recovery completes NeilBrown
2011-10-26  1:43 ` NeilBrown [this message]
2011-10-26  1:43 ` [md PATCH 08/16] md/raid5: remove redundant bio initialisations NeilBrown
2011-10-26  1:43 ` [md PATCH 16/16] md/raid5: Mark device replaceable when we see a write error NeilBrown
2011-10-26  1:43 ` [md PATCH 15/16] md/raid5: If there is a spare and a replaceable device, start replacement NeilBrown
2011-10-26  6:38 ` [md PATCH 00/16] hot-replace support for RAID4/5/6 David Brown
2011-10-26  7:42   ` NeilBrown
2011-10-26  9:01   ` John Robinson
2011-10-26 13:57     ` Peter W. Morreale
2011-10-26 17:27       ` Piergiorgio Sartor
2011-10-27 17:10 ` Peter W. Morreale
2011-10-27 20:44   ` NeilBrown
2011-10-27 20:53     ` Peter W. Morreale
2011-12-14 22:18 ` Dan Williams
2011-12-15  6:18   ` NeilBrown
2011-12-15  7:14     ` Williams, Dan J
2011-12-20  5:18       ` NeilBrown
2011-12-22 20:54         ` Alexander Kühn
2011-12-22 21:14           ` NeilBrown

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:20f42f6 dfblob:4ba4b8c dfblob:4cfd801 dfblob:f6faaa1 )
 OR (
bs:"[md PATCH 11/16] md/raid5: writes should get directed to replacement as well as original." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111026014301.21110.32951.stgit@notabene.brown \
    --to=neilb@suse.de \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).