[md PATCH 14/16] md/raid1: avoid writing to known-bad blocks on known-bad drives.

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: NeilBrown <neilb@suse.de>
To: linux-raid@vger.kernel.org
Subject: [md PATCH 14/16] md/raid1: avoid writing to known-bad blocks on known-bad drives.
Date: Mon, 07 Jun 2010 10:07:56 +1000	[thread overview]
Message-ID: <20100607000756.13302.74584.stgit@notabene.brown> (raw)
In-Reply-To: <20100606235833.13302.60932.stgit@notabene.brown>

If we have seen any write error on a drive, then don't write to
any known-bad blocks on that drive.
If necessary, we divide the write request up into pieces just
like we do for reads, so each piece is either all written or
all not written to any given drive.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c |  147 ++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 112 insertions(+), 35 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bb81681..d240d58 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -838,11 +838,14 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	struct bitmap *bitmap;
 	unsigned long flags;
 	struct bio_list bl;
-	struct page **behind_pages = NULL;
+	struct page **behind_pages;
 	const int rw = bio_data_dir(bio);
 	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
 	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
+	int first_clone;
+	int sectors_handled;
+	int max_sectors;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -914,7 +917,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		/*
 		 * read balancing logic:
 		 */
-		int max_sectors;
 		int rdisk;
 
 	read_again:
@@ -954,7 +956,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 			/* could not read all from this device, so we will
 			 * need another r1_bio.
 			 */
-			int sectors_handled;
 
 			sectors_handled = (r1_bio->sector + max_sectors
 					   - bio->bi_sector);
@@ -983,9 +984,15 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/*
 	 * WRITE:
 	 */
-	/* first select target devices under spinlock and
+	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
+	 * If there are known/acknowledged bad blocks on any device on
+	 * which we have seen a write error, we want to avoid writing those
+	 * blocks.
+	 * This potentially requires several writes to write around
+	 * the bad blocks.  We do those serially using the same
+	 * bio which we repeatedly trim to size.
 	 */
 	disks = conf->raid_disks;
 #if 0
@@ -998,6 +1005,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
  retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
+	max_sectors = r1_bio->sectors;
 	for (i = 0;  i < disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -1005,17 +1013,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 			blocked_rdev = rdev;
 			break;
 		}
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			atomic_inc(&rdev->nr_pending);
-			if (test_bit(Faulty, &rdev->flags)) {
+		r1_bio->bios[i] = NULL;
+		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+			set_bit(R1BIO_Degraded, &r1_bio->state);
+			continue;
+		}
+
+		atomic_inc(&rdev->nr_pending);
+		if (test_bit(Faulty, &rdev->flags)) {
+			rdev_dec_pending(rdev, mddev);
+			set_bit(R1BIO_Degraded, &r1_bio->state);
+			continue;
+		}
+		if (test_bit(WriteErrorSeen, &rdev->flags)) {
+			sector_t first_bad;
+			int bad_sectors, good_sectors;
+
+			if (is_badblock(rdev, r1_bio->sector,
+					max_sectors,
+					&first_bad, &bad_sectors) < 0) {
+
+				/* mustn't write here until the bad block is
+				 * acknowledged*/
+				blocked_rdev = rdev;
+				break;
+			}
+			if (first_bad <= r1_bio->sector) {
+				/* Cannot write here at all */
+				bad_sectors -= (r1_bio->sector - first_bad);
+				if (bad_sectors < max_sectors)
+					/* mustn't write more than bad_sectors
+					 * to other devices yet
+					 */
+					max_sectors = bad_sectors;
 				rdev_dec_pending(rdev, mddev);
-				r1_bio->bios[i] = NULL;
-			} else {
-				r1_bio->bios[i] = bio;
-				targets++;
+				/* We don't set R1BIO_Degraded as that only applies
+				 * if the disk is missing, so it might be re-added,
+				 * and we want to know to recovery this chunk.
+				 * In this case the device is here, and the fact that
+				 * this chunk is no in-sync is recorded in the
+				 * bad block log
+				 */
+				continue;
 			}
-		} else
-			r1_bio->bios[i] = NULL;
+			good_sectors = first_bad - r1_bio->sector;
+			if (good_sectors < max_sectors)
+				max_sectors = good_sectors;
+		}
+		r1_bio->bios[i] = bio;
+		targets++;
 	}
 	rcu_read_unlock();
 
@@ -1035,22 +1081,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 
 	BUG_ON(targets == 0); /* we never fail the last device */
 
+	if (max_sectors < r1_bio->sectors) {
+		/* We are splitting this write into multiple parts, so
+		 * we need to prepare for allocating another r1_bio.
+		 */
+		r1_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+	}
+	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+
 	if (targets < conf->raid_disks) {
 		/* array is degraded, we will not clear the bitmap
 		 * on I/O completion (see raid1_end_write_request) */
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 
-	/* do behind I/O ?
-	 * Not if there are too many, or cannot allocate memory,
-	 * or a reader on WriteMostly is waiting for behind writes 
-	 * to flush */
-	if (bitmap &&
-	    (atomic_read(&bitmap->behind_writes)
-	     < mddev->bitmap_info.max_write_behind) &&
-	    !waitqueue_active(&bitmap->behind_wait) &&
-	    (behind_pages = alloc_behind_pages(bio)) != NULL)
-		set_bit(R1BIO_BehindIO, &r1_bio->state);
 
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
@@ -1060,21 +1110,28 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 
 	bio_list_init(&bl);
+	first_clone = 1;
+	behind_pages = NULL;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
 		if (!r1_bio->bios[i])
 			continue;
 
 		mbio = bio_clone(bio, GFP_NOIO);
-		r1_bio->bios[i] = mbio;
-
-		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
-		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
-			(do_sync << BIO_RW_SYNCIO);
-		mbio->bi_private = r1_bio;
 
+		if (first_clone) {
+			/* do behind I/O ?
+			 * Not if there are too many, or cannot
+			 * allocate memory, or a reader on WriteMostly
+			 * is waiting for behind writes to flush */
+			if (bitmap &&
+			    (atomic_read(&bitmap->behind_writes)
+			     < mddev->bitmap_info.max_write_behind) &&
+			    !waitqueue_active(&bitmap->behind_wait) &&
+			    (behind_pages = alloc_behind_pages(mbio)) != NULL)
+				set_bit(R1BIO_BehindIO, &r1_bio->state);
+			first_clone = 0;
+		}
 		if (behind_pages) {
 			struct bio_vec *bvec;
 			int j;
@@ -1092,6 +1149,17 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 				atomic_inc(&r1_bio->behind_remaining);
 		}
 
+		trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+		r1_bio->bios[i] = mbio;
+
+		mbio->bi_sector	= (r1_bio->sector +
+				   conf->mirrors[i].rdev->data_offset);
+		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+		mbio->bi_end_io	= raid1_end_write_request;
+		mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
+			(do_sync << BIO_RW_SYNCIO);
+		mbio->bi_private = r1_bio;
+
 		atomic_inc(&r1_bio->remaining);
 
 		bio_list_add(&bl, mbio);
@@ -1110,12 +1178,21 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid1d snuck into freeze_array */
 	wake_up(&conf->wait_barrier);
 
+	if (sectors_handled < (bio->bi_size >> 9)) {
+		/* We need another r1_bio.  It has already been counted
+		 * in bio->bi_phys_segments
+		 */
+		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+		r1_bio->state = 0;
+		r1_bio->mddev = mddev;
+		r1_bio->sector = bio->bi_sector + sectors_handled;
+		goto retry_write;
+	}
+
 	if (do_sync)
 		md_wakeup_thread(mddev->thread);
-#if 0
-	while ((bio = bio_list_pop(&bl)) != NULL)
-		generic_make_request(bio);
-#endif
 
 	return 0;
 }

next prev parent reply	other threads:[~2010-06-07  0:07 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-07  0:07 [md PATCH 00/16] bad block list management for md and RAID1 NeilBrown
2010-06-07  0:07 ` [md PATCH 02/16] md/bad-block-log: add sysfs interface for accessing bad-block-log NeilBrown
2010-06-07  0:07 ` [md PATCH 01/16] md: beginnings of bad block management NeilBrown
2010-06-07  0:07 ` [md PATCH 04/16] md: load/store badblock list from v1.x metadata NeilBrown
2010-06-07  0:07 ` [md PATCH 03/16] md: don't allow arrays to contain devices with bad blocks NeilBrown
2010-06-07  0:07 ` [md PATCH 06/16] md/raid1: clean up read_balance NeilBrown
2010-06-07  0:07 ` [md PATCH 07/16] md: simplify raid10 read_balance NeilBrown
2010-06-07  0:07 ` [md PATCH 05/16] md: reject devices with bad blocks and v0.90 metadata NeilBrown
2010-06-07  0:07 ` [md PATCH 08/16] md/raid1: avoid reading from known bad blocks NeilBrown
2010-06-07  0:07 ` [md PATCH 09/16] md/raid1: avoid reading known bad blocks during resync NeilBrown
2010-06-07  0:07 ` [md PATCH 11/16] md/multipath: discard ->working_disks in favour of ->degraded NeilBrown
2010-06-07  0:07 ` [md PATCH 12/16] md: make error_handler functions more uniform and correct NeilBrown
2010-06-07  0:07 ` [md PATCH 10/16] md: add 'write_error' flag to component devices NeilBrown
2010-06-07  0:07 ` [md PATCH 15/16] md/raid1: clear bad-block record when write succeeds NeilBrown
2010-06-07  0:07 ` [md PATCH 13/16] md: make it easier to wait for bad blocks to be acknowledged NeilBrown
2010-06-07  0:07 ` NeilBrown [this message]
2010-06-07  0:07 ` [md PATCH 16/16] md/raid1: Handle write errors by updating badblock log NeilBrown
2010-06-07  0:28 ` [md PATCH 00/16] bad block list management for md and RAID1 Berkey B Walker
2010-06-07 22:18   ` Stefan /*St0fF*/ Hübner
2010-06-17 12:48 ` Brett Russ
2010-06-17 15:53   ` Graham Mitchell
2010-06-18  3:58     ` Neil Brown
2010-06-18  4:30       ` Graham Mitchell
2010-06-18  3:23   ` Neil Brown
     [not found]     ` <4C1BABC4.3020008@tmr.com>
2010-06-29  5:06       ` Neil Brown
2010-06-29 16:54         ` Bill Davidsen

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:bb81681 dfblob:d240d58 )
 OR (
bs:"[md PATCH 14/16] md/raid1: avoid writing to known-bad blocks on known-bad drives." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100607000756.13302.74584.stgit@notabene.brown \
    --to=neilb@suse.de \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).