From: NeilBrown <neilb@suse.de>
To: linux-raid@vger.kernel.org
Subject: [md PATCH 13/16] md/raid5: handle activation of replacement device when recovery completes.
Date: Wed, 26 Oct 2011 12:43:01 +1100 [thread overview]
Message-ID: <20111026014301.21110.35196.stgit@notabene.brown> (raw)
In-Reply-To: <20111026014240.21110.28487.stgit@notabene.brown>
When recovery completes - as reported by a call to ->spare_active,
we clear In_sync on the original and set it on the replacement.
Then when the original gets removed we move the replacement from
'replacement' to 'rdev'.
This could race with other code that is looking at these pointers,
so we use memory barriers and careful ordering to ensure that
a reader might see one device twice, but never no devices.
Then the readers guard against using both devices, which could
only happen when writing.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid5.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 61 insertions(+), 7 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 08f388c..6c22f09 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -519,13 +519,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
rrdev = rcu_dereference(conf->disks[i].replacement);
+ smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (!rdev) {
+ rdev = rrdev;
+ rrdev = NULL;
+ }
if (rw & WRITE) {
if (replace_only)
rdev = NULL;
+ if (rdev == rrdev)
+ /* We raced and saw duplicates */
+ rrdev = NULL;
} else {
- if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
rdev = rrdev;
rrdev = NULL;
}
@@ -1627,7 +1635,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL;
for (i=0 ; i<disks; i++)
@@ -1642,8 +1650,13 @@ static void raid5_end_read_request(struct bio * bi, int error)
return;
}
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ /* If replacement finished while this request was outstanding,
+ * 'replacement' might be NULL already.
+ * In that case it moved down to 'rdev'.
+ * rdev is not removed until all requests are finished.
+ */
rdev = conf->disks[i].replacement;
- else
+ if (!rdev)
rdev = conf->disks[i].rdev;
if (uptodate) {
@@ -1740,7 +1753,13 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
if (bi == &sh->dev[i].rreq) {
rdev = conf->disks[i].replacement;
- replacement = 1;
+ if (rdev)
+ replacement = 1;
+ else
+ /* rdev was removed and 'replacement'
+ * replaced it.
+ */
+ rdev = conf->disks[i].rdev;
break;
}
}
@@ -3515,6 +3534,9 @@ finish:
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = conf->disks[i].replacement;
+ if (!rdev)
+ /* rdev have been moved down */
+ rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS);
rdev_dec_pending(rdev, conf->mddev);
@@ -5183,7 +5205,25 @@ static int raid5_spare_active(struct mddev *mddev)
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
- if (tmp->rdev
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active. */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5229,6 +5269,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
+ (!p->replacement || p->replacement == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
@@ -5239,7 +5280,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
/* lost the race, try later */
err = -EBUSY;
*rdevp = rdev;
- }
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither - if they are careful
+ */
+ p->replacement = NULL;
+ clear_bit(Replaceable, &rdev->flags);
+ } else
+ /* We might have just removed the Replacement as faulty-
+ * clear the bit just in case
+ */
+ clear_bit(Replaceable, &rdev->flags);
abort:
print_raid5_conf(conf);
next prev parent reply other threads:[~2011-10-26 1:43 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-10-26 1:43 [md PATCH 00/16] hot-replace support for RAID4/5/6 NeilBrown
2011-10-26 1:43 ` [md PATCH 02/16] md: take after reference to mddev during sysfs access NeilBrown
2011-10-26 1:43 ` [md PATCH 04/16] md: change hot_remove_disk to take an rdev rather than a number NeilBrown
2011-10-26 1:43 ` [md PATCH 01/16] md: refine interpretation of "hold_active == UNTIL_IOCTL" NeilBrown
2011-10-26 1:43 ` [md PATCH 03/16] md: remove test for duplicate device when setting slot number NeilBrown
2011-10-26 1:43 ` [md PATCH 05/16] md: create externally visible flags for supporting hot-replace NeilBrown
2011-10-26 1:43 ` [md PATCH 06/16] md/raid5: allow each slot to have an extra replacement device NeilBrown
2011-10-26 1:43 ` [md PATCH 09/16] md/raid5: preferentially read from replacement device if possible NeilBrown
2011-10-26 1:43 ` [md PATCH 12/16] md/raid5: detect and handle replacements during recovery NeilBrown
2011-10-26 1:43 ` [md PATCH 10/16] md/raid5: allow removal for failed replacement devices NeilBrown
2011-10-26 1:43 ` [md PATCH 07/16] md/raid5: raid5.h cleanup NeilBrown
2011-10-26 1:43 ` [md PATCH 14/16] md/raid5: recognise replacements when assembling array NeilBrown
2011-10-26 1:43 ` [md PATCH 08/16] md/raid5: remove redundant bio initialisations NeilBrown
2011-10-26 1:43 ` [md PATCH 11/16] md/raid5: writes should get directed to replacement as well as original NeilBrown
2011-10-26 1:43 ` NeilBrown [this message]
2011-10-26 1:43 ` [md PATCH 16/16] md/raid5: Mark device replaceable when we see a write error NeilBrown
2011-10-26 1:43 ` [md PATCH 15/16] md/raid5: If there is a spare and a replaceable device, start replacement NeilBrown
2011-10-26 6:38 ` [md PATCH 00/16] hot-replace support for RAID4/5/6 David Brown
2011-10-26 7:42 ` NeilBrown
2011-10-26 9:01 ` John Robinson
2011-10-26 13:57 ` Peter W. Morreale
2011-10-26 17:27 ` Piergiorgio Sartor
2011-10-27 17:10 ` Peter W. Morreale
2011-10-27 20:44 ` NeilBrown
2011-10-27 20:53 ` Peter W. Morreale
2011-12-14 22:18 ` Dan Williams
2011-12-15 6:18 ` NeilBrown
2011-12-15 7:14 ` Williams, Dan J
2011-12-20 5:18 ` NeilBrown
2011-12-22 20:54 ` Alexander Kühn
2011-12-22 21:14 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111026014301.21110.35196.stgit@notabene.brown \
--to=neilb@suse.de \
--cc=linux-raid@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).