* [md PATCH 02/17] md/raid10: change read_balance to return an rdev
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 03/17] md/raid10: preferentially read from replacement device if possible NeilBrown
` (15 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
It makes more sense to return an rdev than just an index as
read_balance() gets a reference to the rdev and so returning
the pointer make this more idiomatic.
This will be needed in a future patch when we might return
a 'replacement' rdev instead of the main rdev.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 27 +++++++++++++--------------
1 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 93e47f6..254a6ce 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -589,7 +589,9 @@ static int raid10_mergeable_bvec(struct request_queue *q,
* FIXME: possibly should rethink readbalancing and do it differently
* depending on near_copies / far_copies geometry.
*/
-static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
+static struct md_rdev *read_balance(struct r10conf *conf,
+ struct r10bio *r10_bio,
+ int *max_sectors)
{
const sector_t this_sector = r10_bio->sector;
int disk, slot;
@@ -702,11 +704,11 @@ retry:
}
r10_bio->read_slot = slot;
} else
- disk = -1;
+ rdev = NULL;
rcu_read_unlock();
*max_sectors = best_good_sectors;
- return disk;
+ return rdev;
}
static int raid10_congested(void *data, int bits)
@@ -873,7 +875,6 @@ static void unfreeze_array(struct r10conf *conf)
static int make_request(struct mddev *mddev, struct bio * bio)
{
struct r10conf *conf = mddev->private;
- struct mirror_info *mirror;
struct r10bio *r10_bio;
struct bio *read_bio;
int i;
@@ -974,17 +975,16 @@ static int make_request(struct mddev *mddev, struct bio * bio)
/*
* read balancing logic:
*/
- int disk;
+ struct md_rdev *rdev;
int slot;
read_again:
- disk = read_balance(conf, r10_bio, &max_sectors);
- slot = r10_bio->read_slot;
- if (disk < 0) {
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (!rdev) {
raid_end_bio_io(r10_bio);
return 0;
}
- mirror = conf->mirrors + disk;
+ slot = r10_bio->read_slot;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
@@ -993,8 +993,8 @@ read_again:
r10_bio->devs[slot].bio = read_bio;
read_bio->bi_sector = r10_bio->devs[slot].addr +
- mirror->rdev->data_offset;
- read_bio->bi_bdev = mirror->rdev->bdev;
+ rdev->data_offset;
+ read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request;
read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r10_bio;
@@ -2118,8 +2118,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
read_more:
- mirror = read_balance(conf, r10_bio, &max_sectors);
- if (mirror == -1) {
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (rdev == NULL) {
printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
" read error for block %llu\n",
mdname(mddev), b,
@@ -2133,7 +2133,6 @@ read_more:
if (bio)
bio_put(bio);
slot = r10_bio->read_slot;
- rdev = conf->mirrors[mirror].rdev;
printk_ratelimited(
KERN_ERR
"md/raid10:%s: %s: redirecting"
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 03/17] md/raid10: preferentially read from replacement device if possible.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
2011-11-02 5:25 ` [md PATCH 02/17] md/raid10: change read_balance to return an rdev NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 04/17] md/raid10: allow removal of failed replacement devices NeilBrown
` (14 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When reading (for array reads, not for recovery etc) we read from the
replacement device if it has recovered far enough.
This requires storing the chosen rdev in the 'r10_bio' so we can make
sure to drop the ref on the right device when the read finishes.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 36 +++++++++++++++++++++++-------------
1 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 254a6ce..fffdc43 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -323,11 +323,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
int slot, dev;
+ struct md_rdev *rdev;
struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
dev = r10_bio->devs[slot].devnum;
+ rdev = r10_bio->devs[slot].rdev;
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
@@ -345,7 +347,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+ rdev_dec_pending(rdev, conf->mddev);
} else {
/*
* oops, read error - keep the refcount on the rdev
@@ -354,7 +356,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
printk_ratelimited(KERN_ERR
"md/raid10:%s: %s: rescheduling sector %llu\n",
mdname(conf->mddev),
- bdevname(conf->mirrors[dev].rdev->bdev, b),
+ bdevname(rdev->bdev, b),
(unsigned long long)r10_bio->sector);
set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio);
@@ -598,7 +600,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
int sectors = r10_bio->sectors;
int best_good_sectors;
sector_t new_distance, best_dist;
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *best_rdev;
int do_balance;
int best_slot;
@@ -607,6 +609,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
retry:
sectors = r10_bio->sectors;
best_slot = -1;
+ best_rdev = NULL;
best_dist = MaxSector;
best_good_sectors = 0;
do_balance = 1;
@@ -628,10 +631,16 @@ retry:
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL)
continue;
- if (!test_bit(In_sync, &rdev->flags))
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
continue;
dev_sector = r10_bio->devs[slot].addr;
@@ -656,6 +665,7 @@ retry:
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
best_slot = slot;
+ best_rdev = rdev;
}
if (!do_balance)
/* Must read from here */
@@ -684,16 +694,15 @@ retry:
if (new_distance < best_dist) {
best_dist = new_distance;
best_slot = slot;
+ best_rdev = rdev;
}
}
- if (slot == conf->copies)
+ if (slot >= conf->copies) {
slot = best_slot;
+ rdev = best_rdev;
+ }
if (slot >= 0) {
- disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (!rdev)
- goto retry;
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
/* Cannot risk returning a device that failed
@@ -991,6 +1000,7 @@ read_again:
max_sectors);
r10_bio->devs[slot].bio = read_bio;
+ r10_bio->devs[slot].rdev = rdev;
read_bio->bi_sector = r10_bio->devs[slot].addr +
rdev->data_offset;
@@ -2090,10 +2100,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
{
int slot = r10_bio->read_slot;
- int mirror = r10_bio->devs[slot].devnum;
struct bio *bio;
struct r10conf *conf = mddev->private;
- struct md_rdev *rdev;
+ struct md_rdev *rdev = r10_bio->devs[slot].rdev;
char b[BDEVNAME_SIZE];
unsigned long do_sync;
int max_sectors;
@@ -2111,7 +2120,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
- rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
+ rdev_dec_pending(rdev, mddev);
bio = r10_bio->devs[slot].bio;
bdevname(bio->bi_bdev, b);
@@ -2146,6 +2155,7 @@ read_more:
r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[slot].bio = bio;
+ r10_bio->devs[slot].rdev = rdev;
bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 04/17] md/raid10: allow removal of failed replacement devices.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
2011-11-02 5:25 ` [md PATCH 02/17] md/raid10: change read_balance to return an rdev NeilBrown
2011-11-02 5:25 ` [md PATCH 03/17] md/raid10: preferentially read from replacement device if possible NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 01/17] md/raid10: prepare data structures for handling replacement NeilBrown
` (13 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
Enhance raid10_remove_disk to be able to remove ->replacement
as well as ->rdev
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 57 +++++++++++++++++++++++++++++----------------------
1 files changed, 32 insertions(+), 25 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fffdc43..698696b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1431,34 +1431,41 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r10conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct mirror_info *p = conf->mirrors+ number;
+ struct md_rdev **rdevp;
+ struct mirror_info *p = conf->mirrors + number;
print_conf(conf);
- if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
- err = -EBUSY;
- goto abort;
- }
- /* Only remove faulty devices in recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != p->recovery_disabled &&
- enough(conf, -1)) {
- err = -EBUSY;
- goto abort;
- }
- p->rdev = NULL;
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- p->rdev = rdev;
- goto abort;
- }
- err = md_integrity_register(mddev);
+ if (rdev == p->rdev)
+ rdevp = &p->rdev;
+ else if (rdev == p->replacement)
+ rdevp = &p->replacement;
+ else
+ return 0;
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != p->recovery_disabled &&
+ enough(conf, -1)) {
+ err = -EBUSY;
+ goto abort;
}
+ *rdevp = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ goto abort;
+ }
+ err = md_integrity_register(mddev);
+
abort:
print_conf(conf);
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 01/17] md/raid10: prepare data structures for handling replacement.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (2 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 04/17] md/raid10: allow removal of failed replacement devices NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 09/17] md/raid10: recognise replacements when assembling array NeilBrown
` (12 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
Allow each slot in the RAID10 to have 2 devices, the replaceable
and the replacement.
Also an r10bio to have 2 bios, and for resync/recovery allocate the
second bio if there are any replacement devices.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 48 ++++++++++++++++++++++++++++++++++------
drivers/md/raid10.h | 61 +++++++++++++++++++++++++++++++--------------------
2 files changed, 78 insertions(+), 31 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e43c55e..93e47f6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -72,7 +72,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
struct r10conf *conf = data;
int size = offsetof(struct r10bio, devs[conf->copies]);
- /* allocate a r10bio with room for raid_disks entries in the bios array */
+ /* allocate a r10bio with room for raid_disks entries in the
+ * bios array */
return kzalloc(size, gfp_flags);
}
@@ -122,12 +123,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!bio)
goto out_free_bio;
r10_bio->devs[j].bio = bio;
+ if (!conf->have_replacement)
+ continue;
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r10_bio->devs[j].repl_bio = bio;
}
/*
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
for (j = 0 ; j < nalloc; j++) {
+ struct bio *rbio = r10_bio->devs[j].repl_bio;
bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) {
if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
@@ -142,6 +150,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
goto out_free_pages;
bio->bi_io_vec[i].bv_page = page;
+ if (rbio)
+ rbio->bi_io_vec[i].bv_page = page;
}
}
@@ -155,8 +165,11 @@ out_free_pages:
safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
- while ( ++j < nalloc )
+ while (++j < nalloc) {
bio_put(r10_bio->devs[j].bio);
+ if (r10_bio->devs[j].repl_bio)
+ bio_put(r10_bio->devs[j].repl_bio);
+ }
r10bio_pool_free(r10_bio, conf);
return NULL;
}
@@ -177,6 +190,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
}
bio_put(bio);
}
+ bio = r10bio->devs[j].repl_bio;
+ if (bio)
+ bio_put(bio);
}
r10bio_pool_free(r10bio, conf);
}
@@ -190,6 +206,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
+ bio = &r10_bio->devs[i].repl_bio;
+ if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
+ bio_put(*bio);
+ *bio = NULL;
}
}
@@ -274,19 +294,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
* Find the disk number which triggered given bio
*/
static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
- struct bio *bio, int *slotp)
+ struct bio *bio, int *slotp, int *replp)
{
int slot;
+ int repl = 0;
- for (slot = 0; slot < conf->copies; slot++)
+ for (slot = 0; slot < conf->copies; slot++) {
if (r10_bio->devs[slot].bio == bio)
break;
+ if (r10_bio->devs[slot].repl_bio == bio) {
+ repl = 1;
+ break;
+ }
+ }
BUG_ON(slot == conf->copies);
update_head_pos(slot, r10_bio);
if (slotp)
*slotp = slot;
+ if (replp)
+ *replp = repl;
return r10_bio->devs[slot].devnum;
}
@@ -367,7 +395,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private;
int slot;
- dev = find_bio_disk(conf, r10_bio, bio, &slot);
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
/*
* this branch is our 'one mirror IO has finished' event handler:
@@ -1026,6 +1054,7 @@ read_again:
*/
plugged = mddev_check_plugged(mddev);
+ r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
raid10_find_phys(conf, r10_bio);
retry_write:
blocked_rdev = NULL;
@@ -1433,7 +1462,7 @@ static void end_sync_read(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private;
int d;
- d = find_bio_disk(conf, r10_bio, bio, NULL);
+ d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1495,7 +1524,7 @@ static void end_sync_write(struct bio *bio, int error)
int bad_sectors;
int slot;
- d = find_bio_disk(conf, r10_bio, bio, &slot);
+ d = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
if (!uptodate) {
set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
@@ -2273,9 +2302,14 @@ static void raid10d(struct mddev *mddev)
static int init_resync(struct r10conf *conf)
{
int buffs;
+ int i;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
BUG_ON(conf->r10buf_pool);
+ conf->have_replacement = 0;
+ for (i = 0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->have_replacement = 1;
conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
if (!conf->r10buf_pool)
return -ENOMEM;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7facfdf..7c615613 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,7 +2,7 @@
#define _RAID10_H
struct mirror_info {
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *replacement;
sector_t head_position;
int recovery_disabled; /* matches
* mddev->recovery_disabled
@@ -18,12 +18,13 @@ struct r10conf {
spinlock_t device_lock;
/* geometry */
- int near_copies; /* number of copies laid out raid0 style */
+ int near_copies; /* number of copies laid out
+ * raid0 style */
int far_copies; /* number of copies laid out
* at large strides across drives
*/
- int far_offset; /* far_copies are offset by 1 stripe
- * instead of many
+ int far_offset; /* far_copies are offset by 1
+ * stripe instead of many
*/
int copies; /* near_copies * far_copies.
* must be <= raid_disks
@@ -34,10 +35,11 @@ struct r10conf {
* 1 stripe.
*/
- sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
+ sector_t dev_sectors; /* temp copy of
+ * mddev->dev_sectors */
- int chunk_shift; /* shift from chunks to sectors */
- sector_t chunk_mask;
+ int chunk_shift; /* shift from chunks to sectors */
+ sector_t chunk_mask;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
@@ -45,20 +47,22 @@ struct r10conf {
int pending_count;
spinlock_t resync_lock;
- int nr_pending;
- int nr_waiting;
- int nr_queued;
- int barrier;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
-
+ int have_replacement; /* There is at least one
+ * replacement device.
+ */
wait_queue_head_t wait_barrier;
- mempool_t *r10bio_pool;
- mempool_t *r10buf_pool;
+ mempool_t *r10bio_pool;
+ mempool_t *r10buf_pool;
struct page *tmppage;
/* When taking over an array from a different personality, we store
@@ -98,11 +102,18 @@ struct r10bio {
* When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated.
+ * We sometimes need an extra bio to write to the replacement.
*/
struct {
- struct bio *bio;
- sector_t addr;
- int devnum;
+ struct bio *bio;
+ union {
+ struct bio *repl_bio; /* used for resync and
+ * writes */
+ struct md_rdev *rdev; /* used for reads
+ * (read_slot >= 0) */
+ };
+ sector_t addr;
+ int devnum;
} devs[0];
};
@@ -121,17 +132,19 @@ struct r10bio {
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */
-#define R10BIO_Uptodate 0
-#define R10BIO_IsSync 1
-#define R10BIO_IsRecover 2
-#define R10BIO_Degraded 3
+enum r10bio_state {
+ R10BIO_Uptodate,
+ R10BIO_IsSync,
+ R10BIO_IsRecover,
+ R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
-#define R10BIO_ReadError 4
+ R10BIO_ReadError,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
-#define R10BIO_MadeGood 5
-#define R10BIO_WriteError 6
+ R10BIO_MadeGood,
+ R10BIO_WriteError,
+};
#endif
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 09/17] md/raid10: recognise replacements when assembling array.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (3 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 01/17] md/raid10: prepare data structures for handling replacement NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 14/17] md/raid1: handle activation of replacement device when recovery completes NeilBrown
` (11 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
If a Replacement is seen, file it as such.
If we see two replacements (or two normal devices) for the one slot,
abort.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 17 +++++++++++++++++
1 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fd28f03..15e3f73 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3195,6 +3195,16 @@ static int run(struct mddev *mddev)
continue;
disk = conf->mirrors + disk_idx;
+ if (test_bit(Replacement, &rdev->flags)) {
+ if (disk->replacement)
+ goto out_free_conf;
+ disk->replacement = rdev;
+ } else {
+ if (disk->rdev)
+ goto out_free_conf;
+ disk->rdev = rdev;
+ }
+
disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -3222,6 +3232,13 @@ static int run(struct mddev *mddev)
disk = conf->mirrors + i;
+ if (!disk->rdev && disk->replacement) {
+ /* The replacement is all we have - use it */
+ disk->rdev = disk->replacement;
+ disk->replacement = NULL;
+ clear_bit(Replacement, &disk->rdev->flags);
+ }
+
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 14/17] md/raid1: handle activation of replacement device when recovery completes.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (4 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 09/17] md/raid10: recognise replacements when assembling array NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 11/17] md/raid1: Replace use of mddev->raid_disks with conf->raid_disks NeilBrown
` (10 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When recovery completes ->spare_active is called.
This checks if the replacement is ready and if so it fails
the original.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 36 +++++++++++++++++++++++++++++++++---
1 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index eac9a1d..5307e73 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1265,6 +1265,25 @@ static int raid1_spare_active(struct mddev *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
+ struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
+ if (repl
+ && repl->recovery_offset == MaxSector
+ && !test_bit(Faulty, &repl->flags)
+ && !test_and_set_bit(In_sync, &repl->flags)) {
+ /* replacement has just become active */
+ if (!rdev ||
+ !test_and_clear_bit(In_sync, &rdev->flags))
+ count++;
+ if (rdev) {
+ /* Replaced device not technically
+ * faulty, but we need to be sure
+ * it gets removed and never re-added
+ */
+ set_bit(Faulty, &rdev->flags);
+ sysfs_notify_dirent_safe(
+ rdev->sysfs_state);
+ }
+ }
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1362,10 +1381,21 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
p->rdev = rdev;
goto abort;
- } else {
- clear_bit(Replacement, &rdev->flags);
+ } else if (conf->mirrors[conf->raid_disks + number].rdev) {
+ /* We just removed a device that is being replaced.
+ * Move down the replacement. We drain all IO before
+ * doing this to avoid confusion.
+ */
+ struct md_rdev *repl =
+ conf->mirrors[conf->raid_disks + number].rdev;
+ raise_barrier(conf);
+ clear_bit(Replacement, &repl->flags);
+ p->rdev = repl;
+ conf->mirrors[conf->raid_disks + number].rdev = NULL;
+ lower_barrier(conf);
+ clear_bit(Replaceable, &rdev->flags);
+ } else
clear_bit(Replaceable, &rdev->flags);
- }
err = md_integrity_register(mddev);
}
abort:
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 11/17] md/raid1: Replace use of mddev->raid_disks with conf->raid_disks.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (5 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 14/17] md/raid1: handle activation of replacement device when recovery completes NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 08/17] md/raid10: Allow replacement device to be replace old drive NeilBrown
` (9 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
In general mddev->raid_disks can change unexpectedly while
conf->raid_disks will only change in a very controlled way. So change
some uses of one to the other.
The use of mddev->raid_disks will not cause actually problems but
this way is more consistent and safer in the long term.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a52979b..74d4ce5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -276,7 +276,8 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
{
int mirror;
- int raid_disks = r1_bio->mddev->raid_disks;
+ struct r1conf *conf = r1_bio->mddev->private;
+ int raid_disks = conf->raid_disks;
for (mirror = 0; mirror < raid_disks; mirror++)
if (r1_bio->bios[mirror] == bio)
@@ -608,7 +609,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
return 1;
rcu_read_lock();
- for (i = 0; i < mddev->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1287,7 +1288,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror = 0;
struct mirror_info *p;
int first = 0;
- int last = mddev->raid_disks - 1;
+ int last = conf->raid_disks - 1;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 08/17] md/raid10: Allow replacement device to be replace old drive.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (6 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 11/17] md/raid1: Replace use of mddev->raid_disks with conf->raid_disks NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 05/17] md/raid10: writes should get directed to replacement as well as original NeilBrown
` (8 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When recovery finish and spare_active is called, check for a
replace that might have just become fully synced and mark it
as such, marking the original as failed.
Then when the original is removed, move the replacement into
its position.
This means that 'replacement' and spontaneously become NULL in some
situations. Make sure we check for those.
It also means that 'rdev' and 'replacement' could appear to be
identical - check for that too.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 72 +++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 61 insertions(+), 11 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 89de485..fd28f03 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -396,14 +396,17 @@ static void raid10_end_write_request(struct bio *bio, int error)
int dec_rdev = 1;
struct r10conf *conf = r10_bio->mddev->private;
int slot, repl;
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL;
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement;
- else
+ if (!rdev) {
+ smp_rmb();
+ repl = 0;
rdev = conf->mirrors[dev].rdev;
+ }
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
@@ -1090,6 +1093,8 @@ retry_write:
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[d].replacement);
+ if (rdev == rrdev)
+ rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
@@ -1171,9 +1176,15 @@ retry_write:
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
}
if (r10_bio->devs[j].repl_bio) {
+ struct md_rdev *rdev;
d = r10_bio->devs[j].devnum;
- rdev_dec_pending(
- conf->mirrors[d].replacement, mddev);
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ /* Race with remove_disk */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ rdev_dec_pending(rdev, mddev);
}
}
allow_barrier(conf);
@@ -1231,6 +1242,10 @@ retry_write:
max_sectors);
r10_bio->devs[i].repl_bio = mbio;
+ /* We are actively writing to the original device
+ * so it cannot disappear, so the replacement cannot
+ * become NULL here
+ */
mbio->bi_sector = (r10_bio->devs[i].addr+
conf->mirrors[d].replacement->data_offset);
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
@@ -1406,9 +1421,27 @@ static int raid10_spare_active(struct mddev *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i;
- if (tmp->rdev
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
sysfs_notify_dirent(tmp->rdev->sysfs_state);
}
@@ -1508,6 +1541,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*/
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != p->recovery_disabled &&
+ (!p->replacement || p->replacement == rdev) &&
enough(conf, -1)) {
err = -EBUSY;
goto abort;
@@ -1519,7 +1553,21 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
*rdevp = rdev;
goto abort;
- }
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither -- if they are careful.
+ */
+ p->replacement = NULL;
+ clear_bit(Replaceable, &rdev->flags);
+ } else
+ /* We might have just remove the Replacement as faulty
+ * Clear the flag just in case
+ */
+ clear_bit(Replaceable, &rdev->flags);
+
err = md_integrity_register(mddev);
abort:
@@ -1597,13 +1645,15 @@ static void end_sync_write(struct bio *bio, int error)
int bad_sectors;
int slot;
int repl;
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[d].replacement;
- else
+ if (!rdev) {
+ smp_mb();
rdev = conf->mirrors[d].rdev;
+ }
if (!uptodate) {
if (repl)
@@ -2370,7 +2420,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
bio = r10_bio->devs[m].repl_bio;
rdev = conf->mirrors[dev].replacement;
- if (bio == IO_MADE_GOOD) {
+ if (rdev && bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 05/17] md/raid10: writes should get directed to replacement as well as original.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (7 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 08/17] md/raid10: Allow replacement device to be replace old drive NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 06/17] md/raid10: Handle replacement devices during resync NeilBrown
` (7 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When writing, we need to submit two writes, one to the original,
and one to the replacements - if there is a replacement.
If the write to the replacement results in a write error we just
fail the device. We only try to record write errors to the
original.
This only handles writing new data. Writing for resync/recovery
will come later.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 83 +++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 74 insertions(+), 9 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 698696b..0db0402 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -395,17 +395,29 @@ static void raid10_end_write_request(struct bio *bio, int error)
int dev;
int dec_rdev = 1;
struct r10conf *conf = r10_bio->mddev->private;
- int slot;
+ int slot, repl;
+ struct md_rdev *rdev;
- dev = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[dev].replacement;
+ else
+ rdev = conf->mirrors[dev].rdev;
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
if (!uptodate) {
- set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
- set_bit(R10BIO_WriteError, &r10_bio->state);
- dec_rdev = 0;
+ if (repl)
+ /* Never record new bad blocks to replacement,
+ * just fail it.
+ */
+ md_error(rdev->mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ dec_rdev = 0;
+ }
} else {
/*
* Set R10BIO_Uptodate in our master bio, so that
@@ -422,12 +434,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(conf->mirrors[dev].rdev,
+ if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors)) {
bio_put(bio);
- r10_bio->devs[slot].bio = IO_MADE_GOOD;
+ if (repl)
+ r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
+ else
+ r10_bio->devs[slot].bio = IO_MADE_GOOD;
dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
@@ -443,7 +458,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
-
/*
* RAID10 layout manager
* As well as the chunksize and raid_disks count, there are two
@@ -1074,12 +1088,23 @@ retry_write:
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[d].replacement);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
break;
}
+ if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
+ if (rrdev && test_bit(Faulty, &rrdev->flags))
+ rrdev = NULL;
+
r10_bio->devs[i].bio = NULL;
+ r10_bio->devs[i].repl_bio = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
@@ -1128,6 +1153,10 @@ retry_write:
}
r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending);
+ if (rrdev) {
+ r10_bio->devs[i].repl_bio = bio;
+ atomic_inc(&rrdev->nr_pending);
+ }
}
rcu_read_unlock();
@@ -1136,11 +1165,17 @@ retry_write:
int j;
int d;
- for (j = 0; j < i; j++)
+ for (j = 0; j < i; j++) {
if (r10_bio->devs[j].bio) {
d = r10_bio->devs[j].devnum;
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
}
+ if (r10_bio->devs[j].repl_bio) {
+ d = r10_bio->devs[j].devnum;
+ rdev_dec_pending(
+ conf->mirrors[d].replacement, mddev);
+ }
+ }
allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf);
@@ -1187,6 +1222,27 @@ retry_write:
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ if (!r10_bio->devs[i].repl_bio)
+ continue;
+
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
+ r10_bio->devs[i].repl_bio = mbio;
+
+ mbio->bi_sector = (r10_bio->devs[i].addr+
+ conf->mirrors[d].replacement->data_offset);
+ mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw = WRITE | do_sync | do_fua;
+ mbio->bi_private = r10_bio;
+
+ atomic_inc(&r10_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
/* Don't remove the bias on 'remaining' (one_write_done) until
@@ -2255,6 +2311,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
rdev_dec_pending(rdev, conf->mddev);
}
+ bio = r10_bio->devs[m].repl_bio;
+ rdev = conf->mirrors[dev].replacement;
+ if (bio == IO_MADE_GOOD) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
}
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 06/17] md/raid10: Handle replacement devices during resync.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (8 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 05/17] md/raid10: writes should get directed to replacement as well as original NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 13/17] md/raid1: Allow a failed replacement device to be removed NeilBrown
` (6 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
If we need to resync an array which has replacement devices,
we always write any block checked to every replacement.
If the resync was bitmap-based resync we will then complete the
replacement normally.
If it was a full resync, we mark the replacements as fully recovered
when the resync finishes so no further recovery is needed.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0db0402..a2341ca 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1596,19 +1596,29 @@ static void end_sync_write(struct bio *bio, int error)
sector_t first_bad;
int bad_sectors;
int slot;
+ int repl;
+ struct md_rdev *rdev;
- d = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
+ d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[d].replacement;
+ else
+ rdev = conf->mirrors[d].rdev;
if (!uptodate) {
- set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
- set_bit(R10BIO_WriteError, &r10_bio->state);
- } else if (is_badblock(conf->mirrors[d].rdev,
+ if (repl)
+ md_error(mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ }
+ } else if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors))
set_bit(R10BIO_MadeGood, &r10_bio->state);
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ rdev_dec_pending(rdev, mddev);
end_sync_request(r10_bio);
}
@@ -1712,6 +1722,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
generic_make_request(tbio);
}
+ /* Now write out to any replacement devices
+ * that are active
+ */
+ for (i = 0; i < conf->copies; i++) {
+ int j, d;
+ int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
+
+ tbio = r10_bio->devs[i].repl_bio;
+ if (!tbio || !tbio->bi_end_io)
+ continue;
+ if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
+ && r10_bio->devs[i].bio != fbio)
+ for (j = 0; j < vcnt; j++)
+ memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+ page_address(fbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ d = r10_bio->devs[i].devnum;
+ atomic_inc(&r10_bio->remaining);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ tbio->bi_size >> 9);
+ generic_make_request(tbio);
+ }
+
done:
if (atomic_dec_and_test(&r10_bio->remaining)) {
md_done_sync(mddev, r10_bio->sectors, 1);
@@ -2289,6 +2322,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0))
md_error(conf->mddev, rdev);
}
+ rdev = conf->mirrors[dev].replacement;
+ if (r10_bio->devs[m].repl_bio == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE,
+ &r10_bio->devs[m].repl_bio->bi_flags)) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ } else {
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0))
+ md_error(conf->mddev, rdev);
+ }
}
put_buf(r10_bio);
} else {
@@ -2471,9 +2520,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
}
- } else /* completed sync */
+ } else {
+ /* completed sync */
+ if ((!mddev->bitmap || conf->fullsync)
+ && conf->have_replacement
+ && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* Completed a full sync so the replacements
+ * are now fully recovered.
+ */
+ for (i = 0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->mirrors[i].replacement
+ ->recovery_offset
+ = MaxSector;
+ }
conf->fullsync = 0;
-
+ }
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
*skipped = 1;
@@ -2721,6 +2783,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t first_bad, sector;
int bad_sectors;
+ if (r10_bio->devs[i].repl_bio)
+ r10_bio->devs[i].repl_bio->bi_end_io = NULL;
+
bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL;
clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -2751,6 +2816,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev;
count++;
+
+ if (conf->mirrors[d].replacement == NULL ||
+ test_bit(Faulty,
+ &conf->mirrors[d].replacement->flags))
+ continue;
+
+ /* Need to set up for writing to the replacement */
+ bio = r10_bio->devs[i].repl_bio;
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ sector = r10_bio->devs[i].addr;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = sector +
+ conf->mirrors[d].replacement->data_offset;
+ bio->bi_bdev = conf->mirrors[d].replacement->bdev;
+ count++;
}
if (count < 2) {
@@ -2759,6 +2845,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev,
mddev);
+ if (r10_bio->devs[i].repl_bio &&
+ r10_bio->devs[i].repl_bio->bi_end_io)
+ rdev_dec_pending(
+ conf->mirrors[d].replacement,
+ mddev);
}
put_buf(r10_bio);
biolist = NULL;
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 13/17] md/raid1: Allow a failed replacement device to be removed.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (9 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 06/17] md/raid10: Handle replacement devices during resync NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 10/17] md/raid10: If there is a spare and a replaceable device, start replacement NeilBrown
` (5 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
Replacement devices are stored at a different offset, so look
there too.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 99cd12e..eac9a1d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1336,6 +1336,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number;
+ if (rdev != p->rdev)
+ p = conf->mirrors + conf->raid_disks + number;
+
print_conf(conf);
if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
@@ -1359,6 +1362,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
p->rdev = rdev;
goto abort;
+ } else {
+ clear_bit(Replacement, &rdev->flags);
+ clear_bit(Replaceable, &rdev->flags);
}
err = md_integrity_register(mddev);
}
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 10/17] md/raid10: If there is a spare and a replaceable device, start replacement.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (10 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 13/17] md/raid1: Allow a failed replacement device to be removed NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 15/17] md/raid1: recognise replacements when assembling arrays NeilBrown
` (4 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When attempting to add a spare to a RAID10 array, also consider
adding it as a replacement for a replaceable device.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 40 ++++++++++++++++++++++++++++++++++++----
1 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 15e3f73..5993615 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -418,6 +418,9 @@ static void raid10_end_write_request(struct bio *bio, int error)
md_error(rdev->mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(Replaceable, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
dec_rdev = 0;
}
@@ -1483,8 +1486,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled)
continue;
- if (p->rdev)
- continue;
+ if (p->rdev) {
+ if (!test_bit(Replaceable, &p->rdev->flags) ||
+ p->replacement != NULL)
+ continue;
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ break;
+ }
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1660,6 +1680,9 @@ static void end_sync_write(struct bio *bio, int error)
md_error(mddev, rdev);
else {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(Replaceable, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
} else if (is_badblock(rdev,
@@ -1854,8 +1877,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s << 9,
bio->bi_io_vec[idx].bv_page,
WRITE, false);
- if (!ok)
+ if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(Replaceable,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
}
if (!ok) {
/* We don't worry if we cannot set a bad block -
@@ -1973,8 +2001,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
- if (rw == WRITE)
+ if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(Replaceable, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 15/17] md/raid1: recognise replacements when assembling arrays.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (11 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 10/17] md/raid10: If there is a spare and a replaceable device, start replacement NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 07/17] md/raid10: handle recovery of replacement devices NeilBrown
` (3 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
If a Replacement is seen, file it as such.
If we see two replacements (or two normal devices) for the one slot,
abort.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 25 +++++++++++++++++++++++--
1 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5307e73..01cf420 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2453,14 +2453,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo->mddev = mddev;
+ err = -EINVAL;
spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
- disk = conf->mirrors + disk_idx;
+ if (test_bit(Replacement, &rdev->flags))
+ disk = conf->mirrors + conf->raid_disks + disk_idx;
+ else
+ disk = conf->mirrors + disk_idx;
+ if (disk->rdev)
+ goto abort;
disk->rdev = rdev;
disk->head_position = 0;
@@ -2476,11 +2482,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
+ err = -EIO;
conf->last_used = -1;
for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
+ if (i < conf->raid_disks &&
+ disk[conf->raid_disks].rdev) {
+ /* This slot has a replacement. */
+ if (!disk->rdev) {
+ /* No original, just make the replacement
+ * a recovering spare
+ */
+ disk->rdev =
+ disk[conf->raid_disks].rdev;
+ disk[conf->raid_disks].rdev = NULL;
+ } else if (!test_bit(In_sync, &disk->rdev->flags))
+ /* Original is not in_sync - bad */
+ goto abort;
+ }
+
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
@@ -2494,7 +2516,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->last_used = i;
}
- err = -EIO;
if (conf->last_used < 0) {
printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
mdname(mddev));
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 07/17] md/raid10: handle recovery of replacement devices.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (12 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 15/17] md/raid1: recognise replacements when assembling arrays NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 12/17] md/raid1: Allocate spare to store replacement devices and their bios NeilBrown
` (2 subsequent siblings)
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
If there is a replacement device, then recover to it,
reading from any drives - maybe the one being replaced, maybe not.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid10.c | 98 ++++++++++++++++++++++++++++++++++++---------------
1 files changed, 70 insertions(+), 28 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a2341ca..89de485 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1857,12 +1857,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* share the pages with the first bio
* and submit the write request
*/
- wbio = r10_bio->devs[1].bio;
d = r10_bio->devs[1].devnum;
-
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
- generic_make_request(wbio);
+ wbio = r10_bio->devs[1].bio;
+ if (wbio->bi_end_io) {
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+ generic_make_request(wbio);
+ }
+ wbio = r10_bio->devs[1].repl_bio;
+ if (wbio && wbio->bi_end_io) {
+ atomic_inc(&conf->mirrors[d].replacement->nr_pending);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ wbio->bi_size >> 9);
+ generic_make_request(wbio);
+ }
}
@@ -2592,23 +2600,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect;
int must_sync;
int any_working;
+ struct mirror_info *mirror = &conf->mirrors[i];
- if (conf->mirrors[i].rdev == NULL ||
- test_bit(In_sync, &conf->mirrors[i].rdev->flags))
+ if ((mirror->rdev == NULL ||
+ test_bit(In_sync, &mirror->rdev->flags))
+ &&
+ (mirror->replacement == NULL ||
+ test_bit(Faulty,
+ &mirror->replacement->flags)))
continue;
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
- /* Unless we are doing a full sync, we only need
- * to recover the block if it is set in the bitmap
+ /* Unless we are doing a full sync, or a replacement
+ * we only need to recover the block if it is set in
+ * the bitmap
*/
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
+ mirror->replacement == NULL &&
!conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here
@@ -2678,33 +2693,52 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
from_addr = r10_bio->devs[j].addr;
- bio->bi_sector = from_addr +
- conf->mirrors[d].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[d].rdev->bdev;
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- atomic_inc(&r10_bio->remaining);
- /* and we write to 'i' */
+ bio->bi_sector = from_addr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&rdev->nr_pending);
+ /* and we write to 'i' (if not in_sync) */
for (k=0; k<conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
BUG_ON(k == conf->copies);
- bio = r10_bio->devs[1].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_write;
- bio->bi_rw = WRITE;
to_addr = r10_bio->devs[k].addr;
- bio->bi_sector = to_addr +
- conf->mirrors[i].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-
r10_bio->devs[0].devnum = d;
r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
+ rdev = mirror->rdev;
+ if (!test_bit(In_sync, &rdev->flags)) {
+ bio = r10_bio->devs[1].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = to_addr
+ + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
+ } else
+ r10_bio->devs[1].bio->bi_end_io = NULL;
+
+ /* and maybe write to replacement */
+ bio = r10_bio->devs[1].repl_bio;
+ if (bio)
+ bio->bi_end_io = NULL;
+ rdev = mirror->replacement;
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags))
+ break;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = to_addr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
break;
}
if (j == conf->copies) {
@@ -2722,8 +2756,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
for (k = 0; k < conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
- if (!rdev_set_badblocks(
- conf->mirrors[i].rdev,
+ if (!test_bit(In_sync,
+ &mirror->rdev->flags)
+ && !rdev_set_badblocks(
+ mirror->rdev,
+ r10_bio->devs[k].addr,
+ max_sync, 0))
+ any_working = 0;
+ if (mirror->replacement &&
+ !rdev_set_badblocks(
+ mirror->replacement,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
@@ -2734,7 +2776,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
printk(KERN_INFO "md/raid10:%s: insufficient "
"working devices for recovery.\n",
mdname(mddev));
- conf->mirrors[i].recovery_disabled
+ mirror->recovery_disabled
= mddev->recovery_disabled;
}
break;
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 12/17] md/raid1: Allocate spare to store replacement devices and their bios.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (13 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 07/17] md/raid10: handle recovery of replacement devices NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 17/17] md/raid1: Mark device replaceable when we see a write error NeilBrown
2011-11-02 5:25 ` [md PATCH 16/17] md/raid1: If there is a spare and a replaceable device, start replacement NeilBrown
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
In RAID1, a replacement is much like a normal device, so we just
double the size of the relevant arrays and look at all possible
devices for reads and writes.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 58 +++++++++++++++++++++++++++-------------------------
drivers/md/raid1.h | 7 +++++-
2 files changed, 36 insertions(+), 29 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 74d4ce5..99cd12e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -134,7 +134,7 @@ out_free_pages:
put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
- while ( ++j < pi->raid_disks )
+ while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
r1bio_pool_free(r1_bio, data);
return NULL;
@@ -163,7 +163,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
{
int i;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio **bio = r1_bio->bios + i;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
@@ -184,7 +184,7 @@ static void put_buf(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
int i;
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio *bio = r1_bio->bios[i];
if (bio->bi_end_io)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -279,11 +279,11 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
struct r1conf *conf = r1_bio->mddev->private;
int raid_disks = conf->raid_disks;
- for (mirror = 0; mirror < raid_disks; mirror++)
+ for (mirror = 0; mirror < raid_disks * 2; mirror++)
if (r1_bio->bios[mirror] == bio)
break;
- BUG_ON(mirror == raid_disks);
+ BUG_ON(mirror == raid_disks * 2);
update_head_pos(mirror, r1_bio);
return mirror;
@@ -505,7 +505,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
start_disk = conf->last_used;
}
- for (i = 0 ; i < conf->raid_disks ; i++) {
+ for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
@@ -974,7 +974,7 @@ read_again:
*/
plugged = mddev_check_plugged(mddev);
- disks = conf->raid_disks;
+ disks = conf->raid_disks * 2;
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
@@ -1494,7 +1494,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
}
}
d++;
- if (d == conf->raid_disks)
+ if (d == conf->raid_disks * 2)
d = 0;
} while (!success && d != r1_bio->read_disk);
@@ -1511,7 +1511,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
- for (d = 0; d < conf->raid_disks; d++) {
+ for (d = 0; d < conf->raid_disks * 2; d++) {
rdev = conf->mirrors[d].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
@@ -1537,7 +1537,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
/* write it back and re-read */
while (d != r1_bio->read_disk) {
if (d == 0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
@@ -1552,7 +1552,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
d = start;
while (d != r1_bio->read_disk) {
if (d == 0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
@@ -1585,7 +1585,7 @@ static int process_checks(struct r1bio *r1_bio)
int primary;
int i;
- for (primary = 0; primary < conf->raid_disks; primary++)
+ for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1593,7 +1593,7 @@ static int process_checks(struct r1bio *r1_bio)
break;
}
r1_bio->read_disk = primary;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
@@ -1657,7 +1657,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
int i;
- int disks = conf->raid_disks;
+ int disks = conf->raid_disks * 2;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
@@ -1738,7 +1738,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
success = 1;
else {
d++;
- if (d == conf->raid_disks)
+ if (d == conf->raid_disks * 2)
d = 0;
}
} while (!success && d != read_disk);
@@ -1754,7 +1754,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
start = d;
while (d != read_disk) {
if (d==0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
@@ -1766,7 +1766,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
while (d != read_disk) {
char b[BDEVNAME_SIZE];
if (d==0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
@@ -1888,7 +1888,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
{
int m;
int s = r1_bio->sectors;
- for (m = 0; m < conf->raid_disks ; m++) {
+ for (m = 0; m < conf->raid_disks * 2 ; m++) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
@@ -1910,7 +1910,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
- for (m = 0; m < conf->raid_disks ; m++)
+ for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev,
@@ -2185,7 +2185,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state);
- for (i=0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
@@ -2256,7 +2256,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
* need to mark them bad on all write targets
*/
int ok = 1;
- for (i = 0 ; i < conf->raid_disks ; i++)
+ for (i = 0 ; i < conf->raid_disks * 2 ; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
struct md_rdev *rdev =
rcu_dereference(conf->mirrors[i].rdev);
@@ -2325,7 +2325,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
len = sync_blocks<<9;
}
- for (i=0 ; i < conf->raid_disks; i++) {
+ for (i = 0 ; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io) {
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2358,7 +2358,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
@@ -2395,7 +2395,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+ conf->mirrors = kzalloc(sizeof(struct mirror_info)
+ * mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
goto abort;
@@ -2407,7 +2408,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
if (!conf->poolinfo)
goto abort;
- conf->poolinfo->raid_disks = mddev->raid_disks;
+ conf->poolinfo->raid_disks = mddev->raid_disks * 2;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free,
conf->poolinfo);
@@ -2440,7 +2441,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->recovery_disabled = mddev->recovery_disabled - 1;
conf->last_used = -1;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
@@ -2667,7 +2668,7 @@ static int raid1_reshape(struct mddev *mddev)
if (!newpoolinfo)
return -ENOMEM;
newpoolinfo->mddev = mddev;
- newpoolinfo->raid_disks = raid_disks;
+ newpoolinfo->raid_disks = raid_disks * 2;
newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free, newpoolinfo);
@@ -2675,7 +2676,8 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo);
return -ENOMEM;
}
- newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+ newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+ GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
mempool_destroy(newpool);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c732b6c..80ded13 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -12,6 +12,9 @@ struct mirror_info {
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
+ * The 'raid_disks' here is twice the raid_disks in r1conf.
+ * This allows space for each 'real' device can have a replacement in the
+ * second half of the array.
*/
struct pool_info {
@@ -21,7 +24,9 @@ struct pool_info {
struct r1conf {
struct mddev *mddev;
- struct mirror_info *mirrors;
+ struct mirror_info *mirrors; /* twice 'raid_disks' to
+ * allow for replacements.
+ */
int raid_disks;
/* When choose the best device for a read (read_balance())
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 17/17] md/raid1: Mark device replaceable when we see a write error.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (14 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 12/17] md/raid1: Allocate spare to store replacement devices and their bios NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
2011-11-02 5:25 ` [md PATCH 16/17] md/raid1: If there is a spare and a replaceable device, start replacement NeilBrown
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
Now that Replaceable drives are replaced cleanly, mark a drive
as replaceable when we see a write error. It might get failed soon so
the Replaceable flag is irrelevant, but if the write error is recorded
in the bad block log, we still want to activate any spare that might
be available.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 16 +++++++++++++++-
1 files changed, 15 insertions(+), 1 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 968428e..c717480 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -390,6 +390,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (!uptodate) {
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(Replaceable,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ conf->mddev->recovery);
+
set_bit(R1BIO_WriteError, &r1_bio->state);
} else {
/*
@@ -1461,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error)
} while (sectors_to_go > 0);
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(Replaceable,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(conf->mirrors[mirror].rdev,
r1_bio->sector,
@@ -1491,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
- if (rw == WRITE)
+ if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(Replaceable,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ rdev->mddev->recovery);
+ }
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
^ permalink raw reply related [flat|nested] 18+ messages in thread* [md PATCH 16/17] md/raid1: If there is a spare and a replaceable device, start replacement.
2011-11-02 5:25 [md PATCH 00/17] hot-replace support for RAID1 and RAID10 NeilBrown
` (15 preceding siblings ...)
2011-11-02 5:25 ` [md PATCH 17/17] md/raid1: Mark device replaceable when we see a write error NeilBrown
@ 2011-11-02 5:25 ` NeilBrown
16 siblings, 0 replies; 18+ messages in thread
From: NeilBrown @ 2011-11-02 5:25 UTC (permalink / raw)
To: linux-raid
When attempting to add a spare to a RAID1 array, also consider
adding it as a replacement for a replaceable device.
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/md/raid1.c | 17 +++++++++++++++--
1 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 01cf420..968428e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1315,8 +1315,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- for (mirror = first; mirror <= last; mirror++)
- if ( !(p=conf->mirrors+mirror)->rdev) {
+ for (mirror = first; mirror <= last; mirror++) {
+ p = conf->mirrors+mirror;
+ if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1343,6 +1344,18 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
+ if (test_bit(Replaceable, &p->rdev->flags) &&
+ p[conf->raid_disks].rdev == NULL) {
+ /* Add this device as a replacement */
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+ break;
+ }
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
^ permalink raw reply related [flat|nested] 18+ messages in thread