From: raz ben yehuda <raziebe@013.net>
To: neilb@suse.de
Cc: linux-raid@vger.kernel.org
Subject: Subject: [001/002 ] raid0 reshape
Date: Sun, 03 May 2009 00:46:04 +0300 [thread overview]
Message-ID: <1241300764.5607.36.camel@raz> (raw)
Neil Hello
The bellow is the raid0 grow code.I have decided to fix raid0 and not
perform the transformation raid0-raid4-raid0 due to two reasons:
1. raid0 zones. this patch support any zone transformations.
2. Undesired dependency of raid0 over raid4 re-striping code.
The following tests were conducted:
1. various chunk sizes, 4K to 512K. ( mainly in 2.6.27 and 2.6.18 )
2. regrow ( tested on 2.6.27 and 2.6.18 )
3. various super blocks. 0.9 , 1, 1.1 and 1.2 ( mainly in 2.6.27 and 2.6.18 ).
4. support assembling and mounting older raid version ( older kernels and code before patch) after it was grown.
patch passed checkpatch.pl . other than reshaping code i beautified the code.
Currently i about to pass this code to our testing team for further tests.
Other things to do:
1. Speedup the reshape process.It is too slow.
2. Support for non power 2^n ( page size) chunks.
I will be thankful for your criticism.
Raz
drivers/md/Kconfig | 13
drivers/md/md.c | 6
drivers/md/raid0.c | 711 ++++++++++++++++++++++++++++++++++---------
drivers/md/raid0.h | 5
4 files changed, 590 insertions(+), 145 deletions(-)
Signed-off-by: Neil Brown <neilb@suse.de>
---
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675..a9f0ff6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -77,6 +77,19 @@ config MD_RAID0
If unsure, say Y.
+config MD_RAID0_RESHAPE
+ bool "Support adding drives to a raid-0 array.(EXPERIMENTAL)"
+ depends on MD_RAID0 && EXPERIMENTAL
+ default n
+ ---help---
+ A RAID-0 set can be expanded by adding extra drives. This
+ requires "restriping" .
+ You will need mdadm version 2.4.x or later to use this.
+ The mdadm usage is e.g.
+ mdadm --grow /dev/md0 --raid-disks=6
+ Note: The array can only be expanded.
+ If unsure, say N.
+
config MD_RAID1
tristate "RAID-1 (mirroring) mode"
depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ed5727c..82f57ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5707,6 +5707,8 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
max_blocks = mddev->resync_max_sectors >> 1;
else
max_blocks = mddev->dev_sectors / 2;
+ if (mddev->level == 0)
+ max_blocks = mddev->array_sectors>>1;
/*
* Should not happen.
@@ -5915,7 +5917,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev->pers) {
mddev->pers->status(seq, mddev);
seq_printf(seq, "\n ");
- if (mddev->pers->sync_request) {
+ if (mddev->pers->sync_request || !mddev->level) {
if (mddev->curr_resync > 2) {
status_resync(seq, mddev);
seq_printf(seq, "\n ");
@@ -6146,7 +6148,7 @@ int md_allow_write(mddev_t *mddev)
return 0;
if (mddev->ro)
return 0;
- if (!mddev->pers->sync_request)
+ if (!mddev->pers->sync_request && mddev->level != 0)
return 0;
spin_lock_irq(&mddev->write_lock);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d755..9e2b6de 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,11 +18,14 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid0.h"
+static int raid0_create_reshape_thread(mddev_t *mddev);
+
static void raid0_unplug(struct request_queue *q)
{
mddev_t *mddev = q->queuedata;
@@ -53,27 +56,46 @@ static int raid0_congested(void *data, int bits)
}
-static int create_strip_zones (mddev_t *mddev)
+static void raid0_dump_zones(mddev_t *mddev)
{
- int i, c, j;
- sector_t current_start, curr_zone_start;
- sector_t min_spacing;
+ int j, k, h;
+ char b[BDEVNAME_SIZE];
raid0_conf_t *conf = mddev_to_conf(mddev);
- mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
- struct strip_zone *zone;
- int cnt;
+ printk(KERN_INFO "***** %s configuration ******\n\n",
+ mdname(mddev));
+ h = 0;
+ for (j = 0; j < conf->nr_strip_zones; j++) {
+ printk(KERN_INFO "zone%d", j);
+ if (conf->hash_table[h] == conf->strip_zone+j)
+ printk("(h%d)", h++);
+ printk(KERN_INFO "=[");
+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+ printk(KERN_INFO "%s/", bdevname(
+ conf->strip_zone[j].dev[k]->bdev, b));
+ printk(KERN_INFO "]\n\t zone offset=%llu device offset=%llu size=%llukb\n",
+ (unsigned long long)conf->strip_zone[j].zone_start,
+ (unsigned long long)conf->strip_zone[j].dev_start,
+ (unsigned long long)conf->strip_zone[j].sectors>>1);
+ }
+ printk(KERN_INFO "**********************************\n\n");
+}
+
+
+static void raid0_count_zones(mddev_t *mddev, struct list_head *disks)
+{
+ int c = 0;
char b[BDEVNAME_SIZE];
-
+ mdk_rdev_t *rdev1, *rdev2;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
/*
* The number of 'same size groups'
*/
conf->nr_strip_zones = 0;
-
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ list_for_each_entry(rdev1, disks, same_set) {
printk(KERN_INFO "raid0: looking at %s\n",
bdevname(rdev1->bdev,b));
c = 0;
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ list_for_each_entry(rdev2, disks, same_set) {
printk(KERN_INFO "raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b),
(unsigned long long)rdev1->sectors);
@@ -103,78 +125,72 @@ static int create_strip_zones (mddev_t *mddev)
}
}
printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+}
- conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
- conf->nr_strip_zones, GFP_KERNEL);
- if (!conf->strip_zone)
- return 1;
- conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
- conf->nr_strip_zones*mddev->raid_disks,
- GFP_KERNEL);
- if (!conf->devlist)
- return 1;
- /* The first zone must contain all devices, so here we check that
- * there is a proper alignment of slots to devices and find them all
- */
- zone = &conf->strip_zone[0];
- cnt = 0;
- smallest = NULL;
- zone->dev = conf->devlist;
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
- int j = rdev1->raid_disk;
+/*
+ * The first zone must contain all devices, so here we check that
+ * there is a proper alignment of slots to devices and find them all
+ */
+static int raid0_create_first_zone(mddev_t *mddev, struct list_head *disks)
+{
+ mdk_rdev_t *smallest = NULL;
+ mdk_rdev_t *rdev;
+ int cnt = 0;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ struct strip_zone *zone0 = &conf->strip_zone[0];
+ zone0->dev = conf->devlist;
+ list_for_each_entry(rdev, disks, same_set) {
+ int j = rdev->raid_disk;
if (j < 0 || j >= mddev->raid_disks) {
printk(KERN_ERR "raid0: bad disk number %d - "
"aborting!\n", j);
- goto abort;
+ return -1;
}
- if (zone->dev[j]) {
+ if (zone0->dev[j]) {
printk(KERN_ERR "raid0: multiple devices for %d - "
"aborting!\n", j);
- goto abort;
+ return -1;
}
- zone->dev[j] = rdev1;
-
- blk_queue_stack_limits(mddev->queue,
- rdev1->bdev->bd_disk->queue);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_sector to one PAGE, as
- * a one page request is never in violation.
- */
-
- if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
- blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- if (!smallest || (rdev1->sectors < smallest->sectors))
- smallest = rdev1;
+ zone0->dev[j] = rdev;
+ if (!smallest || (rdev->sectors < smallest->sectors))
+ smallest = rdev;
cnt++;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "raid0: too few disks (%d of %d) - "
"aborting!\n", cnt, mddev->raid_disks);
- goto abort;
+ return -1;
}
- zone->nb_dev = cnt;
- zone->sectors = smallest->sectors * cnt;
- zone->zone_start = 0;
+ zone0->nb_dev = cnt;
+ zone0->sectors = smallest->sectors * cnt;
+ zone0->zone_start = 0;
+ return 0;
+}
+
+
- current_start = smallest->sectors;
- curr_zone_start = zone->sectors;
+static void raid0_set_higher_zones(mddev_t *mddev)
+{
+ int i, j, c;
+ mdk_rdev_t *rdev;
+ struct strip_zone *zone;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ mdk_rdev_t *smallest;
+ sector_t current_start =
+ conf->strip_zone[0].sectors/conf->strip_zone[0].nb_dev;
+ sector_t curr_zone_start = conf->strip_zone[0].sectors;
/* now do the other zones */
- for (i = 1; i < conf->nr_strip_zones; i++)
- {
+ for (i = 1; i < conf->nr_strip_zones; i++) {
zone = conf->strip_zone + i;
zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
-
printk(KERN_INFO "raid0: zone %d\n", i);
zone->dev_start = current_start;
smallest = NULL;
c = 0;
-
- for (j=0; j<cnt; j++) {
+ for (j = 0; j < conf->strip_zone[0].nb_dev; j++) {
char b[BDEVNAME_SIZE];
rdev = conf->strip_zone[0].dev[j];
printk(KERN_INFO "raid0: checking %s ...",
@@ -197,25 +213,33 @@ static int create_strip_zones (mddev_t *mddev)
zone->sectors = (smallest->sectors - current_start) * c;
printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
zone->nb_dev, (unsigned long long)zone->sectors);
-
zone->zone_start = curr_zone_start;
curr_zone_start += zone->sectors;
-
current_start = smallest->sectors;
printk(KERN_INFO "raid0: current zone start: %llu\n",
(unsigned long long)current_start);
}
+}
- /* Now find appropriate hash spacing.
- * We want a number which causes most hash entries to cover
- * at most two strips, but the hash table must be at most
- * 1 PAGE. We choose the smallest strip, or contiguous collection
- * of strips, that has big enough size. We never consider the last
- * strip though as it's size has no bearing on the efficacy of the hash
- * table.
- */
- conf->spacing = curr_zone_start;
- min_spacing = curr_zone_start;
+
+/* Now find appropriate hash spacing.
+ * We want a number which causes most hash entries to cover
+ * at most two strips, but the hash table must be at most
+ * 1 PAGE. We choose the smallest strip, or contiguous collection
+ * of strips, that has big enough size. We never consider the last
+ * strip though as it's size has no bearing on the efficacy of the hash
+ * table.
+ */
+static void raid0_find_hash_spacing(mddev_t *mddev)
+{
+ int i, j;
+ sector_t min_spacing;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+
+ conf->spacing = 0;
+ for (i = 0; i < conf->nr_strip_zones; i++)
+ conf->spacing += conf->strip_zone[i].sectors;
+ min_spacing = conf->spacing;
sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
for (i=0; i < conf->nr_strip_zones-1; i++) {
sector_t s = 0;
@@ -225,16 +249,31 @@ static int create_strip_zones (mddev_t *mddev)
if (s >= min_spacing && s < conf->spacing)
conf->spacing = s;
}
+}
- mddev->queue->unplug_fn = raid0_unplug;
+static int raid0_create_strip_zones(mddev_t *mddev, struct list_head *disks)
+{
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ raid0_count_zones(mddev, disks);
+ conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
+ conf->nr_strip_zones, GFP_KERNEL);
+ if (!conf->strip_zone)
+ return 1;
+ conf->devlist = kzalloc(sizeof(mdk_rdev_t *)*
+ conf->nr_strip_zones*mddev->raid_disks,
+ GFP_KERNEL);
+ if (!conf->devlist)
+ return 1;
+ if (raid0_create_first_zone(mddev, disks))
+ return 1;
+ raid0_set_higher_zones(mddev);
+ raid0_find_hash_spacing(mddev);
+ mddev->queue->unplug_fn = raid0_unplug;
mddev->queue->backing_dev_info.congested_fn = raid0_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
printk(KERN_INFO "raid0: done.\n");
return 0;
- abort:
- return 1;
}
/**
@@ -265,79 +304,73 @@ static int raid0_mergeable_bvec(struct request_queue *q,
static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
- sector_t array_sectors = 0;
+ int i;
mdk_rdev_t *rdev;
-
- WARN_ONCE(sectors || raid_disks,
- "%s does not support generic reshape\n", __func__);
-
- list_for_each_entry(rdev, &mddev->disks, same_set)
- array_sectors += rdev->sectors;
-
+ sector_t array_sectors = 0;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+ for (i = 0; i < mddev->raid_disks; i++) {
+ rdev = devlist[i];
+ if (test_bit(In_sync, &rdev->flags))
+ array_sectors += rdev->sectors;
+ }
return array_sectors;
}
-static int raid0_run (mddev_t *mddev)
+static void raid0_set_queue_limits(mddev_t *mddev)
{
- unsigned cur=0, i=0, nb_zone;
- s64 sectors;
- raid0_conf_t *conf;
+ mdk_rdev_t *rdev;
- if (mddev->chunk_size == 0) {
- printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
- return -EINVAL;
- }
- printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
- mdname(mddev),
- mddev->chunk_size >> 9,
- (mddev->chunk_size>>1)-1);
- blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
- blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
- mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ blk_queue_stack_limits(mddev->queue,
+ rdev->bdev->bd_disk->queue);
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, so limit ->max_sector to one PAGE, as
+ * a one page request is never in violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+ mddev->queue->max_sectors > (PAGE_SIZE>>9))
+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
- if (!conf)
- goto out;
- mddev->private = (void *)conf;
-
- conf->strip_zone = NULL;
- conf->devlist = NULL;
- if (create_strip_zones (mddev))
- goto out_free_conf;
+ }
+}
- /* calculate array device size */
- md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+static int raid0_set_array_hash(mddev_t *mddev)
+{
+ int nb_zone = 0;
+ sector_t space;
+ int round;
+ sector_t s , sectors;
+ int cur = 0, i = 0;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
(unsigned long long)mddev->array_sectors);
printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
(unsigned long long)conf->spacing);
- {
- sector_t s = raid0_size(mddev, 0, 0);
- sector_t space = conf->spacing;
- int round;
- conf->sector_shift = 0;
- if (sizeof(sector_t) > sizeof(u32)) {
- /*shift down space and s so that sector_div will work */
- while (space > (sector_t) (~(u32)0)) {
- s >>= 1;
- space >>= 1;
- s += 1; /* force round-up */
- conf->sector_shift++;
- }
+
+ s = raid0_size(mddev, 0, mddev->raid_disks);
+ space = conf->spacing;
+ conf->sector_shift = 0;
+ if (sizeof(sector_t) > sizeof(u32)) {
+ /*shift down space and s so that sector_div will work */
+ while (space > (sector_t) (~(u32)0)) {
+ s >>= 1;
+ space >>= 1;
+ s += 1; /* force round-up */
+ conf->sector_shift++;
}
- round = sector_div(s, (u32)space) ? 1 : 0;
- nb_zone = s + round;
}
+ round = sector_div(s, (u32)space) ? 1 : 0;
+ nb_zone = s + round;
printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
nb_zone*sizeof(struct strip_zone*));
conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
if (!conf->hash_table)
- goto out_free_conf;
+ return -1;
sectors = conf->strip_zone[cur].sectors;
-
conf->hash_table[0] = conf->strip_zone + cur;
for (i=1; i< nb_zone; i++) {
while (sectors <= conf->spacing) {
@@ -354,24 +387,59 @@ static int raid0_run (mddev_t *mddev)
*/
conf->spacing++;
}
+ return 0;
+}
- /* calculate the max read-ahead size.
- * For read-ahead of large files to be effective, we need to
- * readahead at least twice a whole stripe. i.e. number of devices
- * multiplied by chunk size times 2.
- * If an individual device has an ra_pages greater than the
- * chunk size, then we will not drive that device as hard as it
- * wants. We consider this a configuration error: a larger
- * chunksize should be used in that case.
- */
- {
- int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
- if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
- mddev->queue->backing_dev_info.ra_pages = 2* stripe;
- }
+/* calculate the max read-ahead size.
+ * For read-ahead of large files to be effective, we need to
+ * readahead at least twice a whole stripe. i.e. number of devices
+ * multiplied by chunk size times 2.
+ * If an individual device has an ra_pages greater than the
+ * chunk size, then we will not drive that device as hard as it
+ * wants. We consider this a configuration error: a larger
+ * chunksize should be used in that case.
+ */
+static void raid0_set_max_ra(mddev_t *mddev)
+{
+ int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+ if (mddev->queue->backing_dev_info.ra_pages < 2*stripe)
+ mddev->queue->backing_dev_info.ra_pages = 2*stripe;
+}
+
+static int raid0_run(mddev_t *mddev)
+{
+ raid0_conf_t *conf;
+ if (mddev->chunk_size == 0) {
+ printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+ return -EINVAL;
+ }
+ printk(KERN_INFO "%s: setting max_sectors"
+ " to %d, segment boundary to %d\n",
+ mdname(mddev),
+ mddev->chunk_size >> 9,
+ (mddev->chunk_size>>1)-1);
+ blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
+ blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+ mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+ conf = kmalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+ if (!conf)
+ goto out;
+ mddev->private = (void *)conf;
+ conf->strip_zone = NULL;
+ conf->devlist = NULL;
+ if (raid0_create_strip_zones(mddev, &mddev->disks))
+ goto out_free_conf;
+ /* calculate array device size */
+ md_set_array_sectors(mddev, raid0_size(mddev, 0, mddev->raid_disks));
+ raid0_set_array_hash(mddev);
+ raid0_set_queue_limits(mddev);
+ raid0_set_max_ra(mddev);
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+ raid0_dump_zones(mddev);
+ raid0_create_reshape_thread(mddev);
+ init_completion(&conf->wait_reshape);
return 0;
out_free_conf:
@@ -386,7 +454,10 @@ out:
static int raid0_stop (mddev_t *mddev)
{
raid0_conf_t *conf = mddev_to_conf(mddev);
-
+ if (mddev->thread) {
+ md_unregister_thread(mddev->thread);
+ mddev->thread = 0;
+ }
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf->hash_table);
conf->hash_table = NULL;
@@ -414,7 +485,10 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
-
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+ bio_endio(bio, -EBUSY);
+ return 0;
+ }
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
@@ -513,6 +587,357 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
return;
}
+#ifdef CONFIG_MD_RAID0_RESHAPE
+
+#define DEBUG 0
+#define r0_dprintk(x...) ((void)(DEBUG && printk(x)))
+
+static void raid0_reshape_endio(struct bio *bi, int error)
+{
+ struct completion* w = (struct completion *)bi->bi_private;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ r0_dprintk("raid0: endio: sec=%lld:size=%d "
+ "bvlen=%d bvoff=%d \n",
+ (unsigned long long)bi->bi_sector,
+ bi->bi_size,
+ bi->bi_io_vec[0].bv_len,
+ bi->bi_io_vec[0].bv_offset);
+ if (!error || uptodate)
+ return (void)complete(w);
+ printk("raid0: end reshape: io error sector=%llu\n",
+ (unsigned long long)bi->bi_sector);
+}
+
+static int raid0_reshape_rw(struct bio *bi, int dir, int size)
+{
+ char b[BDEVNAME_SIZE];
+ bi->bi_rw = dir;
+ bi->bi_size = size;
+ bi->bi_idx = 0;
+ r0_dprintk("%s %c %llu sec size=%d\n",
+ bdevname(bi->bi_bdev, b),
+ dir == 0 ? 'R' : 'W',
+ (unsigned long long)bi->bi_sector, bi->bi_size);
+ generic_make_request(bi);
+ wait_for_completion((struct completion *)(bi->bi_private));
+ return 0;
+}
+
+static struct strip_zone *raid0_point_to_zone(mddev_t *mddev,
+ sector_t sector)
+{
+ sector_t x;
+ struct strip_zone *zone;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+
+ x = sector >> conf->sector_shift;
+ sector_div(x, (u32)conf->spacing);
+ zone = conf->hash_table[x];
+ while (sector >= zone->zone_start + zone->sectors)
+ zone++;
+ return zone;
+}
+
+
+static int raid0_point_bio_to_disk(struct bio *bio, sector_t raid_sector,
+ mddev_t *mddev)
+{
+ int chunksect_bits;
+ mdk_rdev_t *tmp_dev;
+ sector_t x, chunk_sects, chunk, rsect;
+ sector_t sect_in_chunk;
+ struct strip_zone *zone;
+
+ chunk_sects = mddev->chunk_size >> 9;
+ chunksect_bits = ffz(~chunk_sects);
+
+ zone = raid0_point_to_zone(mddev, raid_sector);
+ sect_in_chunk = raid_sector & (chunk_sects - 1);
+ x = (raid_sector - zone->zone_start) >> chunksect_bits;
+ sector_div(x, zone->nb_dev);
+ chunk = x;
+ x = raid_sector >> chunksect_bits;
+ tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
+ rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
+
+ bio->bi_bdev = tmp_dev->bdev;
+ bio->bi_sector = rsect + tmp_dev->data_offset;
+ return 0;
+}
+
+
+static void raid0_take_speed(mddev_t *mddev, sector_t raid_sector)
+{
+ if ((jiffies-mddev->resync_mark) < 1000)
+ return;
+ mddev->resync_mark = jiffies;
+ mddev->resync_mark_cnt = raid_sector;
+}
+
+
+static sector_t raid0_reshape_move_blocks(mddev_t *mddev,
+ mddev_t *mddev_target,
+ struct strip_zone *zone)
+{
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ struct bio *bi = conf->reshape_bi;
+ int io_size = bi->bi_size;
+ sector_t raid_sector = zone->zone_start;
+ sector_t last_sector = (zone->zone_start + zone->sectors);
+ mddev->curr_mark_cnt = io_size>>10;
+
+ while (raid_sector < last_sector && !kthread_should_stop()) {
+ raid0_take_speed(mddev, raid_sector);
+ if (raid0_point_bio_to_disk(bi, raid_sector, mddev)) {
+ printk(KERN_ERR "raid0:reshape point"
+ " read to bio failed\n");
+ break;
+ }
+ raid0_reshape_rw(bi, READ, io_size);
+ if (raid0_point_bio_to_disk(bi, raid_sector, mddev_target)) {
+ printk(KERN_ERR "raid0: point write to bio failed\n");
+ break;
+ }
+ raid0_reshape_rw(bi, WRITE, io_size);
+ raid_sector += io_size>>9;
+ mddev->curr_mark_cnt = raid_sector;
+ mddev->curr_resync = raid_sector;
+ }
+ bi->bi_size = io_size;
+ return raid_sector - zone->zone_start;
+}
+
+
+static void raid0_reshape_move_zones(mddev_t *mddev, mddev_t *mddev_target)
+{
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ sector_t raid_sector = 0;
+ int i = 0;
+ for (; i < conf->nr_strip_zones && !kthread_should_stop() ; i++)
+ raid_sector += raid0_reshape_move_blocks(mddev,
+ mddev_target,
+ &conf->strip_zone[i]);
+ if (raid_sector == mddev->array_sectors) {
+ printk(KERN_INFO "raid0: reshape ended %llu sectors moved OK\n",
+ (unsigned long long)raid_sector);
+ } else{
+ printk(KERN_INFO "raid0: reshape ended %llu sector moved BAD\n",
+ (unsigned long long)raid_sector);
+ }
+}
+
+
+static int raid0_reshape_prepare(mddev_t *mddev, mddev_t *mddev_target)
+{
+ raid0_conf_t *conf;
+ mddev_target->private = NULL;
+ conf = kzalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+ if (!conf)
+ return -1;
+ mddev_target->private = (void *)conf;
+ conf->strip_zone = NULL;
+ conf->devlist = NULL;
+ if (raid0_create_strip_zones(mddev_target, &mddev->disks))
+ return -1;
+ return raid0_set_array_hash(mddev_target);
+}
+
+
+static mddev_t *raid0_clone_mddev(mddev_t *mddev)
+{
+ void *m = kmalloc(sizeof(*mddev), GFP_NOIO);
+ if (!m)
+ return NULL;
+ memcpy(m, mddev, sizeof(*mddev));
+ return (mddev_t *)m;
+}
+
+static int raid0_reshape_iosize(mddev_t *mddev)
+{
+ int chunk_size_sectors = (mddev->chunk_size / PAGE_SIZE)*8;
+
+ if (mddev->queue->max_hw_sectors >= chunk_size_sectors)
+ return chunk_size_sectors;
+ if ((chunk_size_sectors % mddev->queue->max_hw_sectors) == 0)
+ return mddev->queue->max_hw_sectors;
+ return chunk_size_sectors /
+ ((chunk_size_sectors / mddev->queue->max_hw_sectors)*2);
+}
+
+
+static mddev_t *raid0_reshape_init(mddev_t *mddev)
+{
+ int i;
+ mddev_t *mddev_target = NULL;
+ mdk_rdev_t *rdev = NULL;
+ int nraid_disks = 0;
+ struct bio *bi = NULL;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ int pages = raid0_reshape_iosize(mddev)/8;
+ if (pages == 0) {
+ printk(KERN_INFO "raid0: failed to "
+ "determine transfer size\n");
+ return NULL;
+ }
+ printk("raid0: using transfer size %usectors\n", pages*8);
+ bi = bio_alloc(GFP_NOIO, pages);
+ if (!bi) {
+ printk(KERN_INFO "raid0:failed too alloc bio for"
+ " reshaping. rejecting\n");
+ goto RAID0_RESHAPE_INIT_EXIT_BAD;
+ }
+ mddev_target = raid0_clone_mddev(mddev);
+ bi->bi_vcnt = 0;
+ if (!mddev_target) {
+ printk(KERN_INFO "raid0: failed to clone mddev\n");
+ goto RAID0_RESHAPE_INIT_EXIT_BAD;
+ }
+ mddev->reshape_position = 0;
+ mddev->delta_disks = 0;
+ atomic_set(&mddev->recovery_active, 0);
+ nraid_disks = mddev->raid_disks;
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (!test_bit(In_sync, &rdev->flags)) {
+ rdev->raid_disk = nraid_disks++;
+ rdev->desc_nr = rdev->raid_disk;
+ set_bit(In_sync, &rdev->flags);
+ }
+ }
+ mddev_target->raid_disks = nraid_disks;
+ if (raid0_reshape_prepare(mddev, mddev_target)) {
+ printk(KERN_INFO "raid0: failed to"
+ " setup temporary mappings\n");
+ goto RAID0_RESHAPE_INIT_EXIT_BAD;
+ }
+ bi->bi_vcnt = pages;
+ for (i = 0; i < bi->bi_vcnt; i++) {
+ bi->bi_io_vec[i].bv_len = PAGE_SIZE;
+ bi->bi_io_vec[i].bv_offset = 0;
+ bi->bi_io_vec[i].bv_page = alloc_page(GFP_NOIO);
+ get_page(bi->bi_io_vec[i].bv_page);
+ }
+ bi->bi_next = NULL;
+ bi->bi_end_io = raid0_reshape_endio;
+ bi->bi_size = PAGE_SIZE * bi->bi_vcnt;
+ bi->bi_private = &conf->wait_reshape;
+ bi->bi_idx = 0;
+ conf->reshape_bi = bi;
+ return mddev_target;
+
+RAID0_RESHAPE_INIT_EXIT_BAD:
+ kfree(mddev_target);
+ for (i = 0; i < bi->bi_vcnt; i++)
+ safe_put_page(bi->bi_io_vec[i].bv_page);
+ if (bi)
+ bio_put(bi);
+ return NULL;
+}
+
+
+static void raid0_reshape_thread(mddev_t *mddev)
+{
+ int i = 0;
+ mddev_t *mddev_target = 0;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return;
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ mddev_target = raid0_reshape_init(mddev);
+ if (!mddev_target)
+ return;
+ raid0_reshape_move_zones(mddev, mddev_target);
+ if (kthread_should_stop())
+ goto RAID0_RELEASE_PSEUDO_RAID;
+ for (i = 0; i < conf->reshape_bi->bi_vcnt; i++)
+ safe_put_page(conf->reshape_bi->bi_io_vec[i].bv_page);
+ bio_put(conf->reshape_bi);
+ mddev->resync_mark = 0L;
+ mddev->resync_mark_cnt = 0L;
+ mddev->curr_resync = 0;
+ mddev->recovery_cp = MaxSector;
+ mddev->reshape_position = MaxSector;
+ mddev->raid_disks = mddev_target->raid_disks;
+ kfree(conf->hash_table);
+ kfree(conf);
+ mutex_lock(&mddev->reconfig_mutex);
+ raid0_run(mddev);
+RAID0_RELEASE_PSEUDO_RAID:
+ if (!mutex_is_locked(&mddev->reconfig_mutex))
+ mutex_lock(&mddev->reconfig_mutex);
+ mddev->in_sync = 1;
+ if (md_allow_write(mddev)) {
+ printk("raid0: did not write sb"
+ " critical error\n");
+ }
+ mutex_unlock(&mddev->reconfig_mutex);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ conf = mddev_target->private;
+ kfree(conf->hash_table);
+ kfree(conf->strip_zone);
+ kfree(conf->devlist);
+ kfree(mddev_target);
+}
+
+
+static int raid0_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdk_rdev_t *rdev1;
+ if (rdev->sectors < (mddev->chunk_size>>11)) {
+ printk(KERN_INFO "raid0: device smaller than "
+ "chunk size %llusectors < %llusectors\n",
+ (unsigned long long)rdev->sectors,
+ ((unsigned long long)mddev->chunk_size)>>10);
+ return -1;
+ }
+ if (rdev->bdev->bd_disk->queue->max_hw_sectors <
+ mddev->queue->max_hw_sectors) {
+ printk(KERN_INFO "raid0: device trasnfer"
+ " size %usectors is smaller than other"
+ "raid's components %usectors, rejecting ",
+ rdev->bdev->bd_disk->queue->max_hw_sectors,
+ mddev->queue->max_hw_sectors);
+ return -1;
+ }
+ list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ if (rdev1 == rdev) {
+ clear_bit(In_sync, &rdev->flags);
+ return 0;
+ }
+ }
+ return -1;
+}
+
+
+static int raid0_create_reshape_thread(mddev_t *mddev)
+{
+ if (mddev->thread)
+ return 0;
+ mddev->thread = md_register_thread(
+ raid0_reshape_thread,
+ mddev, "%s_raid0");
+ if (!mddev->thread) {
+ printk(KERN_ERR
+ "raid0: couldn't allocate thread for %s\n",
+ mdname(mddev));
+ return -1;
+ }
+ mddev->recovery_cp = MaxSector;
+ return 0;
+}
+
+
+static int raid0_reshape(mddev_t *mddev)
+{
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ return 0;
+}
+
+#endif
+
static struct mdk_personality raid0_personality=
{
.name = "raid0",
@@ -523,6 +948,10 @@ static struct mdk_personality raid0_personality=
.stop = raid0_stop,
.status = raid0_status,
.size = raid0_size,
+#ifdef CONFIG_MD_RAID0_RESHAPE
+ .check_reshape = raid0_reshape,
+ .hot_add_disk = raid0_add_disk,
+#endif
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12e..ff2dca9 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -14,9 +14,10 @@ struct raid0_private_data
{
struct strip_zone **hash_table; /* Table of indexes into strip_zone */
struct strip_zone *strip_zone;
- mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+ mdk_rdev_t **devlist;/* lists of rdevs, pointed to by strip_zone->dev */
int nr_strip_zones;
-
+ struct bio *reshape_bi;
+ struct completion wait_reshape;
sector_t spacing;
int sector_shift; /* shift this before divide by spacing */
};
next reply other threads:[~2009-05-02 21:46 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-05-02 21:46 raz ben yehuda [this message]
2009-05-10 22:31 ` Subject: [001/002 ] raid0 reshape Neil Brown
2009-05-12 16:59 ` Raz
2009-05-19 18:09 ` Dan Williams
2009-05-19 22:27 ` Raz
2009-05-21 11:48 ` Neil Brown
2009-05-21 12:33 ` OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape) John Robinson
2009-05-21 19:20 ` Subject: [001/002 ] raid0 reshape Greg Freemyer
2009-05-25 12:19 ` Goswin von Brederlow
2009-05-25 20:06 ` Raz
2009-05-27 21:55 ` Bill Davidsen
2009-05-25 22:14 ` Neil Brown
2009-05-26 11:17 ` Goswin von Brederlow
2009-05-26 11:51 ` Neil Brown
2009-05-28 19:07 ` Goswin von Brederlow
2009-05-22 7:53 ` Dan Williams
2009-05-23 22:33 ` Raz
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1241300764.5607.36.camel@raz \
--to=raziebe@013.net \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).