Subject: [001/002 ] raid0 reshape

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Subject: [001/002 ] raid0 reshape
@ 2009-05-02 21:46 raz ben yehuda
  2009-05-10 22:31 ` Neil Brown
  2009-05-19 18:09 ` Dan Williams
  0 siblings, 2 replies; 17+ messages in thread
From: raz ben yehuda @ 2009-05-02 21:46 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid

Neil Hello
The bellow is the raid0 grow code.I have decided to fix raid0 and not 
perform the transformation raid0-raid4-raid0 due to two reasons:
1. raid0 zones. this patch support any zone transformations.
2. Undesired dependency of raid0 over raid4 re-striping code. 

The following tests were conducted:
1. various chunk sizes, 4K to 512K. ( mainly in 2.6.27 and 2.6.18 )
2. regrow ( tested on 2.6.27 and 2.6.18 )
3. various super blocks. 0.9 , 1, 1.1 and 1.2 ( mainly in 2.6.27 and 2.6.18 ).
4. support assembling and mounting older raid version ( older kernels and code before patch) after it was grown.

patch passed checkpatch.pl . other than reshaping code i beautified the code. 
Currently i about to pass this code to our testing team for further tests. 
Other things to do:
1. Speedup the reshape process.It is too slow.
2. Support for non power 2^n ( page size) chunks.

I will be thankful for your criticism.

Raz


 drivers/md/Kconfig |   13 
 drivers/md/md.c    |    6 
 drivers/md/raid0.c |  711 ++++++++++++++++++++++++++++++++++---------
 drivers/md/raid0.h |    5 
 4 files changed, 590 insertions(+), 145 deletions(-)

Signed-off-by:  Neil Brown <neilb@suse.de>
---
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675..a9f0ff6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -77,6 +77,19 @@ config MD_RAID0
 
 	  If unsure, say Y.
 
+config MD_RAID0_RESHAPE
+	bool "Support adding drives to a raid-0 array.(EXPERIMENTAL)"
+	depends on MD_RAID0 && EXPERIMENTAL
+	default n
+	---help---
+	  A RAID-0 set can be expanded by adding extra drives. This
+	  requires "restriping" .
+	  You will need mdadm version 2.4.x or later to use this.
+	  The mdadm usage is e.g.
+	       mdadm --grow /dev/md0 --raid-disks=6
+	  Note: The array can only be expanded.
+	  If unsure, say N.
+
 config MD_RAID1
 	tristate "RAID-1 (mirroring) mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ed5727c..82f57ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5707,6 +5707,8 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		max_blocks = mddev->resync_max_sectors >> 1;
 	else
 		max_blocks = mddev->dev_sectors / 2;
+	if (mddev->level == 0)
+		max_blocks = mddev->array_sectors>>1;
 
 	/*
 	 * Should not happen.
@@ -5915,7 +5917,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		if (mddev->pers) {
 			mddev->pers->status(seq, mddev);
 	 		seq_printf(seq, "\n      ");
-			if (mddev->pers->sync_request) {
+			if (mddev->pers->sync_request || !mddev->level) {
 				if (mddev->curr_resync > 2) {
 					status_resync(seq, mddev);
 					seq_printf(seq, "\n      ");
@@ -6146,7 +6148,7 @@ int md_allow_write(mddev_t *mddev)
 		return 0;
 	if (mddev->ro)
 		return 0;
-	if (!mddev->pers->sync_request)
+	if (!mddev->pers->sync_request && mddev->level != 0)
 		return 0;
 
 	spin_lock_irq(&mddev->write_lock);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d755..9e2b6de 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,11 +18,14 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid0.h"
 
+static int raid0_create_reshape_thread(mddev_t *mddev);
+
 static void raid0_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -53,27 +56,46 @@ static int raid0_congested(void *data, int bits)
 }
 

-static int create_strip_zones (mddev_t *mddev)
+static void raid0_dump_zones(mddev_t *mddev)
 {
-	int i, c, j;
-	sector_t current_start, curr_zone_start;
-	sector_t min_spacing;
+	int j, k, h;
+	char b[BDEVNAME_SIZE];
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
-	struct strip_zone *zone;
-	int cnt;
+	printk(KERN_INFO "***** %s configuration ******\n\n",
+		mdname(mddev));
+	h = 0;
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		printk(KERN_INFO "zone%d", j);
+		if (conf->hash_table[h] == conf->strip_zone+j)
+			printk("(h%d)", h++);
+		printk(KERN_INFO "=[");
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			printk(KERN_INFO "%s/", bdevname(
+				conf->strip_zone[j].dev[k]->bdev, b));
+		printk(KERN_INFO "]\n\t zone offset=%llu device offset=%llu size=%llukb\n",
+			(unsigned long long)conf->strip_zone[j].zone_start,
+			(unsigned long long)conf->strip_zone[j].dev_start,
+			(unsigned long long)conf->strip_zone[j].sectors>>1);
+	}
+	printk(KERN_INFO "**********************************\n\n");
+}
+
+
+static void raid0_count_zones(mddev_t *mddev, struct list_head *disks)
+{
+	int c = 0;
 	char b[BDEVNAME_SIZE];
- 
+	mdk_rdev_t  *rdev1, *rdev2;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 	/*
 	 * The number of 'same size groups'
 	 */
 	conf->nr_strip_zones = 0;
- 
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+	list_for_each_entry(rdev1, disks, same_set) {
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
 		c = 0;
-		list_for_each_entry(rdev2, &mddev->disks, same_set) {
+		list_for_each_entry(rdev2, disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
 			       (unsigned long long)rdev1->sectors);
@@ -103,78 +125,72 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 	}
 	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+}
 
-	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
-				conf->nr_strip_zones, GFP_KERNEL);
-	if (!conf->strip_zone)
-		return 1;
-	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
-				conf->nr_strip_zones*mddev->raid_disks,
-				GFP_KERNEL);
-	if (!conf->devlist)
-		return 1;
 
-	/* The first zone must contain all devices, so here we check that
-	 * there is a proper alignment of slots to devices and find them all
-	 */
-	zone = &conf->strip_zone[0];
-	cnt = 0;
-	smallest = NULL;
-	zone->dev = conf->devlist;
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
-		int j = rdev1->raid_disk;
+/*
+ * The first zone must contain all devices, so here we check that
+ * there is a proper alignment of slots to devices and find them all
+ */
+static int raid0_create_first_zone(mddev_t *mddev, struct list_head *disks)
+{
+	mdk_rdev_t *smallest = NULL;
+	mdk_rdev_t  *rdev;
+	int cnt = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	struct strip_zone *zone0 = &conf->strip_zone[0];
 
+	zone0->dev = conf->devlist;
+	list_for_each_entry(rdev, disks, same_set) {
+		int j = rdev->raid_disk;
 		if (j < 0 || j >= mddev->raid_disks) {
 			printk(KERN_ERR "raid0: bad disk number %d - "
 				"aborting!\n", j);
-			goto abort;
+			return -1;
 		}
-		if (zone->dev[j]) {
+		if (zone0->dev[j]) {
 			printk(KERN_ERR "raid0: multiple devices for %d - "
 				"aborting!\n", j);
-			goto abort;
+			return -1;
 		}
-		zone->dev[j] = rdev1;
-
-		blk_queue_stack_limits(mddev->queue,
-				       rdev1->bdev->bd_disk->queue);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
-		 */
-
-		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
-		if (!smallest || (rdev1->sectors < smallest->sectors))
-			smallest = rdev1;
+		zone0->dev[j] = rdev;
+		if (!smallest || (rdev->sectors < smallest->sectors))
+			smallest = rdev;
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
 		printk(KERN_ERR "raid0: too few disks (%d of %d) - "
 			"aborting!\n", cnt, mddev->raid_disks);
-		goto abort;
+		return -1;
 	}
-	zone->nb_dev = cnt;
-	zone->sectors = smallest->sectors * cnt;
-	zone->zone_start = 0;
+	zone0->nb_dev = cnt;
+	zone0->sectors = smallest->sectors * cnt;
+	zone0->zone_start = 0;
+	return 0;
+}
+
+
 
-	current_start = smallest->sectors;
-	curr_zone_start = zone->sectors;
+static void raid0_set_higher_zones(mddev_t *mddev)
+{
+	int i, j, c;
+	mdk_rdev_t *rdev;
+	struct strip_zone *zone;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *smallest;
+	sector_t current_start =
+		conf->strip_zone[0].sectors/conf->strip_zone[0].nb_dev;
+	sector_t curr_zone_start = conf->strip_zone[0].sectors;
 
 	/* now do the other zones */
-	for (i = 1; i < conf->nr_strip_zones; i++)
-	{
+	for (i = 1; i < conf->nr_strip_zones; i++) {
 		zone = conf->strip_zone + i;
 		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
-
 		printk(KERN_INFO "raid0: zone %d\n", i);
 		zone->dev_start = current_start;
 		smallest = NULL;
 		c = 0;
-
-		for (j=0; j<cnt; j++) {
+		for (j = 0; j < conf->strip_zone[0].nb_dev; j++) {
 			char b[BDEVNAME_SIZE];
 			rdev = conf->strip_zone[0].dev[j];
 			printk(KERN_INFO "raid0: checking %s ...",
@@ -197,25 +213,33 @@ static int create_strip_zones (mddev_t *mddev)
 		zone->sectors = (smallest->sectors - current_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->sectors);
-
 		zone->zone_start = curr_zone_start;
 		curr_zone_start += zone->sectors;
-
 		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
 			(unsigned long long)current_start);
 	}
+}
 
-	/* Now find appropriate hash spacing.
-	 * We want a number which causes most hash entries to cover
-	 * at most two strips, but the hash table must be at most
-	 * 1 PAGE.  We choose the smallest strip, or contiguous collection
-	 * of strips, that has big enough size.  We never consider the last
-	 * strip though as it's size has no bearing on the efficacy of the hash
-	 * table.
-	 */
-	conf->spacing = curr_zone_start;
-	min_spacing = curr_zone_start;
+
+/* Now find appropriate hash spacing.
+ * We want a number which causes most hash entries to cover
+ * at most two strips, but the hash table must be at most
+ * 1 PAGE.  We choose the smallest strip, or contiguous collection
+ * of strips, that has big enough size.  We never consider the last
+ * strip though as it's size has no bearing on the efficacy of the hash
+ * table.
+ */
+static void raid0_find_hash_spacing(mddev_t *mddev)
+{
+	int i, j;
+	sector_t min_spacing;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	conf->spacing = 0;
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		conf->spacing += conf->strip_zone[i].sectors;
+	min_spacing = conf->spacing;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
 	for (i=0; i < conf->nr_strip_zones-1; i++) {
 		sector_t s = 0;
@@ -225,16 +249,31 @@ static int create_strip_zones (mddev_t *mddev)
 		if (s >= min_spacing && s < conf->spacing)
 			conf->spacing = s;
 	}
+}
 
-	mddev->queue->unplug_fn = raid0_unplug;
+static int raid0_create_strip_zones(mddev_t *mddev, struct list_head *disks)
+{
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 
+	raid0_count_zones(mddev, disks);
+	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
+				conf->nr_strip_zones, GFP_KERNEL);
+	if (!conf->strip_zone)
+		return 1;
+	conf->devlist = kzalloc(sizeof(mdk_rdev_t *)*
+				conf->nr_strip_zones*mddev->raid_disks,
+				GFP_KERNEL);
+	if (!conf->devlist)
+		return 1;
+	if (raid0_create_first_zone(mddev, disks))
+		return 1;
+	raid0_set_higher_zones(mddev);
+	raid0_find_hash_spacing(mddev);
+	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-
 	printk(KERN_INFO "raid0: done.\n");
 	return 0;
- abort:
-	return 1;
 }
 
 /**
@@ -265,79 +304,73 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 
 static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	sector_t array_sectors = 0;
+	int i;
 	mdk_rdev_t *rdev;
-
-	WARN_ONCE(sectors || raid_disks,
-		  "%s does not support generic reshape\n", __func__);
-
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		array_sectors += rdev->sectors;
-
+	sector_t array_sectors = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	for (i = 0; i < mddev->raid_disks; i++) {
+		rdev = devlist[i];
+		if (test_bit(In_sync, &rdev->flags))
+			array_sectors += rdev->sectors;
+	}
 	return array_sectors;
 }
 
-static int raid0_run (mddev_t *mddev)
+static void raid0_set_queue_limits(mddev_t *mddev)
 {
-	unsigned  cur=0, i=0, nb_zone;
-	s64 sectors;
-	raid0_conf_t *conf;
+	mdk_rdev_t  *rdev;
 
-	if (mddev->chunk_size == 0) {
-		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
-		return -EINVAL;
-	}
-	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
-	       mdname(mddev),
-	       mddev->chunk_size >> 9,
-	       (mddev->chunk_size>>1)-1);
-	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
-	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		blk_queue_stack_limits(mddev->queue,
+			       rdev->bdev->bd_disk->queue);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_sector to one PAGE, as
+		 * a one page request is never in violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
-	if (!conf)
-		goto out;
-	mddev->private = (void *)conf;
- 
-	conf->strip_zone = NULL;
-	conf->devlist = NULL;
-	if (create_strip_zones (mddev)) 
-		goto out_free_conf;
+	}
+}
 
-	/* calculate array device size */
-	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+static int raid0_set_array_hash(mddev_t *mddev)
+{
+	int nb_zone = 0;
+	sector_t space;
+	int round;
+	sector_t s , sectors;
+	int  cur = 0, i = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
 		(unsigned long long)mddev->array_sectors);
 	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
 		(unsigned long long)conf->spacing);
-	{
-		sector_t s = raid0_size(mddev, 0, 0);
-		sector_t space = conf->spacing;
-		int round;
-		conf->sector_shift = 0;
-		if (sizeof(sector_t) > sizeof(u32)) {
-			/*shift down space and s so that sector_div will work */
-			while (space > (sector_t) (~(u32)0)) {
-				s >>= 1;
-				space >>= 1;
-				s += 1; /* force round-up */
-				conf->sector_shift++;
-			}
+
+	s = raid0_size(mddev, 0, mddev->raid_disks);
+	space = conf->spacing;
+	conf->sector_shift = 0;
+	if (sizeof(sector_t) > sizeof(u32)) {
+		/*shift down space and s so that sector_div will work */
+		while (space > (sector_t) (~(u32)0)) {
+			s >>= 1;
+			space >>= 1;
+			s += 1; /* force round-up */
+			conf->sector_shift++;
 		}
-		round = sector_div(s, (u32)space) ? 1 : 0;
-		nb_zone = s + round;
 	}
+	round = sector_div(s, (u32)space) ? 1 : 0;
+	nb_zone = s + round;
 	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
 
 	printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
 				nb_zone*sizeof(struct strip_zone*));
 	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
 	if (!conf->hash_table)
-		goto out_free_conf;
+		return -1;
 	sectors = conf->strip_zone[cur].sectors;
-
 	conf->hash_table[0] = conf->strip_zone + cur;
 	for (i=1; i< nb_zone; i++) {
 		while (sectors <= conf->spacing) {
@@ -354,24 +387,59 @@ static int raid0_run (mddev_t *mddev)
 		 */
 		conf->spacing++;
 	}
+	return 0;
+}
 
-	/* calculate the max read-ahead size.
-	 * For read-ahead of large files to be effective, we need to
-	 * readahead at least twice a whole stripe. i.e. number of devices
-	 * multiplied by chunk size times 2.
-	 * If an individual device has an ra_pages greater than the
-	 * chunk size, then we will not drive that device as hard as it
-	 * wants.  We consider this a configuration error: a larger
-	 * chunksize should be used in that case.
-	 */
-	{
-		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
-		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
-	}
+/* calculate the max read-ahead size.
+ * For read-ahead of large files to be effective, we need to
+ * readahead at least twice a whole stripe. i.e. number of devices
+ * multiplied by chunk size times 2.
+ * If an individual device has an ra_pages greater than the
+ * chunk size, then we will not drive that device as hard as it
+ * wants.  We consider this a configuration error: a larger
+ * chunksize should be used in that case.
+ */
+static void raid0_set_max_ra(mddev_t *mddev)
+{
+	int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+	if (mddev->queue->backing_dev_info.ra_pages < 2*stripe)
+		mddev->queue->backing_dev_info.ra_pages = 2*stripe;
 
+}
+
+static int raid0_run(mddev_t *mddev)
+{
+	raid0_conf_t *conf;
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+		return -EINVAL;
+	}
+	printk(KERN_INFO "%s: setting max_sectors"
+			" to %d, segment boundary to %d\n",
+	       mdname(mddev),
+	       mddev->chunk_size >> 9,
+	       (mddev->chunk_size>>1)-1);
+	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
+	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
+	conf = kmalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+	if (!conf)
+		goto out;
+	mddev->private = (void *)conf;
+	conf->strip_zone = NULL;
+	conf->devlist = NULL;
+	if (raid0_create_strip_zones(mddev, &mddev->disks))
+		goto out_free_conf;
+	/* calculate array device size */
+	md_set_array_sectors(mddev, raid0_size(mddev, 0, mddev->raid_disks));
+	raid0_set_array_hash(mddev);
+	raid0_set_queue_limits(mddev);
+	raid0_set_max_ra(mddev);
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+	raid0_dump_zones(mddev);
+	raid0_create_reshape_thread(mddev);
+	init_completion(&conf->wait_reshape);
 	return 0;
 
 out_free_conf:
@@ -386,7 +454,10 @@ out:
 static int raid0_stop (mddev_t *mddev)
 {
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-
+	if (mddev->thread) {
+		md_unregister_thread(mddev->thread);
+		mddev->thread = 0;
+	}
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->hash_table);
 	conf->hash_table = NULL;
@@ -414,7 +485,10 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
-
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		bio_endio(bio, -EBUSY);
+		return 0;
+	}
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
 	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
@@ -513,6 +587,357 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
+#ifdef CONFIG_MD_RAID0_RESHAPE
+
+#define DEBUG 0
+#define r0_dprintk(x...) ((void)(DEBUG && printk(x)))
+
+static void raid0_reshape_endio(struct bio *bi, int error)
+{
+	struct completion* w = (struct completion *)bi->bi_private;
+	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+	r0_dprintk("raid0: endio: sec=%lld:size=%d "
+		"bvlen=%d bvoff=%d \n",
+			(unsigned long long)bi->bi_sector,
+			bi->bi_size,
+			bi->bi_io_vec[0].bv_len,
+			bi->bi_io_vec[0].bv_offset);
+	if (!error || uptodate)
+		return (void)complete(w);
+	printk("raid0: end reshape: io error sector=%llu\n",
+			(unsigned long long)bi->bi_sector);
+}
+
+static int raid0_reshape_rw(struct bio *bi, int dir, int size)
+{
+	char b[BDEVNAME_SIZE];
+	bi->bi_rw  	= dir;
+	bi->bi_size     = size;
+	bi->bi_idx      = 0;
+	r0_dprintk("%s %c %llu sec size=%d\n",
+			bdevname(bi->bi_bdev, b),
+			dir == 0 ? 'R' : 'W',
+			(unsigned long long)bi->bi_sector, bi->bi_size);
+	generic_make_request(bi);
+	wait_for_completion((struct completion *)(bi->bi_private));
+	return 0;
+}
+
+static struct strip_zone *raid0_point_to_zone(mddev_t *mddev,
+					sector_t sector)
+{
+	sector_t x;
+	struct strip_zone *zone;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	x = sector >> conf->sector_shift;
+	sector_div(x, (u32)conf->spacing);
+	zone = conf->hash_table[x];
+	while (sector >= zone->zone_start + zone->sectors)
+		zone++;
+	return zone;
+}
+
+
+static int raid0_point_bio_to_disk(struct bio *bio, sector_t raid_sector,
+				mddev_t *mddev)
+{
+	int chunksect_bits;
+	mdk_rdev_t *tmp_dev;
+	sector_t x, chunk_sects, chunk, rsect;
+	sector_t sect_in_chunk;
+	struct strip_zone *zone;
+
+	chunk_sects = mddev->chunk_size >> 9;
+	chunksect_bits = ffz(~chunk_sects);
+
+	zone = raid0_point_to_zone(mddev, raid_sector);
+	sect_in_chunk = raid_sector & (chunk_sects - 1);
+	x = (raid_sector - zone->zone_start) >> chunksect_bits;
+	sector_div(x, zone->nb_dev);
+	chunk = x;
+	x = raid_sector >> chunksect_bits;
+	tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
+	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
+
+	bio->bi_bdev   = tmp_dev->bdev;
+	bio->bi_sector = rsect + tmp_dev->data_offset;
+	return 0;
+}
+
+
+static void raid0_take_speed(mddev_t *mddev, sector_t raid_sector)
+{
+	if ((jiffies-mddev->resync_mark) < 1000)
+		return;
+	mddev->resync_mark = jiffies;
+	mddev->resync_mark_cnt = raid_sector;
+}
+
+
+static sector_t raid0_reshape_move_blocks(mddev_t *mddev,
+					mddev_t *mddev_target,
+					struct strip_zone *zone)
+{
+	raid0_conf_t *conf 	= mddev_to_conf(mddev);
+	struct bio  *bi 	= conf->reshape_bi;
+	int io_size 		= bi->bi_size;
+	sector_t raid_sector    = zone->zone_start;
+	sector_t last_sector 	= (zone->zone_start + zone->sectors);
+	mddev->curr_mark_cnt    = io_size>>10;
+
+	while (raid_sector < last_sector && !kthread_should_stop()) {
+		raid0_take_speed(mddev, raid_sector);
+		if (raid0_point_bio_to_disk(bi, raid_sector, mddev)) {
+			printk(KERN_ERR "raid0:reshape point"
+					" read to bio failed\n");
+			break;
+		}
+		raid0_reshape_rw(bi, READ, io_size);
+		if (raid0_point_bio_to_disk(bi, raid_sector, mddev_target)) {
+			printk(KERN_ERR "raid0: point write to bio failed\n");
+			break;
+		}
+		raid0_reshape_rw(bi, WRITE, io_size);
+		raid_sector += io_size>>9;
+		mddev->curr_mark_cnt = raid_sector;
+		mddev->curr_resync = raid_sector;
+	}
+	bi->bi_size = io_size;
+	return raid_sector - zone->zone_start;
+}
+
+
+static void raid0_reshape_move_zones(mddev_t *mddev, mddev_t *mddev_target)
+{
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	sector_t raid_sector = 0;
+	int i = 0;
+	for (; i < conf->nr_strip_zones && !kthread_should_stop() ; i++)
+		raid_sector += raid0_reshape_move_blocks(mddev,
+						mddev_target,
+						&conf->strip_zone[i]);
+	if (raid_sector == mddev->array_sectors) {
+		printk(KERN_INFO "raid0: reshape ended %llu sectors moved OK\n",
+			(unsigned long long)raid_sector);
+	} else{
+		printk(KERN_INFO "raid0: reshape ended %llu sector moved BAD\n",
+			(unsigned long long)raid_sector);
+	}
+}
+
+
+static int raid0_reshape_prepare(mddev_t *mddev, mddev_t *mddev_target)
+{
+	raid0_conf_t *conf;
+	mddev_target->private = NULL;
+	conf = kzalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+	if (!conf)
+		return -1;
+	mddev_target->private = (void *)conf;
+	conf->strip_zone = NULL;
+	conf->devlist = NULL;
+	if (raid0_create_strip_zones(mddev_target, &mddev->disks))
+		return -1;
+	return raid0_set_array_hash(mddev_target);
+}
+
+
+static	mddev_t *raid0_clone_mddev(mddev_t *mddev)
+{
+	void *m = kmalloc(sizeof(*mddev), GFP_NOIO);
+	if (!m)
+		return NULL;
+	memcpy(m, mddev, sizeof(*mddev));
+	return (mddev_t *)m;
+}
+
+static int raid0_reshape_iosize(mddev_t *mddev)
+{
+	int chunk_size_sectors = (mddev->chunk_size / PAGE_SIZE)*8;
+
+	if (mddev->queue->max_hw_sectors >= chunk_size_sectors)
+		return chunk_size_sectors;
+	if ((chunk_size_sectors % mddev->queue->max_hw_sectors) == 0)
+		return mddev->queue->max_hw_sectors;
+	return chunk_size_sectors /
+		((chunk_size_sectors / mddev->queue->max_hw_sectors)*2);
+}
+
+
+static mddev_t *raid0_reshape_init(mddev_t *mddev)
+{
+	int i;
+	mddev_t *mddev_target = NULL;
+	mdk_rdev_t *rdev = NULL;
+	int nraid_disks = 0;
+	struct bio *bi = NULL;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	int pages = raid0_reshape_iosize(mddev)/8;
+	if (pages == 0) {
+		printk(KERN_INFO "raid0: failed to "
+				"determine transfer size\n");
+		return NULL;
+	}
+	printk("raid0: using transfer size %usectors\n", pages*8);
+	bi = bio_alloc(GFP_NOIO, pages);
+	if (!bi) {
+		printk(KERN_INFO "raid0:failed too alloc bio for"
+			" reshaping. rejecting\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	mddev_target = raid0_clone_mddev(mddev);
+	bi->bi_vcnt = 0;
+	if (!mddev_target) {
+		printk(KERN_INFO "raid0: failed to clone mddev\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	mddev->reshape_position = 0;
+	mddev->delta_disks = 0;
+	atomic_set(&mddev->recovery_active, 0);
+	nraid_disks = mddev->raid_disks;
+
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (!test_bit(In_sync, &rdev->flags)) {
+			rdev->raid_disk = nraid_disks++;
+			rdev->desc_nr = rdev->raid_disk;
+			set_bit(In_sync, &rdev->flags);
+		}
+	}
+	mddev_target->raid_disks = nraid_disks;
+	if (raid0_reshape_prepare(mddev, mddev_target)) {
+		printk(KERN_INFO "raid0: failed to"
+			" setup temporary mappings\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	bi->bi_vcnt = pages;
+	for (i = 0; i < bi->bi_vcnt; i++) {
+		bi->bi_io_vec[i].bv_len    = PAGE_SIZE;
+		bi->bi_io_vec[i].bv_offset = 0;
+		bi->bi_io_vec[i].bv_page   = alloc_page(GFP_NOIO);
+		get_page(bi->bi_io_vec[i].bv_page);
+	}
+	bi->bi_next    		   = NULL;
+	bi->bi_end_io 		   = raid0_reshape_endio;
+	bi->bi_size     	   = PAGE_SIZE * bi->bi_vcnt;
+	bi->bi_private  	   = &conf->wait_reshape;
+	bi->bi_idx  		   = 0;
+	conf->reshape_bi 	   = bi;
+	return mddev_target;
+
+RAID0_RESHAPE_INIT_EXIT_BAD:
+	kfree(mddev_target);
+	for (i = 0; i < bi->bi_vcnt; i++)
+		safe_put_page(bi->bi_io_vec[i].bv_page);
+	if (bi)
+		bio_put(bi);
+	return NULL;
+}
+
+
+static void raid0_reshape_thread(mddev_t *mddev)
+{
+	int i = 0;
+	mddev_t *mddev_target = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		return;
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	mddev_target = raid0_reshape_init(mddev);
+	if (!mddev_target)
+		return;
+	raid0_reshape_move_zones(mddev, mddev_target);
+	if (kthread_should_stop())
+		goto RAID0_RELEASE_PSEUDO_RAID;
+	for (i = 0; i < conf->reshape_bi->bi_vcnt; i++)
+		safe_put_page(conf->reshape_bi->bi_io_vec[i].bv_page);
+	bio_put(conf->reshape_bi);
+	mddev->resync_mark = 0L;
+	mddev->resync_mark_cnt = 0L;
+	mddev->curr_resync = 0;
+	mddev->recovery_cp = MaxSector;
+	mddev->reshape_position = MaxSector;
+	mddev->raid_disks = mddev_target->raid_disks;
+	kfree(conf->hash_table);
+	kfree(conf);
+	mutex_lock(&mddev->reconfig_mutex);
+	raid0_run(mddev);
+RAID0_RELEASE_PSEUDO_RAID:
+	if (!mutex_is_locked(&mddev->reconfig_mutex))
+		mutex_lock(&mddev->reconfig_mutex);
+	mddev->in_sync = 1;
+	if (md_allow_write(mddev)) {
+		printk("raid0: did not write sb"
+				" critical error\n");
+	}
+	mutex_unlock(&mddev->reconfig_mutex);
+	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	conf = mddev_target->private;
+	kfree(conf->hash_table);
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(mddev_target);
+}
+
+
+static int raid0_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *rdev1;
+	if (rdev->sectors < (mddev->chunk_size>>11)) {
+		printk(KERN_INFO "raid0: device smaller than "
+			"chunk size %llusectors < %llusectors\n",
+				(unsigned long long)rdev->sectors,
+				((unsigned long long)mddev->chunk_size)>>10);
+		return -1;
+	}
+	if (rdev->bdev->bd_disk->queue->max_hw_sectors <
+				mddev->queue->max_hw_sectors) {
+		printk(KERN_INFO "raid0: device trasnfer"
+			" size %usectors is smaller than other"
+			"raid's components %usectors, rejecting ",
+			 rdev->bdev->bd_disk->queue->max_hw_sectors,
+			 mddev->queue->max_hw_sectors);
+		return -1;
+	}
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+		if (rdev1 == rdev) {
+			clear_bit(In_sync, &rdev->flags);
+			return 0;
+		}
+	}
+	return -1;
+}
+
+
+static int raid0_create_reshape_thread(mddev_t *mddev)
+{
+	if (mddev->thread)
+		return 0;
+	mddev->thread = md_register_thread(
+			raid0_reshape_thread,
+				mddev, "%s_raid0");
+	if (!mddev->thread) {
+		printk(KERN_ERR
+			"raid0: couldn't allocate thread for %s\n",
+			mdname(mddev));
+		return -1;
+	}
+	mddev->recovery_cp = MaxSector;
+	return 0;
+}
+
+
+static int raid0_reshape(mddev_t *mddev)
+{
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	return 0;
+}
+
+#endif
+
 static struct mdk_personality raid0_personality=
 {
 	.name		= "raid0",
@@ -523,6 +948,10 @@ static struct mdk_personality raid0_personality=
 	.stop		= raid0_stop,
 	.status		= raid0_status,
 	.size		= raid0_size,
+#ifdef CONFIG_MD_RAID0_RESHAPE
+	.check_reshape	= raid0_reshape,
+	.hot_add_disk	= raid0_add_disk,
+#endif
 };
 
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12e..ff2dca9 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -14,9 +14,10 @@ struct raid0_private_data
 {
 	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
 	struct strip_zone *strip_zone;
-	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	mdk_rdev_t **devlist;/* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-
+	struct bio *reshape_bi;
+	struct completion wait_reshape;
 	sector_t spacing;
 	int sector_shift; /* shift this before divide by spacing */
 };




^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-02 21:46 Subject: [001/002 ] raid0 reshape raz ben yehuda
@ 2009-05-10 22:31 ` Neil Brown
  2009-05-12 16:59   ` Raz
  2009-05-19 18:09 ` Dan Williams
  1 sibling, 1 reply; 17+ messages in thread
From: Neil Brown @ 2009-05-10 22:31 UTC (permalink / raw)
  To: raz ben yehuda; +Cc: linux-raid

On Sunday May 3, raziebe@013.net wrote:
> Neil Hello
> The bellow is the raid0 grow code.I have decided to fix raid0 and not 
> perform the transformation raid0-raid4-raid0 due to two reasons:
> 1. raid0 zones. this patch support any zone transformations.
> 2. Undesired dependency of raid0 over raid4 re-striping code. 
> 
> The following tests were conducted:
> 1. various chunk sizes, 4K to 512K. ( mainly in 2.6.27 and 2.6.18 )
> 2. regrow ( tested on 2.6.27 and 2.6.18 )
> 3. various super blocks. 0.9 , 1, 1.1 and 1.2 ( mainly in 2.6.27 and 2.6.18 ).
> 4. support assembling and mounting older raid version ( older kernels and code before patch) after it was grown.
> 
> patch passed checkpatch.pl . other than reshaping code i beautified the code. 
> Currently i about to pass this code to our testing team for further tests. 
> Other things to do:
> 1. Speedup the reshape process.It is too slow.
> 2. Support for non power 2^n ( page size) chunks.
> 
> I will be thankful for your criticism.

Probably my main criticism at this point is that there is no
commentary explaining how it works.
You appear to have chosen to run your own reshape thread rather than
providing a "resync" method and make use of the md_do_sync thread
which provides speed limiting etc.  
Maybe that it a good decision, but as you haven't explained it, it is
hard to be sure.

Also, it seems to if an IO request arrives while the reshape is
happening, then it fails with -EBUSY.
I don't think that is a good thing.
1/ no filesystem is going to be expecting EBUSY so it could cause
   serious problems
2/ if you aren't going to support online reshape so that a device can 
   be reshaped while it is in use, then there seems to be little point
   in putting this code in the kernel.  Just write a program that runs
   in userspace which reshapes the array while it is not assembled.

Also, I cannot see any evidence that you checkpoint the reshape at
all.  So if your machine crashes during the reshape, everything is
lost.  I do not find this acceptable.

NeilBrown

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-10 22:31 ` Neil Brown
@ 2009-05-12 16:59   ` Raz
  0 siblings, 0 replies; 17+ messages in thread
From: Raz @ 2009-05-12 16:59 UTC (permalink / raw)
  To: Neil Brown; +Cc: raz ben yehuda, linux-raid

On Mon, May 11, 2009 at 1:31 AM, Neil Brown <neilb@suse.de> wrote:
> On Sunday May 3, raziebe@013.net wrote:
>> Neil Hello
>> The bellow is the raid0 grow code.I have decided to fix raid0 and not
>> perform the transformation raid0-raid4-raid0 due to two reasons:
>> 1. raid0 zones. this patch support any zone transformations.
>> 2. Undesired dependency of raid0 over raid4 re-striping code.
>>
>> The following tests were conducted:
>> 1. various chunk sizes, 4K to 512K. ( mainly in 2.6.27 and 2.6.18 )
>> 2. regrow ( tested on 2.6.27 and 2.6.18 )
>> 3. various super blocks. 0.9 , 1, 1.1 and 1.2 ( mainly in 2.6.27 and 2.6.18 ).
>> 4. support assembling and mounting older raid version ( older kernels and code before patch) after it was grown.
>>
>> patch passed checkpatch.pl . other than reshaping code i beautified the code.
>> Currently i about to pass this code to our testing team for further tests.
>> Other things to do:
>> 1. Speedup the reshape process.It is too slow.
>> 2. Support for non power 2^n ( page size) chunks.
>>
>> I will be thankful for your criticism.
>
> Probably my main criticism at this point is that there is no
> commentary explaining how it works.
> You appear to have chosen to run your own reshape thread rather than
> providing a "resync" method and make use of the md_do_sync thread
> which provides speed limiting etc.
> Maybe that it a good decision, but as you haven't explained it, it is
> hard to be sure.
You are correct , I will move to md resync; online reshape is essential.
I just need to understand the entire md resync processes for that. I
am going to
base raid0 reshape code like raid1's, meaning, have two threads,
raid0_resync and
raid0d for the writes.
Thank you Neil

> Also, it seems to if an IO request arrives while the reshape is
> happening, then it fails with -EBUSY.
> I don't think that is a good thing.
> 1/ no filesystem is going to be expecting EBUSY so it could cause
>   serious problems
> 2/ if you aren't going to support online reshape so that a device can
>   be reshaped while it is in use, then there seems to be little point
>   in putting this code in the kernel.  Just write a program that runs
>   in userspace which reshapes the array while it is not assembled.
>
> Also, I cannot see any evidence that you checkpoint the reshape at
> all.  So if your machine crashes during the reshape, everything is
> lost.  I do not find this acceptable.
>
> NeilBrown
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-02 21:46 Subject: [001/002 ] raid0 reshape raz ben yehuda
  2009-05-10 22:31 ` Neil Brown
@ 2009-05-19 18:09 ` Dan Williams
  2009-05-19 22:27   ` Raz
  2009-05-21 11:48   ` Neil Brown
  1 sibling, 2 replies; 17+ messages in thread
From: Dan Williams @ 2009-05-19 18:09 UTC (permalink / raw)
  To: raz ben yehuda; +Cc: neilb, linux-raid, Jacek Danecki, Labun, Marcin

On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
> Neil Hello
> The bellow is the raid0 grow code.I have decided to fix raid0 and not
> perform the transformation raid0-raid4-raid0 due to two reasons:
> 1. raid0 zones. this patch support any zone transformations.
> 2. Undesired dependency of raid0 over raid4 re-striping code.

Hi Raz,

Can you explain a bit more about why the raid4 approach is
undesirable?  I think making reshape only available to raid0 arrays
where all the members are the same size is a reasonable constraint.
We then get the nice benefit of reusing the raid5 reshape
infrastructure.  In other words I am not convinced that the benefits
of reimplementing reshape in raid0 outweigh the costs.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-19 18:09 ` Dan Williams
@ 2009-05-19 22:27   ` Raz
  2009-05-21 11:48   ` Neil Brown
  1 sibling, 0 replies; 17+ messages in thread
From: Raz @ 2009-05-19 22:27 UTC (permalink / raw)
  To: Dan Williams
  Cc: raz ben yehuda, neilb, linux-raid, Jacek Danecki, Labun, Marcin,
	yaronp, Ofer Kruzel

On Tue, May 19, 2009 at 9:09 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
>> Neil Hello
>> The bellow is the raid0 grow code.I have decided to fix raid0 and not
>> perform the transformation raid0-raid4-raid0 due to two reasons:
>> 1. raid0 zones. this patch support any zone transformations.
>> 2. Undesired dependency of raid0 over raid4 re-striping code.
>
> Hi Raz,
>
> Can you explain a bit more about why the raid4 approach is
> undesirable?  I think making reshape only available to raid0 arrays
> where all the members are the same size is a reasonable constraint.

1. Because this is the general case. what will you tell a user with two zones ?
2. In my next generation systems, raid0 is on top of 12 raid5s.
    A user may choose to expand one of these raid5s separately.
3. The $/Giga ratio will not decline while it should.
    Storage lasts years, disks  change sizes and formation. Segate
just announced a 2TB disk.
    If you start with 4 1TB disks, and in for 4 years you expand this
array 4 times, each time
    with a 2TB  disk, you end up with 8 TB  out of 12 TB.  You lose
30% storage. Tell that to your
    marketing and tell me when you do that, I want to see :)
4. Number of raid components .
     To reach Size X , you need more components with equal size.Having
multiple zones lets you increase
    the raid size without being subordinate to an ever increasing
number of disks. you just mirror one of the   disks to a bigger disk,
put the new disk back in the array, and reshape with the extra space
of the new disk.

Note !
I do not know what is your code base, but if you still have the hash
spacing, you might
have two zones even if the underlying components are the same in size.
So, in this case your decision if to support or not support the raid0
reshape is not just nb_zone > 1.

Other than that, if you do this work, it be will nice to have my back
covered  sooner than expected.(as long as you will NOT reach for your
revolver.... :) )

> We then get the nice benefit of reusing the raid5 reshape
> infrastructure.  In other words I am not convinced that the benefits
> of reimplementing reshape in raid0 outweigh the costs.
> Thanks,
> Dan
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-19 18:09 ` Dan Williams
  2009-05-19 22:27   ` Raz
@ 2009-05-21 11:48   ` Neil Brown
  2009-05-21 12:33     ` OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape) John Robinson
                       ` (3 more replies)
  1 sibling, 4 replies; 17+ messages in thread
From: Neil Brown @ 2009-05-21 11:48 UTC (permalink / raw)
  To: Dan Williams; +Cc: raz ben yehuda, linux-raid, Jacek Danecki, Labun, Marcin

On Tuesday May 19, dan.j.williams@intel.com wrote:
> On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
> > Neil Hello
> > The bellow is the raid0 grow code.I have decided to fix raid0 and not
> > perform the transformation raid0-raid4-raid0 due to two reasons:
> > 1. raid0 zones. this patch support any zone transformations.
> > 2. Undesired dependency of raid0 over raid4 re-striping code.
> 
> Hi Raz,
> 
> Can you explain a bit more about why the raid4 approach is
> undesirable?  I think making reshape only available to raid0 arrays
> where all the members are the same size is a reasonable constraint.
> We then get the nice benefit of reusing the raid5 reshape
> infrastructure.  In other words I am not convinced that the benefits
> of reimplementing reshape in raid0 outweigh the costs.

I've been thinking about this too... Is it something we really want to
do?

My thoughts include:

 - I don't like special cases - it would be nice to support reshape on
   all arrays, even RAID0 with different sizes devices.
 - Anyone who does this with a raid0 made of simple drives is asking
   for trouble.  But a RAID0 over a bunch of RAID5 or RAID1 might make
   sense. 
 - Maybe we should support different sized drives in RAID4.  As long
   as the parity drive is as big as the largest data drive it could be
   made to work.  Similarly hot spares would need to be big, but you
   could have 2 hot spares and take the smallest one that is big
   enough.
   If a drive in the RAID4+ (or is it is the thing called NORAID?)
   failed and was replaced with a bigger drive, it would be cool to be
   able to incorporate that extra space into the array.

   If we did all that, then the 0->4->0 conversion could make use of
   the same code.
 - Surely RAID0 is (like LVM) just a legacy idea until we get sensible
   file systems that actually understand multiple devices and do all
   this stuff for you are a more sensible level - so why are we
   busting a gut(*) to make RAID0 work well??  Answer is of course
   that no-one has made a sensible file system yet. (well... maybe zfs
   or btrfs, not sure)
 - If you read the DDF spec carefully, you find there is a secondary
   raid level which stripes over heterogeneous arrays a different way.
   You divide every primary array up into N chunks, so the chunk sizes are
   different on different arrays.  Then you make a secondary array by
   striping over those chunks.
   So e.g. you might have a 4Gig RAID5 and a 1GIG RAID1.  So the
   stripes array on top of these could take 4Meg from the RAID5, then
   1Meg from the RAID1, then another 4 from the RAID5 etc.
   So we want to support that?  And would we want to reshape such a
   thing??

So: lots of thoughts, some pointing in different directions.
But I'm not against reshape code appearing in RAID0 providing it is
well designed, maintainable, reliable, and doesn't slow down normal
RAID0 processing.  I suspect we can get there.

NeilBrown




* is that an Australian term??? not sure.  http://www.wordwebonline.com/en/BUSTAGUT

^ permalink raw reply	[flat|nested] 17+ messages in thread

* OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape)
  2009-05-21 11:48   ` Neil Brown
@ 2009-05-21 12:33     ` John Robinson
  2009-05-21 19:20     ` Subject: [001/002 ] raid0 reshape Greg Freemyer
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: John Robinson @ 2009-05-21 12:33 UTC (permalink / raw)
  To: Linux RAID

On 21/05/2009 12:48, Neil Brown wrote:
[...]
> why are we busting a gut(*) to make RAID0 work well??
[...]
> * is that an Australian term??? not sure.  http://www.wordwebonline.com/en/BUSTAGUT

No, or at least not exclusively; I'm familiar with it being used in the UK.

Cheers,

John.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-21 11:48   ` Neil Brown
  2009-05-21 12:33     ` OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape) John Robinson
@ 2009-05-21 19:20     ` Greg Freemyer
  2009-05-25 12:19       ` Goswin von Brederlow
  2009-05-22  7:53     ` Dan Williams
  2009-05-23 22:33     ` Raz
  3 siblings, 1 reply; 17+ messages in thread
From: Greg Freemyer @ 2009-05-21 19:20 UTC (permalink / raw)
  To: Neil Brown
  Cc: Dan Williams, raz ben yehuda, linux-raid, Jacek Danecki,
	Labun, Marcin

On Thu, May 21, 2009 at 7:48 AM, Neil Brown <neilb@suse.de> wrote:
> On Tuesday May 19, dan.j.williams@intel.com wrote:
>> On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
>> > Neil Hello
>> > The bellow is the raid0 grow code.I have decided to fix raid0 and not
>> > perform the transformation raid0-raid4-raid0 due to two reasons:
>> > 1. raid0 zones. this patch support any zone transformations.
>> > 2. Undesired dependency of raid0 over raid4 re-striping code.
>>
>> Hi Raz,
>>
>> Can you explain a bit more about why the raid4 approach is
>> undesirable?  I think making reshape only available to raid0 arrays
>> where all the members are the same size is a reasonable constraint.
>> We then get the nice benefit of reusing the raid5 reshape
>> infrastructure.  In other words I am not convinced that the benefits
>> of reimplementing reshape in raid0 outweigh the costs.
>
> I've been thinking about this too... Is it something we really want to
> do?
>
> My thoughts include:
>
>  - I don't like special cases - it would be nice to support reshape on
>   all arrays, even RAID0 with different sizes devices.
>  - Anyone who does this with a raid0 made of simple drives is asking
>   for trouble.  But a RAID0 over a bunch of RAID5 or RAID1 might make
>   sense.
>  - Maybe we should support different sized drives in RAID4.  As long
>   as the parity drive is as big as the largest data drive it could be
>   made to work.  Similarly hot spares would need to be big, but you
>   could have 2 hot spares and take the smallest one that is big
>   enough.
>   If a drive in the RAID4+ (or is it is the thing called NORAID?)
>   failed and was replaced with a bigger drive, it would be cool to be
>   able to incorporate that extra space into the array.
>
>   If we did all that, then the 0->4->0 conversion could make use of
>   the same code.
>  - Surely RAID0 is (like LVM) just a legacy idea until we get sensible
>   file systems that actually understand multiple devices and do all
>   this stuff for you are a more sensible level - so why are we
>   busting a gut(*) to make RAID0 work well??  Answer is of course
>   that no-one has made a sensible file system yet. (well... maybe zfs
>   or btrfs, not sure)
>  - If you read the DDF spec carefully, you find there is a secondary
>   raid level which stripes over heterogeneous arrays a different way.
>   You divide every primary array up into N chunks, so the chunk sizes are
>   different on different arrays.  Then you make a secondary array by
>   striping over those chunks.
>   So e.g. you might have a 4Gig RAID5 and a 1GIG RAID1.  So the
>   stripes array on top of these could take 4Meg from the RAID5, then
>   1Meg from the RAID1, then another 4 from the RAID5 etc.
>   So we want to support that?  And would we want to reshape such a
>   thing??
>
> So: lots of thoughts, some pointing in different directions.
> But I'm not against reshape code appearing in RAID0 providing it is
> well designed, maintainable, reliable, and doesn't slow down normal
> RAID0 processing.  I suspect we can get there.
>
> NeilBrown

This may be in a FAQ / wiki somewhere, but is the long range plan of
mdraid to stick to the formal "raid levels" or to implement "raid
equivalent levels".

I am a big fan of the raid equivalent concept and would love to see
mdraid moving in that direction.

==> What I mean by raid equivalent levels

More and more arrays allow the user to simply say "give me a 100 GB
logical volume with Raid 5 equivalent protection.  The array then
looks at the drives it has available and puts together the necessary
pieces.  As drives are added, removed it moves the data around under
its own control, but maintains the raid equivalent protection.

Especially when working with dozens of drives and lots of logical
volumes it makes life much easier.  Admittedly it may come at a cost
of not being able to specify raid levels with the specificity that
mdraid currently allows.

==>

The reason I ask if this is the goal is that doing so may factor into
decisions about how reshaping is implemented.

Greg
-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
First 99 Days Litigation White Paper -
http://www.norcrossgroup.com/forms/whitepapers/99%20Days%20whitepaper.pdf

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-21 11:48   ` Neil Brown
  2009-05-21 12:33     ` OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape) John Robinson
  2009-05-21 19:20     ` Subject: [001/002 ] raid0 reshape Greg Freemyer
@ 2009-05-22  7:53     ` Dan Williams
  2009-05-23 22:33     ` Raz
  3 siblings, 0 replies; 17+ messages in thread
From: Dan Williams @ 2009-05-22  7:53 UTC (permalink / raw)
  To: Neil Brown; +Cc: raz ben yehuda, linux-raid, Jacek Danecki, Labun, Marcin

On Thu, May 21, 2009 at 4:48 AM, Neil Brown <neilb@suse.de> wrote:
> On Tuesday May 19, dan.j.williams@intel.com wrote:
>> On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
>  - Surely RAID0 is (like LVM) just a legacy idea until we get sensible
>   file systems that actually understand multiple devices and do all
>   this stuff for you are a more sensible level - so why are we
>   busting a gut(*) to make RAID0 work well??  Answer is of course
>   that no-one has made a sensible file system yet. (well... maybe zfs
>   or btrfs, not sure)

"Compatibility" is another cause of the abdominal pressure.  See the
single drive to raid0 migrations supported by the Windows driver:
http://www.intel.com/support/chipsets/imsm/sb/cs-020674.htm

> So: lots of thoughts, some pointing in different directions.
> But I'm not against reshape code appearing in RAID0 providing it is
> well designed, maintainable, reliable, and doesn't slow down normal
> RAID0 processing.  I suspect we can get there.

Nod.

Thanks,
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-21 11:48   ` Neil Brown
                       ` (2 preceding siblings ...)
  2009-05-22  7:53     ` Dan Williams
@ 2009-05-23 22:33     ` Raz
  3 siblings, 0 replies; 17+ messages in thread
From: Raz @ 2009-05-23 22:33 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid

On Thu, May 21, 2009 at 2:48 PM, Neil Brown <neilb@suse.de> wrote:
> On Tuesday May 19, dan.j.williams@intel.com wrote:
>> On Sat, May 2, 2009 at 2:46 PM, raz ben yehuda <raziebe@013.net> wrote:
>> > Neil Hello
>> > The bellow is the raid0 grow code.I have decided to fix raid0 and not
>> > perform the transformation raid0-raid4-raid0 due to two reasons:
>> > 1. raid0 zones. this patch support any zone transformations.
>> > 2. Undesired dependency of raid0 over raid4 re-striping code.
>>
>> Hi Raz,
>>
>> Can you explain a bit more about why the raid4 approach is
>> undesirable?  I think making reshape only available to raid0 arrays
>> where all the members are the same size is a reasonable constraint.
>> We then get the nice benefit of reusing the raid5 reshape
>> infrastructure.  In other words I am not convinced that the benefits
>> of reimplementing reshape in raid0 outweigh the costs.
>
> I've been thinking about this too... Is it something we really want to
> do?
>
> My thoughts include:
>
>  - I don't like special cases - it would be nice to support reshape on
>   all arrays, even RAID0 with different sizes devices.
>  - Anyone who does this with a raid0 made of simple drives is asking
>   for trouble.  But a RAID0 over a bunch of RAID5 or RAID1 might make
>   sense.
>  - Maybe we should support different sized drives in RAID4.  As long
>   as the parity drive is as big as the largest data drive it could be
>   made to work.  Similarly hot spares would need to be big, but you
>   could have 2 hot spares and take the smallest one that is big
>   enough.
>   If a drive in the RAID4+ (or is it is the thing called NORAID?)
>   failed and was replaced with a bigger drive, it would be cool to be
>   able to incorporate that extra space into the array.
>
>   If we did all that, then the 0->4->0 conversion could make use of
>   the same code.
>  - Surely RAID0 is (like LVM) just a legacy idea until we get sensible
>   file systems that actually understand multiple devices and do all
>   this stuff for you are a more sensible level - so why are we
>   busting a gut(*) to make RAID0 work well??  Answer is of course
>   that no-one has made a sensible file system yet. (well... maybe zfs
>   or btrfs, not sure)
there is pvfs2 that stripes in a file level. though without
redundancy. but i do
consider pvfs2 as a profesional file system.
>  - If you read the DDF spec carefully, you find there is a secondary
>   raid level which stripes over heterogeneous arrays a different way.
>   You divide every primary array up into N chunks, so the chunk sizes are
>   different on different arrays.  Then you make a secondary array by
>   striping over those chunks.
>   So e.g. you might have a 4Gig RAID5 and a 1GIG RAID1.  So the
>   stripes array on top of these could take 4Meg from the RAID5, then
>   1Meg from the RAID1, then another 4 from the RAID5 etc.
>   So we want to support that?  And would we want to reshape such a
>   thing??
i want;but what i do wonder is how raid awared file systems are to be tuned.
chunk=stripe ?
> So: lots of thoughts, some pointing in different directions.
> But I'm not against reshape code appearing in RAID0 providing it is
> well designed, maintainable, reliable, and doesn't slow down normal
> RAID0 processing.  I suspect we can get there.
>
> NeilBrown
>
>
>
>
> * is that an Australian term??? not sure.  http://www.wordwebonline.com/en/BUSTAGUT
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-21 19:20     ` Subject: [001/002 ] raid0 reshape Greg Freemyer
@ 2009-05-25 12:19       ` Goswin von Brederlow
  2009-05-25 20:06         ` Raz
  2009-05-25 22:14         ` Neil Brown
  0 siblings, 2 replies; 17+ messages in thread
From: Goswin von Brederlow @ 2009-05-25 12:19 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Neil Brown, Dan Williams, raz ben yehuda, linux-raid,
	Jacek Danecki, Labun, Marcin

Greg Freemyer <greg.freemyer@gmail.com> writes:

> ==> What I mean by raid equivalent levels
>
> More and more arrays allow the user to simply say "give me a 100 GB
> logical volume with Raid 5 equivalent protection.  The array then
> looks at the drives it has available and puts together the necessary
> pieces.  As drives are added, removed it moves the data around under
> its own control, but maintains the raid equivalent protection.
>
> Especially when working with dozens of drives and lots of logical
> volumes it makes life much easier.  Admittedly it may come at a cost
> of not being able to specify raid levels with the specificity that
> mdraid currently allows.
>
> ==>
>
> The reason I ask if this is the goal is that doing so may factor into
> decisions about how reshaping is implemented.
>
> Greg

That really seems to scream for LVM to support more raid levels. It
already has linear, raid0 and raid1 support (although I have no idea
how device mapper raid1 compares to md raid1).

Those should be fleshed out more and also support raid 4/5/6 for what
you ask.

MfG
        Goswin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-25 12:19       ` Goswin von Brederlow
@ 2009-05-25 20:06         ` Raz
  2009-05-27 21:55           ` Bill Davidsen
  2009-05-25 22:14         ` Neil Brown
  1 sibling, 1 reply; 17+ messages in thread
From: Raz @ 2009-05-25 20:06 UTC (permalink / raw)
  To: Goswin von Brederlow; +Cc: linux-raid

It is not clear to me why Linux has both LVM and md,waste of
development effort to my opinion. Adding to that brtfs/zfs reaching
mainline, Linux will have 3-4 volume managers to maintain.
why not join hands  and come up with a single unify system?
On Mon, May 25, 2009 at 3:19 PM, Goswin von Brederlow <goswin-v-b@web.de> wrote:
> Greg Freemyer <greg.freemyer@gmail.com> writes:
>
>> ==> What I mean by raid equivalent levels
>>
>> More and more arrays allow the user to simply say "give me a 100 GB
>> logical volume with Raid 5 equivalent protection.  The array then
>> looks at the drives it has available and puts together the necessary
>> pieces.  As drives are added, removed it moves the data around under
>> its own control, but maintains the raid equivalent protection.
>>
>> Especially when working with dozens of drives and lots of logical
>> volumes it makes life much easier.  Admittedly it may come at a cost
>> of not being able to specify raid levels with the specificity that
>> mdraid currently allows.
>>
>> ==>
>>
>> The reason I ask if this is the goal is that doing so may factor into
>> decisions about how reshaping is implemented.
>>
>> Greg
>
> That really seems to scream for LVM to support more raid levels. It
> already has linear, raid0 and raid1 support (although I have no idea
> how device mapper raid1 compares to md raid1).
>
> Those should be fleshed out more and also support raid 4/5/6 for what
> you ask.
>
> MfG
>        Goswin
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-25 12:19       ` Goswin von Brederlow
  2009-05-25 20:06         ` Raz
@ 2009-05-25 22:14         ` Neil Brown
  2009-05-26 11:17           ` Goswin von Brederlow
  1 sibling, 1 reply; 17+ messages in thread
From: Neil Brown @ 2009-05-25 22:14 UTC (permalink / raw)
  To: Goswin von Brederlow
  Cc: Greg Freemyer, Dan Williams, raz ben yehuda, linux-raid,
	Jacek Danecki, Labun, Marcin

On Monday May 25, goswin-v-b@web.de wrote:
> Greg Freemyer <greg.freemyer@gmail.com> writes:
> 
> > ==> What I mean by raid equivalent levels
> >
> > More and more arrays allow the user to simply say "give me a 100 GB
> > logical volume with Raid 5 equivalent protection.  The array then
> > looks at the drives it has available and puts together the necessary
> > pieces.  As drives are added, removed it moves the data around under
> > its own control, but maintains the raid equivalent protection.
> >
> > Especially when working with dozens of drives and lots of logical
> > volumes it makes life much easier.  Admittedly it may come at a cost
> > of not being able to specify raid levels with the specificity that
> > mdraid currently allows.
> >
> > ==>
> >
> > The reason I ask if this is the goal is that doing so may factor into
> > decisions about how reshaping is implemented.
> >
> > Greg
> 
> That really seems to scream for LVM to support more raid levels. It
> already has linear, raid0 and raid1 support (although I have no idea
> how device mapper raid1 compares to md raid1).

Note that LVM (a suite of user-space tools) could conceivably use
md/raid1, md/raid5 etc. The functionality doesn't have to go in dm.

Neil

> 
> Those should be fleshed out more and also support raid 4/5/6 for what
> you ask.
> 
> MfG
>         Goswin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-25 22:14         ` Neil Brown
@ 2009-05-26 11:17           ` Goswin von Brederlow
  2009-05-26 11:51             ` Neil Brown
  0 siblings, 1 reply; 17+ messages in thread
From: Goswin von Brederlow @ 2009-05-26 11:17 UTC (permalink / raw)
  To: Neil Brown
  Cc: Goswin von Brederlow, Greg Freemyer, Dan Williams, raz ben yehuda,
	linux-raid, Jacek Danecki, Labun, Marcin

Neil Brown <neilb@suse.de> writes:

> On Monday May 25, goswin-v-b@web.de wrote:
>> That really seems to scream for LVM to support more raid levels. It
>> already has linear, raid0 and raid1 support (although I have no idea
>> how device mapper raid1 compares to md raid1).
>
> Note that LVM (a suite of user-space tools) could conceivably use
> md/raid1, md/raid5 etc. The functionality doesn't have to go in dm.
>
> Neil

How would you do this? Worst case you can have a LV made up of totaly
non linear PEs, meaning lots of 4MB (default PE size) big chunks in
random order on random disks.

Do you create a raid1/5 for each stripe? You surely run out of md
devices.

Create dm mappings for all stripe 0s, stripe 1s, stripe 2s, ... and
then a raid1/5 over those stripe devices?

What if the LV has segments with different raid configurations (number
of disks in a stripe or even different levels)? Create a raid for each
segment and then a dm mapping for a linear raid?

You can get a flood of intermediate devices there. A /proc/mdstat with
200 entries would be horrible. iostat output would be totaly
useless. ...

MfG
        Goswin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-26 11:17           ` Goswin von Brederlow
@ 2009-05-26 11:51             ` Neil Brown
  2009-05-28 19:07               ` Goswin von Brederlow
  0 siblings, 1 reply; 17+ messages in thread
From: Neil Brown @ 2009-05-26 11:51 UTC (permalink / raw)
  To: Goswin von Brederlow
  Cc: Greg Freemyer, Dan Williams, raz ben yehuda, linux-raid,
	Jacek Danecki, Labun, Marcin

On Tuesday May 26, goswin-v-b@web.de wrote:
> Neil Brown <neilb@suse.de> writes:
> 
> > On Monday May 25, goswin-v-b@web.de wrote:
> >> That really seems to scream for LVM to support more raid levels. It
> >> already has linear, raid0 and raid1 support (although I have no idea
> >> how device mapper raid1 compares to md raid1).
> >
> > Note that LVM (a suite of user-space tools) could conceivably use
> > md/raid1, md/raid5 etc. The functionality doesn't have to go in dm.
> >
> > Neil
> 
> How would you do this? Worst case you can have a LV made up of totaly
> non linear PEs, meaning lots of 4MB (default PE size) big chunks in
> random order on random disks.
> 
> Do you create a raid1/5 for each stripe? You surely run out of md
> devices.

We have 2^21 md devices easily (I think that is the number) and it
wouldn't be hard to have more if that were an issue.

> 
> Create dm mappings for all stripe 0s, stripe 1s, stripe 2s, ... and
> then a raid1/5 over those stripe devices?

That might be an option.

> 
> What if the LV has segments with different raid configurations (number
> of disks in a stripe or even different levels)? Create a raid for each
> segment and then a dm mapping for a linear raid?
>

Yes.

> 
> You can get a flood of intermediate devices there. A /proc/mdstat with
> 200 entries would be horrible. iostat output would be totaly
> useless. ...
>

Yep, these would be interesting problems to solve.  /proc/mdstat is a
bit of a wart on the design - getting the entry in /proc/mdstat
optional might be a good idea.

As for iostat - where does it get info from ? /proc/partitions? /proc/diskinfo?
Maybe /sys/block?
Either way, we could probably find a way to say "this block device is
'hidden'" .

If you want to be able to slice and dice lot of mini-raid arrays into
an LVM system, then whatever way you implement it you will need to be
keeping track of all those bits.  I think it makes most sense to use
the "block device" as the common abstraction, then if we start finding
issues: solve them.  That way the solutions become available for
others to use in ways we hadn't expected.

> MfG
>         Goswin

NeilBrown

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-25 20:06         ` Raz
@ 2009-05-27 21:55           ` Bill Davidsen
  0 siblings, 0 replies; 17+ messages in thread
From: Bill Davidsen @ 2009-05-27 21:55 UTC (permalink / raw)
  To: Raz; +Cc: Goswin von Brederlow, linux-raid

Raz wrote:
> It is not clear to me why Linux has both LVM and md,waste of
> development effort to my opinion. Adding to that brtfs/zfs reaching
> mainline, Linux will have 3-4 volume managers to maintain.
> why not join hands  and come up with a single unify system?
>   

Linux is about choice? One size doesn't fit all?

I see no reason for raid anything in LVM, it's a duplication of effort. 
By the same token, I think building everything into the file system, 
while it seems nice, means that you lose the flexibility of being able 
to control the devices, the raid behavior, and the allocation, each 
independently. I'll be the first to admit that I occasionally abuse that 
flexibility (story coming one of these days), but it's there.

As long as there's a tool to help the novice put the pieces together 
without in-depth technical expertise, I don't think limiting options to 
just one is desirable at all.

> On Mon, May 25, 2009 at 3:19 PM, Goswin von Brederlow <goswin-v-b@web.de> wrote:
>   
>> Greg Freemyer <greg.freemyer@gmail.com> writes:
>>
>>     
>>> ==> What I mean by raid equivalent levels
>>>
>>> More and more arrays allow the user to simply say "give me a 100 GB
>>> logical volume with Raid 5 equivalent protection.  The array then
>>> looks at the drives it has available and puts together the necessary
>>> pieces.  As drives are added, removed it moves the data around under
>>> its own control, but maintains the raid equivalent protection.
>>>
>>> Especially when working with dozens of drives and lots of logical
>>> volumes it makes life much easier.  Admittedly it may come at a cost
>>> of not being able to specify raid levels with the specificity that
>>> mdraid currently allows.
>>>
>>> ==>
>>>
>>> The reason I ask if this is the goal is that doing so may factor into
>>> decisions about how reshaping is implemented.
>>>
>>> Greg
>>>       
>> That really seems to scream for LVM to support more raid levels. It
>> already has linear, raid0 and raid1 support (although I have no idea
>> how device mapper raid1 compares to md raid1).
>>
>> Those should be fleshed out more and also support raid 4/5/6 for what
>> you ask.
>>     


-- 
bill davidsen <davidsen@tmr.com>
  CTO TMR Associates, Inc

"You are disgraced professional losers. And by the way, give us our money back."
    - Representative Earl Pomeroy,  Democrat of North Dakota
on the A.I.G. executives who were paid bonuses  after a federal bailout.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Subject: [001/002 ] raid0 reshape
  2009-05-26 11:51             ` Neil Brown
@ 2009-05-28 19:07               ` Goswin von Brederlow
  0 siblings, 0 replies; 17+ messages in thread
From: Goswin von Brederlow @ 2009-05-28 19:07 UTC (permalink / raw)
  To: Neil Brown
  Cc: Goswin von Brederlow, Greg Freemyer, Dan Williams, raz ben yehuda,
	linux-raid, Jacek Danecki, Labun, Marcin

Neil Brown <neilb@suse.de> writes:

> On Tuesday May 26, goswin-v-b@web.de wrote:
>> Neil Brown <neilb@suse.de> writes:
>> 
>> > On Monday May 25, goswin-v-b@web.de wrote:
>> >> That really seems to scream for LVM to support more raid levels. It
>> >> already has linear, raid0 and raid1 support (although I have no idea
>> >> how device mapper raid1 compares to md raid1).
>> >
>> > Note that LVM (a suite of user-space tools) could conceivably use
>> > md/raid1, md/raid5 etc. The functionality doesn't have to go in dm.
>> >
>> > Neil
>> 
>> How would you do this? Worst case you can have a LV made up of totaly
>> non linear PEs, meaning lots of 4MB (default PE size) big chunks in
>> random order on random disks.
>> 
>> Do you create a raid1/5 for each stripe? You surely run out of md
>> devices.
>
> We have 2^21 md devices easily (I think that is the number) and it
> wouldn't be hard to have more if that were an issue.
>
>> 
>> Create dm mappings for all stripe 0s, stripe 1s, stripe 2s, ... and
>> then a raid1/5 over those stripe devices?
>
> That might be an option.
>
>> 
>> What if the LV has segments with different raid configurations (number
>> of disks in a stripe or even different levels)? Create a raid for each
>> segment and then a dm mapping for a linear raid?
>>
>
> Yes.
>  
>> 
>> You can get a flood of intermediate devices there. A /proc/mdstat with
>> 200 entries would be horrible. iostat output would be totaly
>> useless. ...
>>
>
> Yep, these would be interesting problems to solve.  /proc/mdstat is a
> bit of a wart on the design - getting the entry in /proc/mdstat
> optional might be a good idea.

Resyncing in a way that uses parallelism without using a physical
devices twice would also be difficult without merging all those layers
into one or peaking through them. The raid could doesn't see what
physical devices are in a device-mapper device and so on.

Plus I do want ONE entry in /proc/mdstat (or equivalent) to see how a
resync is going. Just not 200. So it is not just about hiding but also
about showing something sensible.

> As for iostat - where does it get info from ? /proc/partitions? /proc/diskinfo?
> Maybe /sys/block?
> Either way, we could probably find a way to say "this block device is
> 'hidden'" .

One of those places.

> If you want to be able to slice and dice lot of mini-raid arrays into
> an LVM system, then whatever way you implement it you will need to be
> keeping track of all those bits.  I think it makes most sense to use
> the "block device" as the common abstraction, then if we start finding
> issues: solve them.  That way the solutions become available for
> others to use in ways we hadn't expected.

I think the device mapper tables should suffice. They are perfect for
slice and dice operations. This should realy sidestep the block device
overhead (alloc major/minor, send event, not runtime overhead) and
combine status of many slices into a combined status.

I see one problem though for converting md code to dm code: The
metadata. In LVM every PE is basically independent and can be moved
around at will. So the raid code must be able to split and merge raid
devices on a PE granularity at least. Specifically the dirty/clean
informations and serial counts are tricky.

There could be 2 options:

1) Put a little bit of metadata at the start of every PE. The first
block of each PE could also hold an internal bitmap for that PE and
not just a few meta infos and the clean/dirty byte. For internal
bitmaps this might be optimal as it would garanty short seeks to reach
the bits.

2) Have detached metadata. Md already has detached bitmaps. Think of
it as a raid without metadata but with external bitmap.

>> MfG
>>         Goswin

MfG
        Goswin

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-05-28 19:07 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-02 21:46 Subject: [001/002 ] raid0 reshape raz ben yehuda
2009-05-10 22:31 ` Neil Brown
2009-05-12 16:59   ` Raz
2009-05-19 18:09 ` Dan Williams
2009-05-19 22:27   ` Raz
2009-05-21 11:48   ` Neil Brown
2009-05-21 12:33     ` OT: busting a gut (was Re: Subject: [001/002 ] raid0 reshape) John Robinson
2009-05-21 19:20     ` Subject: [001/002 ] raid0 reshape Greg Freemyer
2009-05-25 12:19       ` Goswin von Brederlow
2009-05-25 20:06         ` Raz
2009-05-27 21:55           ` Bill Davidsen
2009-05-25 22:14         ` Neil Brown
2009-05-26 11:17           ` Goswin von Brederlow
2009-05-26 11:51             ` Neil Brown
2009-05-28 19:07               ` Goswin von Brederlow
2009-05-22  7:53     ` Dan Williams
2009-05-23 22:33     ` Raz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).