linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 02/12] md/raid5: factor out code for changing size of stripe cache.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (10 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 04/12] dm-raid456: add support for raising events to userspace NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  8:52 ` [PATCH 00/12] A dm-raid45 target implemented using md raid5 Jeff Garzik
  2010-04-15 17:27 ` [dm-devel] " Heinz Mauelshagen
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

Separate the actual 'change' code from the sysfs interface
so that it can eventually be called internally.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c |   38 +++++++++++++++++++++++++-------------
 1 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1a2fe84..c644190 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4487,23 +4487,15 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 		return 0;
 }
 
-static ssize_t
-raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+static int
+raid5_set_cache_size(mddev_t *mddev, int size)
 {
 	raid5_conf_t *conf = mddev->private;
-	unsigned long new;
 	int err;
 
-	if (len >= PAGE_SIZE)
+	if (size <= 16 || size > 32768)
 		return -EINVAL;
-	if (!conf)
-		return -ENODEV;
-
-	if (strict_strtoul(page, 10, &new))
-		return -EINVAL;
-	if (new <= 16 || new > 32768)
-		return -EINVAL;
-	while (new < conf->max_nr_stripes) {
+	while (size < conf->max_nr_stripes) {
 		if (drop_one_stripe(conf))
 			conf->max_nr_stripes--;
 		else
@@ -4512,11 +4504,31 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
-	while (new > conf->max_nr_stripes) {
+	while (size > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
 		else break;
 	}
+	return 0;
+}
+
+static ssize_t
+raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+{
+	raid5_conf_t *conf = mddev->private;
+	unsigned long new;
+	int err;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (strict_strtoul(page, 10, &new))
+		return -EINVAL;
+	err = raid5_set_cache_size(mddev, new);
+	if (err)
+		return err;
 	return len;
 }
 



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 01/12] md: reduce dependence on sysfs.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (4 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 09/12] dm-raid456: support unplug NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 05/12] raid5: Don't set read-ahead when there is no queue NeilBrown
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

We will want md devices to live as dm targets where sysfs is not
visible.  So allow md to not connect to sysfs.

Signed-off-by; NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |  101 ++++++++++++++++++++++++----------------------------
 drivers/md/md.h    |   12 ++++++
 drivers/md/raid5.c |    8 ++--
 3 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ad7e2b8..d4a9788 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -540,14 +540,16 @@ static void mddev_unlock(mddev_t * mddev)
 		mutex_lock(&mddev->open_mutex);
 		mutex_unlock(&mddev->reconfig_mutex);
 
-		if (to_remove != &md_redundancy_group)
-			sysfs_remove_group(&mddev->kobj, to_remove);
-		if (mddev->pers == NULL ||
-		    mddev->pers->sync_request == NULL) {
-			sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-			if (mddev->sysfs_action)
-				sysfs_put(mddev->sysfs_action);
-			mddev->sysfs_action = NULL;
+		if (mddev->kobj.sd) {
+			if (to_remove != &md_redundancy_group)
+				sysfs_remove_group(&mddev->kobj, to_remove);
+			if (mddev->pers == NULL ||
+			    mddev->pers->sync_request == NULL) {
+				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+				if (mddev->sysfs_action)
+					sysfs_put(mddev->sysfs_action);
+				mddev->sysfs_action = NULL;
+			}
 		}
 		mutex_unlock(&mddev->open_mutex);
 	} else
@@ -1803,11 +1805,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		goto fail;
 
 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
-	if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
-		kobject_del(&rdev->kobj);
-		goto fail;
-	}
-	rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
+	if (sysfs_create_link(&rdev->kobj, ko, "block"))
+		/* failure here is OK */
+	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
 
 	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
@@ -2334,8 +2334,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		set_bit(In_sync, &rdev->flags);
 		err = 0;
 	}
-	if (!err && rdev->sysfs_state)
-		sysfs_notify_dirent(rdev->sysfs_state);
+	if (!err)
+		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	return err ? err : len;
 }
 static struct rdev_sysfs_entry rdev_state =
@@ -2430,14 +2430,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			rdev->raid_disk = -1;
 			return err;
 		} else
-			sysfs_notify_dirent(rdev->sysfs_state);
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
-			printk(KERN_WARNING
-			       "md: cannot register "
-			       "%s for %s\n",
-			       nm, mdname(rdev->mddev));
-
+			/* failure here is OK */;
 		/* don't wakeup anyone, leave that to userspace. */
 	} else {
 		if (slot >= rdev->mddev->raid_disks)
@@ -2447,7 +2443,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		clear_bit(Faulty, &rdev->flags);
 		clear_bit(WriteMostly, &rdev->flags);
 		set_bit(In_sync, &rdev->flags);
-		sysfs_notify_dirent(rdev->sysfs_state);
+		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 	return len;
 }
@@ -3406,7 +3402,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	else {
-		sysfs_notify_dirent(mddev->sysfs_state);
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
 		return len;
 	}
 }
@@ -3704,7 +3700,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
-	sysfs_notify_dirent(mddev->sysfs_action);
+	sysfs_notify_dirent_safe(mddev->sysfs_action);
 	return len;
 }
 
@@ -4250,13 +4246,14 @@ static int md_alloc(dev_t dev, char *name)
 		       disk->disk_name);
 		error = 0;
 	}
-	if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
+	if (mddev->kobj.sd &&
+	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
 		printk(KERN_DEBUG "pointless warning\n");
  abort:
 	mutex_unlock(&disks_mutex);
-	if (!error) {
+	if (!error && mddev->kobj.sd) {
 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
-		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
+		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
 	}
 	mddev_put(mddev);
 	return error;
@@ -4294,7 +4291,7 @@ static void md_safemode_timeout(unsigned long data)
 	if (!atomic_read(&mddev->writes_pending)) {
 		mddev->safemode = 1;
 		if (mddev->external)
-			sysfs_notify_dirent(mddev->sysfs_state);
+			sysfs_notify_dirent_safe(mddev->sysfs_state);
 	}
 	md_wakeup_thread(mddev->thread);
 }
@@ -4366,7 +4363,7 @@ static int md_run(mddev_t *mddev)
 				return -EINVAL;
 			}
 		}
-		sysfs_notify_dirent(rdev->sysfs_state);
+		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 
 	spin_lock(&pers_lock);
@@ -4465,11 +4462,12 @@ static int md_run(mddev_t *mddev)
 		return err;
 	}
 	if (mddev->pers->sync_request) {
-		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+		if (mddev->kobj.sd &&
+		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
 			printk(KERN_WARNING
 			       "md: cannot register extra attributes for %s\n",
 			       mdname(mddev));
-		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;
 
@@ -4487,8 +4485,7 @@ static int md_run(mddev_t *mddev)
 			char nm[20];
 			sprintf(nm, "rd%d", rdev->raid_disk);
 			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
-				printk("md: cannot register %s for %s\n",
-				       nm, mdname(mddev));
+				/* failure here is OK */;
 		}
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -4500,9 +4497,8 @@ static int md_run(mddev_t *mddev)
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
 	md_new_event(mddev);
-	sysfs_notify_dirent(mddev->sysfs_state);
-	if (mddev->sysfs_action)
-		sysfs_notify_dirent(mddev->sysfs_action);
+	sysfs_notify_dirent_safe(mddev->sysfs_state);
+	sysfs_notify_dirent_safe(mddev->sysfs_action);
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	return 0;
 }
@@ -4542,7 +4538,7 @@ static int restart_array(mddev_t *mddev)
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread);
-	sysfs_notify_dirent(mddev->sysfs_state);
+	sysfs_notify_dirent_safe(mddev->sysfs_state);
 	return 0;
 }
 
@@ -4665,7 +4661,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
 		mddev->ro = 1;
 		set_disk_ro(mddev->gendisk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		sysfs_notify_dirent(mddev->sysfs_state);
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
 		err = 0;	
 	}
 out:
@@ -4698,7 +4694,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
-		sysfs_notify_dirent(mddev->sysfs_state);
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0) {
@@ -4744,7 +4740,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	err = 0;
 	blk_integrity_unregister(disk);
 	md_new_event(mddev);
-	sysfs_notify_dirent(mddev->sysfs_state);
+	sysfs_notify_dirent_safe(mddev->sysfs_state);
 	return err;
 }
 
@@ -5106,7 +5102,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		if (err)
 			export_rdev(rdev);
 		else
-			sysfs_notify_dirent(rdev->sysfs_state);
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
 
 		md_update_sb(mddev, 1);
 		if (mddev->degraded)
@@ -5752,7 +5748,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
 		if (mddev->ro == 2) {
 			mddev->ro = 0;
-			sysfs_notify_dirent(mddev->sysfs_state);
+			sysfs_notify_dirent_safe(mddev->sysfs_state);
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			md_wakeup_thread(mddev->thread);
 		} else {
@@ -5997,7 +5993,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-	sysfs_notify_dirent(rdev->sysfs_state);
+	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6458,7 +6454,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		spin_unlock_irq(&mddev->write_lock);
 	}
 	if (did_change)
-		sysfs_notify_dirent(mddev->sysfs_state);
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
 		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -6501,7 +6497,7 @@ int md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
-		sysfs_notify_dirent(mddev->sysfs_state);
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	} else
 		spin_unlock_irq(&mddev->write_lock);
 
@@ -6887,10 +6883,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 					sprintf(nm, "rd%d", rdev->raid_disk);
 					if (sysfs_create_link(&mddev->kobj,
 							      &rdev->kobj, nm))
-						printk(KERN_WARNING
-						       "md: cannot register "
-						       "%s for %s\n",
-						       nm, mdname(mddev));
+						/* failure here is OK */
 					spares++;
 					md_new_event(mddev);
 					set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -6983,7 +6976,7 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->safemode = 0;
 			spin_unlock_irq(&mddev->write_lock);
 			if (did_change)
-				sysfs_notify_dirent(mddev->sysfs_state);
+				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
 		if (mddev->flags)
@@ -7022,7 +7015,7 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			sysfs_notify_dirent(mddev->sysfs_action);
+			sysfs_notify_dirent_safe(mddev->sysfs_action);
 			md_new_event(mddev);
 			goto unlock;
 		}
@@ -7084,7 +7077,7 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
-			sysfs_notify_dirent(mddev->sysfs_action);
+			sysfs_notify_dirent_safe(mddev->sysfs_action);
 			md_new_event(mddev);
 		}
 	unlock:
@@ -7093,7 +7086,7 @@ void md_check_recovery(mddev_t *mddev)
 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
 					       &mddev->recovery))
 				if (mddev->sysfs_action)
-					sysfs_notify_dirent(mddev->sysfs_action);
+					sysfs_notify_dirent_safe(mddev->sysfs_action);
 		}
 		mddev_unlock(mddev);
 	}
@@ -7101,7 +7094,7 @@ void md_check_recovery(mddev_t *mddev)
 
 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-	sysfs_notify_dirent(rdev->sysfs_state);
+	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
 			   !test_bit(Blocked, &rdev->flags),
 			   msecs_to_jiffies(5000));
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a536f54..3687331 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -373,6 +373,18 @@ struct md_sysfs_entry {
 };
 extern struct attribute_group md_bitmap_group;
 
+static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
+{
+	if (sd)
+		return sysfs_get_dirent(sd, name);
+	return sd;
+}
+static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
+{
+	if (sd)
+		sysfs_notify_dirent(sd);
+}
+
 static inline char * mdname (mddev_t * mddev)
 {
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2882a26..1a2fe84 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5085,7 +5085,8 @@ static int run(mddev_t *mddev)
 	/* Ok, everything is just fine now */
 	if (mddev->to_remove == &raid5_attrs_group)
 		mddev->to_remove = NULL;
-	else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+	else if (mddev->kobj.sd &&
+	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
 		printk(KERN_WARNING
 		       "raid5: failed to create sysfs attributes for %s\n",
 		       mdname(mddev));
@@ -5466,10 +5467,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 				sprintf(nm, "rd%d", rdev->raid_disk);
 				if (sysfs_create_link(&mddev->kobj,
 						      &rdev->kobj, nm))
-					printk(KERN_WARNING
-					       "raid5: failed to create "
-					       " link %s for %s\n",
-					       nm, mdname(mddev));
+					/* Failure here is OK */;
 			} else
 				break;
 		}



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 04/12] dm-raid456: add support for raising events to userspace.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (9 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 10/12] dm-raid456: add support for setting IO hints NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 02/12] md/raid5: factor out code for changing size of stripe cache NeilBrown
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

Userspace needs to know about failure events.  DM handles
though through the DM_DEV_WAIT_CMD ioctl.

So allow md_error to be given some work to do on an error,
and arrange that work to signal dm.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c |    8 ++++++++
 drivers/md/md.c         |    2 ++
 drivers/md/md.h         |    1 +
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index 0b89f9a..373784d 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -138,6 +138,13 @@ static int dev_parms(struct raid_set *rs, char **argv)
 	return 0;
 }
 
+static void do_table_event(struct work_struct *ws)
+{
+	struct raid_set *rs = container_of(ws, struct raid_set,
+					   md.event_work);
+	dm_table_event(rs->ti->table);
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -289,6 +296,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (rs->md.raid_disks - in_sync > rt->parity_devs)
 		goto err;
 	
+	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->split_io = rs->md.chunk_sectors;
 	ti->private = rs;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6f082bf..2042b1c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6005,6 +6005,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	if (mddev->event_work.func)
+		schedule_work(&mddev->event_work);
 	md_new_event_inintr(mddev);
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index aaadb53..09a2881 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -309,6 +309,7 @@ struct mddev_s
 	struct bio *barrier;
 	atomic_t flush_pending;
 	struct work_struct barrier_work;
+	struct work_struct event_work;
 };
 
 



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 03/12] md/dm: create dm-raid456 module using md/raid5
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (6 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 05/12] raid5: Don't set read-ahead when there is no queue NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 11/12] dm-raid456: add suspend/resume method NeilBrown
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig      |    8 +
 drivers/md/Makefile     |    1 
 drivers/md/dm-raid456.c |  436 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.c         |   42 +++--
 drivers/md/md.h         |    4 
 5 files changed, 474 insertions(+), 17 deletions(-)
 create mode 100644 drivers/md/dm-raid456.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index acb3a4e..a591c54 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -256,6 +256,14 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_RAID456
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && MD_RAID456 && EXPERIMENTAL
+       ---help---
+       A dm target that supports RAID4 RAID5 and RAID6 mapping
+
+       If unsure, say N.
+
 config DM_LOG_USERSPACE
 	tristate "Mirror userspace logging (EXPERIMENTAL)"
 	depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index e355e7f..0734fba 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_RAID456)	+= dm-raid456.o
 
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \
diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
new file mode 100644
index 0000000..0b89f9a
--- /dev/null
+++ b/drivers/md/dm-raid456.c
@@ -0,0 +1,436 @@
+
+/*
+ * dm-raid456 - implemented as wrapper for md/raid456
+ *
+ */
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+
+struct raid_dev {
+	struct dm_dev *dev;
+	struct mdk_rdev_s rdev;
+};
+
+struct raid_set {
+	struct dm_target *ti;
+	struct mddev_s md;
+	struct raid_type *raid_type;
+	struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+	const char *name;		/* RAID algorithm. */
+	const char *descr;		/* Descriptor text for logging. */
+	const unsigned parity_devs;	/* # of parity devices. */
+	const unsigned minimal_devs;	/* minimal # of devices in set. */
+	const unsigned level;		/* RAID level. */
+	const unsigned algorithm;	/* RAID algorithm. */
+} raid_types[] = {
+	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
+	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid5_ls", "RAID5 (left symmetric)",		1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs", "RAID5 (right symmetric)",		1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid6_zr", "RAID6 (zero restart)",		2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART },
+	{"raid6_nr", "RAID6 (N restart)",		2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+		if (strcmp(raid_types[i].name, name) == 0)
+			return &raid_types[i];
+	return NULL;
+}
+
+static struct raid_set *
+context_alloc(struct raid_type *raid_type,
+	      unsigned long chunk_size,
+	      int recovery,
+	      long raid_devs, sector_t sectors_per_dev,
+	      struct dm_target *ti)
+{
+	struct raid_set *rs;
+
+	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]),
+		     GFP_KERNEL);
+	if (!rs) {
+		ti->error = "Cannot allocate raid context";
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mddev_init(&rs->md);
+
+	rs->ti = ti;
+	rs->raid_type = raid_type;
+	rs->md.raid_disks = raid_devs;
+	rs->md.level = raid_type->level;
+	rs->md.dev_sectors = sectors_per_dev;
+	rs->md.persistent = 1;
+	rs->md.external = 1;
+	rs->md.layout = raid_type->algorithm;
+	rs->md.chunk_sectors = chunk_size;
+	rs->md.recovery_cp = recovery ? 0 : MaxSector;
+
+	rs->md.new_level = rs->md.level;
+	rs->md.new_chunk_sectors = rs->md.chunk_sectors;
+	rs->md.new_layout = rs->md.layout;
+	rs->md.delta_disks = 0;
+
+	return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+	int i;
+	for (i=0; i<rs->md.raid_disks; i++)
+		if (rs->dev[i].dev)
+			dm_put_device(rs->ti, rs->dev[i].dev);
+	kfree(rs);
+}
+
+/* For every device we have three words
+ *  device name, or "-" if missing
+ *  offset from start of devices, in sectors
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+	int i;
+
+	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+		int err = 0;
+		unsigned long long offset;
+
+		md_rdev_init(&rs->dev[i].rdev);
+		rs->dev[i].rdev.raid_disk = i;
+
+		if (strcmp(argv[0], "-") == 0)
+			rs->dev[i].dev = NULL;
+		else
+			err = dm_get_device(rs->ti, argv[0],
+					    dm_table_get_mode(rs->ti->table) ,
+					    &rs->dev[i].dev);
+		if (err) {
+			rs->ti->error = "RAID device lookup failure";
+			return err;
+		}
+		if (strict_strtoull(argv[1], 10, &offset) < 0) {
+			rs->ti->error = "RAID device offset is bad";
+			return -EINVAL;
+		}
+		rs->dev[i].rdev.data_offset = offset;
+
+		set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+		rs->dev[i].rdev.mddev = &rs->md;
+		if (rs->dev[i].dev) {
+			rs->dev[i].rdev.bdev = rs->dev[i].dev->bdev;
+			list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *   log_type #log_params <log_params> \
+ *   raid_type #raid_params <raid_params> \
+ *   rebuild-drive-A [rebuild-drive-B] \
+ *   #raid_devs { <dev_path> <offset>  }
+ *        (a missing device is identified by dev_path == "-")
+ *
+ *  log_type must be 'core'. We ignore region_size and use sync/nosync to
+ *                decide if a resync is needed.
+ *  raid_type is from "raid_types" above
+ *  There are as many 'rebuild-drives' as 'parity_devs' in the raid_type.
+ *  -1 means no drive needs rebuilding.
+ *  raid_params are:
+ *    chunk_size  - in sectors, must be power of 2
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	char *err = NULL;
+	int errnum = -EINVAL;
+	unsigned long cnt;
+	struct raid_type *rt;
+	unsigned long chunk_size;
+	int recovery = 1;
+	long raid_devs;
+	long rebuildA, rebuildB;
+	sector_t sectors_per_dev, chunks;
+	struct raid_set *rs = NULL;
+	int in_sync, i;
+
+	/* log type - core XXX [no]sync */
+	err = "Cannot parse log type";
+	if (argc < 2 ||
+	    strcmp(argv[0], "core") != 0 ||
+	    strict_strtoul(argv[1], 10, &cnt) < 0 ||
+	    cnt + 2 > argc)
+		goto err;
+	if (cnt >= 2 && strcmp(argv[3], "sync") == 0)
+		recovery = 0;
+	argc -= cnt+2;
+	argv += cnt+2;
+
+	/* raid type */
+	err = "Cannot find raid_type";
+	if (argc < 1 ||
+	    (rt = get_raid_type(argv[0])) == NULL)
+		goto err;
+	argc--; argv++;
+
+	/* number of parameters */
+	err = "Cannot understand number of RAID parameters";
+	if (argc < 1 ||
+	    strict_strtoul(argv[0], 10, &cnt) < 0 ||
+	    cnt + 1 > argc)
+		goto err;
+	argc--; argv++;
+
+	/* chunk size */
+	if (cnt) {
+		err = "Bad chunk size";
+		if (strict_strtoul(argv[0], 10, &chunk_size) < 0
+		    || !is_power_of_2(chunk_size)
+		    || chunk_size < 8
+			)
+			goto err;
+		cnt--; argc--; argv++;
+	}
+	/* Skip any extra args */
+	argc -= cnt;
+	argv += cnt;
+
+	/* drives needing rebuild */
+	err = "Cannot parse rebuild-drives";
+	if (argc < 1 ||
+	    strict_strtol(argv[0], 10, &rebuildA) < 0)
+		goto err;
+	argc--; argv++;
+
+	rebuildB = -1;
+	if (rt->parity_devs == 2) {
+		if (argc < 1 ||
+		    strict_strtol(argv[0], 10, &rebuildB) < 0)
+			goto err;
+		argc--; argv++;
+	}
+
+	/* number of raid devs */
+	err = "Bad number of raid devices";
+	if (argc < 1 ||
+	    strict_strtol(argv[0], 10, &raid_devs) < 0 ||
+	    raid_devs < rt->minimal_devs)
+		goto err;
+
+	err = "Bad number for rebuild device";
+	if (rebuildA < -1 || rebuildB < -1 ||
+	    rebuildA >= raid_devs || rebuildB >= raid_devs)
+		goto err;
+
+	argc--; argv++;
+	err = "Wrong number of arguments for number of raid devices";
+	if (argc != raid_devs * 2)
+		goto err;
+
+	/* check the sizes all match */
+	sectors_per_dev = ti->len;
+	err = "Target length not divisible by number of data devices";
+	if (sector_div(sectors_per_dev, (raid_devs - rt->minimal_devs)))
+		goto err;
+	chunks = sectors_per_dev;
+	err = "Device length not divisible by chunk_size";
+	if (sector_div(chunks, chunk_size))
+		goto err;
+
+
+	/* Now the devices: three words each */
+	rs = context_alloc(rt, chunk_size, recovery,
+			   raid_devs, sectors_per_dev,
+			   ti);
+	if (IS_ERR(rs))
+		return PTR_ERR(rs);
+
+	errnum = dev_parms(rs, argv);
+	if (errnum) {
+		err = ti->error;
+		goto err;
+	}
+	errnum = EINVAL;
+
+	err = "Rebuild device not present";
+	if (rebuildA >= 0) {
+		if (rs->dev[rebuildA].dev == NULL)
+			goto err;
+		clear_bit(In_sync, &rs->dev[rebuildA].rdev.flags);
+		rs->dev[rebuildA].rdev.recovery_offset = 0;
+	}
+	if (rebuildB >= 0) {
+		if (rs->dev[rebuildB].dev == NULL)
+			goto err;
+		clear_bit(In_sync, &rs->dev[rebuildB].rdev.flags);
+		rs->dev[rebuildB].rdev.recovery_offset = 0;
+	}
+	in_sync = 0;
+	for (i = 0; i < rs->md.raid_disks; i++)
+		if (rs->dev[i].dev &&
+		    test_bit(In_sync, &rs->dev[i].rdev.flags))
+			in_sync ++;
+	err = "Insufficient active RAID devices";
+	if (rs->md.raid_disks - in_sync > rt->parity_devs)
+		goto err;
+	
+	ti->split_io = rs->md.chunk_sectors;
+	ti->private = rs;
+
+	mutex_lock(&rs->md.reconfig_mutex);
+	err = "Fail to run raid array";
+	errnum = md_run(&rs->md);
+	rs->md.in_sync = 0; /* Assume already marked dirty */
+	mutex_unlock(&rs->md.reconfig_mutex);
+	
+	if (errnum)
+		goto err;
+	return 0;
+err:
+	if (rs)
+		context_free(rs);
+	ti->error = err;
+	return errnum;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	md_stop(&rs->md);
+	context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio,
+		    union map_info *map_context)
+{
+	struct raid_set *rs = ti->private;
+	mddev_t *mddev = &rs->md;
+
+	mddev->pers->make_request(mddev, bio);
+	return DM_MAPIO_SUBMITTED;
+}
+	
+static int raid_status(struct dm_target *ti, status_type_t type,
+		       char *result, unsigned maxlen)
+{
+	struct raid_set *rs = ti->private;
+	struct raid5_private_data *conf = rs->md.private;
+	int sz = 0;
+	int rbcnt;
+	int i;
+	sector_t sync;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%u ", rs->md.raid_disks);
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (rs->dev[i].dev)
+				DMEMIT("%s ", rs->dev[i].dev->name);
+			else
+				DMEMIT("- ");
+		}
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+				DMEMIT("D");
+			else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+				DMEMIT("A");
+			else
+				DMEMIT("Ai");
+		}
+		DMEMIT(" %u ", conf->max_nr_stripes);
+		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+			sync = rs->md.curr_resync_completed;
+		else
+			sync = rs->md.recovery_cp;
+		if (sync > rs->md.resync_max_sectors)
+			sync = rs->md.resync_max_sectors;
+		DMEMIT("%llu/%llu ",
+		       (unsigned long long) sync,
+		       (unsigned long long) rs->md.resync_max_sectors);
+		DMEMIT("1 core");
+		       
+		break;
+	case STATUSTYPE_TABLE:
+		/* The string you would use to construct this array */
+		/* Pretend to use a core log with a region size of 1 sector */
+		DMEMIT("core 2 %u %ssync ", 1, 
+		       rs->md.recovery_cp == MaxSector ? "": "no");
+		DMEMIT("%s ", rs->raid_type->name);
+		DMEMIT("1 %u ", rs->md.chunk_sectors);
+
+		/* Print 1 or 2 rebuild_dev numbers */
+		rbcnt = 0;
+		for (i=0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].dev &&
+			    !test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+			    rbcnt < rs->raid_type->parity_devs) {
+				DMEMIT("%u ", i);
+				rbcnt ++;
+			}
+		while (rbcnt < rs->raid_type->parity_devs) {
+			DMEMIT("-1 ");
+			rbcnt++;
+		}
+
+		DMEMIT("%u ", rs->md.raid_disks);
+		for (i=0; i < rs->md.raid_disks; i++) {
+			mdk_rdev_t *rdev = &rs->dev[i].rdev;
+
+			if (rs->dev[i].dev)
+				DMEMIT("%s ", rs->dev[i].dev->name);
+			else
+				DMEMIT("- ");
+
+			DMEMIT("%llu ", (unsigned long long)rdev->data_offset);
+		}			       
+		break;
+	}
+	return 0;
+}
+
+static struct target_type raid_target = {
+	.name = "raid45",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = raid_ctr,
+	.dtr = raid_dtr,
+	.map = raid_map,
+	.status = raid_status,
+};
+
+static int __init dm_raid_init(void)
+{
+	int r = dm_register_target(&raid_target);
+
+	return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+	dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4a9788..6f082bf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -415,7 +415,7 @@ static void mddev_put(mddev_t *mddev)
 	spin_unlock(&all_mddevs_lock);
 }
 
-static void mddev_init(mddev_t *mddev)
+void mddev_init(mddev_t *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
@@ -435,6 +435,7 @@ static void mddev_init(mddev_t *mddev)
 	mddev->resync_max = MaxSector;
 	mddev->level = LEVEL_NONE;
 }
+EXPORT_SYMBOL_GPL(mddev_init);
 
 static mddev_t * mddev_find(dev_t unit)
 {
@@ -2691,6 +2692,24 @@ static struct kobj_type rdev_ktype = {
 	.default_attrs	= rdev_default_attrs,
 };
 
+void md_rdev_init(mdk_rdev_t *rdev)
+{
+	rdev->desc_nr = -1;
+	rdev->saved_raid_disk = -1;
+	rdev->raid_disk = -1;
+	rdev->flags = 0;
+	rdev->data_offset = 0;
+	rdev->sb_events = 0;
+	rdev->last_read_error.tv_sec  = 0;
+	rdev->last_read_error.tv_nsec = 0;
+	atomic_set(&rdev->nr_pending, 0);
+	atomic_set(&rdev->read_errors, 0);
+	atomic_set(&rdev->corrected_errors, 0);
+
+	INIT_LIST_HEAD(&rdev->same_set);
+	init_waitqueue_head(&rdev->blocked_wait);
+}
+EXPORT_SYMBOL_GPL(md_rdev_init);
 /*
  * Import a device. If 'super_format' >= 0, then sanity check the superblock
  *
@@ -2714,6 +2733,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 		return ERR_PTR(-ENOMEM);
 	}
 
+	md_rdev_init(rdev);
 	if ((err = alloc_disk_sb(rdev)))
 		goto abort_free;
 
@@ -2723,18 +2743,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 
 	kobject_init(&rdev->kobj, &rdev_ktype);
 
-	rdev->desc_nr = -1;
-	rdev->saved_raid_disk = -1;
-	rdev->raid_disk = -1;
-	rdev->flags = 0;
-	rdev->data_offset = 0;
-	rdev->sb_events = 0;
-	rdev->last_read_error.tv_sec  = 0;
-	rdev->last_read_error.tv_nsec = 0;
-	atomic_set(&rdev->nr_pending, 0);
-	atomic_set(&rdev->read_errors, 0);
-	atomic_set(&rdev->corrected_errors, 0);
-
 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 	if (!size) {
 		printk(KERN_WARNING 
@@ -2763,8 +2771,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 		}
 	}
 
-	INIT_LIST_HEAD(&rdev->same_set);
-	init_waitqueue_head(&rdev->blocked_wait);
 
 	return rdev;
 
@@ -4298,7 +4304,7 @@ static void md_safemode_timeout(unsigned long data)
 
 static int start_dirty_degraded;
 
-static int md_run(mddev_t *mddev)
+int md_run(mddev_t *mddev)
 {
 	int err;
 	mdk_rdev_t *rdev;
@@ -4502,6 +4508,7 @@ static int md_run(mddev_t *mddev)
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	return 0;
 }
+EXPORT_SYMBOL_GPL(md_run);
 
 static int do_md_run(mddev_t *mddev)
 {
@@ -4631,7 +4638,7 @@ static void md_stop_writes(mddev_t *mddev)
 	}
 }
 
-static void md_stop(mddev_t *mddev)
+void md_stop(mddev_t *mddev)
 {
 	md_stop_writes(mddev);
 
@@ -4642,6 +4649,7 @@ static void md_stop(mddev_t *mddev)
 	mddev->pers = NULL;
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
+EXPORT_SYMBOL_GPL(md_stop);
 
 static int md_set_readonly(mddev_t *mddev, int is_open)
 {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3687331..aaadb53 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -478,4 +478,8 @@ extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
 
+extern void mddev_init(mddev_t *mddev);
+extern int md_run(mddev_t *mddev);
+extern void md_stop(mddev_t *mddev);
+extern void md_rdev_init(mdk_rdev_t *rdev);
 #endif /* _MD_MD_H */



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 05/12] raid5: Don't set read-ahead when there is no queue
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (5 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 01/12] md: reduce dependence on sysfs NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 03/12] md/dm: create dm-raid456 module using md/raid5 NeilBrown
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

dm-raid456 does not provide a 'queue' for raid5 to use,
so we must make raid5 stop depending on the queue.

First: read_ahead
dm handles read-ahead adjustment fully in userspace, so
simply don't do any readahead adjustments if there is
no queue.

Also re-arrange code slightly so all the accesses to ->queue are
together.

Finally, move the blk_queue_merge_bvec function into the 'if' as
the ->split_io setting in dm-raid456 has the same effect.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c |   30 ++++++++++++++++--------------
 1 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c644190..3e97950 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5083,16 +5083,6 @@ static int run(mddev_t *mddev)
 							"reshape");
 	}
 
-	/* read-ahead size must cover two whole stripes, which is
-	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
-	 */
-	{
-		int data_disks = conf->previous_raid_disks - conf->max_degraded;
-		int stripe = data_disks *
-			((mddev->chunk_sectors << 9) / PAGE_SIZE);
-		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
-	}
 
 	/* Ok, everything is just fine now */
 	if (mddev->to_remove == &raid5_attrs_group)
@@ -5102,6 +5092,21 @@ static int run(mddev_t *mddev)
 		printk(KERN_WARNING
 		       "raid5: failed to create sysfs attributes for %s\n",
 		       mdname(mddev));
+	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+
+	if (mddev->queue) {
+		/* read-ahead size must cover two whole stripes, which
+		 * is 2 * (datadisks) * chunksize where 'n' is the
+		 * number of raid devices
+		 */
+		int data_disks = conf->previous_raid_disks - conf->max_degraded;
+		int stripe = data_disks *
+			((mddev->chunk_sectors << 9) / PAGE_SIZE);
+		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+
+		blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+	}
 
 	mddev->queue->queue_lock = &conf->device_lock;
 
@@ -5109,9 +5114,6 @@ static int run(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
-
-	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
 	blk_queue_io_opt(mddev->queue, chunk_size *
@@ -5534,7 +5536,7 @@ static void end_reshape(raid5_conf_t *conf)
 		/* read-ahead size must cover two whole stripes, which is
 		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
 		 */
-		{
+		if (conf->mddev->queue) {
 			int data_disks = conf->raid_disks - conf->max_degraded;
 			int stripe = data_disks * ((conf->chunk_sectors << 9)
 						   / PAGE_SIZE);



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 06/12] dm-raid456: add congestion checking.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 12/12] dm-raid456: add message handler NeilBrown
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

dm currently implements congestion checking by checking on congestion
in each component device.

For raid456 we need to also check if the stripe cache is congested.
So add support to dm for a target to register a congestion checker,
then registered such a checker for dm-raid456.

We add support for multiple callbacks as we will need one for unplug
too.

Finally, we move the setting for congested_fn for the mddev->queue
into the "if (mddev->queue)" protected branch as it is not needed
for dm-raid456 now.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c       |   13 +++++++++++++
 drivers/md/dm-table.c         |   15 +++++++++++++++
 drivers/md/raid5.c            |   22 +++++++++++++++-------
 drivers/md/raid5.h            |    1 +
 include/linux/device-mapper.h |   12 ++++++++++++
 5 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index 373784d..5632999 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -16,6 +16,7 @@ struct raid_set {
 	struct dm_target *ti;
 	struct mddev_s md;
 	struct raid_type *raid_type;
+	struct target_callbacks callbacks;
 	struct raid_dev dev[0];
 };
 
@@ -145,6 +146,13 @@ static void do_table_event(struct work_struct *ws)
 	dm_table_event(rs->ti->table);
 }
 
+static int raid_is_congested(void *v, int bits)
+{
+	struct target_callbacks *cb = v;
+	struct raid_set *rs = container_of(cb, struct raid_set,
+					   callbacks);
+	return md_raid5_congested(&rs->md, bits);
+}
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -308,6 +316,10 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	
 	if (errnum)
 		goto err;
+
+	rs->callbacks.congested_fn = raid_is_congested;
+	dm_table_add_callbacks(ti->table, &rs->callbacks);
+
 	return 0;
 err:
 	if (rs)
@@ -320,6 +332,7 @@ static void raid_dtr(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
+	list_del_init(&rs->callbacks.list);
 	md_stop(&rs->md);
 	context_free(rs);
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9924ea2..b856340 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -68,6 +68,8 @@ struct dm_table {
 	void (*event_fn)(void *);
 	void *event_context;
 
+	struct list_head target_callbacks;
+
 	struct dm_md_mempools *mempools;
 };
 
@@ -202,6 +204,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&t->devices);
+	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
 
 	if (!num_targets)
@@ -1174,10 +1177,18 @@ int dm_table_resume_targets(struct dm_table *t)
 	return 0;
 }
 
+void dm_table_add_callbacks(struct dm_table *t,
+			    struct target_callbacks *cb)
+{
+	list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_callbacks);
+
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct target_callbacks *cb;
 	int r = 0;
 
 	list_for_each_entry(dd, devices, list) {
@@ -1192,6 +1203,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 				     bdevname(dd->dm_dev.bdev, b));
 	}
 
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->congested_fn)
+			r |= cb->congested_fn(cb, bdi_bits);
+
 	return r;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3e97950..aaaed29 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3595,17 +3595,14 @@ static void raid5_unplug_device(struct request_queue *q)
 	unplug_slaves(mddev);
 }
 
-static int raid5_congested(void *data, int bits)
+int md_raid5_congested(mddev_t *mddev, int bits)
 {
-	mddev_t *mddev = data;
 	raid5_conf_t *conf = mddev->private;
 
 	/* No difference between reads and writes.  Just check
 	 * how busy the stripe_cache is
 	 */
 
-	if (mddev_congested(mddev, bits))
-		return 1;
 	if (conf->inactive_blocked)
 		return 1;
 	if (conf->quiesce)
@@ -3615,6 +3612,15 @@ static int raid5_congested(void *data, int bits)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(md_raid5_congested);
+
+static int raid5_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+
+	return mddev_congested(mddev, bits) ||
+		md_raid5_congested(mddev, bits);
+}
 
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
@@ -5106,13 +5112,14 @@ static int run(mddev_t *mddev)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 
 		blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+
+		mddev->queue->backing_dev_info.congested_data = mddev;
+		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 	}
 
 	mddev->queue->queue_lock = &conf->device_lock;
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
-	mddev->queue->backing_dev_info.congested_data = mddev;
-	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
@@ -5144,7 +5151,8 @@ static int stop(mddev_t *mddev)
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
-	mddev->queue->backing_dev_info.congested_fn = NULL;
+	if (mddev->queue)
+		mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	free_conf(conf);
 	mddev->private = NULL;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 0f86f5e..6641789 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -497,4 +497,5 @@ static inline int algorithm_is_DDF(int layout)
 {
 	return layout >= 8 && layout <= 10;
 }
+extern int md_raid5_congested(mddev_t *mddev, int bits);
 #endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 1381cd9..2b0f538 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -187,6 +187,12 @@ struct dm_target {
 	char *error;
 };
 
+/* Each target can link one of these into the table */
+struct target_callbacks {
+	struct list_head list;
+	congested_fn *congested_fn;
+};
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
@@ -263,6 +269,12 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params);
 
 /*
+ * Target_ctr should call this if they need to add any
+ * callback
+ */
+void dm_table_add_callbacks(struct dm_table *t,
+			    struct target_callbacks *cb);
+/*
  * Finally call this to make the table ready for use.
  */
 int dm_table_complete(struct dm_table *t);



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 07/12] md/raid5: add simple plugging infrastructure.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (2 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 08/12] md/plug: optionally use plugger to unplug an array during resync/recovery NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 09/12] dm-raid456: support unplug NeilBrown
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

md/raid5 uses the plugging infrastructure provided by the block layer
and 'struct request_queue'.  However when we plug raid5 under dm there
is no request queue so we cannot use that.

So create a similar infrastructure that is much lighter weight and use
it for raid5.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |   45 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.h    |   21 +++++++++++++++++++++
 drivers/md/raid5.c |   39 +++++++++++++++++++++++++--------------
 drivers/md/raid5.h |    3 +++
 4 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2042b1c..49336e7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -383,6 +383,51 @@ void md_barrier_request(mddev_t *mddev, struct bio *bio)
 }
 EXPORT_SYMBOL(md_barrier_request);
 
+/* Support for plugging.
+ * This mirrors the plugging support in request_queue, but does not
+ * require having a whole queue
+ */
+static void plugger_work(struct work_struct *work)
+{
+	struct plug_handle *plug =
+		container_of(work, struct plug_handle, unplug_work);
+	plug->unplug_fn(plug);
+}
+static void plugger_timeout(unsigned long data)
+{
+	struct plug_handle *plug = (void*)data;
+	kblockd_schedule_work(NULL, &plug->unplug_work);
+}
+void plugger_init(struct plug_handle *plug,
+		  void (*unplug_fn)(struct plug_handle *))
+{
+	plug->unplug_flag = 0;
+	plug->unplug_fn = unplug_fn;
+	init_timer(&plug->unplug_timer);
+	plug->unplug_timer.function = plugger_timeout;
+	plug->unplug_timer.data = (unsigned long)plug;
+	INIT_WORK(&plug->unplug_work, plugger_work);
+}
+EXPORT_SYMBOL_GPL(plugger_init);
+
+void plugger_set_plug(struct plug_handle *plug)
+{
+	if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
+		mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
+}
+EXPORT_SYMBOL_GPL(plugger_set_plug);
+
+int plugger_remove_plug(struct plug_handle *plug)
+{
+	if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
+		del_timer(&plug->unplug_timer);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL_GPL(plugger_remove_plug);
+
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09a2881..1c14c33 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,26 @@
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
+/* generic plugging support - like that provided with request_queue,
+ * but does not require a request_queue
+ */
+struct plug_handle {
+	void 	(*unplug_fn)(struct plug_handle *);
+	struct timer_list	unplug_timer;
+	struct work_struct	unplug_work;
+	unsigned long		unplug_flag;
+};
+#define	PLUGGED_FLAG 1
+void plugger_init(struct plug_handle *plug,
+		  void (*unplug_fn)(struct plug_handle *));
+void plugger_set_plug(struct plug_handle *plug);
+int plugger_remove_plug(struct plug_handle *plug);
+static inline void plugger_flush(struct plug_handle *plug)
+{
+	del_timer_sync(&plug->unplug_timer);
+	cancel_work_sync(&plug->unplug_work);
+}
+
 /*
  * MD's 'extended' device
  */
@@ -449,6 +469,7 @@ static inline void safe_put_page(struct page *p)
 	if (p) put_page(p);
 }
 
+
 extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index aaaed29..ac63012 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -200,11 +200,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
 			if (test_bit(STRIPE_DELAYED, &sh->state)) {
 				list_add_tail(&sh->lru, &conf->delayed_list);
-				blk_plug_device(conf->mddev->queue);
+				plugger_set_plug(&conf->plug);
 			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 				   sh->bm_seq - conf->seq_write > 0) {
 				list_add_tail(&sh->lru, &conf->bitmap_list);
-				blk_plug_device(conf->mddev->queue);
+				plugger_set_plug(&conf->plug);
 			} else {
 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
@@ -364,7 +364,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
 }
 
 static void unplug_slaves(mddev_t *mddev);
-static void raid5_unplug_device(struct request_queue *q);
+static void raid5_unplug_device(raid5_conf_t *conf);
 
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
@@ -394,7 +394,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    raid5_unplug_device(conf->mddev->queue)
+						    raid5_unplug_device(conf)
 					);
 				conf->inactive_blocked = 0;
 			} else
@@ -3535,7 +3535,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
 			list_add_tail(&sh->lru, &conf->hold_list);
 		}
 	} else
-		blk_plug_device(conf->mddev->queue);
+		plugger_set_plug(&conf->plug);
 }
 
 static void activate_bit_delay(raid5_conf_t *conf)
@@ -3576,23 +3576,33 @@ static void unplug_slaves(mddev_t *mddev)
 	rcu_read_unlock();
 }
 
-static void raid5_unplug_device(struct request_queue *q)
+static void raid5_unplug_device(raid5_conf_t *conf)
 {
-	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 
-	if (blk_remove_plug(q)) {
+	if (plugger_remove_plug(&conf->plug)) {
 		conf->seq_flush++;
 		raid5_activate_delayed(conf);
 	}
-	md_wakeup_thread(mddev->thread);
+	md_wakeup_thread(conf->mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
-	unplug_slaves(mddev);
+	unplug_slaves(conf->mddev);
+}
+
+static void raid5_unplug(struct plug_handle *plug)
+{
+	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
+	raid5_unplug_device(conf);
+}
+	
+static void raid5_unplug_queue(struct request_queue *q)
+{
+	mddev_t *mddev = q->queuedata;
+	raid5_unplug_device(mddev->private);
 }
 
 int md_raid5_congested(mddev_t *mddev, int bits)
@@ -4002,7 +4012,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
-				raid5_unplug_device(mddev->queue);
+				raid5_unplug_device(conf);
 				release_stripe(sh);
 				schedule();
 				goto retry;
@@ -5100,6 +5110,7 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
+	plugger_init(&conf->plug, raid5_unplug);
 	if (mddev->queue) {
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
@@ -5119,7 +5130,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->queue_lock = &conf->device_lock;
 
-	mddev->queue->unplug_fn = raid5_unplug_device;
+	mddev->queue->unplug_fn = raid5_unplug_queue;
 
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
@@ -5153,7 +5164,7 @@ static int stop(mddev_t *mddev)
 	mddev->thread = NULL;
 	if (mddev->queue)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
-	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+	plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
 	free_conf(conf);
 	mddev->private = NULL;
 	mddev->to_remove = &raid5_attrs_group;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6641789..4dc58bf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -398,6 +398,9 @@ struct raid5_private_data {
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
+
+	struct plug_handle	plug;
+
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 08/12] md/plug: optionally use plugger to unplug an array during resync/recovery.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
  2010-04-15  6:43 ` [PATCH 06/12] dm-raid456: add congestion checking NeilBrown
  2010-04-15  6:43 ` [PATCH 12/12] dm-raid456: add message handler NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 07/12] md/raid5: add simple plugging infrastructure NeilBrown
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

If an array doesn't have a 'queue' then md_do_sync cannot
unplug it.
In that case it will have a 'plugger', so make that available
to the mddev, and us it to unplug the array if needed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |   15 ++++++++++++---
 drivers/md/md.h    |    2 ++
 drivers/md/raid5.c |    1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 49336e7..b292dc5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4660,6 +4660,7 @@ static void md_clean(mddev_t *mddev)
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
+	mddev->plug = NULL;
 }
 
 static void md_stop_writes(mddev_t *mddev)
@@ -6563,6 +6564,14 @@ int md_allow_write(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
+static void md_unplug(mddev_t *mddev)
+{
+	if (mddev->queue)
+		blk_unplug(mddev->queue);
+	if (mddev->plug)
+		mddev->plug->unplug_fn(mddev->plug);
+}
+
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
 void md_do_sync(mddev_t *mddev)
@@ -6741,7 +6750,7 @@ void md_do_sync(mddev_t *mddev)
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
 			/* time to update curr_resync_completed */
-			blk_unplug(mddev->queue);
+			md_unplug(mddev);
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed =
@@ -6818,7 +6827,7 @@ void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
-		blk_unplug(mddev->queue);
+		md_unplug(mddev);
 		cond_resched();
 
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6837,7 +6846,7 @@ void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
-	blk_unplug(mddev->queue);
+	md_unplug(mddev);
 
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c14c33..e4f60ae 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -319,6 +319,8 @@ struct mddev_s
 	struct list_head		all_mddevs;
 
 	struct attribute_group		*to_remove;
+	struct plug_handle		*plug; /* if used by personality */
+
 	/* Generic barrier handling.
 	 * If there is a pending barrier request, all other
 	 * writes are blocked while the devices are flushed.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ac63012..fae805e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5111,6 +5111,7 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
 	plugger_init(&conf->plug, raid5_unplug);
+	mddev->plug = &conf->plug;
 	if (mddev->queue) {
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 11/12] dm-raid456: add suspend/resume method
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (7 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 03/12] md/dm: create dm-raid456 module using md/raid5 NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 10/12] dm-raid456: add support for setting IO hints NeilBrown
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

These just call in to the md methods.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c |   15 +++++++++++++++
 drivers/md/md.c         |    6 ++++--
 drivers/md/md.h         |    3 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index 071a8ae..126042d 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -465,6 +465,19 @@ static void raid_io_hints(struct dm_target *ti,
 			  (conf->raid_disks - conf->max_degraded));
 }			  
 
+static void raid_presuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	mddev_resume(&rs->md);
+}
+
 static struct target_type raid_target = {
 	.name = "raid45",
 	.version = {1, 0, 0},
@@ -475,6 +488,8 @@ static struct target_type raid_target = {
 	.status = raid_status,
 	.iterate_devices = raid_iterate_devices,
 	.io_hints = raid_io_hints,
+	.presuspend = raid_presuspend,
+	.resume = raid_resume,
 };
 
 static int __init dm_raid_init(void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b292dc5..098f72c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -260,7 +260,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
  * Once ->stop is called and completes, the module will be completely
  * unused.
  */
-static void mddev_suspend(mddev_t *mddev)
+void mddev_suspend(mddev_t *mddev)
 {
 	BUG_ON(mddev->suspended);
 	mddev->suspended = 1;
@@ -268,13 +268,15 @@ static void mddev_suspend(mddev_t *mddev)
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
 }
+EXPORT_SYMBOL_GPL(mddev_suspend);
 
-static void mddev_resume(mddev_t *mddev)
+void mddev_resume(mddev_t *mddev)
 {
 	mddev->suspended = 0;
 	wake_up(&mddev->sb_wait);
 	mddev->pers->quiesce(mddev, 0);
 }
+EXPORT_SYMBOL_GPL(mddev_resume);
 
 int mddev_congested(mddev_t *mddev, int bits)
 {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e4f60ae..fac75bd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -506,4 +506,7 @@ extern void mddev_init(mddev_t *mddev);
 extern int md_run(mddev_t *mddev);
 extern void md_stop(mddev_t *mddev);
 extern void md_rdev_init(mdk_rdev_t *rdev);
+
+extern void mddev_suspend(mddev_t *mddev);
+extern void mddev_resume(mddev_t *mddev);
 #endif /* _MD_MD_H */



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 10/12] dm-raid456: add support for setting IO hints.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (8 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 11/12] dm-raid456: add suspend/resume method NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 04/12] dm-raid456: add support for raising events to userspace NeilBrown
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c |   33 +++++++++++++++++++++++++++++++++
 drivers/md/raid5.c      |   19 ++++++++++---------
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index d0e44f2..071a8ae 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -434,6 +434,37 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int raid_iterate_devices(struct dm_target *ti,
+				iterate_devices_callout_fn fn,
+				void *data)
+{
+	struct raid_set *rs = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	for (i=0; !ret && i < rs->md.raid_disks; i++)
+		if (rs->dev[i].dev)
+			ret = fn(ti,
+				 rs->dev[i].dev,
+				 rs->dev[i].rdev.data_offset,
+				 rs->md.dev_sectors,
+				 data);
+
+	return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti,
+			  struct queue_limits *limits)
+{
+	struct raid_set *rs = ti->private;
+	unsigned chunk_size = rs->md.chunk_sectors << 9;
+	raid5_conf_t *conf = rs->md.private;
+
+	blk_limits_io_min(limits, chunk_size);
+	blk_limits_io_opt(limits, chunk_size *
+			  (conf->raid_disks - conf->max_degraded));
+}			  
+
 static struct target_type raid_target = {
 	.name = "raid45",
 	.version = {1, 0, 0},
@@ -442,6 +473,8 @@ static struct target_type raid_target = {
 	.dtr = raid_dtr,
 	.map = raid_map,
 	.status = raid_status,
+	.iterate_devices = raid_iterate_devices,
+	.io_hints = raid_io_hints,
 };
 
 static int __init dm_raid_init(void)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 41a905b..138f65f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4906,7 +4906,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
 static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
-	int working_disks = 0, chunk_size;
+	int working_disks = 0;
 	int dirty_parity_disks = 0;
 	mdk_rdev_t *rdev;
 	sector_t reshape_offset = 0;
@@ -5113,6 +5113,7 @@ static int run(mddev_t *mddev)
 	plugger_init(&conf->plug, raid5_unplug);
 	mddev->plug = &conf->plug;
 	if (mddev->queue) {
+		int chunk_size;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5130,16 +5131,16 @@ static int run(mddev_t *mddev)
 
 		mddev->queue->queue_lock = &conf->device_lock;
 		mddev->queue->unplug_fn = raid5_unplug_queue;
-	}
 
-	chunk_size = mddev->chunk_sectors << 9;
-	blk_queue_io_min(mddev->queue, chunk_size);
-	blk_queue_io_opt(mddev->queue, chunk_size *
-			 (conf->raid_disks - conf->max_degraded));
+		chunk_size = mddev->chunk_sectors << 9;
+		blk_queue_io_min(mddev->queue, chunk_size);
+		blk_queue_io_opt(mddev->queue, chunk_size *
+				 (conf->raid_disks - conf->max_degraded));
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
+		list_for_each_entry(rdev, &mddev->disks, same_set)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+	}
 
 	return 0;
 abort:



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 09/12] dm-raid456: support unplug
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (3 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 07/12] md/raid5: add simple plugging infrastructure NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 01/12] md: reduce dependence on sysfs NeilBrown
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

In a similar manner to congestion checking, per-target
unplug support for raid456 under dm.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c       |    9 +++++++++
 drivers/md/dm-table.c         |    4 ++++
 drivers/md/raid5.c            |   11 +++++------
 drivers/md/raid5.h            |    1 +
 include/linux/device-mapper.h |    1 +
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index 5632999..d0e44f2 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -153,6 +153,14 @@ static int raid_is_congested(void *v, int bits)
 					   callbacks);
 	return md_raid5_congested(&rs->md, bits);
 }
+static void raid_unplug(void *v)
+{
+	struct target_callbacks *cb = v;
+	struct raid_set *rs = container_of(cb, struct raid_set,
+					   callbacks);
+	raid5_unplug_device(rs->md.private);
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -288,6 +296,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			goto err;
 		clear_bit(In_sync, &rs->dev[rebuildA].rdev.flags);
 		rs->dev[rebuildA].rdev.recovery_offset = 0;
+	rs->callbacks.unplug_fn = raid_unplug;
 	}
 	if (rebuildB >= 0) {
 		if (rs->dev[rebuildB].dev == NULL)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b856340..cad4992 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1228,6 +1228,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct target_callbacks *cb;
 
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1240,6 +1241,9 @@ void dm_table_unplug_all(struct dm_table *t)
 				     dm_device_name(t->md),
 				     bdevname(dd->dm_dev.bdev, b));
 	}
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->unplug_fn)
+			cb->unplug_fn(cb);
 }
 
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fae805e..41a905b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -364,7 +364,6 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
 }
 
 static void unplug_slaves(mddev_t *mddev);
-static void raid5_unplug_device(raid5_conf_t *conf);
 
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
@@ -3576,7 +3575,7 @@ static void unplug_slaves(mddev_t *mddev)
 	rcu_read_unlock();
 }
 
-static void raid5_unplug_device(raid5_conf_t *conf)
+void raid5_unplug_device(raid5_conf_t *conf)
 {
 	unsigned long flags;
 
@@ -3592,6 +3591,7 @@ static void raid5_unplug_device(raid5_conf_t *conf)
 
 	unplug_slaves(conf->mddev);
 }
+EXPORT_SYMBOL_GPL(raid5_unplug_device);
 
 static void raid5_unplug(struct plug_handle *plug)
 {
@@ -5127,11 +5127,10 @@ static int run(mddev_t *mddev)
 
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-	}
-
-	mddev->queue->queue_lock = &conf->device_lock;
 
-	mddev->queue->unplug_fn = raid5_unplug_queue;
+		mddev->queue->queue_lock = &conf->device_lock;
+		mddev->queue->unplug_fn = raid5_unplug_queue;
+	}
 
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4dc58bf..69dfe39 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -501,4 +501,5 @@ static inline int algorithm_is_DDF(int layout)
 	return layout >= 8 && layout <= 10;
 }
 extern int md_raid5_congested(mddev_t *mddev, int bits);
+extern void raid5_unplug_device(raid5_conf_t *conf);
 #endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2b0f538..c6de593 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -191,6 +191,7 @@ struct dm_target {
 struct target_callbacks {
 	struct list_head list;
 	congested_fn *congested_fn;
+	void (*unplug_fn)(void*);
 };
 
 int dm_register_target(struct target_type *t);



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 12/12] dm-raid456: add message handler.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
  2010-04-15  6:43 ` [PATCH 06/12] dm-raid456: add congestion checking NeilBrown
@ 2010-04-15  6:43 ` NeilBrown
  2010-04-15  6:43 ` [PATCH 08/12] md/plug: optionally use plugger to unplug an array during resync/recovery NeilBrown
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid

Support messages to:
 - change the size of the stripe cache
 - change the speed limiter on resync.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid456.c |   26 ++++++++++++++++++++++++++
 drivers/md/raid5.c      |    3 ++-
 drivers/md/raid5.h      |    1 +
 3 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
index 126042d..62c2807 100644
--- a/drivers/md/dm-raid456.c
+++ b/drivers/md/dm-raid456.c
@@ -478,6 +478,31 @@ static void raid_resume(struct dm_target *ti)
 	mddev_resume(&rs->md);
 }
 
+/* Parse and handle a message from userspace
+ * Messages are:
+ *    stripecache  N  (pages per devices)
+ *    minspeed  N          (kibibytes per seconds)
+ */
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct raid_set *rs = ti->private;
+
+	if (argc == 2 && strcmp(argv[0], "stripecache") == 0) {
+		unsigned long size;
+		if (strict_strtoul(argv[1], 10, &size))
+			return -EINVAL;
+		return raid5_set_cache_size(&rs->md, size);
+	}
+	if (argc == 2 && strcmp(argv[0], "minspeed") == 0) {
+		unsigned long speed;
+		if (strict_strtoul(argv[1], 10, &speed))
+			return -EINVAL;
+		rs->md.sync_speed_min = speed;
+		return 0;
+	}
+	return -EINVAL;
+}
+
 static struct target_type raid_target = {
 	.name = "raid45",
 	.version = {1, 0, 0},
@@ -490,6 +515,7 @@ static struct target_type raid_target = {
 	.io_hints = raid_io_hints,
 	.presuspend = raid_presuspend,
 	.resume = raid_resume,
+	.message = raid_message,
 };
 
 static int __init dm_raid_init(void)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 138f65f..805f229 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4503,7 +4503,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 		return 0;
 }
 
-static int
+int
 raid5_set_cache_size(mddev_t *mddev, int size)
 {
 	raid5_conf_t *conf = mddev->private;
@@ -4527,6 +4527,7 @@ raid5_set_cache_size(mddev_t *mddev, int size)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(raid5_set_cache_size);
 
 static ssize_t
 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 69dfe39..a08c71b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -502,4 +502,5 @@ static inline int algorithm_is_DDF(int layout)
 }
 extern int md_raid5_congested(mddev_t *mddev, int bits);
 extern void raid5_unplug_device(raid5_conf_t *conf);
+extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 00/12] A dm-raid45 target implemented using md raid5.
@ 2010-04-15  6:43 NeilBrown
  2010-04-15  6:43 ` [PATCH 06/12] dm-raid456: add congestion checking NeilBrown
                   ` (13 more replies)
  0 siblings, 14 replies; 17+ messages in thread
From: NeilBrown @ 2010-04-15  6:43 UTC (permalink / raw)
  To: dm-devel; +Cc: linux-raid, HeinzMauelshagen, Alasdair G Kergon

Greetings Heinz, Alasdair, and all,
(Alasdair and Heinz cc:ed on this intro, but the patches are
 only going to the lists).

 Some months ago I posted a proof-of-concept patch which attempted to
 provide RAID4/5/6 functionality to 'dm' using md/raid5.c.
 While it did a least partly work it contained lots of hacks and was
 very ugly.

 I finally made time to do the job "properly".

 The following series, when applied on top of a bunch of patches I
 just submitted for linux-next, provides a 'dm-raid45' target which is
 largely compatible with the one that Heinz has written (and several
 distros are shipping), but which uses md/raid5.c for the core IO
 processing.

 I have tried to split the patch up into easy-to-handle pieces.  You
 will note that some changes to core-dm are required, in particular to
 pass back 'congestion' information and to handle plugging (which
 raid5 uses to improve throughput).  I hope the approach I have taken
 is suitable, but it can obviously be changed if necessary.

 The create/status/message interface differs from the one in Heinz's
 patch, but should be close enough to work with current 'dmraid'.

 If you want to try the patches (rather than just read them) you
 should probably "git pull" (please don't clone) from
      git://neil.brown.name/md md-dm-raid45

 so as to get all the prior refactoring patches in md.

 Some advantages of this over Heinz's patch (at least as it was
 when I last looked at it) are:
  - raid6 support
  - support for XOR-offload hardware where present
  - less code duplication
  - a single dm device can include multiple dm-raid45 targets.
    (Heinz' code accesses dm_disk(md)->queue directly which
     is a layering violations and assumes that there is no
     other target in the mapped_device).

 There is a lot more that could be done to this such as getting to
 work with a disk based dirty-log and making the reshape options
 available.  But this patch set should provide all basic RAID5
 functionality.

 Would the dm community be interested in including this work upstream
 (after suitable review and testing)?

Thanks,
NeilBrown


---

NeilBrown (12):
      md: reduce dependence on sysfs.
      md/raid5: factor out code for changing size of stripe cache.
      md/dm: create dm-raid456 module using md/raid5
      dm-raid456: add support for raising events to userspace.
      raid5: Don't set read-ahead when there is no queue
      dm-raid456: add congestion checking.
      md/raid5: add simple plugging infrastructure.
      md/plug: optionally use plugger to unplug an array during resync/recovery.
      dm-raid456: support unplug
      dm-raid456: add support for setting IO hints.
      dm-raid456: add suspend/resume method
      dm-raid456: add message handler.


 drivers/md/Kconfig            |    8 +
 drivers/md/Makefile           |    1 
 drivers/md/dm-raid456.c       |  540 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-table.c         |   19 +
 drivers/md/md.c               |  211 ++++++++++------
 drivers/md/md.h               |   43 +++
 drivers/md/raid5.c            |  155 +++++++-----
 drivers/md/raid5.h            |    6 
 include/linux/device-mapper.h |   13 +
 9 files changed, 859 insertions(+), 137 deletions(-)
 create mode 100644 drivers/md/dm-raid456.c

-- 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 00/12] A dm-raid45 target implemented using md raid5.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (11 preceding siblings ...)
  2010-04-15  6:43 ` [PATCH 02/12] md/raid5: factor out code for changing size of stripe cache NeilBrown
@ 2010-04-15  8:52 ` Jeff Garzik
  2010-04-15 17:27 ` [dm-devel] " Heinz Mauelshagen
  13 siblings, 0 replies; 17+ messages in thread
From: Jeff Garzik @ 2010-04-15  8:52 UTC (permalink / raw)
  To: NeilBrown; +Cc: dm-devel, Alasdair G Kergon, Heinz Mauelshagen, linux-raid

On 04/15/2010 02:43 AM, NeilBrown wrote:
>   Would the dm community be interested in including this work upstream
>   (after suitable review and testing)?


I certainly hope so.  I was able to get this to the "it works" stage, 
and the overall effort is something upstream has needed for a long while.

	Jeff




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dm-devel] [PATCH 00/12] A dm-raid45 target implemented using md raid5.
  2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
                   ` (12 preceding siblings ...)
  2010-04-15  8:52 ` [PATCH 00/12] A dm-raid45 target implemented using md raid5 Jeff Garzik
@ 2010-04-15 17:27 ` Heinz Mauelshagen
  2010-04-15 22:14   ` Neil Brown
  13 siblings, 1 reply; 17+ messages in thread
From: Heinz Mauelshagen @ 2010-04-15 17:27 UTC (permalink / raw)
  To: device-mapper development, NeilBrown; +Cc: linux-raid, Alasdair G Kergon


Hi Neil,

had a first go reading through your patch series w/o finding any major
issues. The only important feature for an initial release which needs
adding (as you mentioned) is (persistent) dirty log support.

Because you're using a persistent bitmap in the MD RAID personalities,
this looks like a bit more surgery to factor it out to potentially
enhance dm-log.c. For an initial solution we can as well just go with
MDs existing bitmap while keeping the dm-raid456 ctr support for
explicit dirty logging in order to avoid compatibility issues (there's
obviously no parameter to support bitmap chunk sizes so far).

Reshaping could be triggered either preferably via the constructor
involving MD metadata reads to be able to recognize the size change
requested or the message interface. Both ctr/message support could be
implemented sharing the same functions. Enhancements in the status
interface and dm_table_event() throwing on error/finish are mandatory if
we support reshaping.

A shortcoming of this MD wrapping solution vs. dm-raid45 is, that there
is no obvious way to leverage it to be a clustered RAID456 mapping
target. dm-raid45 has been designed with that future enhancement
possibility in mind.

Will try testing your code tomorrow.

Regards,
Heinz

On Thu, 2010-04-15 at 16:43 +1000, NeilBrown wrote:
> Greetings Heinz, Alasdair, and all,
> (Alasdair and Heinz cc:ed on this intro, but the patches are
>  only going to the lists).
> 
>  Some months ago I posted a proof-of-concept patch which attempted to
>  provide RAID4/5/6 functionality to 'dm' using md/raid5.c.
>  While it did a least partly work it contained lots of hacks and was
>  very ugly.
> 
>  I finally made time to do the job "properly".
> 
>  The following series, when applied on top of a bunch of patches I
>  just submitted for linux-next, provides a 'dm-raid45' target which is
>  largely compatible with the one that Heinz has written (and several
>  distros are shipping), but which uses md/raid5.c for the core IO
>  processing.
> 
>  I have tried to split the patch up into easy-to-handle pieces.  You
>  will note that some changes to core-dm are required, in particular to
>  pass back 'congestion' information and to handle plugging (which
>  raid5 uses to improve throughput).  I hope the approach I have taken
>  is suitable, but it can obviously be changed if necessary.
> 
>  The create/status/message interface differs from the one in Heinz's
>  patch, but should be close enough to work with current 'dmraid'.
> 
>  If you want to try the patches (rather than just read them) you
>  should probably "git pull" (please don't clone) from
>       git://neil.brown.name/md md-dm-raid45
> 
>  so as to get all the prior refactoring patches in md.
> 
>  Some advantages of this over Heinz's patch (at least as it was
>  when I last looked at it) are:
>   - raid6 support
>   - support for XOR-offload hardware where present
>   - less code duplication
>   - a single dm device can include multiple dm-raid45 targets.
>     (Heinz' code accesses dm_disk(md)->queue directly which
>      is a layering violations and assumes that there is no
>      other target in the mapped_device).
> 
>  There is a lot more that could be done to this such as getting to
>  work with a disk based dirty-log and making the reshape options
>  available.  But this patch set should provide all basic RAID5
>  functionality.
> 
>  Would the dm community be interested in including this work upstream
>  (after suitable review and testing)?
> 
> Thanks,
> NeilBrown
> 
> 
> ---
> 
> NeilBrown (12):
>       md: reduce dependence on sysfs.
>       md/raid5: factor out code for changing size of stripe cache.
>       md/dm: create dm-raid456 module using md/raid5
>       dm-raid456: add support for raising events to userspace.
>       raid5: Don't set read-ahead when there is no queue
>       dm-raid456: add congestion checking.
>       md/raid5: add simple plugging infrastructure.
>       md/plug: optionally use plugger to unplug an array during resync/recovery.
>       dm-raid456: support unplug
>       dm-raid456: add support for setting IO hints.
>       dm-raid456: add suspend/resume method
>       dm-raid456: add message handler.
> 
> 
>  drivers/md/Kconfig            |    8 +
>  drivers/md/Makefile           |    1 
>  drivers/md/dm-raid456.c       |  540 +++++++++++++++++++++++++++++++++++++++++
>  drivers/md/dm-table.c         |   19 +
>  drivers/md/md.c               |  211 ++++++++++------
>  drivers/md/md.h               |   43 +++
>  drivers/md/raid5.c            |  155 +++++++-----
>  drivers/md/raid5.h            |    6 
>  include/linux/device-mapper.h |   13 +
>  9 files changed, 859 insertions(+), 137 deletions(-)
>  create mode 100644 drivers/md/dm-raid456.c
> 



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dm-devel] [PATCH 00/12] A dm-raid45 target implemented using md raid5.
  2010-04-15 17:27 ` [dm-devel] " Heinz Mauelshagen
@ 2010-04-15 22:14   ` Neil Brown
  2010-04-16  9:27     ` Heinz Mauelshagen
  0 siblings, 1 reply; 17+ messages in thread
From: Neil Brown @ 2010-04-15 22:14 UTC (permalink / raw)
  To: heinzm; +Cc: device-mapper development, linux-raid, Alasdair G Kergon

On Thu, 15 Apr 2010 19:27:15 +0200
Heinz Mauelshagen <heinzm@redhat.com> wrote:

> 
> Hi Neil,
> 
> had a first go reading through your patch series w/o finding any major
> issues. The only important feature for an initial release which needs
> adding (as you mentioned) is (persistent) dirty log support.
> 
> Because you're using a persistent bitmap in the MD RAID personalities,
> this looks like a bit more surgery to factor it out to potentially
> enhance dm-log.c. For an initial solution we can as well just go with
> MDs existing bitmap while keeping the dm-raid456 ctr support for
> explicit dirty logging in order to avoid compatibility issues (there's
> obviously no parameter to support bitmap chunk sizes so far).

I don't think we can use md's existing bitmap support as there is no easy way
to store it on an arbitrary target:  it either lives near the metadata or on
a file (not a device).
There a just a few calls in the interface to md/bitmap.c - it shouldn't be
too hard to make those selectively call into a dm_dirty_log instead.
I want to do something like that anyway as I want to optionally be able to use
a dirty log which is a list of dirty sector addresses rather than a bitmap.
I'll have a look next week.

And the "bitmap chunk size" is exactly the same as the dm "region size".
(which would probably have been a better name to choose for md too).

> 
> Reshaping could be triggered either preferably via the constructor
> involving MD metadata reads to be able to recognize the size change
> requested or the message interface. Both ctr/message support could be
> implemented sharing the same functions. Enhancements in the status
> interface and dm_table_event() throwing on error/finish are mandatory if
> we support reshaping.

I imagine enhancing the constructor to take before/after values for
type, disks, chunksize, and a sector which marks where "after" starts.
You also need to know which direction the reshape is going (low addresses to
high addresses, or the reverse) though that might be implicit in the other
values.

> 
> A shortcoming of this MD wrapping solution vs. dm-raid45 is, that there
> is no obvious way to leverage it to be a clustered RAID456 mapping
> target. dm-raid45 has been designed with that future enhancement
> possibility in mind.
> 

I haven't given cluster locking a lot of thought...
I would probably do the locking on a per-"stripe_head" basis as everything
revolves around that structure.
Get a shared lock when servicing a read (Which would only happen on a
degraded array - normally reads bypass the stripe cache), or a write lock
when servicing a write or a resync.
It should all interface with DLM quite well - when DLM tries to reclaim a lock
we first mark all the stripe as not up-to-date...

Does DM simply use DLM for locking or something else?


> Will try testing your code tomorrow.

Thanks,

NeilBrown

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dm-devel] [PATCH 00/12] A dm-raid45 target implemented using md raid5.
  2010-04-15 22:14   ` Neil Brown
@ 2010-04-16  9:27     ` Heinz Mauelshagen
  0 siblings, 0 replies; 17+ messages in thread
From: Heinz Mauelshagen @ 2010-04-16  9:27 UTC (permalink / raw)
  To: device-mapper development; +Cc: linux-raid, Alasdair G Kergon

On Fri, 2010-04-16 at 08:14 +1000, Neil Brown wrote:
> On Thu, 15 Apr 2010 19:27:15 +0200
> Heinz Mauelshagen <heinzm@redhat.com> wrote:
> 
> > 
> > Hi Neil,
> > 
> > had a first go reading through your patch series w/o finding any major
> > issues. The only important feature for an initial release which needs
> > adding (as you mentioned) is (persistent) dirty log support.
> > 
> > Because you're using a persistent bitmap in the MD RAID personalities,
> > this looks like a bit more surgery to factor it out to potentially
> > enhance dm-log.c. For an initial solution we can as well just go with
> > MDs existing bitmap while keeping the dm-raid456 ctr support for
> > explicit dirty logging in order to avoid compatibility issues (there's
> > obviously no parameter to support bitmap chunk sizes so far).
> 
> I don't think we can use md's existing bitmap support as there is no easy way
> to store it on an arbitrary target:  it either lives near the metadata or on
> a file (not a device).
> There a just a few calls in the interface to md/bitmap.c - it shouldn't be
> too hard to make those selectively call into a dm_dirty_log instead.

Good, it was my thinking if using dm-dirty-log interface, that there are
some MD bitmap code valuables we could factor out (bitmap flushing
enhancements?).

> 
> I want to do something like that anyway as I want to optionally be able to use
> a dirty log which is a list of dirty sector addresses rather than a bitmap.
> I'll have a look next week.

Ok.

> 
> And the "bitmap chunk size" is exactly the same as the dm "region size".
> (which would probably have been a better name to choose for md too).

Fair enough.

> 
> > 
> > Reshaping could be triggered either preferably via the constructor
> > involving MD metadata reads to be able to recognize the size change
> > requested or the message interface. Both ctr/message support could be
> > implemented sharing the same functions. Enhancements in the status
> > interface and dm_table_event() throwing on error/finish are mandatory if
> > we support reshaping.
> 
> I imagine enhancing the constructor to take before/after values for
> type, disks, chunksize, and a sector which marks where "after" starts.
> You also need to know which direction the reshape is going (low addresses to
> high addresses, or the reverse) though that might be implicit in the other
> values.

Yes, that can be additional ctr variable parameters allowing for a
compatible enhancement.

One possibility could be using variable parameters from free #8 on:

o to_raid_type		# may be existing one; eg. raid6_zr
o to_chunk_size		# new requested chunk size in sectors
o old_size		# actual size of the array
o low_to_high/high_to_low # low->high or high->low addresses

ti->len defines the new intended size while old_size provides the actual
size of the array.

> 
> > 
> > A shortcoming of this MD wrapping solution vs. dm-raid45 is, that there
> > is no obvious way to leverage it to be a clustered RAID456 mapping
> > target. dm-raid45 has been designed with that future enhancement
> > possibility in mind.
> > 
> 
> I haven't given cluster locking a lot of thought...
> I would probably do the locking on a per-"stripe_head" basis as everything
> revolves around that structure.

Makes sense. I was also thinking about tying stripe invalidation to lock
state changes.

> Get a shared lock when servicing a read (Which would only happen on a
> degraded array - normally reads bypass the stripe cache), or a write lock
> when servicing a write or a resync.

Yes, an exclusive DLM lock.

> It should all interface with DLM quite well - when DLM tries to reclaim a lock
> we first mark all the stripe as not up-to-date...

When a dm-raid45(6) instance tries to reclaim either lock *after* it had
to drop it before, it has to invalidate the respective stripe date.

> 
> Does DM simply use DLM for locking or something else?

We don't use the DLM from DM yet, but essentially: yes, you'ld call
dlm_new_lockspace(), dlm_lock(..., DLM_LOCK_{CR|EX}, ...), ...

Of course such locking has to be abstracted in dm-raid456 in order to
plug in NULL, clustered, locking modules.

Cheers,
Heinz

> 
> 
> > Will try testing your code tomorrow.
> 
> Thanks,
> 
> NeilBrown
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-04-16  9:27 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-15  6:43 [PATCH 00/12] A dm-raid45 target implemented using md raid5 NeilBrown
2010-04-15  6:43 ` [PATCH 06/12] dm-raid456: add congestion checking NeilBrown
2010-04-15  6:43 ` [PATCH 12/12] dm-raid456: add message handler NeilBrown
2010-04-15  6:43 ` [PATCH 08/12] md/plug: optionally use plugger to unplug an array during resync/recovery NeilBrown
2010-04-15  6:43 ` [PATCH 07/12] md/raid5: add simple plugging infrastructure NeilBrown
2010-04-15  6:43 ` [PATCH 09/12] dm-raid456: support unplug NeilBrown
2010-04-15  6:43 ` [PATCH 01/12] md: reduce dependence on sysfs NeilBrown
2010-04-15  6:43 ` [PATCH 05/12] raid5: Don't set read-ahead when there is no queue NeilBrown
2010-04-15  6:43 ` [PATCH 03/12] md/dm: create dm-raid456 module using md/raid5 NeilBrown
2010-04-15  6:43 ` [PATCH 11/12] dm-raid456: add suspend/resume method NeilBrown
2010-04-15  6:43 ` [PATCH 10/12] dm-raid456: add support for setting IO hints NeilBrown
2010-04-15  6:43 ` [PATCH 04/12] dm-raid456: add support for raising events to userspace NeilBrown
2010-04-15  6:43 ` [PATCH 02/12] md/raid5: factor out code for changing size of stripe cache NeilBrown
2010-04-15  8:52 ` [PATCH 00/12] A dm-raid45 target implemented using md raid5 Jeff Garzik
2010-04-15 17:27 ` [dm-devel] " Heinz Mauelshagen
2010-04-15 22:14   ` Neil Brown
2010-04-16  9:27     ` Heinz Mauelshagen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).