linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/RFC] md/multipath: implement I/O balancing
@ 2011-06-10 11:32 Namhyung Kim
  2011-06-14  3:59 ` NeilBrown
  0 siblings, 1 reply; 2+ messages in thread
From: Namhyung Kim @ 2011-06-10 11:32 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid

Implement basic I/O balancing code (for read/write) for multipath
personality. The code is based on RAID1 implementation.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/multipath.c |   70 ++++++++++++++++++++++++++++++++++++++---------
 drivers/md/multipath.h |    1 +
 2 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23af288..83c4f5105705 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -30,29 +30,58 @@
 
 #define	NR_RESERVED_BUFS	32
 
-
-static int multipath_map (multipath_conf_t *conf)
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, IO completion handlers update this
+ * position correctly. We pick the disk whose head is closest.
+ *
+ * Note that 'sector' argument is for original bio whereas 'head_position'
+ * is maintained for each rdev so we should take it into account when
+ * calculating the distance.
+ */
+static int multipath_map(multipath_conf_t *conf, sector_t sector)
 {
 	int i, disks = conf->raid_disks;
-
-	/*
-	 * Later we do read balancing on the read side 
-	 * now we use the first available disk.
-	 */
+	int best_disk;
+	sector_t best_dist;
 
 	rcu_read_lock();
+retry:
+	best_disk = -1;
+	best_dist = MaxSector;
+
 	for (i = 0; i < disks; i++) {
+		int dist;
 		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		sector_t this_sector = sector;
+
 		if (rdev && test_bit(In_sync, &rdev->flags)) {
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-			return i;
+			this_sector += rdev->data_offset;
+			dist = abs(this_sector - conf->multipaths[i].head_position);
+			if (dist < best_dist) {
+				best_dist = dist;
+				best_disk = i;
+			}
 		}
 	}
+
+	if (best_disk == -1) {
+		printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	} else {
+		mdk_rdev_t *rdev;
+
+		rdev = rcu_dereference(conf->multipaths[best_disk].rdev);
+		if (!rdev || !test_bit(In_sync, &rdev->flags))
+			goto retry;
+
+		atomic_inc(&rdev->nr_pending);
+	}
 	rcu_read_unlock();
 
-	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
-	return (-1);
+	return best_disk;
 }
 
 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
@@ -82,6 +111,17 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 	mempool_free(mp_bh, conf->pool);
 }
 
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, struct multipath_bh *mp_bh)
+{
+	multipath_conf_t *conf = mp_bh->mddev->private;
+
+	conf->multipaths[disk].head_position =
+		mp_bh->bio.bi_sector + (mp_bh->bio.bi_size >> 9);
+}
+
 static void multipath_end_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -89,6 +129,8 @@ static void multipath_end_request(struct bio *bio, int error)
 	multipath_conf_t *conf = mp_bh->mddev->private;
 	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
 
+	update_head_pos(mp_bh->path, mp_bh);
+
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
 	else if (!(bio->bi_rw & REQ_RAHEAD)) {
@@ -122,7 +164,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
 
-	mp_bh->path = multipath_map(conf);
+	mp_bh->path = multipath_map(conf, bio->bi_sector);
 	if (mp_bh->path < 0) {
 		bio_endio(bio, -EIO);
 		mempool_free(mp_bh, conf->pool);
@@ -356,7 +398,7 @@ static void multipathd (mddev_t *mddev)
 		bio = &mp_bh->bio;
 		bio->bi_sector = mp_bh->master_bio->bi_sector;
 		
-		if ((mp_bh->path = multipath_map (conf))<0) {
+		if ((mp_bh->path = multipath_map(conf, bio->bi_sector)) < 0) {
 			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
 				" error for block %llu\n",
 				bdevname(bio->bi_bdev,b),
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 3c5a45eb5f8a..060fe2aabd97 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -3,6 +3,7 @@
 
 struct multipath_info {
 	mdk_rdev_t	*rdev;
+	sector_t	head_position;
 };
 
 struct multipath_private_data {
-- 
1.7.5.2


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH/RFC] md/multipath: implement I/O balancing
  2011-06-10 11:32 [PATCH/RFC] md/multipath: implement I/O balancing Namhyung Kim
@ 2011-06-14  3:59 ` NeilBrown
  0 siblings, 0 replies; 2+ messages in thread
From: NeilBrown @ 2011-06-14  3:59 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: linux-raid

On Fri, 10 Jun 2011 20:32:11 +0900 Namhyung Kim <namhyung@gmail.com> wrote:

> Implement basic I/O balancing code (for read/write) for multipath
> personality. The code is based on RAID1 implementation.

Thanks, but no thanks.

As far as I am concerned, the md/multipath implementation is deprecated.  The
dm-multipath implementation is much more mature and is more widely used and
actually has a sensible design - unlike md/multipath which has always had a
bad design.

I would rip it out and throw it away if I could, but I believe there are
people who use it so doing that is too difficult.

But I will not be adding feature to it at all.

Thanks,
NeilBrown

> 
> Signed-off-by: Namhyung Kim <namhyung@gmail.com>
> ---
>  drivers/md/multipath.c |   70 ++++++++++++++++++++++++++++++++++++++---------
>  drivers/md/multipath.h |    1 +
>  2 files changed, 57 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
> index 3535c23af288..83c4f5105705 100644
> --- a/drivers/md/multipath.c
> +++ b/drivers/md/multipath.c
> @@ -30,29 +30,58 @@
>  
>  #define	NR_RESERVED_BUFS	32
>  
> -
> -static int multipath_map (multipath_conf_t *conf)
> +/*
> + * This routine returns the disk from which the requested read should
> + * be done. There is a per-array 'next expected sequential IO' sector
> + * number - if this matches on the next IO then we use the last disk.
> + * There is also a per-disk 'last know head position' sector that is
> + * maintained from IRQ contexts, IO completion handlers update this
> + * position correctly. We pick the disk whose head is closest.
> + *
> + * Note that 'sector' argument is for original bio whereas 'head_position'
> + * is maintained for each rdev so we should take it into account when
> + * calculating the distance.
> + */
> +static int multipath_map(multipath_conf_t *conf, sector_t sector)
>  {
>  	int i, disks = conf->raid_disks;
> -
> -	/*
> -	 * Later we do read balancing on the read side 
> -	 * now we use the first available disk.
> -	 */
> +	int best_disk;
> +	sector_t best_dist;
>  
>  	rcu_read_lock();
> +retry:
> +	best_disk = -1;
> +	best_dist = MaxSector;
> +
>  	for (i = 0; i < disks; i++) {
> +		int dist;
>  		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
> +		sector_t this_sector = sector;
> +
>  		if (rdev && test_bit(In_sync, &rdev->flags)) {
> -			atomic_inc(&rdev->nr_pending);
> -			rcu_read_unlock();
> -			return i;
> +			this_sector += rdev->data_offset;
> +			dist = abs(this_sector - conf->multipaths[i].head_position);
> +			if (dist < best_dist) {
> +				best_dist = dist;
> +				best_disk = i;
> +			}
>  		}
>  	}
> +
> +	if (best_disk == -1) {
> +		printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
> +	} else {
> +		mdk_rdev_t *rdev;
> +
> +		rdev = rcu_dereference(conf->multipaths[best_disk].rdev);
> +		if (!rdev || !test_bit(In_sync, &rdev->flags))
> +			goto retry;
> +
> +		atomic_inc(&rdev->nr_pending);
> +	}
>  	rcu_read_unlock();
>  
> -	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
> -	return (-1);
> +	return best_disk;
>  }
>  
>  static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
> @@ -82,6 +111,17 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
>  	mempool_free(mp_bh, conf->pool);
>  }
>  
> +/*
> + * Update disk head position estimator based on IRQ completion info.
> + */
> +static inline void update_head_pos(int disk, struct multipath_bh *mp_bh)
> +{
> +	multipath_conf_t *conf = mp_bh->mddev->private;
> +
> +	conf->multipaths[disk].head_position =
> +		mp_bh->bio.bi_sector + (mp_bh->bio.bi_size >> 9);
> +}
> +
>  static void multipath_end_request(struct bio *bio, int error)
>  {
>  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
> @@ -89,6 +129,8 @@ static void multipath_end_request(struct bio *bio, int error)
>  	multipath_conf_t *conf = mp_bh->mddev->private;
>  	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
>  
> +	update_head_pos(mp_bh->path, mp_bh);
> +
>  	if (uptodate)
>  		multipath_end_bh_io(mp_bh, 0);
>  	else if (!(bio->bi_rw & REQ_RAHEAD)) {
> @@ -122,7 +164,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
>  	mp_bh->master_bio = bio;
>  	mp_bh->mddev = mddev;
>  
> -	mp_bh->path = multipath_map(conf);
> +	mp_bh->path = multipath_map(conf, bio->bi_sector);
>  	if (mp_bh->path < 0) {
>  		bio_endio(bio, -EIO);
>  		mempool_free(mp_bh, conf->pool);
> @@ -356,7 +398,7 @@ static void multipathd (mddev_t *mddev)
>  		bio = &mp_bh->bio;
>  		bio->bi_sector = mp_bh->master_bio->bi_sector;
>  		
> -		if ((mp_bh->path = multipath_map (conf))<0) {
> +		if ((mp_bh->path = multipath_map(conf, bio->bi_sector)) < 0) {
>  			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
>  				" error for block %llu\n",
>  				bdevname(bio->bi_bdev,b),
> diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
> index 3c5a45eb5f8a..060fe2aabd97 100644
> --- a/drivers/md/multipath.h
> +++ b/drivers/md/multipath.h
> @@ -3,6 +3,7 @@
>  
>  struct multipath_info {
>  	mdk_rdev_t	*rdev;
> +	sector_t	head_position;
>  };
>  
>  struct multipath_private_data {


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2011-06-14  3:59 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-10 11:32 [PATCH/RFC] md/multipath: implement I/O balancing Namhyung Kim
2011-06-14  3:59 ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).