All of lore.kernel.org
 help / color / mirror / Atom feed
From: Konstantin Sharlaimov <konstantin.sharlaimov@gmail.com>
To: linux-raid <linux-raid@vger.kernel.org>
Cc: Rik van Riel <riel@redhat.com>,
	"; Ingo Molnar" <mingo@redhat.com>,
	"; Neil Brown" <neilb@suse.de>
Subject: [RFC PATCH 2.6.23.1] md: add dm-raid1 read balancing
Date: Sat, 03 Nov 2007 20:08:42 +1000	[thread overview]
Message-ID: <1194084522.1353.10.camel@localhost> (raw)

This patch adds RAID1 read balancing to device mapper. A read operation
that is close (in terms of sectors) to a previous read or write goes to 
the same mirror.

Signed-off-by: Konstantin Sharlaimov <konstantin.sharlaimov@gmail.com>
---
Please give it a try, it works for me, yet my results might be system-specific.
Any feedback (bug-reports, suggestions) will be greatly appreciated.

--- linux-2.6.23.1/drivers/md/dm-raid1.c.old	2007-11-03 18:47:10.000000000 +1000
+++ linux-2.6.23.1/drivers/md/dm-raid1.c	2007-11-03 19:54:35.000000000 +1000
@@ -19,6 +19,7 @@
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#include <linux/random.h>
 
 #define DM_MSG_PREFIX "raid1"
 #define DM_IO_PAGES 64
@@ -26,6 +27,9 @@
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
 
+/* Read balancing max hdd head distance */
+#define DM_RAID1_BALANCE_MAX_IO_DISTANCE	(256)
+
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
 /*-----------------------------------------------------------------
@@ -116,6 +120,7 @@ struct mirror {
 	atomic_t error_count;
 	struct dm_dev *dev;
 	sector_t offset;
+	sector_t last_io_sector;
 };
 
 struct mirror_set {
@@ -741,13 +746,51 @@ static void do_recovery(struct mirror_se
 	}
 }
 
+static void set_mirror_last_io_sector(struct mirror *m, sector_t sector)
+{
+	/* FIXME: Probably some more work is needed here, however this is unlikely */
+	m->last_io_sector = sector;
+}
+
 /*-----------------------------------------------------------------
  * Reads
  *---------------------------------------------------------------*/
+/*
+ * There is a per-array 'last IO operation' sector number maintained by
+ * read and write handlers for the region. When balancing reads we pick 
+ * the disk whose IO operation (HDD head position) is closest.
+ */
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
-	/* FIXME: add read balancing */
-	return ms->default_mirror;
+	/* If we got here, then the array is in sync and we can pick any mirror */
+	
+	unsigned int i;
+	struct mirror *use_mirror;
+	sector_t use_distance, new_distance;
+	
+	use_mirror = &ms->mirror[0];
+	use_distance = abs(sector - ms->mirror[0].last_io_sector);
+		
+	for (i = 1; i < ms->nr_mirrors; i++) {
+		new_distance = abs(sector - ms->mirror[i].last_io_sector);
+		if (new_distance < use_distance) {
+			use_distance = new_distance;
+			use_mirror = &ms->mirror[i];
+		}
+	}
+	
+	/* 
+	 * If the HDD head is too far from the needed sector then we do stochastic 
+	 * balancing - chose the mirror randomly. This appers to have a better 
+	 * chance of chosing an idle disk in case of two or more regions residing 
+	 * on the same physical disk.
+	 *
+	 * TODO: Gather more statistical data and verify that the above is correct
+	 */
+	if (use_distance > DM_RAID1_BALANCE_MAX_IO_DISTANCE)
+		return &ms->mirror[random32() % ms->nr_mirrors];
+	else
+		return use_mirror;
 }
 
 /*
@@ -776,6 +819,9 @@ static void do_reads(struct mirror_set *
 		else
 			m = ms->default_mirror;
 
+		/* Set last IO position for chosen mirror */
+		set_mirror_last_io_sector(m, bio->bi_sector);
+			
 		map_bio(ms, m, bio);
 		generic_make_request(bio);
 	}
@@ -800,6 +846,21 @@ static void write_callback(unsigned long
 
 	ms = bio_get_ms(bio);
 	bio_set_ms(bio, NULL);
+	
+	/*
+	 * Things might be different for various region states:
+	 * SYNC:	writing is done to all mirrors, reading is balanced
+	 * RECOVERING:	writing is delayed, reading is done from the default
+	 * NOSYNC:	writing to default only, reading from the default
+	 *
+	 * In any case, if we update last IO sector at all mirrors, we will use 
+	 * the up-to-date data when doing read balancing
+	 *
+	 * FIXME: update write position only on the region being written
+	 */
+
+	for (i = 0; i < ms->nr_mirrors; i++)
+		set_mirror_last_io_sector(&ms->mirror[i], bio->bi_sector);
 
 	/*
 	 * NOTE: We don't decrement the pending count here,


             reply	other threads:[~2007-11-03 10:08 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-03 10:08 Konstantin Sharlaimov [this message]
2007-11-03 14:29 ` [RFC PATCH 2.6.23.1] md: add dm-raid1 read balancing Rik van Riel
2007-11-07  9:15 ` Was: " Goswin von Brederlow
2007-11-08 11:06   ` Konstantin Sharlaimov
2007-11-08 16:28     ` Goswin von Brederlow
2007-11-08 17:06       ` Rik van Riel
2007-11-08 17:24         ` Bill Davidsen
2007-11-08 19:43         ` Goswin von Brederlow
2007-11-08 17:35       ` Bill Davidsen
2007-11-11 23:36 ` Samuel Tardieu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1194084522.1353.10.camel@localhost \
    --to=konstantin.sharlaimov@gmail.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=neilb@suse.de \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.