From: Samuel Tardieu <sam@rfc1149.net>
To: linux-raid@vger.kernel.org
Subject: [PATCH reposted] raid1 load balancing
Date: Thu, 24 Jan 2008 22:57:20 +0100 [thread overview]
Message-ID: <87sl0mrinz.fsf@willow.rfc1149.net> (raw)
I have been running Konstantin's patch to add raid1 load balancing
since last November. I follow Linus' git version of the kernel + this
patch and haven't noticed any drawback.
Maybe it would be a good idea to apply it, maybe with a FIXME which
reminds people that a more elaborate solution could be used. Here is
patch updated to apply against Linus' HEAD.
Author: Konstantin Sharlaimov <konstantin.sharlaimov@gmail.com>
Date: Sat Nov 3 20:08:42 2007 +1000
md: add dm-raid1 read balancing
This patch adds RAID1 read balancing to device mapper. A read operation
that is close (in terms of sectors) to a previous read or write goes to
the same mirror.
Signed-off-by: Konstantin Sharlaimov <konstantin.sharlaimov@gmail.com>
Tested-by: Samuel Tardieu <sam@rfc1149.net>
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 31123d4..a103340 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -19,6 +19,7 @@
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/random.h>
#include <linux/log2.h>
#define DM_MSG_PREFIX "raid1"
@@ -27,6 +28,9 @@
#define DM_RAID1_HANDLE_ERRORS 0x01
#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
+/* Read balancing max hdd head distance */
+#define DM_RAID1_BALANCE_MAX_IO_DISTANCE (256)
+
static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
/*-----------------------------------------------------------------
@@ -118,6 +122,7 @@ struct mirror {
atomic_t error_count;
struct dm_dev *dev;
sector_t offset;
+ sector_t last_io_sector;
};
struct mirror_set {
@@ -743,13 +748,51 @@ static void do_recovery(struct mirror_set *ms)
}
}
+static void set_mirror_last_io_sector(struct mirror *m, sector_t sector)
+{
+ /* FIXME: Probably some more work is needed here, however this is unlikely */
+ m->last_io_sector = sector;
+}
+
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
+/*
+ * There is a per-array 'last IO operation' sector number maintained by
+ * read and write handlers for the region. When balancing reads we pick
+ * the disk whose IO operation (HDD head position) is closest.
+ */
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
{
- /* FIXME: add read balancing */
- return ms->default_mirror;
+ /* If we got here, then the array is in sync and we can pick any mirror */
+
+ unsigned int i;
+ struct mirror *use_mirror;
+ sector_t use_distance, new_distance;
+
+ use_mirror = &ms->mirror[0];
+ use_distance = abs(sector - ms->mirror[0].last_io_sector);
+
+ for (i = 1; i < ms->nr_mirrors; i++) {
+ new_distance = abs(sector - ms->mirror[i].last_io_sector);
+ if (new_distance < use_distance) {
+ use_distance = new_distance;
+ use_mirror = &ms->mirror[i];
+ }
+ }
+
+ /*
+ * If the HDD head is too far from the needed sector then we do stochastic
+ * balancing - chose the mirror randomly. This appers to have a better
+ * chance of chosing an idle disk in case of two or more regions residing
+ * on the same physical disk.
+ *
+ * TODO: Gather more statistical data and verify that the above is correct
+ */
+ if (use_distance > DM_RAID1_BALANCE_MAX_IO_DISTANCE)
+ return &ms->mirror[random32() % ms->nr_mirrors];
+ else
+ return use_mirror;
}
/*
@@ -778,6 +821,9 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
else
m = ms->default_mirror;
+ /* Set last IO position for chosen mirror */
+ set_mirror_last_io_sector(m, bio->bi_sector);
+
map_bio(ms, m, bio);
generic_make_request(bio);
}
@@ -804,6 +850,21 @@ static void write_callback(unsigned long error, void *context)
bio_set_ms(bio, NULL);
/*
+ * Things might be different for various region states:
+ * SYNC: writing is done to all mirrors, reading is balanced
+ * RECOVERING: writing is delayed, reading is done from the default
+ * NOSYNC: writing to default only, reading from the default
+ *
+ * In any case, if we update last IO sector at all mirrors, we will use
+ * the up-to-date data when doing read balancing
+ *
+ * FIXME: update write position only on the region being written
+ */
+
+ for (i = 0; i < ms->nr_mirrors; i++)
+ set_mirror_last_io_sector(&ms->mirror[i], bio->bi_sector);
+
+ /*
* NOTE: We don't decrement the pending count here,
* instead it is done by the targets endio function.
* This way we handle both writes to SYNC and NOSYNC
reply other threads:[~2008-01-24 21:57 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87sl0mrinz.fsf@willow.rfc1149.net \
--to=sam@rfc1149.net \
--cc=linux-raid@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).