All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jonathan Brassow <jbrassow@redhat.com>
To: dm-devel@redhat.com, linux-raid@vger.kernel.org
Cc: jbrassow@redhat.com, agk@redhat.com, neilb@suse.de
Subject: [PATCH] DM RAID:  Add support for MD RAID10 personality
Date: Tue, 26 Jun 2012 07:03:51 -0500	[thread overview]
Message-ID: <1340712231.19015.42.camel@f16> (raw)

dm raid: add md raid10 support

Support the MD RAID10 personality through dm-raid.c

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Index: linux-upstream/drivers/md/dm-raid.c
===================================================================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
 #include "md.h"
 #include "raid1.h"
 #include "raid5.h"
+#include "raid10.h"
 #include "bitmap.h"
 
 #include <linux/device-mapper.h>
@@ -52,7 +53,11 @@ struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0X100
+#define DMPF_REGION_SIZE       0x100
+#define DMPF_RAID10_NEAR_COPIES 0x200
+#define DMPF_RAID10_FAR_COPIES  0x400
+#define DMPF_RAID10_FAR_OFFSET  0x800
+
 struct raid_set {
 	struct dm_target *ti;
 
@@ -66,6 +71,15 @@ struct raid_set {
 	struct raid_dev dev[0];
 };
 
+/* near_copies in first byte */
+/* far_copies in second byte */
+/* far_offset in 17th bit */
+#define ALGORITHM_RAID10(near_copies, far_copies, far_offset) \
+	((near_copies & 0xFF) | ((far_copies & 0xFF) << 8) | ((!!far_offset) << 16))
+#define RAID10_NC(layout) (layout & 0xFF)
+#define RAID10_FC(layout) ((layout >> 8) & 0xFF)
+#define RAID10_FO(layout) (layout & 0x10000)
+
 /* Supported raid types and properties. */
 static struct raid_type {
 	const char *name;		/* RAID algorithm. */
@@ -76,6 +90,8 @@ static struct raid_type {
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
+	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
+	{"raid1e",   "RAID1E (Enhanced RAID1)",         0, 2, 10, -1 /* Varies */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -339,10 +355,17 @@ static int validate_region_size(struct r
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [region_size <sectors>]           Defines granularity of bitmap
+ *
+ * RAID10-only options:
+ *    [raid10_near_copies   <# copies>] Near copies. (Default: 2)
+ *    [raid10_far_copies    <# copies>] Far copies.  (Default: 1)
+ *    [raid10_far_offset    <0/1>]      Offset is device size(0) or stripe(1).
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
+	unsigned raid10_default = ALGORITHM_RAID10(2, 1, 0);
+	unsigned raid10_nc = 1, raid10_fc = 1, raid10_fo = 0;
 	unsigned i, rebuild_cnt = 0;
 	unsigned long value, region_size = 0;
 	sector_t sectors_per_dev = rs->ti->len;
@@ -435,6 +458,7 @@ static int parse_raid_params(struct raid
 				if (rebuild_cnt > rs->raid_type->parity_devs)
 					rs->ti->error = "Too many rebuild devices specified for given RAID type";
 				break;
+			case 10:
 			default:
 				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
 				rs->ti->error = "Rebuild not supported for this RAID type";
@@ -492,7 +516,7 @@ static int parse_raid_params(struct raid
 			 */
 			value /= 2;
 
-			if (rs->raid_type->level < 5) {
+			if (rs->raid_type->level != 5) {
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				return -EINVAL;
 			}
@@ -517,6 +541,33 @@ static int parse_raid_params(struct raid
 		} else if (!strcasecmp(key, "region_size")) {
 			rs->print_flags |= DMPF_REGION_SIZE;
 			region_size = value;
+		} else if (!strcasecmp(key, "raid10_near_copies") &&
+			   (rs->raid_type->level == 10)) {
+			if ((value < 1) || (value > 0xFF)) {
+				rs->ti->error = "Bad value for 'raid10_near_copies'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_NEAR_COPIES;
+			raid10_nc = value;
+			raid10_default = 0;
+		} else if (!strcasecmp(key, "raid10_far_copies") &&
+			   (rs->raid_type->level == 10)) {
+			if ((value < 1) || (value > 0xFF)) {
+				rs->ti->error = "Bad value for 'raid10_far_copies'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_FAR_COPIES;
+			raid10_fc = value;
+			raid10_default = 0;
+		} else if (!strcasecmp(key, "raid10_far_offset") &&
+			   (rs->raid_type->level == 10)) {
+			if (value > 1) {
+				rs->ti->error = "Bad value for 'raid10_far_offset'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_FAR_OFFSET;
+			raid10_fo = value;
+			raid10_default = 0;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -532,9 +583,33 @@ static int parse_raid_params(struct raid
 	else
 		rs->ti->split_io = region_size;
 
-	if ((rs->raid_type->level > 1) &&
-	    sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+	if (rs->raid_type->level == 10) {
+		/* (Len * Stripes) / Mirrors */
+		sectors_per_dev *= rs->md.raid_disks;
+		if (sector_div(sectors_per_dev, (raid10_nc * raid10_fc))) {
+			rs->ti->error = "Target length not divisible by number of data devices";
+			return -EINVAL;
+		}
+		if ((raid10_nc * raid10_fc) > rs->md.raid_disks) {
+			rs->ti->error = "Not enough devices to satisfy specification";
+			return -EINVAL;
+		}
+		if (raid10_fo && (raid10_fc < 2)) {
+			DMWARN("RAID10 parameter 'far_offset' ignored");
+			raid10_fo = 0;
+		}
+
+		if (raid10_default)
+			rs->md.layout = raid10_default;
+		else
+			rs->md.layout = ALGORITHM_RAID10(raid10_nc,
+							 raid10_fc, raid10_fo);
+		rs->md.new_layout = rs->md.layout;
+	} else if ((rs->raid_type->level > 1) &&
+		   sector_div(sectors_per_dev,
+			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
 		rs->ti->error = "Target length not divisible by number of data devices";
+
 		return -EINVAL;
 	}
 	rs->md.dev_sectors = sectors_per_dev;
@@ -560,6 +635,9 @@ static int raid_is_congested(struct dm_t
 	if (rs->raid_type->level == 1)
 		return md_raid1_congested(&rs->md, bits);
 
+	if (rs->raid_type->level == 10)
+		return md_raid10_congested(&rs->md, bits);
+
 	return md_raid5_congested(&rs->md, bits);
 }
 
@@ -878,6 +956,9 @@ static int analyse_superblocks(struct dm
 	case 6:
 		redundancy = rs->raid_type->parity_devs;
 		break;
+	case 10:
+		redundancy = RAID10_NC(mddev->layout) *	RAID10_FC(mddev->layout);
+		break;
 	default:
 		ti->error = "Unknown RAID type";
 		return -EINVAL;
@@ -1197,6 +1278,18 @@ static int raid_status(struct dm_target
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
+		if (rs->print_flags & DMPF_RAID10_NEAR_COPIES)
+			DMEMIT(" raid10_near_copies %u",
+			       RAID10_NC(rs->md.layout));
+
+		if (rs->print_flags & DMPF_RAID10_FAR_COPIES)
+			DMEMIT(" raid10_far_copies %u",
+			       RAID10_FC(rs->md.layout));
+
+		if (rs->print_flags & DMPF_RAID10_FAR_OFFSET)
+			DMEMIT(" raid10_far_offset %u",
+			       RAID10_FO(rs->md.layout));
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			if (rs->dev[i].meta_dev)
@@ -1271,7 +1364,7 @@ static void raid_resume(struct dm_target
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@@ -1298,6 +1391,8 @@ module_init(dm_raid_init);
 module_exit(dm_raid_exit);
 
 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid1");
+MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
===================================================================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -27,6 +27,11 @@ The target is named "raid" and it accept
 		- rotating parity N (right-to-left) with data restart
   raid6_nc	RAID6 N continue
 		- rotating parity N (right-to-left) with data continuation
+  raid10/raid1e Various RAID10 inspired algorithms chosen by additional params
+  		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
+		- RAID1E: Integrated Adjacent Stripe Mirroring
+		- RAID1E: Integrated Offset Stripe Mirroring
+		-  and other similar RAID10 variants
 
   Reference: Chapter 4 of
   http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
@@ -59,6 +64,80 @@ The target is named "raid" and it accept
 		logical size of the array.  The bitmap records the device
 		synchronisation state for each region.
 
+        [raid10_near_copies   <# copies>]
+        [raid10_far_copies    <# copies>]
+        [raid10_far_offset    <0/1>]
+		These three options are used to alter the default layout of
+		a RAID10/RAID1E configuration.  The total number of copies is
+		given by the number of "near" (aka "adjacent") copies times
+		the number of "far" (aka "offset") copies.  Near copies
+		are what most people think of with respect to mirroring.
+		If 'raid10_near_copies 2', 'raid10_far_copies 1' and
+		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives         3 drives          4 drives
+		--------         ----------        --------------
+		A1  A1           A1  A1  A2        A1  A1  A2  A2
+		A2  A2           A2  A3  A3        A3  A3  A4  A4
+		A3  A3           A4  A4  A5        A5  A5  A6  A6
+		A4  A4           A5  A6  A6        A7  A7  A8  A8
+		..  ..           ..  ..  ..        ..  ..  ..  ..
+		The 2-device layout is equivalent 2-way RAID1.  The 4-device
+		layout is what a traditional RAID10 would look like.  The
+		3-device layout is what might be called a 'RAID1E - Integrated
+		Adjacent Stripe Mirroring'.
+
+		The 'raid10_far_[copies|offset]' arguments work together to
+		determine where any "far"/"offset" copies will be placed.
+		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
+		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives             3 drives             4 drives
+		--------             --------------       --------------------
+		A1  A2               A1   A2   A3         A1   A2   A3   A4
+		A3  A4               A4   A5   A6         A5   A6   A7   A8
+		A5  A6               A7   A8   A9         A9   A10  A11  A12
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		A2  A1               A3   A1   A2         A4   A1   A2   A3
+		A4  A3               A6   A4   A5         A8   A5   A6   A7
+		A6  A5               A9   A7   A8         A12  A9   A10  A11
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+
+		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
+		'raid10_far_offset 1', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives       3 drives           4 drives
+		--------       ------------       -----------------
+		A1  A2         A1  A2  A3         A1  A2  A3  A4
+		A2  A1         A3  A1  A2         A4  A1  A2  A3
+		A3  A4         A4  A5  A6         A5  A6  A7  A8
+		A4  A3         A6  A4  A5         A8  A5  A6  A7
+		A5  A6         A7  A8  A9         A9  A10 A11 A12
+		A6  A5         A9  A7  A8         A12 A9  A10 A11
+		..  ..         ..  ..  ..         ..  ..  ..  ..
+		Here we see layouts closely akin to 'RAID1E - Integrated
+		Offset Stripe Mirroring'.
+
+		Near and far copies can both be specified giving more
+		complex arrangements.  If 'raid10_near_copies 2',
+		'raid10_far_copies 2' and 'raid10_far_offset 0', then the
+		layouts for 4 and 5 devices are:
+		4 drives              5 drives
+		--------              --------
+		A1  A1  A2  A2        A1  A1  A2  A2  A3
+		A3  A3  A4  A4        A3  A4  A4  A5  A5
+		A5  A5  A6  A6        A6  A6  A7  A7  A8
+		A7  A7  A8  A8        A8  A9  A9  A10 A10
+		..  ..  ..  ..        ..  ..  ..  ..  ..
+		A2  A2  A1  A1        A2  A3  A1  A1  A2
+		A4  A4  A3  A3        A5  A5  A3  A4  A4
+		A6  A6  A5  A5        A7  A8  A6  A6  A7
+		A8  A8  A7  A7        A10 A10 A8  A9  A9
+		..  ..  ..  ..        ..  ..  ..  ..  ..
+		Thanks wikipedia 'Non-standard RAID levels' for the layout
+		figures:
+		http://en.wikipedia.org/wiki/Non-standard_RAID_levels
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the



             reply	other threads:[~2012-06-26 12:03 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-06-26 12:03 Jonathan Brassow [this message]
2012-07-04  1:21 ` [PATCH] DM RAID: Add support for MD RAID10 personality NeilBrown
2012-07-04  3:20   ` Brassow Jonathan
2012-07-04  5:15     ` NeilBrown
2012-07-04 15:27       ` Jan Ceuleers
2012-07-10 19:27       ` Brassow Jonathan
2012-07-11  0:08         ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1340712231.19015.42.camel@f16 \
    --to=jbrassow@redhat.com \
    --cc=agk@redhat.com \
    --cc=dm-devel@redhat.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.