From mboxrd@z Thu Jan 1 00:00:00 1970 From: David Woodhouse Subject: Re: A start at RAID[56] support. Date: Sat, 11 Jul 2009 15:40:41 +0100 Message-ID: <1247323241.17045.16.camel@macbook.infradead.org> References: <1247323186.17045.15.camel@macbook.infradead.org> Mime-Version: 1.0 Content-Type: text/plain To: linux-btrfs@vger.kernel.org Return-path: In-Reply-To: <1247323186.17045.15.camel@macbook.infradead.org> List-ID: On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote: > This is a preliminary attempt to add RAID5 and RAID6 support. Matching btrfs-progs patch... diff --git a/ctree.h b/ctree.h index a9062ea..5b3c690 100644 --- a/ctree.h +++ b/ctree.h @@ -640,6 +640,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) struct btrfs_block_group_item { __le64 used; diff --git a/extent-tree.c b/extent-tree.c index b2f9bb2..77cfcb5 100644 --- a/extent-tree.c +++ b/extent-tree.c @@ -1775,6 +1775,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/mkfs.c b/mkfs.c index 2e99b95..aefe1af 100644 --- a/mkfs.c +++ b/mkfs.c @@ -203,16 +203,22 @@ static int create_raid_groups(struct btrfs_trans_handle *trans, u64 metadata_profile) { u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); - u64 allowed; + u64 allowed = 0; int ret; - if (num_devices == 1) - allowed = BTRFS_BLOCK_GROUP_DUP; - else if (num_devices >= 4) { - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - } else - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1; + switch (num_devices) { + default: + case 4: + allowed |= BTRFS_BLOCK_GROUP_RAID10; + case 3: + allowed |= BTRFS_BLOCK_GROUP_RAID6; + case 2: + allowed |= BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5; + break; + case 1: + allowed |= BTRFS_BLOCK_GROUP_DUP; + } if (allowed & metadata_profile) { ret = create_one_raid_group(trans, root, @@ -292,6 +298,10 @@ static u64 parse_profile(char *s) return BTRFS_BLOCK_GROUP_RAID0; } else if (strcmp(s, "raid1") == 0) { return BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP; + } else if (strcmp(s, "raid5") == 0) { + return BTRFS_BLOCK_GROUP_RAID5; + } else if (strcmp(s, "raid6") == 0) { + return BTRFS_BLOCK_GROUP_RAID6; } else if (strcmp(s, "raid10") == 0) { return BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP; } else if (strcmp(s, "single") == 0) { diff --git a/volumes.c b/volumes.c index 7671855..90090b0 100644 --- a/volumes.c +++ b/volumes.c @@ -47,6 +47,21 @@ struct map_lookup { struct btrfs_bio_stripe stripes[]; }; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -623,6 +638,10 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } @@ -664,6 +683,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { if (type & BTRFS_BLOCK_GROUP_SYSTEM) { @@ -703,6 +723,18 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + } /* we don't want a chunk larger than 10% of the FS */ percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1); @@ -879,6 +911,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; return ret; @@ -894,6 +930,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start); @@ -901,10 +938,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = container_of(ce, struct map_lookup, ce); length = ce->size; + rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) length = ce->size / (map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) length = ce->size / map->num_stripes; + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + length = ce->size / nr_data_stripes(map); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); @@ -923,8 +966,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map->sub_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = ce->start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = ce->start + stripe_nr * rmap_len; for (j = 0; j < nr; j++) { if (buf[j] == bytenr) break; @@ -935,7 +981,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; return 0; } @@ -1001,6 +1047,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -1041,6 +1088,23 @@ again: multi->num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + + stripe_index = stripe_nr % nr_data_stripes(map); + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) & map->num_stripes; + } else { /* * after this do_div call, stripe_nr is the number of stripes -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation