From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH 7/8] btrfs-progs: allow read_data_from_disk() to rebuild RAID56 using P/Q
Date: Tue, 5 Apr 2022 20:48:29 +0800 [thread overview]
Message-ID: <f8fda291f6104ecddf3eb36e263a82b506508112.1649162174.git.wqu@suse.com> (raw)
In-Reply-To: <cover.1649162174.git.wqu@suse.com>
This new ability is added by:
- Allow btrfs_map_block() to return the chunk type
This makes later work much easier
- Only reset stripe offset inside btrfs_map_block() when needed
Currently if @raid_map is not NULL, btrfs_map_block() will consider
this call is for WRITE and will reset stripe offset.
This is no longer the case, as for RAID56 read with mirror_num 1/0,
we will still call btrfs_map_block() with non-NULL raid_map.
Add a small check to make sure we won't reset stripe offset for
mirror 1/0 read.
- Add new helper read_raid56() to handle rebuild
We will read the full stripe (including all data and P/Q stripes)
do the rebuild, then only copy the refered part to the caller.
There is a catch for RAID6, we have no way to exhaust all combination,
so the current repair will assume the mirror = 0 data is corrupted,
then try to find a missing device.
But if no missing device can be found, it will assume P is corrupted.
This is just a guess, and can to totally wrong, but we have no better
idea.
Now btrfs-progs have full read ability for RAID56.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
kernel-shared/extent_io.c | 114 +++++++++++++++++++++++++++++++++++++-
kernel-shared/volumes.c | 27 +++++----
kernel-shared/volumes.h | 1 +
3 files changed, 128 insertions(+), 14 deletions(-)
diff --git a/kernel-shared/extent_io.c b/kernel-shared/extent_io.c
index b8ded5cf7373..ee92e0f847d6 100644
--- a/kernel-shared/extent_io.c
+++ b/kernel-shared/extent_io.c
@@ -26,6 +26,7 @@
#include "kerncompat.h"
#include "kernel-shared/extent_io.h"
#include "kernel-lib/list.h"
+#include "kernel-lib/raid56.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/disk-io.h"
@@ -788,23 +789,131 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return ret;
}
+static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
+ u64 len, int mirror, struct btrfs_multi_bio *multi,
+ u64 *raid_map)
+{
+ const int num_stripes = multi->num_stripes;
+ const u64 full_stripe_start = raid_map[0];
+ void **pointers = NULL;
+ int failed_a = -1;
+ int failed_b = -1;
+ int i;
+ int ret;
+
+ /* Only read repair should go this path */
+ ASSERT(mirror > 1);
+ ASSERT(raid_map);
+
+ /* The read length should be inside one stripe */
+ ASSERT(len <= BTRFS_STRIPE_LEN);
+
+ pointers = calloc(num_stripes, sizeof(void *));
+ if (!pointers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* Allocate memory for the full stripe */
+ for (i = 0; i < num_stripes; i++) {
+ pointers[i] = malloc(BTRFS_STRIPE_LEN);
+ if (!pointers[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * Read the full stripe.
+ *
+ * The stripes in @multi is not rotated, thus can be used to read from
+ * disk directly.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i],
+ BTRFS_STRIPE_LEN, multi->stripes[i].physical,
+ fs_info->zoned);
+ if (ret < BTRFS_STRIPE_LEN) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ /*
+ * Get the failed index.
+ *
+ * Since we're reading using mirror_num > 1 already, it means the data
+ * stripe where @logical lies in is definitely corrupted.
+ */
+ failed_a = (logical - full_stripe_start) / BTRFS_STRIPE_LEN;
+
+ /*
+ * For RAID6, we don't have good way to exhaust all the combinations,
+ * so here we can only go through the map to see if we have missing devices.
+ */
+ if (multi->type & BTRFS_BLOCK_GROUP_RAID6) {
+ for (i = 0; i < num_stripes; i++) {
+ /* Skip failed_a, as it's already marked failed */
+ if (i == failed_a)
+ continue;
+ /* Missing dev */
+ if (multi->stripes[i].dev->fd == -1) {
+ failed_b = i;
+ break;
+ }
+ }
+ /*
+ * No missing device, we have no better idea, default to P
+ * corruption
+ */
+ if (failed_b < 0)
+ failed_b = num_stripes - 2;
+ }
+
+ /* Rebuild the full stripe */
+ ret = raid56_recov(num_stripes, BTRFS_STRIPE_LEN, multi->type,
+ failed_a, failed_b, pointers);
+ ASSERT(ret == 0);
+
+ /* Now copy the data back to original buf */
+ memcpy(buf, pointers[failed_a] + (logical - full_stripe_start) %
+ BTRFS_STRIPE_LEN, len);
+ ret = 0;
+out:
+ for (i = 0; i < num_stripes; i++)
+ free(pointers[i]);
+ free(pointers);
+ return ret;
+}
+
int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
u64 *len, int mirror)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 read_len = *len;
+ u64 *raid_map = NULL;
int ret;
ret = btrfs_map_block(info, READ, logical, &read_len, &multi, mirror,
- NULL);
+ &raid_map);
if (ret) {
fprintf(stderr, "Couldn't map the block %llu\n", logical);
return -EIO;
}
+ read_len = min(*len, read_len);
+
+ /* We need to rebuild from P/Q */
+ if (mirror > 1 && multi->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = read_raid56(info, buf, logical, read_len, mirror, multi,
+ raid_map);
+ free(multi);
+ free(raid_map);
+ *len = read_len;
+ return ret;
+ }
+ free(raid_map);
device = multi->stripes[0].dev;
- read_len = min(*len, read_len);
if (device->fd <= 0) {
kfree(multi);
return -EIO;
@@ -824,6 +933,7 @@ int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
logical, ret, read_len);
return -EIO;
}
+ *len = read_len;
return 0;
}
diff --git a/kernel-shared/volumes.c b/kernel-shared/volumes.c
index cb49609cc60c..f082fa9f898e 100644
--- a/kernel-shared/volumes.c
+++ b/kernel-shared/volumes.c
@@ -1805,6 +1805,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
int stripes_required = 1;
int stripe_index;
int i;
+ bool need_raid_map = false;
struct btrfs_multi_bio *multi = NULL;
if (multi_ret && rw == READ) {
@@ -1842,17 +1843,18 @@ again:
}
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK
&& multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
- /* RAID[56] write or recovery. Return all stripes */
- stripes_required = map->num_stripes;
-
- /* Only allocate the map if we've already got a large enough multi_ret */
- if (stripes_allocated >= stripes_required) {
- raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
- if (!raid_map) {
- kfree(multi);
- return -ENOMEM;
- }
- }
+ need_raid_map = true;
+ /* RAID[56] write or recovery. Return all stripes */
+ stripes_required = map->num_stripes;
+
+ /* Only allocate the map if we've already got a large enough multi_ret */
+ if (stripes_allocated >= stripes_required) {
+ raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ if (!raid_map) {
+ kfree(multi);
+ return -ENOMEM;
+ }
+ }
}
/* if our multi bio struct is too small, back off and try again */
@@ -1890,6 +1892,7 @@ again:
goto out;
multi->num_stripes = 1;
+ multi->type = map->type;
stripe_index = 0;
if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (rw == WRITE)
@@ -1916,7 +1919,7 @@ again:
else if (mirror_num)
stripe_index = mirror_num - 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (raid_map) {
+ if (need_raid_map && raid_map) {
int rot;
u64 tmp;
u64 raid56_full_stripe_start;
diff --git a/kernel-shared/volumes.h b/kernel-shared/volumes.h
index 5cfe7e39f6b8..d90065b98a3e 100644
--- a/kernel-shared/volumes.h
+++ b/kernel-shared/volumes.h
@@ -106,6 +106,7 @@ struct btrfs_bio_stripe {
};
struct btrfs_multi_bio {
+ u64 type;
int error;
int num_stripes;
struct btrfs_bio_stripe stripes[];
--
2.35.1
next prev parent reply other threads:[~2022-04-05 15:02 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-04-05 12:48 [PATCH 0/8] btrfs-progs: add RAID56 rebuild ability at read time Qu Wenruo
2022-04-05 12:48 ` [PATCH 1/8] btrfs-progs: remove the unnecessary BTRFS_SUPER_INFO_OFFSET path for tree block read Qu Wenruo
2022-04-05 12:48 ` [PATCH 2/8] btrfs-progs: extract metadata restore read code into its own helper Qu Wenruo
2022-04-05 12:48 ` [PATCH 3/8] btrfs-progs: don't use write_extent_to_disk() directly Qu Wenruo
2022-04-05 12:48 ` [PATCH 4/8] btrfs-progs: use write_data_to_disk() to replace write_extent_to_disk() Qu Wenruo
2022-04-05 12:48 ` [PATCH 5/8] btrfs-progs: use read_data_from_disk() to replace read_extent_from_disk() and replace read_extent_data() Qu Wenruo
2022-04-05 12:48 ` [PATCH 6/8] btrfs-progs: remove extent_buffer::fd and extent_buffer::dev_bytes Qu Wenruo
2022-04-05 12:48 ` Qu Wenruo [this message]
2022-04-05 12:48 ` [PATCH 8/8] btrfs-progs: tests/fsck: add test case for data csum check on raid5 Qu Wenruo
2022-04-08 21:16 ` [PATCH 0/8] btrfs-progs: add RAID56 rebuild ability at read time David Sterba
2022-04-11 15:01 ` David Sterba
2022-04-25 16:29 ` David Sterba
2022-04-25 22:38 ` Qu Wenruo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f8fda291f6104ecddf3eb36e263a82b506508112.1649162174.git.wqu@suse.com \
--to=wqu@suse.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox