Linux Btrfs filesystem development
 help / color / mirror / Atom feed
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH 7/8] btrfs-progs: allow read_data_from_disk() to rebuild RAID56 using P/Q
Date: Tue,  5 Apr 2022 20:48:29 +0800	[thread overview]
Message-ID: <f8fda291f6104ecddf3eb36e263a82b506508112.1649162174.git.wqu@suse.com> (raw)
In-Reply-To: <cover.1649162174.git.wqu@suse.com>

This new ability is added by:

- Allow btrfs_map_block() to return the chunk type
  This makes later work much easier

- Only reset stripe offset inside btrfs_map_block() when needed
  Currently if @raid_map is not NULL, btrfs_map_block() will consider
  this call is for WRITE and will reset stripe offset.

  This is no longer the case, as for RAID56 read with mirror_num 1/0,
  we will still call btrfs_map_block() with non-NULL raid_map.

  Add a small check to make sure we won't reset stripe offset for
  mirror 1/0 read.

- Add new helper read_raid56() to handle rebuild
  We will read the full stripe (including all data and P/Q stripes)
  do the rebuild, then only copy the refered part to the caller.

  There is a catch for RAID6, we have no way to exhaust all combination,
  so the current repair will assume the mirror = 0 data is corrupted,
  then try to find a missing device.

  But if no missing device can be found, it will assume P is corrupted.
  This is just a guess, and can to totally wrong, but we have no better
  idea.

Now btrfs-progs have full read ability for RAID56.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 kernel-shared/extent_io.c | 114 +++++++++++++++++++++++++++++++++++++-
 kernel-shared/volumes.c   |  27 +++++----
 kernel-shared/volumes.h   |   1 +
 3 files changed, 128 insertions(+), 14 deletions(-)

diff --git a/kernel-shared/extent_io.c b/kernel-shared/extent_io.c
index b8ded5cf7373..ee92e0f847d6 100644
--- a/kernel-shared/extent_io.c
+++ b/kernel-shared/extent_io.c
@@ -26,6 +26,7 @@
 #include "kerncompat.h"
 #include "kernel-shared/extent_io.h"
 #include "kernel-lib/list.h"
+#include "kernel-lib/raid56.h"
 #include "kernel-shared/ctree.h"
 #include "kernel-shared/volumes.h"
 #include "kernel-shared/disk-io.h"
@@ -788,23 +789,131 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
+		       u64 len, int mirror, struct btrfs_multi_bio *multi,
+		       u64 *raid_map)
+{
+	const int num_stripes = multi->num_stripes;
+	const u64 full_stripe_start = raid_map[0];
+	void **pointers = NULL;
+	int failed_a = -1;
+	int failed_b = -1;
+	int i;
+	int ret;
+
+	/* Only read repair should go this path */
+	ASSERT(mirror > 1);
+	ASSERT(raid_map);
+
+	/* The read length should be inside one stripe */
+	ASSERT(len <= BTRFS_STRIPE_LEN);
+
+	pointers = calloc(num_stripes, sizeof(void *));
+	if (!pointers) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	/* Allocate memory for the full stripe */
+	for (i = 0; i < num_stripes; i++) {
+		pointers[i] = malloc(BTRFS_STRIPE_LEN);
+		if (!pointers[i]) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Read the full stripe.
+	 *
+	 * The stripes in @multi is not rotated, thus can be used to read from
+	 * disk directly.
+	 */
+	for (i = 0; i < num_stripes; i++) {
+		ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i],
+				  BTRFS_STRIPE_LEN, multi->stripes[i].physical,
+				  fs_info->zoned);
+		if (ret < BTRFS_STRIPE_LEN) {
+			ret = -EIO;
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the failed index.
+	 *
+	 * Since we're reading using mirror_num > 1 already, it means the data
+	 * stripe where @logical lies in is definitely corrupted.
+	 */
+	failed_a = (logical - full_stripe_start) / BTRFS_STRIPE_LEN;
+
+	/*
+	 * For RAID6, we don't have good way to exhaust all the combinations,
+	 * so here we can only go through the map to see if we have missing devices.
+	 */
+	if (multi->type & BTRFS_BLOCK_GROUP_RAID6) {
+		for (i = 0; i < num_stripes; i++) {
+			/* Skip failed_a, as it's already marked failed */
+			if (i == failed_a)
+				continue;
+			/* Missing dev */
+			if (multi->stripes[i].dev->fd == -1) {
+				failed_b = i;
+				break;
+			}
+		}
+		/*
+		 * No missing device, we have no better idea, default to P
+		 * corruption
+		 */
+		if (failed_b < 0)
+			failed_b = num_stripes - 2;
+	}
+
+	/* Rebuild the full stripe */
+	ret = raid56_recov(num_stripes, BTRFS_STRIPE_LEN, multi->type,
+			   failed_a, failed_b, pointers);
+	ASSERT(ret == 0);
+
+	/* Now copy the data back to original buf */
+	memcpy(buf, pointers[failed_a] + (logical - full_stripe_start) %
+			BTRFS_STRIPE_LEN, len);
+	ret = 0;
+out:
+	for (i = 0; i < num_stripes; i++)
+		free(pointers[i]);
+	free(pointers);
+	return ret;
+}
+
 int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
 			u64 *len, int mirror)
 {
 	struct btrfs_multi_bio *multi = NULL;
 	struct btrfs_device *device;
 	u64 read_len = *len;
+	u64 *raid_map = NULL;
 	int ret;
 
 	ret = btrfs_map_block(info, READ, logical, &read_len, &multi, mirror,
-			      NULL);
+			      &raid_map);
 	if (ret) {
 		fprintf(stderr, "Couldn't map the block %llu\n", logical);
 		return -EIO;
 	}
+	read_len = min(*len, read_len);
+
+	/* We need to rebuild from P/Q */
+	if (mirror > 1 && multi->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		ret = read_raid56(info, buf, logical, read_len, mirror, multi,
+				  raid_map);
+		free(multi);
+		free(raid_map);
+		*len = read_len;
+		return ret;
+	}
+	free(raid_map);
 	device = multi->stripes[0].dev;
 
-	read_len = min(*len, read_len);
 	if (device->fd <= 0) {
 		kfree(multi);
 		return -EIO;
@@ -824,6 +933,7 @@ int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
 			logical, ret, read_len);
 		return -EIO;
 	}
+	*len = read_len;
 
 	return 0;
 }
diff --git a/kernel-shared/volumes.c b/kernel-shared/volumes.c
index cb49609cc60c..f082fa9f898e 100644
--- a/kernel-shared/volumes.c
+++ b/kernel-shared/volumes.c
@@ -1805,6 +1805,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	bool need_raid_map = false;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && rw == READ) {
@@ -1842,17 +1843,18 @@ again:
 	}
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK
 	    && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
-		    /* RAID[56] write or recovery. Return all stripes */
-		    stripes_required = map->num_stripes;
-
-		    /* Only allocate the map if we've already got a large enough multi_ret */
-		    if (stripes_allocated >= stripes_required) {
-			    raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
-			    if (!raid_map) {
-				    kfree(multi);
-				    return -ENOMEM;
-			    }
-		    }
+		need_raid_map = true;
+		/* RAID[56] write or recovery. Return all stripes */
+		stripes_required = map->num_stripes;
+
+		/* Only allocate the map if we've already got a large enough multi_ret */
+		if (stripes_allocated >= stripes_required) {
+			raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+			if (!raid_map) {
+				kfree(multi);
+				return -ENOMEM;
+			}
+		}
 	}
 
 	/* if our multi bio struct is too small, back off and try again */
@@ -1890,6 +1892,7 @@ again:
 		goto out;
 
 	multi->num_stripes = 1;
+	multi->type = map->type;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
 		if (rw == WRITE)
@@ -1916,7 +1919,7 @@ again:
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		if (raid_map) {
+		if (need_raid_map && raid_map) {
 			int rot;
 			u64 tmp;
 			u64 raid56_full_stripe_start;
diff --git a/kernel-shared/volumes.h b/kernel-shared/volumes.h
index 5cfe7e39f6b8..d90065b98a3e 100644
--- a/kernel-shared/volumes.h
+++ b/kernel-shared/volumes.h
@@ -106,6 +106,7 @@ struct btrfs_bio_stripe {
 };
 
 struct btrfs_multi_bio {
+	u64 type;
 	int error;
 	int num_stripes;
 	struct btrfs_bio_stripe stripes[];
-- 
2.35.1


  parent reply	other threads:[~2022-04-05 15:02 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-05 12:48 [PATCH 0/8] btrfs-progs: add RAID56 rebuild ability at read time Qu Wenruo
2022-04-05 12:48 ` [PATCH 1/8] btrfs-progs: remove the unnecessary BTRFS_SUPER_INFO_OFFSET path for tree block read Qu Wenruo
2022-04-05 12:48 ` [PATCH 2/8] btrfs-progs: extract metadata restore read code into its own helper Qu Wenruo
2022-04-05 12:48 ` [PATCH 3/8] btrfs-progs: don't use write_extent_to_disk() directly Qu Wenruo
2022-04-05 12:48 ` [PATCH 4/8] btrfs-progs: use write_data_to_disk() to replace write_extent_to_disk() Qu Wenruo
2022-04-05 12:48 ` [PATCH 5/8] btrfs-progs: use read_data_from_disk() to replace read_extent_from_disk() and replace read_extent_data() Qu Wenruo
2022-04-05 12:48 ` [PATCH 6/8] btrfs-progs: remove extent_buffer::fd and extent_buffer::dev_bytes Qu Wenruo
2022-04-05 12:48 ` Qu Wenruo [this message]
2022-04-05 12:48 ` [PATCH 8/8] btrfs-progs: tests/fsck: add test case for data csum check on raid5 Qu Wenruo
2022-04-08 21:16 ` [PATCH 0/8] btrfs-progs: add RAID56 rebuild ability at read time David Sterba
2022-04-11 15:01 ` David Sterba
2022-04-25 16:29   ` David Sterba
2022-04-25 22:38     ` Qu Wenruo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f8fda291f6104ecddf3eb36e263a82b506508112.1649162174.git.wqu@suse.com \
    --to=wqu@suse.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox