From: Liu Bo <bo.li.liu@oracle.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH 08/14] Btrfs: raid56: log recovery
Date: Tue, 1 Aug 2017 10:14:31 -0600 [thread overview]
Message-ID: <20170801161439.13426-9-bo.li.liu@oracle.com> (raw)
In-Reply-To: <20170801161439.13426-1-bo.li.liu@oracle.com>
This is adding recovery on raid5/6 log.
We've set a %journal_tail in super_block, which indicates the position
from where we need to replay data. So we scan the log and replay
valid meta/data/parity pairs until finding an invalid one. By
replaying, it simply reads data/parity from the raid5/6 log and issues
writes to the raid disks where it should be. Please note that the
whole meta/data/parity pair can be discarded if it fails the sanity
check in the meta block.
After recovery, we also append an empty meta block and update the
%journal_tail in super_block in order to avoid a situation, where the
layout on the raid5/6 log is
[valid A][invalid B][valid C],
so block A is the only one we should replay.
Then the recovery ends up pointing to block A as block B is invalid,
and some new writes come in and append to block A so that block B is
now overwritten to be a valid meta/data/parity. If a power loss
happens, the new recovery starts again from block A, and since block B
is now valid, it may replay block C as well which has become stale.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
fs/btrfs/raid56.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 151 insertions(+)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5d7ea235..dea33c4 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1530,10 +1530,161 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
return ret;
}
+struct btrfs_r5l_recover_ctx {
+ u64 pos;
+ u64 seq;
+ u64 total_size;
+ struct page *meta_page;
+ struct page *io_page;
+};
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ struct btrfs_r5l_meta_block *mb;
+
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos >> 9), PAGE_SIZE, ctx->meta_page, REQ_OP_READ);
+
+ mb = kmap(ctx->meta_page);
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq));
+#endif
+
+ if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC ||
+ le64_to_cpu(mb->position) != ctx->pos ||
+ le64_to_cpu(mb->seq) != ctx->seq) {
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
+#endif
+ return -EINVAL;
+ }
+
+ ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
+ kunmap(ctx->meta_page);
+
+ /* meta_block */
+ ctx->total_size = PAGE_SIZE;
+
+ return 0;
+}
+
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ u64 offset;
+ struct btrfs_r5l_meta_block *mb;
+ u64 meta_size;
+ u64 io_offset;
+ struct btrfs_device *dev;
+
+ mb = kmap(ctx->meta_page);
+
+ io_offset = PAGE_SIZE;
+ offset = sizeof(struct btrfs_r5l_meta_block);
+ meta_size = le32_to_cpu(mb->meta_size);
+
+ while (offset < meta_size) {
+ struct btrfs_r5l_payload *payload = (void *)mb + offset;
+
+ /* read data from log disk and write to payload->location */
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+#endif
+
+ dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+ if (!dev || dev->missing) {
+ ASSERT(0);
+ }
+
+ if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
+ ASSERT(le32_to_cpu(payload->size) == 1);
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+ btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+ io_offset += PAGE_SIZE;
+ } else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
+ int i;
+ ASSERT(le32_to_cpu(payload->size) == 16);
+ for (i = 0; i < le32_to_cpu(payload->size); i++) {
+ /* liubo: parity are guaranteed to be
+ * contiguous, use just one bio to
+ * hold all pages and flush them. */
+ u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE;
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+ btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+ io_offset += PAGE_SIZE;
+ }
+ } else {
+ ASSERT(0);
+ }
+
+ offset += sizeof(struct btrfs_r5l_payload);
+ }
+ kunmap(ctx->meta_page);
+
+ ctx->total_size += (io_offset - PAGE_SIZE);
+ return 0;
+}
+
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ int ret;
+
+ while (1) {
+ ret = btrfs_r5l_recover_load_meta(log, ctx);
+ if (ret)
+ break;
+
+ ret = btrfs_r5l_recover_load_data(log, ctx);
+ ASSERT(!ret || ret > 0);
+ if (ret)
+ break;
+
+ ctx->seq++;
+ ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size);
+ }
+
+ return ret;
+}
+
static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
{
+ struct btrfs_r5l_recover_ctx *ctx;
+ u64 pos;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+ ASSERT(ctx);
+
+ ctx->pos = log->last_checkpoint;
+ ctx->seq = log->last_cp_seq;
+ ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ ASSERT(ctx->meta_page);
+ ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ ASSERT(ctx->io_page);
+
+ ret = btrfs_r5l_recover_flush_log(log, ctx);
+ if (ret) {
+ ;
+ }
+
+ pos = ctx->pos;
+ log->next_checkpoint = ctx->pos;
+ ctx->seq += 10000;
+ btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+ ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE);
+
+ log->log_start = ctx->pos;
+ log->seq = ctx->seq;
+ /* last_checkpoint point to the empty block. */
+ log->last_checkpoint = pos;
+ btrfs_r5l_write_super(log->fs_info, pos);
+
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+ __free_page(ctx->meta_page);
+ __free_page(ctx->io_page);
+ kfree(ctx);
return 0;
}
--
2.9.4
next prev parent reply other threads:[~2017-08-01 17:15 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-08-01 16:14 [PATCH 00/14 RFC] Btrfs: Add journal for raid5/6 writes Liu Bo
2017-08-01 16:14 ` [PATCH 01/14] Btrfs: raid56: add raid56 log via add_dev v2 ioctl Liu Bo
2017-08-02 19:25 ` Nikolay Borisov
2017-08-01 16:14 ` [PATCH 02/14] Btrfs: raid56: do not allocate chunk on raid56 log Liu Bo
2017-08-01 16:14 ` [PATCH 03/14] Btrfs: raid56: detect raid56 log on mount Liu Bo
2017-08-01 16:14 ` [PATCH 04/14] Btrfs: raid56: add verbose debug Liu Bo
2017-08-01 16:14 ` [PATCH 05/14] Btrfs: raid56: add stripe log for raid5/6 Liu Bo
2017-08-01 16:14 ` [PATCH 06/14] Btrfs: raid56: add reclaim support Liu Bo
2017-08-01 16:14 ` [PATCH 07/14] Btrfs: raid56: load r5log Liu Bo
2017-08-01 16:14 ` Liu Bo [this message]
2017-08-01 16:14 ` [PATCH 09/14] Btrfs: raid56: add readahead for recovery Liu Bo
2017-08-01 16:14 ` [PATCH 10/14] Btrfs: raid56: use the readahead helper to get page Liu Bo
2017-08-01 16:14 ` [PATCH 11/14] Btrfs: raid56: add csum support Liu Bo
2017-08-01 16:14 ` [PATCH 12/14] Btrfs: raid56: fix error handling while adding a log device Liu Bo
2017-08-01 16:14 ` [PATCH 13/14] Btrfs: raid56: initialize raid5/6 log after adding it Liu Bo
2017-08-01 16:14 ` [PATCH 14/14] Btrfs: raid56: maintain IO order on raid5/6 log Liu Bo
2017-08-01 16:14 ` [PATCH 1/2] Btrfs-progs: add option to add raid5/6 log device Liu Bo
2017-08-01 16:14 ` [PATCH 2/2] Btrfs-progs: introduce super_journal_tail to inspect-dump-super Liu Bo
2017-08-01 17:25 ` [PATCH 00/14 RFC] Btrfs: Add journal for raid5/6 writes Roman Mamedov
2017-08-01 17:03 ` Liu Bo
2017-08-01 17:39 ` Austin S. Hemmelgarn
2017-08-01 17:07 ` Liu Bo
2017-08-02 18:47 ` Chris Mason
2018-05-03 19:16 ` Goffredo Baroncelli
2017-08-01 17:28 ` Hugo Mills
2017-08-01 16:56 ` Liu Bo
2017-08-01 18:15 ` Hugo Mills
2017-08-01 17:42 ` Goffredo Baroncelli
2017-08-01 17:24 ` Liu Bo
2017-08-01 22:14 ` Goffredo Baroncelli
2017-08-02 17:57 ` Liu Bo
2017-08-02 20:41 ` Goffredo Baroncelli
2017-08-02 20:27 ` Liu Bo
2017-08-03 4:02 ` Duncan
2017-08-03 4:40 ` Goffredo Baroncelli
2017-08-23 15:28 ` Chris Murphy
2017-08-23 15:47 ` Austin S. Hemmelgarn
2017-08-25 13:53 ` Goffredo Baroncelli
2017-08-01 21:00 ` Christoph Anton Mitterer
2017-08-01 22:24 ` Goffredo Baroncelli
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170801161439.13426-9-bo.li.liu@oracle.com \
--to=bo.li.liu@oracle.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).