From: Mark Harmstone <maharmstone@fb.com>
To: <linux-btrfs@vger.kernel.org>
Cc: Mark Harmstone <maharmstone@fb.com>
Subject: [PATCH v2] btrfs-progs: add rudimentary log checking
Date: Thu, 11 Jul 2024 15:23:38 +0100 [thread overview]
Message-ID: <20240711142412.4139062-1-maharmstone@fb.com> (raw)
Currently the transaction log is more or less ignored by btrfs check,
meaning that it's possible for a FS with a corrupt log to pass btrfs
check, but be immediately corrupted by the kernel when it's mounted.
This patch adds a check that if there's an inode in the log, any pending
non-compressed writes also have corresponding csum entries.
This is a resubmission, taking into account Josef and Qu's suggestions
from the first patch. It also fixes a logic error in check_log_csum that
I only noticed after submission.
Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
check/main.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 290 insertions(+), 12 deletions(-)
diff --git a/check/main.c b/check/main.c
index 83c721d3..bd524a32 100644
--- a/check/main.c
+++ b/check/main.c
@@ -9787,6 +9787,272 @@ static int zero_log_tree(struct btrfs_root *root)
return ret;
}
+static int check_log_csum(struct btrfs_root *root, u64 addr, u64 length)
+{
+ struct btrfs_path path = { 0 };
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u16 csum_size = gfs_info->csum_size;
+ u16 num_entries;
+ u64 data_len;
+ int ret;
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = addr;
+
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path.slots[0])
+ path.slots[0]--;
+
+ ret = 0;
+
+ while (1) {
+ leaf = path.nodes[0];
+ if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, &path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+
+ break;
+ }
+ leaf = path.nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+ if (key.objectid > BTRFS_EXTENT_CSUM_OBJECTID)
+ break;
+
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY)
+ goto next;
+
+ if (key.offset >= addr + length)
+ break;
+
+ num_entries = btrfs_item_size(leaf, path.slots[0]) / csum_size;
+ data_len = num_entries * gfs_info->sectorsize;
+
+ if (addr >= key.offset && addr <= key.offset + data_len) {
+ u64 end = min(addr + length, key.offset + data_len);
+
+ length = addr + length - end;
+ addr = end;
+
+ if (length == 0)
+ break;
+ }
+
+next:
+ path.slots[0]++;
+ }
+
+ btrfs_release_path(&path);
+
+ if (ret >= 0)
+ ret = length == 0 ? 1 : 0;
+
+ return ret;
+}
+
+static int check_log_root(struct btrfs_root *root, struct cache_tree *root_cache,
+ struct walk_control *wc)
+{
+ struct btrfs_path path = { 0 };
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret, err = 0;
+ u64 last_csum_inode = 0;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ return 1;
+
+ while (1) {
+ leaf = path.nodes[0];
+ if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, &path);
+ if (ret) {
+ if (ret < 0)
+ err = 1;
+
+ break;
+ }
+ leaf = path.nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID)
+ break;
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(leaf, path.slots[0],
+ struct btrfs_inode_item);
+
+ if (!(btrfs_inode_flags(leaf, item) & BTRFS_INODE_NODATASUM))
+ last_csum_inode = key.objectid;
+ } else if (key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.objectid == last_csum_inode) {
+ struct btrfs_file_extent_item *fi;
+ u64 addr, length;
+
+ fi = btrfs_item_ptr(leaf, path.slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ goto next;
+
+ addr = btrfs_file_extent_disk_bytenr(leaf, fi) +
+ btrfs_file_extent_offset(leaf, fi);
+ length = btrfs_file_extent_num_bytes(leaf, fi);
+
+ ret = check_log_csum(root, addr, length);
+ if (ret < 0) {
+ err = 1;
+ break;
+ }
+
+ if (!ret) {
+ error("csum missing in log (root %llu, inode %llu, "
+ "offset %llu, address 0x%llx, length %llu)",
+ root->objectid, last_csum_inode, key.offset,
+ addr, length);
+ err = 1;
+ }
+ }
+
+next:
+ path.slots[0]++;
+ }
+
+ btrfs_release_path(&path);
+
+ return err;
+}
+
+static int load_log_root(u64 root_id, struct btrfs_path *path,
+ struct btrfs_root *tmp_root)
+{
+ struct extent_buffer *l;
+ struct btrfs_tree_parent_check check = { 0 };
+
+ btrfs_setup_root(tmp_root, gfs_info, root_id);
+
+ l = path->nodes[0];
+ read_extent_buffer(l, &tmp_root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(tmp_root->root_item));
+
+ tmp_root->root_key.objectid = root_id;
+ tmp_root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_root->root_key.offset = 0;
+
+ check.owner_root = btrfs_root_id(tmp_root);
+ check.transid = btrfs_root_generation(&tmp_root->root_item);
+ check.level = btrfs_root_level(&tmp_root->root_item);
+
+ tmp_root->node = read_tree_block(gfs_info,
+ btrfs_root_bytenr(&tmp_root->root_item),
+ &check);
+ if (IS_ERR(tmp_root->node)) {
+ tmp_root->node = NULL;
+ return 1;
+ }
+
+ if (btrfs_header_level(tmp_root->node) != btrfs_root_level(&tmp_root->root_item)) {
+ error("root [%llu %llu] level %d does not match %d",
+ tmp_root->root_key.objectid,
+ tmp_root->root_key.offset,
+ btrfs_header_level(tmp_root->node),
+ btrfs_root_level(&tmp_root->root_item));
+ return 1;
+ }
+
+ return 0;
+}
+
+static int check_log(struct cache_tree *root_cache)
+{
+ struct btrfs_path path = { 0 };
+ struct walk_control wc = { 0 };
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_root *log_root = gfs_info->log_root_tree;
+ int ret;
+ int err = 0;
+
+ cache_tree_init(&wc.shared);
+
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, log_root, &key, &path, 0, 0);
+ if (ret < 0) {
+ err = 1;
+ goto out;
+ }
+
+ while (1) {
+ leaf = path.nodes[0];
+ if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(log_root, &path);
+ if (ret) {
+ if (ret < 0)
+ err = 1;
+ break;
+ }
+ leaf = path.nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+ if (key.objectid > BTRFS_TREE_LOG_OBJECTID ||
+ key.type > BTRFS_ROOT_ITEM_KEY)
+ break;
+
+ if (key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+ key.type == BTRFS_ROOT_ITEM_KEY &&
+ fs_root_objectid(key.offset)) {
+ struct btrfs_root tmp_root;
+
+ memset(&tmp_root, 0, sizeof(tmp_root));
+
+ ret = load_log_root(key.offset, &path, &tmp_root);
+ if (ret) {
+ err = 1;
+ goto next;
+ }
+
+ ret = check_log_root(&tmp_root, root_cache, &wc);
+ if (ret)
+ err = 1;
+
+next:
+ if (tmp_root.node)
+ free_extent_buffer(tmp_root.node);
+ }
+
+ path.slots[0]++;
+ }
+out:
+ btrfs_release_path(&path);
+ if (err)
+ free_extent_cache_tree(&wc.shared);
+ if (!cache_tree_empty(&wc.shared))
+ fprintf(stderr, "warning line %d\n", __LINE__);
+
+ return err;
+}
+
static void free_roots_info_cache(void)
{
if (!roots_info_cache)
@@ -10585,9 +10851,21 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
goto close_out;
}
+ if (gfs_info->log_root_tree) {
+ fprintf(stderr, "[1/8] checking log\n");
+ ret = check_log(&root_cache);
+
+ if (ret)
+ error("errors found in log");
+ err |= !!ret;
+ } else {
+ fprintf(stderr,
+ "[1/8] checking log skipped (none written)\n");
+ }
+
if (!init_extent_tree) {
if (!g_task_ctx.progress_enabled) {
- fprintf(stderr, "[1/7] checking root items\n");
+ fprintf(stderr, "[2/8] checking root items\n");
} else {
g_task_ctx.tp = TASK_ROOT_ITEMS;
task_start(g_task_ctx.info, &g_task_ctx.start_time,
@@ -10622,11 +10900,11 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
}
}
} else {
- fprintf(stderr, "[1/7] checking root items... skipped\n");
+ fprintf(stderr, "[2/8] checking root items... skipped\n");
}
if (!g_task_ctx.progress_enabled) {
- fprintf(stderr, "[2/7] checking extents\n");
+ fprintf(stderr, "[3/8] checking extents\n");
} else {
g_task_ctx.tp = TASK_EXTENTS;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10644,9 +10922,9 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
if (!g_task_ctx.progress_enabled) {
if (is_free_space_tree)
- fprintf(stderr, "[3/7] checking free space tree\n");
+ fprintf(stderr, "[4/8] checking free space tree\n");
else
- fprintf(stderr, "[3/7] checking free space cache\n");
+ fprintf(stderr, "[4/8] checking free space cache\n");
} else {
g_task_ctx.tp = TASK_FREE_SPACE;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10664,7 +10942,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
*/
no_holes = btrfs_fs_incompat(gfs_info, NO_HOLES);
if (!g_task_ctx.progress_enabled) {
- fprintf(stderr, "[4/7] checking fs roots\n");
+ fprintf(stderr, "[5/8] checking fs roots\n");
} else {
g_task_ctx.tp = TASK_FS_ROOTS;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10680,10 +10958,10 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
if (!g_task_ctx.progress_enabled) {
if (check_data_csum)
- fprintf(stderr, "[5/7] checking csums against data\n");
+ fprintf(stderr, "[6/8] checking csums against data\n");
else
fprintf(stderr,
- "[5/7] checking only csums items (without verifying data)\n");
+ "[6/8] checking only csums items (without verifying data)\n");
} else {
g_task_ctx.tp = TASK_CSUMS;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10702,7 +10980,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
/* For low memory mode, check_fs_roots_v2 handles root refs */
if (check_mode != CHECK_MODE_LOWMEM) {
if (!g_task_ctx.progress_enabled) {
- fprintf(stderr, "[6/7] checking root refs\n");
+ fprintf(stderr, "[7/8] checking root refs\n");
} else {
g_task_ctx.tp = TASK_ROOT_REFS;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10717,7 +10995,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
}
} else {
fprintf(stderr,
- "[6/7] checking root refs done with fs roots in lowmem mode, skipping\n");
+ "[7/8] checking root refs done with fs roots in lowmem mode, skipping\n");
}
while (opt_check_repair && !list_empty(&gfs_info->recow_ebs)) {
@@ -10749,7 +11027,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
if (gfs_info->quota_enabled) {
if (!g_task_ctx.progress_enabled) {
- fprintf(stderr, "[7/7] checking quota groups\n");
+ fprintf(stderr, "[8/8] checking quota groups\n");
} else {
g_task_ctx.tp = TASK_QGROUPS;
task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10772,7 +11050,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
ret = 0;
} else {
fprintf(stderr,
- "[7/7] checking quota groups skipped (not enabled on this FS)\n");
+ "[8/8] checking quota groups skipped (not enabled on this FS)\n");
}
if (!list_empty(&gfs_info->recow_ebs)) {
--
2.45.2
next reply other threads:[~2024-07-11 14:24 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-11 14:23 Mark Harmstone [this message]
2024-07-12 20:39 ` [PATCH v2] btrfs-progs: add rudimentary log checking Boris Burkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240711142412.4139062-1-maharmstone@fb.com \
--to=maharmstone@fb.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox