[PATCH] btrfs-progs: add rudimentary log checking

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] btrfs-progs: add rudimentary log checking
@ 2024-07-03 16:28 Mark Harmstone
  2024-07-03 17:13 ` Josef Bacik
  2024-07-03 23:14 ` Qu Wenruo
  0 siblings, 2 replies; 4+ messages in thread
From: Mark Harmstone @ 2024-07-03 16:28 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Mark Harmstone

From: Mark Harmstone <maharmstone@meta.com>

Currently the transaction log is more or less ignored by btrfs check,
meaning that it's possible for a FS with a corrupt log to pass btrfs
check, but be immediately corrupted by the kernel when it's mounted.

This patch adds a check that if there's an inode in the log, any pending
non-compressed writes also have corresponding csum entries.

Signed-off-by: Mark Harmstone <maharmstone@meta.com>
---
 check/main.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 281 insertions(+), 12 deletions(-)

diff --git a/check/main.c b/check/main.c
index 83c721d3..6f3fab35 100644
--- a/check/main.c
+++ b/check/main.c
@@ -9787,6 +9787,263 @@ static int zero_log_tree(struct btrfs_root *root)
 	return ret;
 }
 
+static int check_log_csum(struct btrfs_root *root, u64 addr, u64 length)
+{
+	struct btrfs_path path = { 0 };
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u16 csum_size = gfs_info->csum_size;
+	u16 num_entries;
+	u64 data_len;
+	int ret;
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = addr;
+
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	if (ret > 0 && path.slots[0])
+		path.slots[0]--;
+
+	ret = 0;
+
+	while (1) {
+		leaf = path.nodes[0];
+		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, &path);
+			if (ret) {
+				if (ret > 0)
+					ret = 0;
+
+				break;
+			}
+			leaf = path.nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+		if (key.objectid > BTRFS_EXTENT_CSUM_OBJECTID)
+			break;
+
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			if (key.offset >= addr + length)
+				break;
+
+			num_entries = btrfs_item_size(leaf, path.slots[0]) / csum_size;
+			data_len = num_entries * gfs_info->sectorsize;
+
+			if (addr >= key.offset && addr + length <= key.offset + data_len) {
+				u64 end = min(addr + length, key.offset + data_len);
+
+				length = addr + length - end;
+				addr = end;
+
+				if (length == 0)
+					break;
+			}
+		}
+
+		path.slots[0]++;
+	}
+
+	btrfs_release_path(&path);
+
+	if (ret >= 0)
+		ret = length == 0 ? 1 : 0;
+
+	return ret;
+}
+
+static int check_log_root(struct btrfs_root *root, struct cache_tree *root_cache,
+			  struct walk_control *wc)
+{
+	struct btrfs_path path = { 0 };
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	int ret, err = 0;
+	u64 last_csum_inode = 0;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret < 0)
+		return 1;
+
+	while (1) {
+		leaf = path.nodes[0];
+		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, &path);
+			if (ret) {
+				if (ret < 0)
+					err = 1;
+
+				break;
+			}
+			leaf = path.nodes[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID)
+			break;
+
+		if (key.type == BTRFS_INODE_ITEM_KEY) {
+			struct btrfs_inode_item *item;
+
+			item = btrfs_item_ptr(leaf, path.slots[0],
+					      struct btrfs_inode_item);
+
+			if (!(btrfs_inode_flags(leaf, item) & BTRFS_INODE_NODATASUM))
+				last_csum_inode = key.objectid;
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY &&
+			   key.objectid == last_csum_inode) {
+			struct btrfs_file_extent_item *fi;
+			u64 addr, length;
+
+			fi = btrfs_item_ptr(leaf, path.slots[0],
+					    struct btrfs_file_extent_item);
+
+			if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+				goto next;
+
+			if (btrfs_file_extent_compression(leaf, fi) != 0)
+				goto next;
+
+			addr = btrfs_file_extent_disk_bytenr(leaf, fi) +
+				btrfs_file_extent_offset(leaf, fi);
+			length = btrfs_file_extent_disk_num_bytes(leaf, fi);
+
+			ret = check_log_csum(root, addr, length);
+			if (ret < 0) {
+				err = 1;
+				break;
+			}
+
+			if (!ret) {
+				error("csum missing in log (root %llu, inode %llu, "
+				      "offset %llu, address 0x%llx, length %llu)",
+				      root->objectid, last_csum_inode, key.offset,
+				      addr, length);
+				err = 1;
+			}
+		}
+
+next:
+		path.slots[0]++;
+	}
+
+	btrfs_release_path(&path);
+
+	return err;
+}
+
+static int check_log(struct cache_tree *root_cache)
+{
+	struct btrfs_path path = { 0 };
+	struct walk_control wc;
+
+	memset(&wc, 0, sizeof(wc));
+	cache_tree_init(&wc.shared);
+
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_root *log_root = gfs_info->log_root_tree;
+	int ret;
+	int err = 0;
+
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, log_root, &key, &path, 0, 0);
+	if (ret < 0) {
+		err = 1;
+		goto out;
+	}
+
+	while (1) {
+		leaf = path.nodes[0];
+		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(log_root, &path);
+			if (ret) {
+				if (ret < 0)
+					err = 1;
+				break;
+			}
+			leaf = path.nodes[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
+
+		if (key.objectid > BTRFS_TREE_LOG_OBJECTID ||
+		    key.type > BTRFS_ROOT_ITEM_KEY)
+			break;
+
+		if (key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+		    key.type == BTRFS_ROOT_ITEM_KEY &&
+		    fs_root_objectid(key.offset)) {
+			struct btrfs_root tmp_root;
+			struct extent_buffer *l;
+			struct btrfs_tree_parent_check check = { 0 };
+
+			memset(&tmp_root, 0, sizeof(tmp_root));
+
+			btrfs_setup_root(&tmp_root, gfs_info, key.offset);
+
+			l = path.nodes[0];
+			read_extent_buffer(l, &tmp_root.root_item,
+					btrfs_item_ptr_offset(l, path.slots[0]),
+					sizeof(tmp_root.root_item));
+
+			tmp_root.root_key.objectid = key.offset;
+			tmp_root.root_key.type = BTRFS_ROOT_ITEM_KEY;
+			tmp_root.root_key.offset = 0;
+
+			check.owner_root = btrfs_root_id(&tmp_root);
+			check.transid = btrfs_root_generation(&tmp_root.root_item);
+			check.level = btrfs_root_level(&tmp_root.root_item);
+
+			tmp_root.node = read_tree_block(gfs_info,
+							btrfs_root_bytenr(&tmp_root.root_item),
+							&check);
+			if (IS_ERR(tmp_root.node)) {
+				tmp_root.node = NULL;
+				err = 1;
+				goto next;
+			}
+
+			if (btrfs_header_level(tmp_root.node) != btrfs_root_level(&tmp_root.root_item)) {
+				error("root [%llu %llu] level %d does not match %d",
+					tmp_root.root_key.objectid,
+					tmp_root.root_key.offset,
+					btrfs_header_level(tmp_root.node),
+					btrfs_root_level(&tmp_root.root_item));
+				err = 1;
+				goto next;
+			}
+
+			ret = check_log_root(&tmp_root, root_cache, &wc);
+			if (ret)
+				err = 1;
+
+next:
+			if (tmp_root.node)
+				free_extent_buffer(tmp_root.node);
+		}
+		path.slots[0]++;
+	}
+out:
+	btrfs_release_path(&path);
+	if (err)
+		free_extent_cache_tree(&wc.shared);
+	if (!cache_tree_empty(&wc.shared))
+		fprintf(stderr, "warning line %d\n", __LINE__);
+
+	return err;
+}
+
 static void free_roots_info_cache(void)
 {
 	if (!roots_info_cache)
@@ -10587,7 +10844,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 
 	if (!init_extent_tree) {
 		if (!g_task_ctx.progress_enabled) {
-			fprintf(stderr, "[1/7] checking root items\n");
+			fprintf(stderr, "[1/8] checking root items\n");
 		} else {
 			g_task_ctx.tp = TASK_ROOT_ITEMS;
 			task_start(g_task_ctx.info, &g_task_ctx.start_time,
@@ -10622,11 +10879,11 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 			}
 		}
 	} else {
-		fprintf(stderr, "[1/7] checking root items... skipped\n");
+		fprintf(stderr, "[1/8] checking root items... skipped\n");
 	}
 
 	if (!g_task_ctx.progress_enabled) {
-		fprintf(stderr, "[2/7] checking extents\n");
+		fprintf(stderr, "[2/8] checking extents\n");
 	} else {
 		g_task_ctx.tp = TASK_EXTENTS;
 		task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10644,9 +10901,9 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 
 	if (!g_task_ctx.progress_enabled) {
 		if (is_free_space_tree)
-			fprintf(stderr, "[3/7] checking free space tree\n");
+			fprintf(stderr, "[3/8] checking free space tree\n");
 		else
-			fprintf(stderr, "[3/7] checking free space cache\n");
+			fprintf(stderr, "[3/8] checking free space cache\n");
 	} else {
 		g_task_ctx.tp = TASK_FREE_SPACE;
 		task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10664,7 +10921,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 	 */
 	no_holes = btrfs_fs_incompat(gfs_info, NO_HOLES);
 	if (!g_task_ctx.progress_enabled) {
-		fprintf(stderr, "[4/7] checking fs roots\n");
+		fprintf(stderr, "[4/8] checking fs roots\n");
 	} else {
 		g_task_ctx.tp = TASK_FS_ROOTS;
 		task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10680,10 +10937,10 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 
 	if (!g_task_ctx.progress_enabled) {
 		if (check_data_csum)
-			fprintf(stderr, "[5/7] checking csums against data\n");
+			fprintf(stderr, "[5/8] checking csums against data\n");
 		else
 			fprintf(stderr,
-		"[5/7] checking only csums items (without verifying data)\n");
+		"[5/8] checking only csums items (without verifying data)\n");
 	} else {
 		g_task_ctx.tp = TASK_CSUMS;
 		task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10702,7 +10959,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 	/* For low memory mode, check_fs_roots_v2 handles root refs */
         if (check_mode != CHECK_MODE_LOWMEM) {
 		if (!g_task_ctx.progress_enabled) {
-			fprintf(stderr, "[6/7] checking root refs\n");
+			fprintf(stderr, "[6/8] checking root refs\n");
 		} else {
 			g_task_ctx.tp = TASK_ROOT_REFS;
 			task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10717,7 +10974,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 		}
 	} else {
 		fprintf(stderr,
-	"[6/7] checking root refs done with fs roots in lowmem mode, skipping\n");
+	"[6/8] checking root refs done with fs roots in lowmem mode, skipping\n");
 	}
 
 	while (opt_check_repair && !list_empty(&gfs_info->recow_ebs)) {
@@ -10749,7 +11006,7 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 
 	if (gfs_info->quota_enabled) {
 		if (!g_task_ctx.progress_enabled) {
-			fprintf(stderr, "[7/7] checking quota groups\n");
+			fprintf(stderr, "[7/8] checking quota groups\n");
 		} else {
 			g_task_ctx.tp = TASK_QGROUPS;
 			task_start(g_task_ctx.info, &g_task_ctx.start_time, &g_task_ctx.item_count);
@@ -10772,7 +11029,19 @@ static int cmd_check(const struct cmd_struct *cmd, int argc, char **argv)
 		ret = 0;
 	} else {
 		fprintf(stderr,
-		"[7/7] checking quota groups skipped (not enabled on this FS)\n");
+		"[7/8] checking quota groups skipped (not enabled on this FS)\n");
+	}
+
+	if (gfs_info->log_root_tree) {
+		fprintf(stderr, "[8/8] checking log\n");
+		ret = check_log(&root_cache);
+
+		if (ret)
+			error("errors found in log");
+		err |= !!ret;
+	} else {
+		fprintf(stderr,
+		"[8/8] checking log skipped (none written)\n");
 	}
 
 	if (!list_empty(&gfs_info->recow_ebs)) {
-- 
2.44.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: add rudimentary log checking
  2024-07-03 16:28 [PATCH] btrfs-progs: add rudimentary log checking Mark Harmstone
@ 2024-07-03 17:13 ` Josef Bacik
  2024-07-03 23:14 ` Qu Wenruo
  1 sibling, 0 replies; 4+ messages in thread
From: Josef Bacik @ 2024-07-03 17:13 UTC (permalink / raw)
  To: Mark Harmstone; +Cc: linux-btrfs, Mark Harmstone

On Wed, Jul 03, 2024 at 05:28:32PM +0100, Mark Harmstone wrote:
> From: Mark Harmstone <maharmstone@meta.com>
> 
> Currently the transaction log is more or less ignored by btrfs check,
> meaning that it's possible for a FS with a corrupt log to pass btrfs
> check, but be immediately corrupted by the kernel when it's mounted.
> 
> This patch adds a check that if there's an inode in the log, any pending
> non-compressed writes also have corresponding csum entries.
> 
> Signed-off-by: Mark Harmstone <maharmstone@meta.com>
> ---
>  check/main.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 281 insertions(+), 12 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 83c721d3..6f3fab35 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -9787,6 +9787,263 @@ static int zero_log_tree(struct btrfs_root *root)
>  	return ret;
>  }
>  
> +static int check_log_csum(struct btrfs_root *root, u64 addr, u64 length)
> +{
> +	struct btrfs_path path = { 0 };
> +	struct btrfs_key key;
> +	struct extent_buffer *leaf;
> +	u16 csum_size = gfs_info->csum_size;
> +	u16 num_entries;
> +	u64 data_len;
> +	int ret;
> +
> +	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
> +	key.type = BTRFS_EXTENT_CSUM_KEY;
> +	key.offset = addr;
> +
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (ret > 0 && path.slots[0])
> +		path.slots[0]--;
> +
> +	ret = 0;
> +
> +	while (1) {
> +		leaf = path.nodes[0];
> +		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
> +			ret = btrfs_next_leaf(root, &path);
> +			if (ret) {
> +				if (ret > 0)
> +					ret = 0;
> +
> +				break;
> +			}
> +			leaf = path.nodes[0];
> +		}
> +
> +		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
> +
> +		if (key.objectid > BTRFS_EXTENT_CSUM_OBJECTID)
> +			break;
> +
> +		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
> +		    key.type == BTRFS_EXTENT_CSUM_KEY) {
> +			if (key.offset >= addr + length)
> +				break;
> +

You can turn this into

if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
    key.type != BTRFS_EXTENT_CSUM_KEY)
	goto next;

if (key.offset >= addr + length)
	break;

and put a next at path.slots[0]++;

and then you don't have to indent the bits below.

> +			num_entries = btrfs_item_size(leaf, path.slots[0]) / csum_size;
> +			data_len = num_entries * gfs_info->sectorsize;
> +
> +			if (addr >= key.offset && addr + length <= key.offset + data_len) {
> +				u64 end = min(addr + length, key.offset + data_len);
> +
> +				length = addr + length - end;
> +				addr = end;
> +
> +				if (length == 0)
> +					break;
> +			}
> +		}
> +
> +		path.slots[0]++;
> +	}
> +
> +	btrfs_release_path(&path);
> +
> +	if (ret >= 0)
> +		ret = length == 0 ? 1 : 0;
> +
> +	return ret;
> +}
> +
> +static int check_log_root(struct btrfs_root *root, struct cache_tree *root_cache,
> +			  struct walk_control *wc)
> +{
> +	struct btrfs_path path = { 0 };
> +	struct btrfs_key key;
> +	struct extent_buffer *leaf;
> +	int ret, err = 0;
> +	u64 last_csum_inode = 0;
> +
> +	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
> +	key.type = BTRFS_INODE_ITEM_KEY;
> +	key.offset = 0;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret < 0)
> +		return 1;
> +
> +	while (1) {
> +		leaf = path.nodes[0];
> +		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
> +			ret = btrfs_next_leaf(root, &path);
> +			if (ret) {
> +				if (ret < 0)
> +					err = 1;
> +
> +				break;
> +			}
> +			leaf = path.nodes[0];
> +		}
> +		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
> +
> +		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID)
> +			break;
> +
> +		if (key.type == BTRFS_INODE_ITEM_KEY) {
> +			struct btrfs_inode_item *item;
> +
> +			item = btrfs_item_ptr(leaf, path.slots[0],
> +					      struct btrfs_inode_item);
> +
> +			if (!(btrfs_inode_flags(leaf, item) & BTRFS_INODE_NODATASUM))
> +				last_csum_inode = key.objectid;
> +		} else if (key.type == BTRFS_EXTENT_DATA_KEY &&
> +			   key.objectid == last_csum_inode) {
> +			struct btrfs_file_extent_item *fi;
> +			u64 addr, length;
> +
> +			fi = btrfs_item_ptr(leaf, path.slots[0],
> +					    struct btrfs_file_extent_item);
> +
> +			if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
> +				goto next;
> +
> +			if (btrfs_file_extent_compression(leaf, fi) != 0)
> +				goto next;

Compressed file extents should definitely have a csum associated with them.

> +
> +			addr = btrfs_file_extent_disk_bytenr(leaf, fi) +
> +				btrfs_file_extent_offset(leaf, fi);
> +			length = btrfs_file_extent_disk_num_bytes(leaf, fi);
> +
> +			ret = check_log_csum(root, addr, length);
> +			if (ret < 0) {
> +				err = 1;
> +				break;
> +			}
> +
> +			if (!ret) {
> +				error("csum missing in log (root %llu, inode %llu, "
> +				      "offset %llu, address 0x%llx, length %llu)",
> +				      root->objectid, last_csum_inode, key.offset,
> +				      addr, length);
> +				err = 1;
> +			}
> +		}
> +
> +next:
> +		path.slots[0]++;
> +	}
> +
> +	btrfs_release_path(&path);
> +
> +	return err;
> +}
> +
> +static int check_log(struct cache_tree *root_cache)
> +{
> +	struct btrfs_path path = { 0 };
> +	struct walk_control wc;
> +
> +	memset(&wc, 0, sizeof(wc));

You can just do

struct walk_contro wc = { 0 };

as well.

> +	cache_tree_init(&wc.shared);
> +
> +	struct btrfs_key key;
> +	struct extent_buffer *leaf;
> +	struct btrfs_root *log_root = gfs_info->log_root_tree;
> +	int ret;
> +	int err = 0;

We tend to prefer the declarations first, then code, so move this above please.

> +
> +	key.objectid = BTRFS_TREE_LOG_OBJECTID;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = 0;
> +	ret = btrfs_search_slot(NULL, log_root, &key, &path, 0, 0);
> +	if (ret < 0) {
> +		err = 1;
> +		goto out;
> +	}
> +
> +	while (1) {
> +		leaf = path.nodes[0];
> +		if (path.slots[0] >= btrfs_header_nritems(leaf)) {
> +			ret = btrfs_next_leaf(log_root, &path);
> +			if (ret) {
> +				if (ret < 0)
> +					err = 1;
> +				break;
> +			}
> +			leaf = path.nodes[0];
> +		}
> +		btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
> +
> +		if (key.objectid > BTRFS_TREE_LOG_OBJECTID ||
> +		    key.type > BTRFS_ROOT_ITEM_KEY)
> +			break;
> +
> +		if (key.objectid == BTRFS_TREE_LOG_OBJECTID &&
> +		    key.type == BTRFS_ROOT_ITEM_KEY &&
> +		    fs_root_objectid(key.offset)) {
> +			struct btrfs_root tmp_root;
> +			struct extent_buffer *l;
> +			struct btrfs_tree_parent_check check = { 0 };
> +
> +			memset(&tmp_root, 0, sizeof(tmp_root));
> +
> +			btrfs_setup_root(&tmp_root, gfs_info, key.offset);
> +
> +			l = path.nodes[0];
> +			read_extent_buffer(l, &tmp_root.root_item,
> +					btrfs_item_ptr_offset(l, path.slots[0]),
> +					sizeof(tmp_root.root_item));
> +
> +			tmp_root.root_key.objectid = key.offset;
> +			tmp_root.root_key.type = BTRFS_ROOT_ITEM_KEY;
> +			tmp_root.root_key.offset = 0;
> +
> +			check.owner_root = btrfs_root_id(&tmp_root);
> +			check.transid = btrfs_root_generation(&tmp_root.root_item);
> +			check.level = btrfs_root_level(&tmp_root.root_item);
> +
> +			tmp_root.node = read_tree_block(gfs_info,
> +							btrfs_root_bytenr(&tmp_root.root_item),
> +							&check);
> +			if (IS_ERR(tmp_root.node)) {
> +				tmp_root.node = NULL;
> +				err = 1;
> +				goto next;
> +			}
> +
> +			if (btrfs_header_level(tmp_root.node) != btrfs_root_level(&tmp_root.root_item)) {
> +				error("root [%llu %llu] level %d does not match %d",
> +					tmp_root.root_key.objectid,
> +					tmp_root.root_key.offset,
> +					btrfs_header_level(tmp_root.node),
> +					btrfs_root_level(&tmp_root.root_item));
> +				err = 1;
> +				goto next;
> +			}

Turn the above into a helper.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: add rudimentary log checking
  2024-07-03 16:28 [PATCH] btrfs-progs: add rudimentary log checking Mark Harmstone
  2024-07-03 17:13 ` Josef Bacik
@ 2024-07-03 23:14 ` Qu Wenruo
  2024-07-11 14:12   ` Mark Harmstone
  1 sibling, 1 reply; 4+ messages in thread
From: Qu Wenruo @ 2024-07-03 23:14 UTC (permalink / raw)
  To: Mark Harmstone, linux-btrfs; +Cc: Mark Harmstone

在 2024/7/4 01:58, Mark Harmstone 写道:
> From: Mark Harmstone <maharmstone@meta.com>
>
> Currently the transaction log is more or less ignored by btrfs check,

By "transaction log" did you mean log tree?

> meaning that it's possible for a FS with a corrupt log to pass btrfs
> check, but be immediately corrupted by the kernel when it's mounted.

Firstly, if kernel can really corrupt the fs with such corrupted log (I
mean not just csum mismatch, but metadata level corruption), then it's a
kernel bug and we need to first fix it.

The expected behavior is, kernel itself should not generate such
corrupted log tree.
Also the kernel should reject such corrupted log tree.

Did you hit such problem (missing csum or bad inodes etc) in real world?

[...]> +		if (key.objectid == BTRFS_TREE_LOG_OBJECTID &&
> +		    key.type == BTRFS_ROOT_ITEM_KEY &&
> +		    fs_root_objectid(key.offset)) {
> +			struct btrfs_root tmp_root;
> +			struct extent_buffer *l;
> +			struct btrfs_tree_parent_check check = { 0 };
> +
> +			memset(&tmp_root, 0, sizeof(tmp_root));
> +
> +			btrfs_setup_root(&tmp_root, gfs_info, key.offset);
> +
> +			l = path.nodes[0];
> +			read_extent_buffer(l, &tmp_root.root_item,
> +					btrfs_item_ptr_offset(l, path.slots[0]),
> +					sizeof(tmp_root.root_item));
> +
> +			tmp_root.root_key.objectid = key.offset;
> +			tmp_root.root_key.type = BTRFS_ROOT_ITEM_KEY;
> +			tmp_root.root_key.offset = 0;
> +
> +			check.owner_root = btrfs_root_id(&tmp_root);
> +			check.transid = btrfs_root_generation(&tmp_root.root_item);
> +			check.level = btrfs_root_level(&tmp_root.root_item);
> +
> +			tmp_root.node = read_tree_block(gfs_info,
> +							btrfs_root_bytenr(&tmp_root.root_item),
> +							&check);

This is mostly a hard-coded btrfs_read_fs_root().

I know you did this because we do not have a good infrastructure to read
out a log tree.
But I really prefer to have a proper helper to do that.

[...]
> +	if (gfs_info->log_root_tree) {
> +		fprintf(stderr, "[8/8] checking log\n");
> +		ret = check_log(&root_cache);
> +
> +		if (ret)
> +			error("errors found in log");
> +		err |= !!ret;
> +	} else {
> +		fprintf(stderr,
> +		"[8/8] checking log skipped (none written)\n");

The timing may be problematic.

Firstly btrfs-check can do write operations, and although we have checks
to ensure we zero the log first, it only zeros the log in the super
block, not clearing the fs_info->log_root_tree.

So this means, for btrfs-check --repair runs, we can modify the base fs,
then you do the log tree check based on the modified fs, which can cause
various problems.

At least we need to make zero_log_tree() to cleanup fs_info->log_tree_root.

Thus I'd really prefer to do the log tree check first, as in kernel log
replay is the first thing to be done.

Thanks,
Qu

>   	}
>
>   	if (!list_empty(&gfs_info->recow_ebs)) {

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: add rudimentary log checking
  2024-07-03 23:14 ` Qu Wenruo
@ 2024-07-11 14:12   ` Mark Harmstone
  0 siblings, 0 replies; 4+ messages in thread
From: Mark Harmstone @ 2024-07-11 14:12 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs@vger.kernel.org

Thanks Qu.

> Firstly, if kernel can really corrupt the fs with such corrupted log (I
> mean not just csum mismatch, but metadata level corruption), then it's a
> kernel bug and we need to first fix it.
> 
> The expected behavior is, kernel itself should not generate such
> corrupted log tree.
> Also the kernel should reject such corrupted log tree.
> 
> Did you hit such problem (missing csum or bad inodes etc) in real world?

Yes, this is part of an investigation I'm doing into a log tree bug. 
There's a bug somewhere in the kernel, but we can't fix it without btrfs 
check being able to diagnose it.

> This is mostly a hard-coded btrfs_read_fs_root().
> 
> I know you did this because we do not have a good infrastructure to read
> out a log tree.
> But I really prefer to have a proper helper to do that.

No problem, Josef said the same thing.

> Thus I'd really prefer to do the log tree check first, as in kernel log
> replay is the first thing to be done.

Fine by me.

Mark

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-07-11 14:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-03 16:28 [PATCH] btrfs-progs: add rudimentary log checking Mark Harmstone
2024-07-03 17:13 ` Josef Bacik
2024-07-03 23:14 ` Qu Wenruo
2024-07-11 14:12   ` Mark Harmstone

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.