From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group
Date: Sat, 17 Jun 2023 13:07:24 +0800 [thread overview]
Message-ID: <8fc39e8af0ce2c335f0e41bdd8b616b30ac2d7b9.1686977659.git.wqu@suse.com> (raw)
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
Introduce a new helper, prepare_scrub_block_group(), to handle the
checks before scrubbing a block group.
The helper would be called by both scrub_enumerate_chunks() and
scrub_logical_chunks().
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 328 ++++++++++++++++++++++++++++++-----------------
1 file changed, 209 insertions(+), 119 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 598754ad7ce6..aa68cda5226f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2349,6 +2349,141 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
return btrfs_commit_transaction(trans);
}
+/*
+ * Do the preparation before we scrub the block group.
+ *
+ * Return >0 if we don't need to scrub the block group.
+ * Return 0 if we have done the preparation and set @ro_set_ret.
+ * Return <0 for error and can not scrub the bg.
+ */
+static int prepare_scrub_block_group(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
+ bool *ro_set_ret)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ bool ro_set = false;
+ int ret;
+
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &bg->runtime_flags))
+ return 1;
+ }
+
+ /*
+ * Make sure that while we are scrubbing the corresponding block
+ * group doesn't get its logical address and its device extents
+ * reused for another block group, which can possibly be of a
+ * different type and different profile. We do this to prevent
+ * false error detections and crashes due to bogus attempts to
+ * repair extents.
+ */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ return 1;
+ }
+ btrfs_freeze_block_group(bg);
+ spin_unlock(&bg->lock);
+
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
+ */
+ ret = btrfs_inc_block_group_ro(bg, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, bg);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ }
+ }
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
+ !(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ *
+ * For RAID56 chunks, we have to mark them read-only
+ * for scrub, as later we would use our own cache
+ * out of RAID56 realm.
+ * Thus we want the RAID56 bg to be marked RO to
+ * prevent RMW from screwing up out cache.
+ */
+ ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ bg->start);
+ btrfs_unfreeze_block_group(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ } else {
+ btrfs_warn(fs_info,
+ "failed setting block group ro: %d", ret);
+ btrfs_unfreeze_block_group(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ }
+
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(bg);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, bg->start,
+ bg->length);
+ }
+ scrub_pause_off(fs_info);
+ *ro_set_ret = ro_set;
+ return 0;
+}
+
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -2359,7 +2494,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->dev_root;
u64 chunk_offset;
int ret = 0;
- int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -2380,6 +2514,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
+ bool ro_set = false;
u64 dev_extent_len;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2461,127 +2596,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
goto skip;
}
- if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
- if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
- btrfs_put_block_group(cache);
- goto skip;
- }
+ ret = prepare_scrub_block_group(sctx, cache, &ro_set);
+ if (ret == -ETXTBSY) {
+ ret = 0;
+ goto skip_unfreeze;
}
-
- /*
- * Make sure that while we are scrubbing the corresponding block
- * group doesn't get its logical address and its device extents
- * reused for another block group, which can possibly be of a
- * different type and different profile. We do this to prevent
- * false error detections and crashes due to bogus attempts to
- * repair extents.
- */
- spin_lock(&cache->lock);
- if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
- spin_unlock(&cache->lock);
+ if (ret < 0) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ if (ret > 0) {
btrfs_put_block_group(cache);
goto skip;
}
- btrfs_freeze_block_group(cache);
- spin_unlock(&cache->lock);
- /*
- * we need call btrfs_inc_block_group_ro() with scrubs_paused,
- * to avoid deadlock caused by:
- * btrfs_inc_block_group_ro()
- * -> btrfs_wait_for_commit()
- * -> btrfs_commit_transaction()
- * -> btrfs_scrub_pause()
- */
- scrub_pause_on(fs_info);
-
- /*
- * Don't do chunk preallocation for scrub.
- *
- * This is especially important for SYSTEM bgs, or we can hit
- * -EFBIG from btrfs_finish_chunk_alloc() like:
- * 1. The only SYSTEM bg is marked RO.
- * Since SYSTEM bg is small, that's pretty common.
- * 2. New SYSTEM bg will be allocated
- * Due to regular version will allocate new chunk.
- * 3. New SYSTEM bg is empty and will get cleaned up
- * Before cleanup really happens, it's marked RO again.
- * 4. Empty SYSTEM bg get scrubbed
- * We go back to 2.
- *
- * This can easily boost the amount of SYSTEM chunks if cleaner
- * thread can't be triggered fast enough, and use up all space
- * of btrfs_super_block::sys_chunk_array
- *
- * While for dev replace, we need to try our best to mark block
- * group RO, to prevent race between:
- * - Write duplication
- * Contains latest data
- * - Scrub copy
- * Contains data from commit tree
- *
- * If target block group is not marked RO, nocow writes can
- * be overwritten by scrub copy, causing data corruption.
- * So for dev-replace, it's not allowed to continue if a block
- * group is not RO.
- */
- ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
- if (!ret && sctx->is_dev_replace) {
- ret = finish_extent_writes_for_zoned(root, cache);
- if (ret) {
- btrfs_dec_block_group_ro(cache);
- scrub_pause_off(fs_info);
- btrfs_put_block_group(cache);
- break;
- }
- }
-
- if (ret == 0) {
- ro_set = 1;
- } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
- !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
- /*
- * btrfs_inc_block_group_ro return -ENOSPC when it
- * failed in creating new chunk for metadata.
- * It is not a problem for scrub, because
- * metadata are always cowed, and our scrub paused
- * commit_transactions.
- *
- * For RAID56 chunks, we have to mark them read-only
- * for scrub, as later we would use our own cache
- * out of RAID56 realm.
- * Thus we want the RAID56 bg to be marked RO to
- * prevent RMW from screwing up out cache.
- */
- ro_set = 0;
- } else if (ret == -ETXTBSY) {
- btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
- cache->start);
- scrub_pause_off(fs_info);
- ret = 0;
- goto skip_unfreeze;
- } else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
- btrfs_unfreeze_block_group(cache);
- btrfs_put_block_group(cache);
- scrub_pause_off(fs_info);
- break;
- }
-
- /*
- * Now the target block is marked RO, wait for nocow writes to
- * finish before dev-replace.
- * COW is fine, as COW never overwrites extents in commit tree.
- */
- if (sctx->is_dev_replace) {
- btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
- cache->length);
- }
-
- scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + dev_extent_len;
dev_replace->cursor_left = found_key.offset;
@@ -2960,6 +2988,69 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 cur = start;
+ int ret = 0;
+
+ while (cur < end) {
+ struct btrfs_block_group *bg;
+ bool ro_set = false;
+
+ bg = btrfs_lookup_first_block_group(fs_info, cur);
+
+ /* No more block groups. */
+ if (!bg)
+ break;
+ if (bg->start > end)
+ break;
+
+ ret = prepare_scrub_block_group(sctx, bg, &ro_set);
+ if (ret == -ETXTBSY) {
+ ret = 0;
+ goto next;
+ }
+ if (ret > 0)
+ goto next;
+ if (ret < 0) {
+ btrfs_put_block_group(bg);
+ break;
+ }
+
+ /* The real work starts here. */
+ ret = -EOPNOTSUPP;
+
+ if (ro_set)
+ btrfs_dec_block_group_ro(bg);
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags) &&
+ !bg->ro && bg->reserved == 0 && bg->used == 0) {
+ spin_unlock(&bg->lock);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
+ else
+ btrfs_mark_bg_unused(bg);
+ } else {
+ spin_unlock(&bg->lock);
+ }
+ btrfs_unfreeze_block_group(bg);
+next:
+ cur = bg->start + bg->length;
+ btrfs_put_block_group(bg);
+ if (ret < 0)
+ break;
+ }
+ return ret;
+}
+
int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
struct btrfs_scrub_progress *progress, bool readonly)
{
@@ -3006,8 +3097,7 @@ int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
atomic_inc(&fs_info->scrubs_logical_running);
mutex_unlock(&fs_info->scrub_lock);
- /* The main work would be implemented. */
- ret = -EOPNOTSUPP;
+ ret = scrub_logical_chunks(sctx, start, end);
atomic_dec(&fs_info->scrubs_running);
atomic_dec(&fs_info->scrubs_logical_running);
--
2.41.0
next prev parent reply other threads:[~2023-06-17 5:07 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated Qu Wenruo
2023-06-20 8:43 ` Johannes Thumshirn
2023-06-17 5:07 ` [PATCH RFC 2/5] btrfs: scrub: introduce the skeleton for logical-scrub Qu Wenruo
2023-06-20 8:50 ` Johannes Thumshirn
2023-06-17 5:07 ` Qu Wenruo [this message]
2023-06-20 8:55 ` [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group Johannes Thumshirn
2023-06-17 5:07 ` [PATCH RFC 4/5] btrfs: scrub: implement the basic extent iteration code Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 5/5] btrfs: scrub: implement the repair part of scrub logical Qu Wenruo
2023-06-20 9:24 ` Johannes Thumshirn
2023-06-17 5:12 ` [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=8fc39e8af0ce2c335f0e41bdd8b616b30ac2d7b9.1686977659.git.wqu@suse.com \
--to=wqu@suse.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox