* [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
@ 2023-06-17 5:07 ` Qu Wenruo
2023-06-20 8:43 ` Johannes Thumshirn
2023-06-17 5:07 ` [PATCH RFC 2/5] btrfs: scrub: introduce the skeleton for logical-scrub Qu Wenruo
` (4 subsequent siblings)
5 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:07 UTC (permalink / raw)
To: linux-btrfs
Currently scrub_ctx::stripes are a fixed size array, this is fine for
most use cases, but later we may want to allocate one larger sized array
for logical bytenr based scrub.
So here we change the member to a dynamically allocated array.
This also affects the lifespan of the member.
Now it only needs to be allocated and initialized at the beginning of
scrub_stripe() function.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 65 +++++++++++++++++++++++++++++++++++++-----------
1 file changed, 51 insertions(+), 14 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 093823aa8d2c..d83bcc396bb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -172,7 +172,7 @@ struct scrub_stripe {
};
struct scrub_ctx {
- struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
+ struct scrub_stripe *stripes;
struct scrub_stripe *raid56_data_stripes;
struct btrfs_fs_info *fs_info;
int first_free;
@@ -181,6 +181,9 @@ struct scrub_ctx {
int readonly;
int sectors_per_bio;
+ /* Number of stripes we have in @stripes. */
+ unsigned int nr_stripes;
+
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
u64 throttle_sent;
@@ -315,9 +318,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
- release_scrub_stripe(&sctx->stripes[i]);
-
+ if (sctx->stripes)
+ for (i = 0; i < sctx->nr_stripes; i++)
+ release_scrub_stripe(&sctx->stripes[i]);
+ kfree(sctx->stripes);
kfree(sctx);
}
@@ -331,7 +335,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
struct btrfs_fs_info *fs_info, int is_dev_replace)
{
struct scrub_ctx *sctx;
- int i;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
@@ -339,14 +342,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
- int ret;
-
- ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
- if (ret < 0)
- goto nomem;
- sctx->stripes[i].sctx = sctx;
- }
sctx->first_free = 0;
atomic_set(&sctx->cancel_req, 0);
@@ -1660,6 +1655,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
const int nr_stripes = sctx->cur_stripe;
int ret = 0;
+ ASSERT(nr_stripes <= sctx->nr_stripes);
if (!nr_stripes)
return 0;
@@ -1754,8 +1750,11 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
struct scrub_stripe *stripe;
int ret;
+ ASSERT(sctx->stripes);
+ ASSERT(sctx->nr_stripes);
+
/* No available slot, submit all stripes and wait for them. */
- if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
+ if (sctx->cur_stripe >= sctx->nr_stripes) {
ret = flush_scrub_stripes(sctx);
if (ret < 0)
return ret;
@@ -2080,6 +2079,39 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
return ret;
}
+static void free_scrub_stripes(struct scrub_ctx *sctx)
+{
+ for (int i = 0; i < sctx->nr_stripes; i++)
+ release_scrub_stripe(&sctx->stripes[i]);
+ kfree(sctx->stripes);
+ sctx->nr_stripes = 0;
+ sctx->stripes = NULL;
+}
+
+static int alloc_scrub_stripes(struct scrub_ctx *sctx, int nr_stripes)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret;
+
+ ASSERT(!sctx->stripes);
+ ASSERT(!sctx->nr_stripes);
+ sctx->stripes = kcalloc(nr_stripes, sizeof(struct scrub_stripe),
+ GFP_KERNEL);
+ if (!sctx->stripes)
+ return -ENOMEM;
+ sctx->nr_stripes = nr_stripes;
+ for (int i = 0; i < sctx->nr_stripes; i++) {
+ ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
+ if (ret < 0)
+ goto cleanup;
+ sctx->stripes[i].sctx = sctx;
+ }
+ return 0;
+cleanup:
+ free_scrub_stripes(sctx);
+ return -ENOMEM;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
struct extent_map *em,
@@ -2106,6 +2138,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
+ ret = alloc_scrub_stripes(sctx, SCRUB_STRIPES_PER_SCTX);
+ if (ret < 0)
+ return ret;
+
if (sctx->is_dev_replace &&
btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
mutex_lock(&sctx->wr_lock);
@@ -2228,6 +2264,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
kfree(sctx->raid56_data_stripes);
sctx->raid56_data_stripes = NULL;
}
+ free_scrub_stripes(sctx);
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
--
2.41.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated
2023-06-17 5:07 ` [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated Qu Wenruo
@ 2023-06-20 8:43 ` Johannes Thumshirn
0 siblings, 0 replies; 11+ messages in thread
From: Johannes Thumshirn @ 2023-06-20 8:43 UTC (permalink / raw)
To: Qu Wenruo, linux-btrfs@vger.kernel.org
On 17.06.23 07:08, Qu Wenruo wrote:
> @@ -315,9 +318,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
> if (!sctx)
> return;
>
> - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
> - release_scrub_stripe(&sctx->stripes[i]);
> -
> + if (sctx->stripes)
> + for (i = 0; i < sctx->nr_stripes; i++)
> + release_scrub_stripe(&sctx->stripes[i]);
I think we're usually guarding the inner part of the if by {}.
> + kfree(sctx->stripes);
> kfree(sctx);
But it could even be simplified to:
static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
int i;
if (!sctx)
return;
free_scrub_stripes(sctx);
kfree(sctx);
}
> +static void free_scrub_stripes(struct scrub_ctx *sctx)
> +{
+ if (!sctx->stripes)
return;
> + for (int i = 0; i < sctx->nr_stripes; i++)
> + release_scrub_stripe(&sctx->stripes[i]);
> + kfree(sctx->stripes);
> + sctx->nr_stripes = 0;
> + sctx->stripes = NULL;
> +}
> +
Otherwise looks good to me
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH RFC 2/5] btrfs: scrub: introduce the skeleton for logical-scrub
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated Qu Wenruo
@ 2023-06-17 5:07 ` Qu Wenruo
2023-06-20 8:50 ` Johannes Thumshirn
2023-06-17 5:07 ` [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group Qu Wenruo
` (3 subsequent siblings)
5 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:07 UTC (permalink / raw)
To: linux-btrfs
Currently btrfs scrub is a per-device operation, this is fine for most
non-RAID56 profiles, but not a good thing for RAID56 profiles.
The main challenge for RAID56 is, we will read out data stripes more
than one times (one for the data stripe scrub itself, more for
parities).
To address this, and maybe even improve the non-RAID56 scrub, here we
introduce a new scrub flag, SCRUB_LOGICAL.
This would be a per-fs operation, and conflicts with any
dev-scrub/dev-replace.
This patch only implements the basic exclusion checks.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/disk-io.c | 1 +
fs/btrfs/fs.h | 12 +++++++
fs/btrfs/ioctl.c | 6 +++-
fs/btrfs/scrub.c | 72 +++++++++++++++++++++++++++++++++++++-
fs/btrfs/scrub.h | 2 ++
include/uapi/linux/btrfs.h | 11 ++++--
6 files changed, 100 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1168e5df8bae..5a970040e5db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1845,6 +1845,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->scrub_lock);
atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrubs_logical_running, 0);
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 396e2a463480..2067722ea6c8 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -631,7 +631,19 @@ struct btrfs_fs_info {
/* Private scrub information */
struct mutex scrub_lock;
+
+ /*
+ * Number of running scrubbing, including both dev-scrub (at most
+ * one dev-scrub on each device) and logical-scrub (at most
+ * one logical-scrub for each fs).
+ */
atomic_t scrubs_running;
+
+ /*
+ * Number of running logical scrubbing, there is at most one running
+ * logical scrub for each fs.
+ */
+ atomic_t scrubs_logical_running;
atomic_t scrub_pause_req;
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edbbd5cf23fc..1a1afcce73e0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3179,7 +3179,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
goto out;
}
- ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
+ if (sa->flags & BTRFS_SCRUB_LOGICAL)
+ ret = btrfs_scrub_logical(fs_info, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+ else
+ ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d83bcc396bb0..598754ad7ce6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -178,7 +178,8 @@ struct scrub_ctx {
int first_free;
int cur_stripe;
atomic_t cancel_req;
- int readonly;
+ bool readonly;
+ bool scrub_logical;
int sectors_per_bio;
/* Number of stripes we have in @stripes. */
@@ -2841,6 +2842,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out;
}
+ /* Dev-scrub conflicts with logical-scrub. */
+ if (atomic_read(&fs_info->scrubs_logical_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+
down_read(&fs_info->dev_replace.rwsem);
if (dev->scrub_ctx ||
(!is_dev_replace &&
@@ -2951,6 +2960,67 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+ struct btrfs_scrub_progress *progress, bool readonly)
+{
+ struct scrub_ctx *sctx;
+ int ret;
+
+ if (btrfs_fs_closing(fs_info))
+ return -EAGAIN;
+
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
+
+ sctx = scrub_setup_ctx(fs_info, false);
+ if (IS_ERR(sctx))
+ return PTR_ERR(sctx);
+ sctx->scrub_logical = true;
+ sctx->readonly = readonly;
+
+ ret = scrub_workers_get(fs_info, false);
+ if (ret)
+ goto out_free_ctx;
+
+ /* Make sure we're the only running scrub. */
+ mutex_lock(&fs_info->scrub_lock);
+ if (atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ down_read(&fs_info->dev_replace.rwsem);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ up_read(&fs_info->dev_replace.rwsem);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+ /*
+ * Checking @scrub_pause_req here, we can avoid
+ * race between committing transaction and scrubbing.
+ */
+ __scrub_blocked_if_needed(fs_info);
+ atomic_inc(&fs_info->scrubs_running);
+ atomic_inc(&fs_info->scrubs_logical_running);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ /* The main work would be implemented. */
+ ret = -EOPNOTSUPP;
+
+ atomic_dec(&fs_info->scrubs_running);
+ atomic_dec(&fs_info->scrubs_logical_running);
+ wake_up(&fs_info->scrub_pause_wait);
+ if (progress)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
+out:
+ scrub_workers_put(fs_info);
+out_free_ctx:
+ scrub_free_ctx(sctx);
+ return ret;
+}
+
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf9d..22db0f71083e 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -6,6 +6,8 @@
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
+int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+ struct btrfs_scrub_progress *progress, bool readonly);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index dbb8b96da50d..fa5cb3b3611b 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -186,8 +186,15 @@ struct btrfs_scrub_progress {
* Intermittent error. */
};
-#define BTRFS_SCRUB_READONLY 1
-#define BTRFS_SCRUB_SUPPORTED_FLAGS (BTRFS_SCRUB_READONLY)
+#define BTRFS_SCRUB_READONLY (1ULL << 0)
+
+/*
+ * Regular scrub is based on device, while with this flag, scrub is
+ * based on logical, and @devid would be ignored.
+ */
+#define BTRFS_SCRUB_LOGICAL (1ULL << 1)
+#define BTRFS_SCRUB_SUPPORTED_FLAGS (BTRFS_SCRUB_READONLY |\
+ BTRFS_SCRUB_LOGICAL)
struct btrfs_ioctl_scrub_args {
__u64 devid; /* in */
__u64 start; /* in */
--
2.41.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically allocated Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 2/5] btrfs: scrub: introduce the skeleton for logical-scrub Qu Wenruo
@ 2023-06-17 5:07 ` Qu Wenruo
2023-06-20 8:55 ` Johannes Thumshirn
2023-06-17 5:07 ` [PATCH RFC 4/5] btrfs: scrub: implement the basic extent iteration code Qu Wenruo
` (2 subsequent siblings)
5 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:07 UTC (permalink / raw)
To: linux-btrfs
Introduce a new helper, prepare_scrub_block_group(), to handle the
checks before scrubbing a block group.
The helper would be called by both scrub_enumerate_chunks() and
scrub_logical_chunks().
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 328 ++++++++++++++++++++++++++++++-----------------
1 file changed, 209 insertions(+), 119 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 598754ad7ce6..aa68cda5226f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2349,6 +2349,141 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
return btrfs_commit_transaction(trans);
}
+/*
+ * Do the preparation before we scrub the block group.
+ *
+ * Return >0 if we don't need to scrub the block group.
+ * Return 0 if we have done the preparation and set @ro_set_ret.
+ * Return <0 for error and can not scrub the bg.
+ */
+static int prepare_scrub_block_group(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
+ bool *ro_set_ret)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ bool ro_set = false;
+ int ret;
+
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &bg->runtime_flags))
+ return 1;
+ }
+
+ /*
+ * Make sure that while we are scrubbing the corresponding block
+ * group doesn't get its logical address and its device extents
+ * reused for another block group, which can possibly be of a
+ * different type and different profile. We do this to prevent
+ * false error detections and crashes due to bogus attempts to
+ * repair extents.
+ */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ return 1;
+ }
+ btrfs_freeze_block_group(bg);
+ spin_unlock(&bg->lock);
+
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
+ */
+ ret = btrfs_inc_block_group_ro(bg, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, bg);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ }
+ }
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
+ !(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ *
+ * For RAID56 chunks, we have to mark them read-only
+ * for scrub, as later we would use our own cache
+ * out of RAID56 realm.
+ * Thus we want the RAID56 bg to be marked RO to
+ * prevent RMW from screwing up out cache.
+ */
+ ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ bg->start);
+ btrfs_unfreeze_block_group(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ } else {
+ btrfs_warn(fs_info,
+ "failed setting block group ro: %d", ret);
+ btrfs_unfreeze_block_group(bg);
+ scrub_pause_off(fs_info);
+ return ret;
+ }
+
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(bg);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, bg->start,
+ bg->length);
+ }
+ scrub_pause_off(fs_info);
+ *ro_set_ret = ro_set;
+ return 0;
+}
+
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -2359,7 +2494,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->dev_root;
u64 chunk_offset;
int ret = 0;
- int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -2380,6 +2514,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
+ bool ro_set = false;
u64 dev_extent_len;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2461,127 +2596,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
goto skip;
}
- if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
- if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
- btrfs_put_block_group(cache);
- goto skip;
- }
+ ret = prepare_scrub_block_group(sctx, cache, &ro_set);
+ if (ret == -ETXTBSY) {
+ ret = 0;
+ goto skip_unfreeze;
}
-
- /*
- * Make sure that while we are scrubbing the corresponding block
- * group doesn't get its logical address and its device extents
- * reused for another block group, which can possibly be of a
- * different type and different profile. We do this to prevent
- * false error detections and crashes due to bogus attempts to
- * repair extents.
- */
- spin_lock(&cache->lock);
- if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
- spin_unlock(&cache->lock);
+ if (ret < 0) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ if (ret > 0) {
btrfs_put_block_group(cache);
goto skip;
}
- btrfs_freeze_block_group(cache);
- spin_unlock(&cache->lock);
- /*
- * we need call btrfs_inc_block_group_ro() with scrubs_paused,
- * to avoid deadlock caused by:
- * btrfs_inc_block_group_ro()
- * -> btrfs_wait_for_commit()
- * -> btrfs_commit_transaction()
- * -> btrfs_scrub_pause()
- */
- scrub_pause_on(fs_info);
-
- /*
- * Don't do chunk preallocation for scrub.
- *
- * This is especially important for SYSTEM bgs, or we can hit
- * -EFBIG from btrfs_finish_chunk_alloc() like:
- * 1. The only SYSTEM bg is marked RO.
- * Since SYSTEM bg is small, that's pretty common.
- * 2. New SYSTEM bg will be allocated
- * Due to regular version will allocate new chunk.
- * 3. New SYSTEM bg is empty and will get cleaned up
- * Before cleanup really happens, it's marked RO again.
- * 4. Empty SYSTEM bg get scrubbed
- * We go back to 2.
- *
- * This can easily boost the amount of SYSTEM chunks if cleaner
- * thread can't be triggered fast enough, and use up all space
- * of btrfs_super_block::sys_chunk_array
- *
- * While for dev replace, we need to try our best to mark block
- * group RO, to prevent race between:
- * - Write duplication
- * Contains latest data
- * - Scrub copy
- * Contains data from commit tree
- *
- * If target block group is not marked RO, nocow writes can
- * be overwritten by scrub copy, causing data corruption.
- * So for dev-replace, it's not allowed to continue if a block
- * group is not RO.
- */
- ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
- if (!ret && sctx->is_dev_replace) {
- ret = finish_extent_writes_for_zoned(root, cache);
- if (ret) {
- btrfs_dec_block_group_ro(cache);
- scrub_pause_off(fs_info);
- btrfs_put_block_group(cache);
- break;
- }
- }
-
- if (ret == 0) {
- ro_set = 1;
- } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
- !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
- /*
- * btrfs_inc_block_group_ro return -ENOSPC when it
- * failed in creating new chunk for metadata.
- * It is not a problem for scrub, because
- * metadata are always cowed, and our scrub paused
- * commit_transactions.
- *
- * For RAID56 chunks, we have to mark them read-only
- * for scrub, as later we would use our own cache
- * out of RAID56 realm.
- * Thus we want the RAID56 bg to be marked RO to
- * prevent RMW from screwing up out cache.
- */
- ro_set = 0;
- } else if (ret == -ETXTBSY) {
- btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
- cache->start);
- scrub_pause_off(fs_info);
- ret = 0;
- goto skip_unfreeze;
- } else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
- btrfs_unfreeze_block_group(cache);
- btrfs_put_block_group(cache);
- scrub_pause_off(fs_info);
- break;
- }
-
- /*
- * Now the target block is marked RO, wait for nocow writes to
- * finish before dev-replace.
- * COW is fine, as COW never overwrites extents in commit tree.
- */
- if (sctx->is_dev_replace) {
- btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
- cache->length);
- }
-
- scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + dev_extent_len;
dev_replace->cursor_left = found_key.offset;
@@ -2960,6 +2988,69 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 cur = start;
+ int ret = 0;
+
+ while (cur < end) {
+ struct btrfs_block_group *bg;
+ bool ro_set = false;
+
+ bg = btrfs_lookup_first_block_group(fs_info, cur);
+
+ /* No more block groups. */
+ if (!bg)
+ break;
+ if (bg->start > end)
+ break;
+
+ ret = prepare_scrub_block_group(sctx, bg, &ro_set);
+ if (ret == -ETXTBSY) {
+ ret = 0;
+ goto next;
+ }
+ if (ret > 0)
+ goto next;
+ if (ret < 0) {
+ btrfs_put_block_group(bg);
+ break;
+ }
+
+ /* The real work starts here. */
+ ret = -EOPNOTSUPP;
+
+ if (ro_set)
+ btrfs_dec_block_group_ro(bg);
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags) &&
+ !bg->ro && bg->reserved == 0 && bg->used == 0) {
+ spin_unlock(&bg->lock);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
+ else
+ btrfs_mark_bg_unused(bg);
+ } else {
+ spin_unlock(&bg->lock);
+ }
+ btrfs_unfreeze_block_group(bg);
+next:
+ cur = bg->start + bg->length;
+ btrfs_put_block_group(bg);
+ if (ret < 0)
+ break;
+ }
+ return ret;
+}
+
int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
struct btrfs_scrub_progress *progress, bool readonly)
{
@@ -3006,8 +3097,7 @@ int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
atomic_inc(&fs_info->scrubs_logical_running);
mutex_unlock(&fs_info->scrub_lock);
- /* The main work would be implemented. */
- ret = -EOPNOTSUPP;
+ ret = scrub_logical_chunks(sctx, start, end);
atomic_dec(&fs_info->scrubs_running);
atomic_dec(&fs_info->scrubs_logical_running);
--
2.41.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group
2023-06-17 5:07 ` [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group Qu Wenruo
@ 2023-06-20 8:55 ` Johannes Thumshirn
0 siblings, 0 replies; 11+ messages in thread
From: Johannes Thumshirn @ 2023-06-20 8:55 UTC (permalink / raw)
To: Qu Wenruo, linux-btrfs@vger.kernel.org
On 17.06.23 07:08, Qu Wenruo wrote:
> Introduce a new helper, prepare_scrub_block_group(), to handle the
> checks before scrubbing a block group.
>
> The helper would be called by both scrub_enumerate_chunks() and
> scrub_logical_chunks().
I'd personally split out the scrub_logical_chunks() part into a new patch
to make it easier to read.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> fs/btrfs/scrub.c | 328 ++++++++++++++++++++++++++++++-----------------
> 1 file changed, 209 insertions(+), 119 deletions(-)
>
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 598754ad7ce6..aa68cda5226f 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -2349,6 +2349,141 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
> return btrfs_commit_transaction(trans);
> }
>
> +/*
> + * Do the preparation before we scrub the block group.
> + *
> + * Return >0 if we don't need to scrub the block group.
> + * Return 0 if we have done the preparation and set @ro_set_ret.
> + * Return <0 for error and can not scrub the bg.
> + */
> +static int prepare_scrub_block_group(struct scrub_ctx *sctx,
> + struct btrfs_block_group *bg,
> + bool *ro_set_ret)
> +{
> + struct btrfs_fs_info *fs_info = sctx->fs_info;
> + struct btrfs_root *root = fs_info->dev_root;
> + bool ro_set = false;
> + int ret;
> +
> + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
> + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &bg->runtime_flags))
> + return 1;
> + }
> +
> + /*
> + * Make sure that while we are scrubbing the corresponding block
> + * group doesn't get its logical address and its device extents
> + * reused for another block group, which can possibly be of a
> + * different type and different profile. We do this to prevent
> + * false error detections and crashes due to bogus attempts to
> + * repair extents.
> + */
> + spin_lock(&bg->lock);
> + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
> + spin_unlock(&bg->lock);
> + return 1;
> + }
> + btrfs_freeze_block_group(bg);
> + spin_unlock(&bg->lock);
> +
> + /*
> + * we need call btrfs_inc_block_group_ro() with scrubs_paused,
> + * to avoid deadlock caused by:
> + * btrfs_inc_block_group_ro()
> + * -> btrfs_wait_for_commit()
> + * -> btrfs_commit_transaction()
> + * -> btrfs_scrub_pause()
> + */
> + scrub_pause_on(fs_info);
> +
> + /*
> + * Don't do chunk preallocation for scrub.
> + *
> + * This is especially important for SYSTEM bgs, or we can hit
> + * -EFBIG from btrfs_finish_chunk_alloc() like:
> + * 1. The only SYSTEM bg is marked RO.
> + * Since SYSTEM bg is small, that's pretty common.
> + * 2. New SYSTEM bg will be allocated
> + * Due to regular version will allocate new chunk.
> + * 3. New SYSTEM bg is empty and will get cleaned up
> + * Before cleanup really happens, it's marked RO again.
> + * 4. Empty SYSTEM bg get scrubbed
> + * We go back to 2.
> + *
> + * This can easily boost the amount of SYSTEM chunks if cleaner
> + * thread can't be triggered fast enough, and use up all space
> + * of btrfs_super_block::sys_chunk_array
> + *
> + * While for dev replace, we need to try our best to mark block
> + * group RO, to prevent race between:
> + * - Write duplication
> + * Contains latest data
> + * - Scrub copy
> + * Contains data from commit tree
> + *
> + * If target block group is not marked RO, nocow writes can
> + * be overwritten by scrub copy, causing data corruption.
> + * So for dev-replace, it's not allowed to continue if a block
> + * group is not RO.
> + */
> + ret = btrfs_inc_block_group_ro(bg, sctx->is_dev_replace);
> + if (!ret && sctx->is_dev_replace) {
> + ret = finish_extent_writes_for_zoned(root, bg);
> + if (ret) {
> + btrfs_dec_block_group_ro(bg);
> + scrub_pause_off(fs_info);
> + return ret;
> + }
> + }
> +
> + if (ret == 0) {
> + ro_set = 1;
> + } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
> + !(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
> + /*
> + * btrfs_inc_block_group_ro return -ENOSPC when it
> + * failed in creating new chunk for metadata.
> + * It is not a problem for scrub, because
> + * metadata are always cowed, and our scrub paused
> + * commit_transactions.
> + *
> + * For RAID56 chunks, we have to mark them read-only
> + * for scrub, as later we would use our own cache
> + * out of RAID56 realm.
> + * Thus we want the RAID56 bg to be marked RO to
> + * prevent RMW from screwing up out cache.
> + */
> + ro_set = 0;
> + } else if (ret == -ETXTBSY) {
> + btrfs_warn(fs_info,
> + "skipping scrub of block group %llu due to active swapfile",
> + bg->start);
> + btrfs_unfreeze_block_group(bg);
> + scrub_pause_off(fs_info);
> + return ret;
> + } else {
> + btrfs_warn(fs_info,
> + "failed setting block group ro: %d", ret);
> + btrfs_unfreeze_block_group(bg);
> + scrub_pause_off(fs_info);
> + return ret;
> + }
> +
> + /*
> + * Now the target block is marked RO, wait for nocow writes to
> + * finish before dev-replace.
> + * COW is fine, as COW never overwrites extents in commit tree.
> + */
> + if (sctx->is_dev_replace) {
> + btrfs_wait_nocow_writers(bg);
> + btrfs_wait_ordered_roots(fs_info, U64_MAX, bg->start,
> + bg->length);
> + }
> + scrub_pause_off(fs_info);
> + *ro_set_ret = ro_set;
> + return 0;
> +}
> +
> static noinline_for_stack
> int scrub_enumerate_chunks(struct scrub_ctx *sctx,
> struct btrfs_device *scrub_dev, u64 start, u64 end)
> @@ -2359,7 +2494,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
> struct btrfs_root *root = fs_info->dev_root;
> u64 chunk_offset;
> int ret = 0;
> - int ro_set;
> int slot;
> struct extent_buffer *l;
> struct btrfs_key key;
> @@ -2380,6 +2514,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
> key.type = BTRFS_DEV_EXTENT_KEY;
>
> while (1) {
> + bool ro_set = false;
> u64 dev_extent_len;
>
> ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
> @@ -2461,127 +2596,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
> goto skip;
> }
>
> - if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
> - if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
> - btrfs_put_block_group(cache);
> - goto skip;
> - }
> + ret = prepare_scrub_block_group(sctx, cache, &ro_set);
> + if (ret == -ETXTBSY) {
> + ret = 0;
> + goto skip_unfreeze;
> }
> -
> - /*
> - * Make sure that while we are scrubbing the corresponding block
> - * group doesn't get its logical address and its device extents
> - * reused for another block group, which can possibly be of a
> - * different type and different profile. We do this to prevent
> - * false error detections and crashes due to bogus attempts to
> - * repair extents.
> - */
> - spin_lock(&cache->lock);
> - if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
> - spin_unlock(&cache->lock);
> + if (ret < 0) {
> + btrfs_put_block_group(cache);
> + break;
> + }
> + if (ret > 0) {
> btrfs_put_block_group(cache);
> goto skip;
> }
> - btrfs_freeze_block_group(cache);
> - spin_unlock(&cache->lock);
>
> - /*
> - * we need call btrfs_inc_block_group_ro() with scrubs_paused,
> - * to avoid deadlock caused by:
> - * btrfs_inc_block_group_ro()
> - * -> btrfs_wait_for_commit()
> - * -> btrfs_commit_transaction()
> - * -> btrfs_scrub_pause()
> - */
> - scrub_pause_on(fs_info);
> -
> - /*
> - * Don't do chunk preallocation for scrub.
> - *
> - * This is especially important for SYSTEM bgs, or we can hit
> - * -EFBIG from btrfs_finish_chunk_alloc() like:
> - * 1. The only SYSTEM bg is marked RO.
> - * Since SYSTEM bg is small, that's pretty common.
> - * 2. New SYSTEM bg will be allocated
> - * Due to regular version will allocate new chunk.
> - * 3. New SYSTEM bg is empty and will get cleaned up
> - * Before cleanup really happens, it's marked RO again.
> - * 4. Empty SYSTEM bg get scrubbed
> - * We go back to 2.
> - *
> - * This can easily boost the amount of SYSTEM chunks if cleaner
> - * thread can't be triggered fast enough, and use up all space
> - * of btrfs_super_block::sys_chunk_array
> - *
> - * While for dev replace, we need to try our best to mark block
> - * group RO, to prevent race between:
> - * - Write duplication
> - * Contains latest data
> - * - Scrub copy
> - * Contains data from commit tree
> - *
> - * If target block group is not marked RO, nocow writes can
> - * be overwritten by scrub copy, causing data corruption.
> - * So for dev-replace, it's not allowed to continue if a block
> - * group is not RO.
> - */
> - ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
> - if (!ret && sctx->is_dev_replace) {
> - ret = finish_extent_writes_for_zoned(root, cache);
> - if (ret) {
> - btrfs_dec_block_group_ro(cache);
> - scrub_pause_off(fs_info);
> - btrfs_put_block_group(cache);
> - break;
> - }
> - }
> -
> - if (ret == 0) {
> - ro_set = 1;
> - } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
> - !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
> - /*
> - * btrfs_inc_block_group_ro return -ENOSPC when it
> - * failed in creating new chunk for metadata.
> - * It is not a problem for scrub, because
> - * metadata are always cowed, and our scrub paused
> - * commit_transactions.
> - *
> - * For RAID56 chunks, we have to mark them read-only
> - * for scrub, as later we would use our own cache
> - * out of RAID56 realm.
> - * Thus we want the RAID56 bg to be marked RO to
> - * prevent RMW from screwing up out cache.
> - */
> - ro_set = 0;
> - } else if (ret == -ETXTBSY) {
> - btrfs_warn(fs_info,
> - "skipping scrub of block group %llu due to active swapfile",
> - cache->start);
> - scrub_pause_off(fs_info);
> - ret = 0;
> - goto skip_unfreeze;
> - } else {
> - btrfs_warn(fs_info,
> - "failed setting block group ro: %d", ret);
> - btrfs_unfreeze_block_group(cache);
> - btrfs_put_block_group(cache);
> - scrub_pause_off(fs_info);
> - break;
> - }
> -
> - /*
> - * Now the target block is marked RO, wait for nocow writes to
> - * finish before dev-replace.
> - * COW is fine, as COW never overwrites extents in commit tree.
> - */
> - if (sctx->is_dev_replace) {
> - btrfs_wait_nocow_writers(cache);
> - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
> - cache->length);
> - }
> -
> - scrub_pause_off(fs_info);
> down_write(&dev_replace->rwsem);
> dev_replace->cursor_right = found_key.offset + dev_extent_len;
> dev_replace->cursor_left = found_key.offset;
> @@ -2960,6 +2988,69 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
> return ret;
> }
>
> +static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
> +{
> + struct btrfs_fs_info *fs_info = sctx->fs_info;
> + u64 cur = start;
> + int ret = 0;
> +
> + while (cur < end) {
> + struct btrfs_block_group *bg;
> + bool ro_set = false;
> +
> + bg = btrfs_lookup_first_block_group(fs_info, cur);
> +
> + /* No more block groups. */
> + if (!bg)
> + break;
> + if (bg->start > end)
> + break;
> +
> + ret = prepare_scrub_block_group(sctx, bg, &ro_set);
> + if (ret == -ETXTBSY) {
> + ret = 0;
> + goto next;
> + }
> + if (ret > 0)
> + goto next;
> + if (ret < 0) {
> + btrfs_put_block_group(bg);
> + break;
> + }
> +
> + /* The real work starts here. */
> + ret = -EOPNOTSUPP;
> +
> + if (ro_set)
> + btrfs_dec_block_group_ro(bg);
> + /*
> + * We might have prevented the cleaner kthread from deleting
> + * this block group if it was already unused because we raced
> + * and set it to RO mode first. So add it back to the unused
> + * list, otherwise it might not ever be deleted unless a manual
> + * balance is triggered or it becomes used and unused again.
> + */
> + spin_lock(&bg->lock);
> + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags) &&
> + !bg->ro && bg->reserved == 0 && bg->used == 0) {
> + spin_unlock(&bg->lock);
> + if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
> + btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
> + else
> + btrfs_mark_bg_unused(bg);
> + } else {
> + spin_unlock(&bg->lock);
> + }
> + btrfs_unfreeze_block_group(bg);
> +next:
> + cur = bg->start + bg->length;
> + btrfs_put_block_group(bg);
> + if (ret < 0)
> + break;
> + }
> + return ret;
> +}
> +
> int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
> struct btrfs_scrub_progress *progress, bool readonly)
> {
> @@ -3006,8 +3097,7 @@ int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
> atomic_inc(&fs_info->scrubs_logical_running);
> mutex_unlock(&fs_info->scrub_lock);
>
> - /* The main work would be implemented. */
> - ret = -EOPNOTSUPP;
> + ret = scrub_logical_chunks(sctx, start, end);
>
> atomic_dec(&fs_info->scrubs_running);
> atomic_dec(&fs_info->scrubs_logical_running);
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH RFC 4/5] btrfs: scrub: implement the basic extent iteration code
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
` (2 preceding siblings ...)
2023-06-17 5:07 ` [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before scrubbing a block group Qu Wenruo
@ 2023-06-17 5:07 ` Qu Wenruo
2023-06-17 5:07 ` [PATCH RFC 5/5] btrfs: scrub: implement the repair part of scrub logical Qu Wenruo
2023-06-17 5:12 ` [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
5 siblings, 0 replies; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:07 UTC (permalink / raw)
To: linux-btrfs
Unlike the per-device scrub code, logical bytenr based scrub code is
much easier iterating the extents.
The difference is mainly in the stripe queueing part, as we need to
queue @nr_copies stripes at once.
So here we introduce a helper, scrub_copy_stripe(), and fill the first
stripe, then use that helper to fill the remaining copies.
For now, we still use the same flush_scrub_stripes(), but the repair
part is less efficient.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 160 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index aa68cda5226f..2500737adff1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1465,7 +1465,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
struct btrfs_device *dev, u64 physical,
int mirror_num, u64 logical_start,
- u32 logical_len,
+ u64 logical_len,
struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -2988,6 +2988,164 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+static void scrub_copy_stripe(const struct scrub_stripe *src,
+ struct scrub_stripe *dst)
+{
+ struct scrub_ctx *sctx = src->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+
+ /* This function should only be called for logical scrub. */
+ ASSERT(sctx->scrub_logical);
+ scrub_reset_stripe(dst);
+
+ dst->bg = src->bg;
+ dst->dev = NULL;
+ dst->physical = 0;
+ dst->logical = src->logical;
+ dst->mirror_num = src->mirror_num;
+
+ bitmap_copy(&dst->extent_sector_bitmap, &src->extent_sector_bitmap,
+ src->nr_sectors);
+ memcpy(dst->csums, src->csums,
+ (BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits) *
+ fs_info->csum_size);
+
+ for (int i = 0; i < src->nr_sectors; i++) {
+ dst->sectors[i].is_metadata = src->sectors[i].is_metadata;
+ if (src->sectors[i].is_metadata)
+ dst->sectors[i].generation = src->sectors[i].generation;
+ else if (src->sectors[i].csum)
+ dst->sectors[i].csum = dst->csums + i * fs_info->csum_size;
+ }
+ dst->nr_data_extents = src->nr_data_extents;
+ dst->nr_meta_extents = src->nr_meta_extents;
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &dst->state);
+}
+
+/*
+ * Unlike queue_scrub_stripe() which only queue one stripe, this queue all
+ * mirrors for non-RAID56 profiles, or the full stripe for RAID56.
+ */
+static int queue_scrub_logical_stripes(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg, u64 logical)
+{
+ const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+ const int nr_copies = btrfs_raid_array[raid_index].ncopies;
+ const int old_cur = sctx->cur_stripe;
+ struct scrub_stripe *stripe;
+ int ret;
+
+ ASSERT(sctx->stripes);
+ ASSERT(sctx->nr_stripes);
+
+ if (sctx->cur_stripe + nr_copies > sctx->nr_stripes) {
+ ret = flush_scrub_stripes(sctx);
+ if (ret < 0)
+ return ret;
+ }
+
+ stripe = &sctx->stripes[old_cur];
+
+ scrub_reset_stripe(stripe);
+ ret = scrub_find_fill_first_stripe(bg, NULL, 0, 1,
+ logical, bg->start + bg->length - logical, stripe);
+ /* Either >0 as no more extents or <0 for error. */
+ if (ret)
+ return ret;
+
+ /* For the remaining slots, just copy the above mirror. */
+ for (int i = 1; i < nr_copies; i++) {
+ struct scrub_stripe *dst = &sctx->stripes[old_cur + i];
+
+ scrub_copy_stripe(stripe, dst);
+ dst->mirror_num = i + 1;
+ }
+ sctx->cur_stripe += nr_copies;
+ return 0;
+}
+
+static int scrub_logical_one_chunk(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ u64 cur = bg->start;
+ const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+ const int nr_copies = btrfs_raid_array[raid_index].ncopies;
+ int nr_stripes;
+ int ret = 0;
+ int flush_ret;
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
+ read_unlock(&map_tree->lock);
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
+ ret = -EINVAL;
+ spin_unlock(&bg->lock);
+ return ret;
+ }
+
+ scrub_blocked_if_needed(fs_info);
+
+ /* RAID56 not yet supported. */
+ if (bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ nr_stripes = SCRUB_STRIPES_PER_SCTX * nr_copies;
+ ret = alloc_scrub_stripes(sctx, nr_stripes);
+ if (ret < 0)
+ goto out;
+
+ while (cur < bg->start + bg->length) {
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
+
+ ret = queue_scrub_logical_stripes(sctx, bg, cur);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ ASSERT(sctx->cur_stripe >= 1);
+ cur = sctx->stripes[sctx->cur_stripe - 1].logical + BTRFS_STRIPE_LEN;
+ }
+out:
+ flush_ret = flush_scrub_stripes(sctx);
+ if (!ret)
+ ret = flush_ret;
+ free_scrub_stripes(sctx);
+ free_extent_map(em);
+ return ret;
+
+}
+
static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3018,8 +3176,7 @@ static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
break;
}
- /* The real work starts here. */
- ret = -EOPNOTSUPP;
+ ret = scrub_logical_one_chunk(sctx, bg);
if (ro_set)
btrfs_dec_block_group_ro(bg);
--
2.41.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH RFC 5/5] btrfs: scrub: implement the repair part of scrub logical
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
` (3 preceding siblings ...)
2023-06-17 5:07 ` [PATCH RFC 4/5] btrfs: scrub: implement the basic extent iteration code Qu Wenruo
@ 2023-06-17 5:07 ` Qu Wenruo
2023-06-20 9:24 ` Johannes Thumshirn
2023-06-17 5:12 ` [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
5 siblings, 1 reply; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:07 UTC (permalink / raw)
To: linux-btrfs
The repair part of scrub logical is done differently compared to the
per-device counterpart:
- We read out all mirrors in one go
Since it's no longer per-device, we just read out all mirrors.
- Find a good mirror for the same sectornr of all mirrors
- Copy the good content to any corrupted sector
This has several advantages:
- Less IO wait
Since all IOs are submitted at the very beginning, we avoid the read
then wait for per-device scrub.
This applies to both read and write part.
This needs some changes to the per-device scrub code though:
- Call the scrub_verify_one_stripe() inside scrub_read_endio()
This is to improve the performance, as we can have csum verification
per-mirror.
- Do not queue scrub_stripe_read_repair_worker() workload for
scrub_logical
Since we don't need to go per-device repair path.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 133 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2500737adff1..cdde01d55e1a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -988,7 +988,6 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
ASSERT(stripe->mirror_num > 0);
wait_scrub_stripe_io(stripe);
- scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
/* Save the initial failed bitmap for later repair and report usage. */
stripe->init_error_bitmap = stripe->error_bitmap;
stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
@@ -1061,9 +1060,12 @@ static void scrub_read_endio(struct btrfs_bio *bbio)
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
+ scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
wake_up(&stripe->io_wait);
- INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
- queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ if (!stripe->sctx->scrub_logical) {
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ }
}
}
@@ -1649,7 +1651,127 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
return false;
}
-static int flush_scrub_stripes(struct scrub_ctx *sctx)
+/*
+ * Unlike the per-device repair, we have all mirrors read out already.
+ *
+ * Thus we only need to find out the good mirror, and copy the content to
+ * any bad sectors.
+ */
+static void repair_one_mirror_group(struct scrub_ctx *sctx, int start_stripe,
+ int ncopies)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct scrub_stripe *first_stripe = &sctx->stripes[start_stripe];
+ struct scrub_stripe *cur_stripe;
+ const u32 sectorsize = fs_info->sectorsize;
+ int sectornr;
+
+ ASSERT(start_stripe + ncopies <= sctx->cur_stripe);
+
+ for_each_set_bit(sectornr, &first_stripe->extent_sector_bitmap,
+ first_stripe->nr_sectors) {
+ struct scrub_stripe *good_stripe;
+ int good_mirror = -1;
+
+ for (int cur_mirror = start_stripe;
+ cur_mirror < start_stripe + ncopies; cur_mirror++) {
+ cur_stripe = &sctx->stripes[cur_mirror];
+
+ if (!test_bit(sectornr, &cur_stripe->error_bitmap)) {
+ good_mirror = cur_mirror;
+ break;
+ }
+ }
+ /* No good mirror found, this vertical stripe can not be repaired. */
+ if (good_mirror < 0)
+ continue;
+
+ good_stripe = &sctx->stripes[good_mirror];
+
+ for (int cur_mirror = start_stripe;
+ cur_mirror < start_stripe + ncopies; cur_mirror++) {
+ cur_stripe = &sctx->stripes[cur_mirror];
+
+ if (!test_bit(sectornr, &cur_stripe->error_bitmap))
+ continue;
+ /* Repair from the good mirror. */
+ memcpy_page(scrub_stripe_get_page(cur_stripe, sectornr),
+ scrub_stripe_get_page_offset(cur_stripe, sectornr),
+ scrub_stripe_get_page(good_stripe, sectornr),
+ scrub_stripe_get_page_offset(good_stripe, sectornr),
+ sectorsize);
+ clear_bit(sectornr, &cur_stripe->error_bitmap);
+ clear_bit(sectornr, &cur_stripe->io_error_bitmap);
+ if (cur_stripe->sectors[sectornr].is_metadata)
+ clear_bit(sectornr, &cur_stripe->meta_error_bitmap);
+ else
+ clear_bit(sectornr, &cur_stripe->csum_error_bitmap);
+ }
+ }
+ for (int cur_mirror = start_stripe; cur_mirror < start_stripe + ncopies;
+ cur_mirror++) {
+ cur_stripe = &sctx->stripes[cur_mirror];
+ set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &cur_stripe->state);
+ scrub_stripe_report_errors(sctx, cur_stripe);
+ wake_up(&cur_stripe->repair_wait);
+
+ if (btrfs_is_zoned(fs_info)) {
+ if (!bitmap_empty(&cur_stripe->init_error_bitmap,
+ cur_stripe->nr_sectors)) {
+ btrfs_repair_one_zone(fs_info, cur_stripe->logical);
+ break;
+ }
+ }
+ if (!sctx->readonly) {
+ unsigned long repaired;
+
+ bitmap_andnot(&repaired, &cur_stripe->init_error_bitmap,
+ &cur_stripe->error_bitmap,
+ cur_stripe->nr_sectors);
+ scrub_write_sectors(sctx, cur_stripe, repaired, false);
+ }
+ }
+ /* Wait for above writeback to finish. */
+ for (int cur_mirror = start_stripe; cur_mirror < start_stripe + ncopies;
+ cur_mirror++) {
+ cur_stripe = &sctx->stripes[cur_mirror];
+
+ wait_scrub_stripe_io(cur_stripe);
+ }
+}
+
+static int handle_logical_stripes(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg)
+{
+ const int nr_stripes = sctx->cur_stripe;
+ const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+ const int ncopies = btrfs_raid_array[raid_index].ncopies;
+ struct scrub_stripe *stripe;
+
+ for (int i = 0 ; i < nr_stripes; i++) {
+ stripe = &sctx->stripes[i];
+
+ wait_scrub_stripe_io(stripe);
+
+ /* Save the initial failed bitmap for later repair and report usage. */
+ stripe->init_error_bitmap = stripe->error_bitmap;
+ stripe->init_nr_io_errors =
+ bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
+ stripe->init_nr_csum_errors =
+ bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
+ stripe->init_nr_meta_errors =
+ bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+ }
+
+ for (int i = 0 ; i < nr_stripes; i += ncopies)
+ repair_one_mirror_group(sctx, i, ncopies);
+ sctx->cur_stripe = 0;
+
+ return 0;
+}
+
+static int flush_scrub_stripes(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct scrub_stripe *stripe;
@@ -1660,6 +1782,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (!nr_stripes)
return 0;
+ if (sctx->scrub_logical)
+ return handle_logical_stripes(sctx, bg);
+
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
@@ -1756,7 +1881,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
/* No available slot, submit all stripes and wait for them. */
if (sctx->cur_stripe >= sctx->nr_stripes) {
- ret = flush_scrub_stripes(sctx);
+ ret = flush_scrub_stripes(sctx, bg);
if (ret < 0)
return ret;
}
@@ -2256,7 +2381,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
break;
}
out:
- ret2 = flush_scrub_stripes(sctx);
+ ret2 = flush_scrub_stripes(sctx, bg);
if (!ret)
ret = ret2;
if (sctx->raid56_data_stripes) {
@@ -3039,7 +3164,7 @@ static int queue_scrub_logical_stripes(struct scrub_ctx *sctx,
ASSERT(sctx->nr_stripes);
if (sctx->cur_stripe + nr_copies > sctx->nr_stripes) {
- ret = flush_scrub_stripes(sctx);
+ ret = flush_scrub_stripes(sctx, bg);
if (ret < 0)
return ret;
}
@@ -3137,7 +3262,7 @@ static int scrub_logical_one_chunk(struct scrub_ctx *sctx,
cur = sctx->stripes[sctx->cur_stripe - 1].logical + BTRFS_STRIPE_LEN;
}
out:
- flush_ret = flush_scrub_stripes(sctx);
+ flush_ret = flush_scrub_stripes(sctx, bg);
if (!ret)
ret = flush_ret;
free_scrub_stripes(sctx);
--
2.41.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag
2023-06-17 5:07 [PATCH RFC 0/5] btrfs: scrub: introduce SCRUB_LOGICAL flag Qu Wenruo
` (4 preceding siblings ...)
2023-06-17 5:07 ` [PATCH RFC 5/5] btrfs: scrub: implement the repair part of scrub logical Qu Wenruo
@ 2023-06-17 5:12 ` Qu Wenruo
5 siblings, 0 replies; 11+ messages in thread
From: Qu Wenruo @ 2023-06-17 5:12 UTC (permalink / raw)
To: linux-btrfs
On 2023/6/17 13:07, Qu Wenruo wrote:
> [BACKGROUND]
> Scrub originally works in a per-device basis, that means if we want to
> scrub the full fs, we would queue a scrub for each writeable device.
>
> This is normally fine, but some behavior is not ideal like the
> following:
> X X+16K X+32K
> Mirror 1 |XXXXXXX| |
> Mirror 2 | |XXXXXXX|
>
> When scrubbing mirror 1, we found [X, X+16K) has corruption, then we
> will try to read from mirror 2 and repair using the correct data.
>
> This applies to range [X+16K, X+32K) of mirror 2, causing the good copy
> to be read twice, which may or may not be a big deal.
>
> But the problem can easily go crazy for RAID56, as its repair requires
> the full stripe to be read out, so is its P/Q verification.
>
>
> There are also some other minor problems, like due to the difference in
> the disk read speed, we may be scrubbing different block groups on
> different devices.
> This can lead to reduced available space and higher chance of false
> -ENOSPC.
>
> [ENHANCEMENT]
> Instead of the existing per-device scrub, this patchset introduce a new
> flag, SCRUB_LOGICAL, to do logical address space based scrub.
>
> Unlike per-device scrub, this new flag would make scrub a per-fs
> operation.
>
> This allows us to scrub the different mirrors in one go, and avoid
> unnecessary read to repair the corruption.
>
> This also makes user space handling much simpler, just one ioctl to
> scrub the full fs, without the current multi-thread scrub code.
>
> [TODO]
> The long todo list is the main reason for RFC:
>
> - Missing RAID56 handling
> Unlike pure mirror based profiles, RAID56 repair now needs to be
> handled inside scrub (or some new raid interfaces to handle fully
> cached stripes).
>
> This can be some extra investment, and add an exception for the
> elegant and simpler mirror based read-repair path.
>
> - Better progs integration
> In theory we can silently try SCRUB_LOGICAL first, if the kernel
> doesn't support it, then fallback to the old multi-device scrub.
>
> But for current testing, a dedicated option is still assigned for
> scrub subcommand.
>
> And currently it only supports full report, no summary nor scrub file
> support.
>
> - More testing
> Currently only very basic tests done, no stress tests yet.
Forgot one more feature missing:
- IO speed limit through sysfs interface
The old interface is per-device one, which can not be integrated
easily into the per-fs scrub.
Thanks,
Qu
>
> Qu Wenruo (5):
> btrfs: scrub: make scrub_ctx::stripes dynamically allocated
> btrfs: scrub: introduce the skeleton for logical-scrub
> btrfs: scrub: extract the common preparation before scrubbing a block
> group
> btrfs: scrub: implement the basic extent iteration code
> btrfs: scrub: implement the repair part of scrub logical
>
> fs/btrfs/disk-io.c | 1 +
> fs/btrfs/fs.h | 12 +
> fs/btrfs/ioctl.c | 6 +-
> fs/btrfs/scrub.c | 757 ++++++++++++++++++++++++++++++-------
> fs/btrfs/scrub.h | 2 +
> include/uapi/linux/btrfs.h | 11 +-
> 6 files changed, 647 insertions(+), 142 deletions(-)
>
^ permalink raw reply [flat|nested] 11+ messages in thread