* [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-12-03 15:10 ` li zhang
2022-09-28 8:35 ` [PATCH PoC v2 02/10] btrfs: scrub: introduce place holder for btrfs_scrub_fs() Qu Wenruo
` (8 subsequent siblings)
9 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
The new ioctls are to address the disadvantages of the existing
btrfs_scrub_dev():
a One thread per-device
This can cause multiple block groups to be marked read-only for scrub,
reducing available space temporarily.
This also causes higher CPU/IO usage.
For scrub, we should use the minimal amount of CPU and cause less
IO when possible.
b Extra IO for RAID56
For data stripes, we will cause at least 2x IO if we run "btrfs scrub
start <mnt>".
1x from scrubbing the device of data stripe.
The other 1x from scrubbing the parity stripe.
This duplicated IO should definitely be avoided
c Bad progress report for RAID56
We can not report any repaired P/Q bytes at all.
The a and b will be addressed by the new one thread per-fs
btrfs_scrub_fs ioctl.
While c will be addressed by the new btrfs_scrub_fs_progress structure,
which has better comments and classification for all errors.
This patch is only a skeleton for the new family of ioctls, will return
-EOPNOTSUPP for now.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/ioctl.c | 6 ++
include/uapi/linux/btrfs.h | 174 +++++++++++++++++++++++++++++++++++++
2 files changed, 180 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d5dd8bed1488..6944216e1425 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(fs_info);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(fs_info, argp);
+ case BTRFS_IOC_SCRUB_FS:
+ return -EOPNOTSUPP;
+ case BTRFS_IOC_SCRUB_FS_CANCEL:
+ return -EOPNOTSUPP;
+ case BTRFS_IOC_SCRUB_FS_PROGRESS:
+ return -EOPNOTSUPP;
case BTRFS_IOC_BALANCE_V2:
return btrfs_ioctl_balance(file, argp);
case BTRFS_IOC_BALANCE_CTL:
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5655e89b962b..86169e2ffefe 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -191,6 +191,175 @@ struct btrfs_ioctl_scrub_args {
__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
};
+struct btrfs_scrub_fs_progress {
+ /*
+ * Fatal errors, including -ENOMEM, or csum/extent tree search errors.
+ *
+ * Normally after hitting such fatal errors, we error out, thus later
+ * accounting will no longer be reliable.
+ */
+ __u16 nr_fatal_errors;
+
+ /*
+ * All super errors, from invalid members and IO error all go into
+ * nr_super_errors.
+ */
+ __u16 nr_super_errors;
+
+ /* Super block accounting. */
+ __u16 nr_super_scrubbed;
+ __u16 nr_super_repaired;
+
+ /*
+ * Data accounting in bytes.
+ *
+ * We only care about how many bytes we scrubbed, thus no
+ * accounting for number of extents.
+ *
+ * This accounting includes the extra mirrors.
+ * E.g. for RAID1, one 16KiB extent will cause 32KiB in @data_scrubbed.
+ */
+ __u64 data_scrubbed;
+
+ /* How many bytes can be recovered. */
+ __u64 data_recoverable;
+
+ /*
+ * How many bytes don't have csum.
+ *
+ * For nocsum case we won't even try to compare the different copies.
+ * As for NOCSUM case we can not distignuish NODATASUM data and
+ * pre-allocated extents without doing expensive backref walk.
+ */
+ __u64 data_nocsum;
+
+ /*
+ * For data error bytes, these means determining errors, including:
+ *
+ * - IO failure, including missing dev.
+ * - Data csum mismatch
+ * Csum tree search failure must go above case.
+ */
+ __u64 data_io_fail;
+ __u64 data_csum_mismatch;
+
+ /*
+ * All the unmentioned cases, including data matching its csum (of
+ * course, implies IO suceeded) and data has no csum but matches all
+ * other copies/parities, are the expected cases, no need to record.
+ */
+
+ /*
+ * Metadata accounting in bytes, pretty much the same as data.
+ *
+ * And since metadata has mandatory csum, there is no uncertain case.
+ */
+ __u64 meta_scrubbed;
+ __u64 meta_recoverable;
+
+ /*
+ * For meta, the checks are mostly progressive:
+ *
+ * - Unable to read
+ * @meta_io_fail
+ *
+ * - Unable to pass basic sanity checks (e.g. bytenr check)
+ * @meta_invalid
+ *
+ * - Pass basic sanity checks, but bad csum
+ * @meta_bad_csum
+ *
+ * - Pass basic checks and csum, but bad transid
+ * @meta_bad_transid
+ *
+ * - Pass all checks
+ * The expected case, no special accounting needed.
+ */
+ __u64 meta_io_fail;
+ __u64 meta_invalid;
+ __u64 meta_bad_csum;
+ __u64 meta_bad_transid;
+
+ /*
+ * Parity accounting.
+ *
+ * NOTE: for unused data sectors (but still contributes to P/Q
+ * calculation, like the following case), they don't contribute
+ * to any accounting.
+ *
+ * Data 1: |<--- Unused ---->| <<<
+ * Data 2: |<- Data extent ->|
+ * Parity: |<--- Parity ---->|
+ */
+ __u64 parity_scrubbed;
+ __u64 parity_recoverable;
+
+ /*
+ * This happens when there is not enough info to determine if the
+ * parity is correct, mostly happens when vertical stripes are
+ * *all* NODATASUM sectors.
+ *
+ * If there is any sector with checksum in the vertical stripe,
+ * parity itself will be no longer uncertain.
+ */
+ __u64 parity_uncertain;
+
+ /*
+ * For parity, the checks are progressive too:
+ *
+ * - Unable to read
+ * @parity_io_fail
+ *
+ * - Mismatch and any veritical data stripe has csum and
+ * the data stripe csum matches
+ * @parity_mismatch
+ * We want to repair the parity then.
+ *
+ * - Mismatch and veritical data stripe has csum, and data
+ * csum mismatch. And rebuilt data passes csum.
+ * This will go @data_recoverable or @data_csum_mismatch instead.
+ *
+ * - Mismatch but no veritical data stripe has csum
+ * @parity_uncertain
+ *
+ */
+ __u64 parity_io_fail;
+ __u64 parity_mismatch;
+
+ /* Padding to 256 bytes, and for later expansion. */
+ __u64 __unused[15];
+};
+static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
+
+/*
+ * Readonly scrub fs will not try any repair (thus *_repaired member
+ * in scrub_fs_progress should always be 0).
+ */
+#define BTRFS_SCRUB_FS_FLAG_READONLY (1ULL << 0)
+
+/*
+ * All supported flags.
+ *
+ * From the very beginning, scrub_fs ioctl would reject any unsupported
+ * flags, making later expansion much simper.
+ */
+#define BTRFS_SCRUB_FS_FLAG_SUPP (BTRFS_SCRUB_FS_FLAG_READONLY)
+
+struct btrfs_ioctl_scrub_fs_args {
+ /* Input, logical bytenr to start the scrub */
+ __u64 start;
+
+ /* Input, the logical bytenr end (inclusive) */
+ __u64 end;
+
+ __u64 flags;
+ __u64 reserved[8];
+ struct btrfs_scrub_fs_progress progress; /* out */
+
+ /* pad to 1K */
+ __u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
+};
+
#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
struct btrfs_ioctl_dev_replace_start_params {
@@ -1143,5 +1312,10 @@ enum btrfs_err_code {
struct btrfs_ioctl_encoded_io_args)
#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_SCRUB_FS _IOWR(BTRFS_IOCTL_MAGIC, 65, \
+ struct btrfs_ioctl_scrub_fs_args)
+#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
+#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
+ struct btrfs_ioctl_scrub_fs_args)
#endif /* _UAPI_LINUX_BTRFS_H */
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls
2022-09-28 8:35 ` [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls Qu Wenruo
@ 2022-12-03 15:10 ` li zhang
2022-12-03 23:09 ` Qu Wenruo
0 siblings, 1 reply; 15+ messages in thread
From: li zhang @ 2022-12-03 15:10 UTC (permalink / raw)
To: Qu Wenruo, linux-btrfs
Qu Wenruo <wqu@suse.com> 于2022年9月28日周三 16:40写道:
>
> The new ioctls are to address the disadvantages of the existing
> btrfs_scrub_dev():
>
> a One thread per-device
> This can cause multiple block groups to be marked read-only for scrub,
> reducing available space temporarily.
>
> This also causes higher CPU/IO usage.
> For scrub, we should use the minimal amount of CPU and cause less
> IO when possible.
>
> b Extra IO for RAID56
> For data stripes, we will cause at least 2x IO if we run "btrfs scrub
> start <mnt>".
> 1x from scrubbing the device of data stripe.
> The other 1x from scrubbing the parity stripe.
>
> This duplicated IO should definitely be avoided
>
> c Bad progress report for RAID56
> We can not report any repaired P/Q bytes at all.
>
> The a and b will be addressed by the new one thread per-fs
> btrfs_scrub_fs ioctl.
>
> While c will be addressed by the new btrfs_scrub_fs_progress structure,
> which has better comments and classification for all errors.
>
> This patch is only a skeleton for the new family of ioctls, will return
> -EOPNOTSUPP for now.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> fs/btrfs/ioctl.c | 6 ++
> include/uapi/linux/btrfs.h | 174 +++++++++++++++++++++++++++++++++++++
> 2 files changed, 180 insertions(+)
>
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index d5dd8bed1488..6944216e1425 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
> return btrfs_ioctl_scrub_cancel(fs_info);
> case BTRFS_IOC_SCRUB_PROGRESS:
> return btrfs_ioctl_scrub_progress(fs_info, argp);
> + case BTRFS_IOC_SCRUB_FS:
> + return -EOPNOTSUPP;
> + case BTRFS_IOC_SCRUB_FS_CANCEL:
> + return -EOPNOTSUPP;
> + case BTRFS_IOC_SCRUB_FS_PROGRESS:
> + return -EOPNOTSUPP;
> case BTRFS_IOC_BALANCE_V2:
> return btrfs_ioctl_balance(file, argp);
> case BTRFS_IOC_BALANCE_CTL:
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 5655e89b962b..86169e2ffefe 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -191,6 +191,175 @@ struct btrfs_ioctl_scrub_args {
> __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
> };
>
> +struct btrfs_scrub_fs_progress {
> + /*
> + * Fatal errors, including -ENOMEM, or csum/extent tree search errors.
> + *
> + * Normally after hitting such fatal errors, we error out, thus later
> + * accounting will no longer be reliable.
> + */
> + __u16 nr_fatal_errors;
> +
> + /*
> + * All super errors, from invalid members and IO error all go into
> + * nr_super_errors.
> + */
> + __u16 nr_super_errors;
> +
> + /* Super block accounting. */
> + __u16 nr_super_scrubbed;
> + __u16 nr_super_repaired;
> +
> + /*
> + * Data accounting in bytes.
> + *
> + * We only care about how many bytes we scrubbed, thus no
> + * accounting for number of extents.
> + *
> + * This accounting includes the extra mirrors.
> + * E.g. for RAID1, one 16KiB extent will cause 32KiB in @data_scrubbed.
> + */
> + __u64 data_scrubbed;
> +
> + /* How many bytes can be recovered. */
> + __u64 data_recoverable;
> +
> + /*
> + * How many bytes don't have csum.
> + *
> + * For nocsum case we won't even try to compare the different copies.
> + * As for NOCSUM case we can not distignuish NODATASUM data and
> + * pre-allocated extents without doing expensive backref walk.
> + */
> + __u64 data_nocsum;
> +
> + /*
> + * For data error bytes, these means determining errors, including:
> + *
> + * - IO failure, including missing dev.
> + * - Data csum mismatch
> + * Csum tree search failure must go above case.
> + */
> + __u64 data_io_fail;
> + __u64 data_csum_mismatch;
> +
> + /*
> + * All the unmentioned cases, including data matching its csum (of
> + * course, implies IO suceeded) and data has no csum but matches all
> + * other copies/parities, are the expected cases, no need to record.
> + */
> +
> + /*
> + * Metadata accounting in bytes, pretty much the same as data.
> + *
> + * And since metadata has mandatory csum, there is no uncertain case.
> + */
> + __u64 meta_scrubbed;
> + __u64 meta_recoverable;
> +
> + /*
> + * For meta, the checks are mostly progressive:
> + *
> + * - Unable to read
> + * @meta_io_fail
> + *
> + * - Unable to pass basic sanity checks (e.g. bytenr check)
> + * @meta_invalid
> + *
> + * - Pass basic sanity checks, but bad csum
> + * @meta_bad_csum
> + *
> + * - Pass basic checks and csum, but bad transid
> + * @meta_bad_transid
> + *
> + * - Pass all checks
> + * The expected case, no special accounting needed.
> + */
> + __u64 meta_io_fail;
> + __u64 meta_invalid;
> + __u64 meta_bad_csum;
> + __u64 meta_bad_transid;
> +
> + /*
> + * Parity accounting.
> + *
> + * NOTE: for unused data sectors (but still contributes to P/Q
> + * calculation, like the following case), they don't contribute
> + * to any accounting.
> + *
> + * Data 1: |<--- Unused ---->| <<<
> + * Data 2: |<- Data extent ->|
> + * Parity: |<--- Parity ---->|
> + */
> + __u64 parity_scrubbed;
> + __u64 parity_recoverable;
> +
> + /*
> + * This happens when there is not enough info to determine if the
> + * parity is correct, mostly happens when vertical stripes are
> + * *all* NODATASUM sectors.
> + *
> + * If there is any sector with checksum in the vertical stripe,
> + * parity itself will be no longer uncertain.
> + */
> + __u64 parity_uncertain;
> +
> + /*
> + * For parity, the checks are progressive too:
> + *
> + * - Unable to read
> + * @parity_io_fail
> + *
> + * - Mismatch and any veritical data stripe has csum and
> + * the data stripe csum matches
> + * @parity_mismatch
> + * We want to repair the parity then.
> + *
> + * - Mismatch and veritical data stripe has csum, and data
> + * csum mismatch. And rebuilt data passes csum.
> + * This will go @data_recoverable or @data_csum_mismatch instead.
> + *
> + * - Mismatch but no veritical data stripe has csum
> + * @parity_uncertain
> + *
> + */
> + __u64 parity_io_fail;
> + __u64 parity_mismatch;
> +
> + /* Padding to 256 bytes, and for later expansion. */
> + __u64 __unused[15];
> +};
It looks like this btrfs_scrub_fs_progress is missing
a member for unverified errors like the old btrfs_scrub_progress used to
indicate that an error came up but went away during a recheck. But
this is a poc patch,
just wondering if this recheck feature will be added to the official patch.
Also, just curious, what kind of situation would cause the first read
of a block that appears
to be corrupted, but the second read everything is fine. Could bad
sectors on the hard drive cause this?
> +static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
> +
> +/*
> + * Readonly scrub fs will not try any repair (thus *_repaired member
> + * in scrub_fs_progress should always be 0).
> + */
> +#define BTRFS_SCRUB_FS_FLAG_READONLY (1ULL << 0)
> +
> +/*
> + * All supported flags.
> + *
> + * From the very beginning, scrub_fs ioctl would reject any unsupported
> + * flags, making later expansion much simper.
> + */
> +#define BTRFS_SCRUB_FS_FLAG_SUPP (BTRFS_SCRUB_FS_FLAG_READONLY)
> +
> +struct btrfs_ioctl_scrub_fs_args {
> + /* Input, logical bytenr to start the scrub */
> + __u64 start;
> +
> + /* Input, the logical bytenr end (inclusive) */
> + __u64 end;
> +
> + __u64 flags;
> + __u64 reserved[8];
> + struct btrfs_scrub_fs_progress progress; /* out */
> +
> + /* pad to 1K */
> + __u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
> +};
> +
> #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
> #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
> struct btrfs_ioctl_dev_replace_start_params {
> @@ -1143,5 +1312,10 @@ enum btrfs_err_code {
> struct btrfs_ioctl_encoded_io_args)
> #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
> struct btrfs_ioctl_encoded_io_args)
> +#define BTRFS_IOC_SCRUB_FS _IOWR(BTRFS_IOCTL_MAGIC, 65, \
> + struct btrfs_ioctl_scrub_fs_args)
> +#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
> +#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
> + struct btrfs_ioctl_scrub_fs_args)
>
> #endif /* _UAPI_LINUX_BTRFS_H */
> --
> 2.37.3
>
Best Regards
Li
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls
2022-12-03 15:10 ` li zhang
@ 2022-12-03 23:09 ` Qu Wenruo
0 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-12-03 23:09 UTC (permalink / raw)
To: li zhang, Qu Wenruo, linux-btrfs
On 2022/12/3 23:10, li zhang wrote:
> Qu Wenruo <wqu@suse.com> 于2022年9月28日周三 16:40写道:
[...]
>> + __u64 parity_io_fail;
>> + __u64 parity_mismatch;
>> +
>> + /* Padding to 256 bytes, and for later expansion. */
>> + __u64 __unused[15];
>> +};
> It looks like this btrfs_scrub_fs_progress is missing
> a member for unverified errors like the old btrfs_scrub_progress used to
> indicate that an error came up but went away during a recheck.
That member is no longer required as we don't do sector by sector read
at all.
Thus there will be no such case that initial read failed but sector by
sector read succeeded.
> But
> this is a poc patch,
> just wondering if this recheck feature will be added to the official patch.
>
> Also, just curious, what kind of situation would cause the first read
> of a block that appears
> to be corrupted, but the second read everything is fine. Could bad
> sectors on the hard drive cause this?
I don't know, it can be hardware dependent.
Some hard disks have internal checksum, thus if one sector is corrupted,
it may mark the full read which covers that sector error.
Thanks,
Qu
>
>> +static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
>> +
>> +/*
>> + * Readonly scrub fs will not try any repair (thus *_repaired member
>> + * in scrub_fs_progress should always be 0).
>> + */
>> +#define BTRFS_SCRUB_FS_FLAG_READONLY (1ULL << 0)
>> +
>> +/*
>> + * All supported flags.
>> + *
>> + * From the very beginning, scrub_fs ioctl would reject any unsupported
>> + * flags, making later expansion much simper.
>> + */
>> +#define BTRFS_SCRUB_FS_FLAG_SUPP (BTRFS_SCRUB_FS_FLAG_READONLY)
>> +
>> +struct btrfs_ioctl_scrub_fs_args {
>> + /* Input, logical bytenr to start the scrub */
>> + __u64 start;
>> +
>> + /* Input, the logical bytenr end (inclusive) */
>> + __u64 end;
>> +
>> + __u64 flags;
>> + __u64 reserved[8];
>> + struct btrfs_scrub_fs_progress progress; /* out */
>> +
>> + /* pad to 1K */
>> + __u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
>> +};
>> +
>> #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
>> #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
>> struct btrfs_ioctl_dev_replace_start_params {
>> @@ -1143,5 +1312,10 @@ enum btrfs_err_code {
>> struct btrfs_ioctl_encoded_io_args)
>> #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
>> struct btrfs_ioctl_encoded_io_args)
>> +#define BTRFS_IOC_SCRUB_FS _IOWR(BTRFS_IOCTL_MAGIC, 65, \
>> + struct btrfs_ioctl_scrub_fs_args)
>> +#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
>> +#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
>> + struct btrfs_ioctl_scrub_fs_args)
>>
>> #endif /* _UAPI_LINUX_BTRFS_H */
>> --
>> 2.37.3
>>
>
>
> Best Regards
> Li
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH PoC v2 02/10] btrfs: scrub: introduce place holder for btrfs_scrub_fs()
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 03/10] btrfs: scrub: introduce a place holder helper scrub_fs_iterate_bgs() Qu Wenruo
` (7 subsequent siblings)
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
The new function btrfs_scrub_fs() will do the exclusive checking against
regular scrub and dev-replace, then return -EOPNOTSUPP as a place
holder.
Also to let regular scrub/dev-replace to be exclusive against
btrfs_scrub_fs(), also introduce btrfs_fs_info::scrub_fs_running member.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/ctree.h | 4 ++
fs/btrfs/ioctl.c | 41 +++++++++++++++++-
fs/btrfs/scrub.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 149 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 727595eee973..07542a55d77d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -905,6 +905,7 @@ struct btrfs_fs_info {
/* private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
+ atomic_t scrub_fs_running;
atomic_t scrub_pause_req;
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
@@ -4026,6 +4027,9 @@ int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
+int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+ struct btrfs_scrub_fs_progress *progress,
+ bool readonly);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6944216e1425..00870b20bd5a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4112,6 +4112,45 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
return ret;
}
+static long btrfs_ioctl_scrub_fs(struct file *file, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct btrfs_ioctl_scrub_fs_args *sfsa;
+ bool readonly = false;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sfsa = memdup_user(arg, sizeof(*sfsa));
+ if (IS_ERR(sfsa))
+ return PTR_ERR(sfsa);
+
+ if (sfsa->flags & ~BTRFS_SCRUB_FS_FLAG_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (sfsa->flags & BTRFS_SCRUB_FS_FLAG_READONLY)
+ readonly = true;
+
+ if (!readonly) {
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_scrub_fs(fs_info, sfsa->start, sfsa->end, &sfsa->progress,
+ readonly);
+ if (copy_to_user(arg, sfsa, sizeof(*sfsa)))
+ ret = -EFAULT;
+
+ if (!readonly)
+ mnt_drop_write_file(file);
+out:
+ kfree(sfsa);
+ return ret;
+}
+
static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
void __user *arg)
{
@@ -5509,7 +5548,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(fs_info, argp);
case BTRFS_IOC_SCRUB_FS:
- return -EOPNOTSUPP;
+ return btrfs_ioctl_scrub_fs(file, argp);
case BTRFS_IOC_SCRUB_FS_CANCEL:
return -EOPNOTSUPP;
case BTRFS_IOC_SCRUB_FS_PROGRESS:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f260c53829e5..d3d64f048c7b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4297,6 +4297,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
mutex_lock(&fs_info->scrub_lock);
+
+ /* Conflict with scrub_fs ioctls. */
+ if (atomic_read(&fs_info->scrub_fs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
@@ -4418,6 +4427,102 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+/*
+ * Unlike btrfs_scrub_dev(), this function works completely in logical bytenr
+ * level, and has the following advantage:
+ *
+ * - Better error reporting
+ * The new btrfs_scrub_fs_progress has better classified errors, more
+ * members to include parity errors.
+ *
+ * - Always scrub one block group at one time
+ * btrfs_scrub_dev() works by starting one scrub for each device.
+ * This can cause asynchronised progress, and mark multiple block groups
+ * RO, reducing the avaialbe space unnecessarily.
+ *
+ * - Less IO for RAID56
+ * Instead of treating RAID56 data and P/Q stripes differently, here we only
+ * scrub a full stripe at most once.
+ * Instead of the 2x read for data stripes (one for scrubbing the data stripe itself,
+ * the other one from scrubbing the P/Q stripe).
+ *
+ * - No bio formshaping and streamlined code
+ * Always submit bio for all involved mirrors (or data/p/q stripes for
+ * RAID56), wait for the IO, then run the check.
+ *
+ * Thus there are at most nr_mirrors (nr_stripes for RAID56) bios on-the-fly,
+ * and for each device, there is always at most one bio for scrub.
+ *
+ * This would greatly simplify all involved code.
+ *
+ * - No need to support dev-replace
+ * Thus we can have simpler code.
+ *
+ * Unfortunately this ioctl has the following disadvantage so far:
+ *
+ * - No resume after unmount
+ * We may need extra on-disk format to save the progress.
+ * Thus we may need a new RO compat flags for the resume ability.
+ *
+ * - Conflicts with dev-replace/scrub
+ *
+ * - Needs kernel support.
+ *
+ * - Not fully finished
+ */
+int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+ struct btrfs_scrub_fs_progress *progress,
+ bool readonly)
+{
+ int ret;
+
+ if (btrfs_fs_closing(fs_info))
+ return -EAGAIN;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
+ * Metadata and data unit should be able to be contained inside one
+ * stripe.
+ */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
+ ASSERT(fs_info->sectorsize <= BTRFS_STRIPE_LEN);
+
+ mutex_lock(&fs_info->scrub_lock);
+ /* This function conflicts with scrub/dev-replace. */
+ if (atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -EINPROGRESS;
+ }
+
+ /* And there can only be one running btrfs_scrub_fs(). */
+ if (atomic_read(&fs_info->scrub_fs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -EINPROGRESS;
+ }
+
+ __scrub_blocked_if_needed(fs_info);
+ atomic_inc(&fs_info->scrub_fs_running);
+
+ /* This is to allow existing scrub pause to be reused. */
+ atomic_inc(&fs_info->scrubs_running);
+ btrfs_info(fs_info, "scrub_fs: started");
+ mutex_unlock(&fs_info->scrub_lock);
+
+ /* Place holder for real workload. */
+ ret = -EOPNOTSUPP;
+
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_dec(&fs_info->scrubs_running);
+ atomic_dec(&fs_info->scrub_fs_running);
+ btrfs_info(fs_info, "scrub_fs: finished with status: %d", ret);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ return ret;
+}
+
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 03/10] btrfs: scrub: introduce a place holder helper scrub_fs_iterate_bgs()
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 01/10] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 02/10] btrfs: scrub: introduce place holder for btrfs_scrub_fs() Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group() Qu Wenruo
` (6 subsequent siblings)
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
This new helper is mostly the same as scrub_enumerate_chunks(), but with
some small changes:
- No need for dev-replace branches
- No need to search dev-extent tree
We can directly iterate the block groups.
The new helper currently will only iterate all the bgs, but doing
nothing for the iterated bgs.
Also one smaller helper is introduced:
- scrub_fs_alloc_ctx()
To allocate a scrub_fs_ctx, which has way less members (for now and
for the future) compared to scrub_ctx.
The scrub_fs_ctx will have a very defined lifespan (only inside
btrfs_scrub_fs(), and can only have one scrub_fs_ctx, thus not need to
be ref counted)
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 162 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d3d64f048c7b..97da8545c9ab 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -198,6 +198,24 @@ struct scrub_ctx {
refcount_t refs;
};
+/* This structure should only has a lifespan inside btrfs_scrub_fs(). */
+struct scrub_fs_ctx {
+ struct btrfs_fs_info *fs_info;
+
+ /* Current block group we're scurbbing. */
+ struct btrfs_block_group *cur_bg;
+
+ /* Current logical bytenr being scrubbed. */
+ u64 cur_logical;
+
+ atomic_t bios_under_io;
+
+ bool readonly;
+
+ /* There will and only be one thread touching @stat. */
+ struct btrfs_scrub_fs_progress stat;
+};
+
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
@@ -4427,6 +4445,126 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+static struct scrub_fs_ctx *scrub_fs_alloc_ctx(struct btrfs_fs_info *fs_info,
+ bool readonly)
+{
+ struct scrub_fs_ctx *sfctx;
+ int ret;
+
+ sfctx = kzalloc(sizeof(*sfctx), GFP_KERNEL);
+ if (!sfctx) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ sfctx->fs_info = fs_info;
+ sfctx->readonly = readonly;
+ atomic_set(&sfctx->bios_under_io, 0);
+ return sfctx;
+error:
+ kfree(sfctx);
+ return ERR_PTR(ret);
+}
+
+static int scrub_fs_iterate_bgs(struct scrub_fs_ctx *sfctx, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ u64 cur = start;
+ int ret;
+
+ while (cur < end) {
+ struct btrfs_block_group *bg;
+ bool ro_set = false;
+
+ bg = btrfs_lookup_first_block_group(fs_info, cur);
+ if (!bg)
+ break;
+ if (bg->start + bg->length >= end) {
+ btrfs_put_block_group(bg);
+ break;
+ }
+ spin_lock(&bg->lock);
+
+ /* Already deleted bg, skip to the next one. */
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ cur = bg->start + bg->length;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+ btrfs_freeze_block_group(bg);
+ spin_unlock(&bg->lock);
+
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+
+ /*
+ * Check the comments before btrfs_inc_block_group_ro() inside
+ * scrub_enumerate_chunks() for reasons.
+ */
+ ret = btrfs_inc_block_group_ro(bg, false);
+ if (ret == 0)
+ ro_set = true;
+ if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ bg->start);
+ scrub_pause_off(fs_info);
+ ret = 0;
+ goto next;
+ }
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_warn(fs_info,
+ "failed setting block group ro: %d", ret);
+ scrub_pause_off(fs_info);
+ goto next;
+ }
+
+ scrub_pause_off(fs_info);
+
+ /* Place holder for the real chunk scrubbing code. */
+ ret = 0;
+
+ if (ro_set)
+ btrfs_dec_block_group_ro(bg);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags) &&
+ !bg->ro && bg->reserved == 0 && bg->used == 0) {
+ spin_unlock(&bg->lock);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ bg);
+ else
+ btrfs_mark_bg_unused(bg);
+ } else {
+ spin_unlock(&bg->lock);
+ }
+next:
+ cur = bg->start + bg->length;
+
+ btrfs_unfreeze_block_group(bg);
+ btrfs_put_block_group(bg);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
/*
* Unlike btrfs_scrub_dev(), this function works completely in logical bytenr
* level, and has the following advantage:
@@ -4474,6 +4612,8 @@ int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
struct btrfs_scrub_fs_progress *progress,
bool readonly)
{
+ struct scrub_fs_ctx *sfctx;
+ unsigned int nofs_flag;
int ret;
if (btrfs_fs_closing(fs_info))
@@ -4510,8 +4650,25 @@ int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
btrfs_info(fs_info, "scrub_fs: started");
mutex_unlock(&fs_info->scrub_lock);
- /* Place holder for real workload. */
- ret = -EOPNOTSUPP;
+ sfctx = scrub_fs_alloc_ctx(fs_info, readonly);
+ if (IS_ERR(sfctx)) {
+ ret = PTR_ERR(sfctx);
+ sfctx = NULL;
+ goto out;
+ }
+
+ if (progress)
+ memcpy(&sfctx->stat, progress, sizeof(*progress));
+
+ /*
+ * Check the comments before memalloc_nofs_save() in btrfs_scrub_dev()
+ * for reasons.
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = scrub_fs_iterate_bgs(sfctx, start, end);
+ memalloc_nofs_restore(nofs_flag);
+out:
+ kfree(sfctx);
mutex_lock(&fs_info->scrub_lock);
atomic_dec(&fs_info->scrubs_running);
@@ -4520,6 +4677,9 @@ int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
+ if (progress)
+ memcpy(progress, &sfctx->stat, sizeof(*progress));
+
return ret;
}
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group()
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (2 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 03/10] btrfs: scrub: introduce a place holder helper scrub_fs_iterate_bgs() Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 14:30 ` Wang Yugui
2022-09-28 8:35 ` [PATCH PoC v2 05/10] btrfs: scrub: add helpers to fulfill csum/extent_generation Qu Wenruo
` (5 subsequent siblings)
9 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
The main place holder helper scrub_fs_block_group() will:
- Initialize various needed members inside scrub_fs_ctx
This includes:
* Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
for RAID56 profiles.
* Allocate memory for sectors/pages array, and csum_buf if it's data
bg.
* Initialize all sectors to type UNUSED.
All these above memory will stay for each stripe we run, thus we only
need to allocate these memories once-per-bg.
- Iterate stripes containing any used sector
This is the code to be implemented.
- Cleanup above memories before we finish the block group.
The real work of scrubbing a stripe is not yet implemented.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 232 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 97da8545c9ab..6e6c50962ace 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -198,6 +198,45 @@ struct scrub_ctx {
refcount_t refs;
};
+#define SCRUB_FS_SECTOR_FLAG_UNUSED (1 << 0)
+#define SCRUB_FS_SECTOR_FLAG_DATA (1 << 1)
+#define SCRUB_FS_SECTOR_FLAG_META (1 << 2)
+#define SCRUB_FS_SECTOR_FLAG_PARITY (1 << 3)
+
+/*
+ * Represent a sector.
+ *
+ * To access the content of a sector, the caller should have the index inside
+ * the scrub_fs_ctx->sectors[] array, and use that index to calculate the page
+ * and page offset innside scrub_fs_ctx->pages[] array.
+ *
+ * To get the logical/physical bytenr of the a sector, the caller should use
+ * scrub_fs_ctx->bioc and the sector index to calclulate the logical/physical
+ * bytenr.
+ */
+struct scrub_fs_sector {
+ unsigned int flags;
+ union {
+ /*
+ * For SCRUB_FS_SECTOR_TYPE_DATA, either it points to some byte
+ * inside scrub_fs_ctx->csum_buf, or it's NULL for NODATACSUM
+ * case.
+ */
+ u8 *csum;
+
+ /*
+ * For SECRUB_FS_SECTOR_TYPE_META, this records the generation
+ * and the logical bytenr of the tree block.
+ * (So we can grab the first sector to calculate their inline
+ * csum).
+ */
+ struct {
+ u64 eb_logical;
+ u64 eb_generation;
+ };
+ };
+};
+
/* This structure should only has a lifespan inside btrfs_scrub_fs(). */
struct scrub_fs_ctx {
struct btrfs_fs_info *fs_info;
@@ -214,6 +253,57 @@ struct scrub_fs_ctx {
/* There will and only be one thread touching @stat. */
struct btrfs_scrub_fs_progress stat;
+
+ /*
+ * How many sectors we read per stripe.
+ *
+ * For now, it's fixed to BTRFS_STRIPE_LEN / sectorsize.
+ *
+ * This can be enlarged to full stripe size / sectorsize
+ * for later RAID0/10/5/6 code.
+ */
+ int sectors_per_stripe;
+ /*
+ * For non-RAID56 profiles, we only care how many copies the block
+ * group has.
+ * For RAID56 profiles, we care how many stripes the block group
+ * has (including data and parities).
+ */
+ union {
+ int nr_stripes;
+ int nr_copies;
+ };
+
+ /*
+ * The total number of sectors we scrub in one run (including
+ * the extra mirrors/parities).
+ *
+ * For non-RAID56 profiles, it would be:
+ * nr_copie * (BTRFS_STRIPE_LEN / sectorsize).
+ *
+ * For RAID56 profiles, it would be:
+ * nr_stripes * (BTRFS_STRIPE_LEN / sectorsize).
+ */
+ int total_sectors;
+
+ /* Page array for above total_sectors. */
+ struct page **pages;
+
+ /*
+ * Sector array for above total_sectors. The page content will be
+ * inside above pages array.
+ *
+ * Both array should be initialized when start to scrub a block group.
+ */
+ struct scrub_fs_sector *sectors;
+
+ /*
+ * Csum buffer allocated for the stripe.
+ *
+ * All sectors in different mirrors for the same logical bytenr
+ * would point to the same location inside the buffer.
+ */
+ u8 *csum_buf;
};
struct scrub_warning {
@@ -4466,6 +4556,147 @@ static struct scrub_fs_ctx *scrub_fs_alloc_ctx(struct btrfs_fs_info *fs_info,
return ERR_PTR(ret);
}
+/*
+ * Cleanup the memory allocation, mostly after finishing a bg, or for error
+ * path.
+ */
+static void scrub_fs_cleanup_for_bg(struct scrub_fs_ctx *sfctx)
+{
+ int i;
+ const int nr_pages = sfctx->nr_copies * (BTRFS_STRIPE_LEN >> PAGE_SHIFT);
+
+ if (sfctx->pages) {
+ for (i = 0; i < nr_pages; i++) {
+ if (sfctx->pages[i]) {
+ __free_page(sfctx->pages[i]);
+ sfctx->pages[i] = NULL;
+ }
+ }
+ }
+ kfree(sfctx->pages);
+ sfctx->pages = NULL;
+
+ kfree(sfctx->sectors);
+ sfctx->sectors = NULL;
+
+ kfree(sfctx->csum_buf);
+ sfctx->csum_buf = NULL;
+
+ /* NOTE: block group will only be put inside scrub_fs_iterate_bgs(). */
+ sfctx->cur_bg = NULL;
+}
+
+/* Do the block group specific initialization. */
+static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ bool is_raid56 = !!(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ int ret = 0;
+ int nr_pages;
+ int i;
+
+ /*
+ * One stripe should be page aligned, aka, PAGE_SIZE should not be
+ * larger than 64K.
+ */
+ ASSERT(IS_ALIGNED(BTRFS_STRIPE_LEN, PAGE_SIZE));
+
+ /* Last run should have cleanedup all the memories. */
+ ASSERT(!sfctx->cur_bg);
+ ASSERT(!sfctx->pages);
+ ASSERT(!sfctx->sectors);
+ ASSERT(!sfctx->csum_buf);
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
+ read_unlock(&map_tree->lock);
+
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ if (!em) {
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
+ ret = -EINVAL;
+ spin_unlock(&bg->lock);
+ return ret;
+ }
+ /*
+ * Since we're ensured to be executed without any other
+ * dev-replace/scrub running, the num_stripes should be the total
+ * number of stripes, without the replace target device.
+ */
+ if (is_raid56)
+ sfctx->nr_stripes = em->map_lookup->num_stripes;
+ free_extent_map(em);
+
+ if (!is_raid56)
+ sfctx->nr_copies = btrfs_num_copies(fs_info, bg->start,
+ fs_info->sectorsize);
+ sfctx->sectors_per_stripe = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ sfctx->total_sectors = sfctx->sectors_per_stripe * sfctx->nr_copies;
+
+ nr_pages = (BTRFS_STRIPE_LEN >> PAGE_SHIFT) * sfctx->nr_copies;
+
+ sfctx->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!sfctx->pages)
+ goto enomem;
+
+ for (i = 0; i < nr_pages; i++) {
+ sfctx->pages[i] = alloc_page(GFP_KERNEL);
+ if (!sfctx->pages[i])
+ goto enomem;
+ }
+
+ sfctx->sectors = kcalloc(sfctx->total_sectors,
+ sizeof(struct scrub_fs_sector), GFP_KERNEL);
+ if (!sfctx->sectors)
+ goto enomem;
+
+ for (i = 0; i < sfctx->total_sectors; i++)
+ sfctx->sectors[i].flags = SCRUB_FS_SECTOR_FLAG_UNUSED;
+
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+ sfctx->csum_buf = kzalloc(fs_info->csum_size *
+ sfctx->sectors_per_stripe, GFP_KERNEL);
+ if (!sfctx->csum_buf)
+ goto enomem;
+ }
+ sfctx->cur_bg = bg;
+ sfctx->cur_logical = bg->start;
+ return 0;
+
+enomem:
+ sfctx->stat.nr_fatal_errors++;
+ scrub_fs_cleanup_for_bg(sfctx);
+ return -ENOMEM;
+}
+
+
+static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
+ struct btrfs_block_group *bg)
+{
+ int ret;
+
+ /* Not yet supported, just skip RAID56 bgs for now. */
+ if (bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ return 0;
+
+ ret = scrub_fs_init_for_bg(sfctx, bg);
+ if (ret < 0)
+ return ret;
+
+ /* Place holder for the loop itearting the sectors. */
+ ret = 0;
+
+ scrub_fs_cleanup_for_bg(sfctx);
+ return ret;
+}
+
static int scrub_fs_iterate_bgs(struct scrub_fs_ctx *sfctx, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = sfctx->fs_info;
@@ -4529,8 +4760,7 @@ static int scrub_fs_iterate_bgs(struct scrub_fs_ctx *sfctx, u64 start, u64 end)
scrub_pause_off(fs_info);
- /* Place holder for the real chunk scrubbing code. */
- ret = 0;
+ ret = scrub_fs_block_group(sfctx, bg);
if (ro_set)
btrfs_dec_block_group_ro(bg);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group()
2022-09-28 8:35 ` [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group() Qu Wenruo
@ 2022-09-28 14:30 ` Wang Yugui
2022-09-28 22:54 ` Qu Wenruo
0 siblings, 1 reply; 15+ messages in thread
From: Wang Yugui @ 2022-09-28 14:30 UTC (permalink / raw)
To: Qu Wenruo; +Cc: linux-btrfs
Hi,
> The main place holder helper scrub_fs_block_group() will:
>
> - Initialize various needed members inside scrub_fs_ctx
> This includes:
> * Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
> for RAID56 profiles.
> * Allocate memory for sectors/pages array, and csum_buf if it's data
> bg.
> * Initialize all sectors to type UNUSED.
>
> All these above memory will stay for each stripe we run, thus we only
> need to allocate these memories once-per-bg.
>
> - Iterate stripes containing any used sector
> This is the code to be implemented.
>
> - Cleanup above memories before we finish the block group.
>
> The real work of scrubbing a stripe is not yet implemented.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 232 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 97da8545c9ab..6e6c50962ace 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -198,6 +198,45 @@ struct scrub_ctx {
> refcount_t refs;
> };
>
> +#define SCRUB_FS_SECTOR_FLAG_UNUSED (1 << 0)
> +#define SCRUB_FS_SECTOR_FLAG_DATA (1 << 1)
> +#define SCRUB_FS_SECTOR_FLAG_META (1 << 2)
> +#define SCRUB_FS_SECTOR_FLAG_PARITY (1 << 3)
> +
there is few use case for SCRUB_FS_SECTOR_FLAG_PARITY ?
and we may need SCRUB_FS_SECTOR_FLAG_SYSTEM so we can match 'btrfs scrub
start' well with 'btrfs balance start -d -m -s'
Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/09/28
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group()
2022-09-28 14:30 ` Wang Yugui
@ 2022-09-28 22:54 ` Qu Wenruo
0 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 22:54 UTC (permalink / raw)
To: Wang Yugui, Qu Wenruo; +Cc: linux-btrfs
On 2022/9/28 22:30, Wang Yugui wrote:
> Hi,
>
>> The main place holder helper scrub_fs_block_group() will:
>>
>> - Initialize various needed members inside scrub_fs_ctx
>> This includes:
>> * Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
>> for RAID56 profiles.
>> * Allocate memory for sectors/pages array, and csum_buf if it's data
>> bg.
>> * Initialize all sectors to type UNUSED.
>>
>> All these above memory will stay for each stripe we run, thus we only
>> need to allocate these memories once-per-bg.
>>
>> - Iterate stripes containing any used sector
>> This is the code to be implemented.
>>
>> - Cleanup above memories before we finish the block group.
>>
>> The real work of scrubbing a stripe is not yet implemented.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 232 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
>> index 97da8545c9ab..6e6c50962ace 100644
>> --- a/fs/btrfs/scrub.c
>> +++ b/fs/btrfs/scrub.c
>> @@ -198,6 +198,45 @@ struct scrub_ctx {
>> refcount_t refs;
>> };
>>
>> +#define SCRUB_FS_SECTOR_FLAG_UNUSED (1 << 0)
>> +#define SCRUB_FS_SECTOR_FLAG_DATA (1 << 1)
>> +#define SCRUB_FS_SECTOR_FLAG_META (1 << 2)
>> +#define SCRUB_FS_SECTOR_FLAG_PARITY (1 << 3)
>> +
>
> there is few use case for SCRUB_FS_SECTOR_FLAG_PARITY ?
For future RAID56 support.
>
> and we may need SCRUB_FS_SECTOR_FLAG_SYSTEM so we can match 'btrfs scrub
> start' well with 'btrfs balance start -d -m -s'
"System" is still metadata.
Please understand why we need system chunks first.
>
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2022/09/28
>
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH PoC v2 05/10] btrfs: scrub: add helpers to fulfill csum/extent_generation
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (3 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group() Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 06/10] btrfs: scrub: submit and wait for the read of each copy Qu Wenruo
` (4 subsequent siblings)
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
This patch will introduce two new major helpers:
- scrub_fs_locate_and_fill_stripe()
This will find a stripe which contains any extent.
And then fill corresponding sectors inside sectors[] array with its
extent_type.
If it's a metadata extent, it will also fill eb_generation member.
- scrub_fs_fill_stripe_csum()
This is for data block groups only.
This helper will find all csums for the stripe, and copy the csum into
the corresponding position inside scrub_fs_ctx->csum_buf.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 306 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6e6c50962ace..f04d2e552666 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4535,6 +4535,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
+static void scrub_fs_check_sector_mirror_nr(struct scrub_fs_ctx *sfctx,
+ int sector_nr, int mirror_nr)
+{
+ /* Basic boudary checks. */
+ ASSERT(sector_nr >= 0 && sector_nr < sfctx->sectors_per_stripe);
+ ASSERT(mirror_nr >= 0 && mirror_nr < sfctx->nr_copies);
+}
+
+static struct scrub_fs_sector *scrub_fs_get_sector(struct scrub_fs_ctx *sfctx,
+ int sector_nr, int mirror_nr)
+{
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, mirror_nr);
+ return &sfctx->sectors[mirror_nr * sfctx->sectors_per_stripe + sector_nr];
+}
+
static struct scrub_fs_ctx *scrub_fs_alloc_ctx(struct btrfs_fs_info *fs_info,
bool readonly)
{
@@ -4676,10 +4691,264 @@ static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
return -ENOMEM;
}
+static int scrub_fs_fill_sector_types(struct scrub_fs_ctx *sfctx,
+ u64 stripe_start, u64 extent_start,
+ u64 extent_len, u64 extent_flags,
+ u64 extent_gen)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ const u64 stripe_end = stripe_start + (sfctx->sectors_per_stripe <<
+ fs_info->sectorsize_bits);
+ const u64 real_start = max(stripe_start, extent_start);
+ const u64 real_len = min(stripe_end, extent_start + extent_len) - real_start;
+ bool is_meta = false;
+ u64 cur_logical;
+ int sector_flags;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ sector_flags = SCRUB_FS_SECTOR_FLAG_META;
+ is_meta = true;
+ /* Metadata should never corss stripe boundary. */
+ if (extent_start != real_start) {
+ btrfs_err(fs_info,
+ "tree block at bytenr %llu crossed stripe boundary",
+ extent_start);
+ return -EUCLEAN;
+ }
+ } else {
+ sector_flags = SCRUB_FS_SECTOR_FLAG_DATA;
+ }
+
+ for (cur_logical = real_start; cur_logical < real_start + real_len;
+ cur_logical += fs_info->sectorsize) {
+ const int sector_nr = (cur_logical - stripe_start) >>
+ fs_info->sectorsize_bits;
+ int mirror_nr;
+
+ for (mirror_nr = 0; mirror_nr < sfctx->nr_copies; mirror_nr++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+
+ /*
+ * All sectors in the range should have not been
+ * initialized.
+ */
+ ASSERT(sector->flags == SCRUB_FS_SECTOR_FLAG_UNUSED);
+ ASSERT(sector->csum == NULL);
+ ASSERT(sector->eb_generation == 0);
+
+ sector->flags = sector_flags;
+ /*
+ * Here we only populate eb_*, for csum it will be later
+ * filled in a dedicated csum tree search.
+ */
+ if (is_meta) {
+ sector->eb_logical = extent_start;
+ sector->eb_generation = extent_gen;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * To locate a stripe where there is any extent inside it.
+ *
+ * @start: logical bytenr to start the search. Result stripe should
+ * be >= @start.
+ * @found_ret: logical bytenr of the found stripe. Should also be a stripe start
+ * bytenr.
+ *
+ * Return 0 if we found such stripe, and update @found_ret, furthermore, we will
+ * fill sfctx->stripes[] array with the needed extent info (generation for tree
+ * block, csum for data extents).
+ *
+ * Return <0 if we hit fatal errors.
+ *
+ * Return >0 if there is no more stripe containing any extent after @start.
+ */
+static int scrub_fs_locate_and_fill_stripe(struct scrub_fs_ctx *sfctx, u64 start,
+ u64 *found_ret)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct btrfs_path path = {0};
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info,
+ sfctx->cur_bg->start);
+ const u64 bg_start = sfctx->cur_bg->start;
+ const u64 bg_end = bg_start + sfctx->cur_bg->length;
+ const u32 stripe_len = sfctx->sectors_per_stripe << fs_info->sectorsize_bits;
+ u64 cur_logical = start;
+ /*
+ * The full stripe start we found. If 0, it means we haven't yet found
+ * any extent.
+ */
+ u64 stripe_start = 0;
+ u64 extent_start;
+ u64 extent_size;
+ u64 extent_flags;
+ u64 extent_gen;
+ int ret;
+
+ path.search_commit_root = true;
+ path.skip_locking = true;
+
+ /* Initial search to find any extent inside the block group. */
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ bg_end - cur_logical);
+ /* Either error out or no more extent items. */
+ if (ret)
+ goto out;
+
+ get_extent_info(&path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+ /*
+ * Note here a full stripe for RAID56 may not be power of 2, thus
+ * we have to use rounddown(), not round_down().
+ */
+ stripe_start = rounddown(max(extent_start, cur_logical) - bg_start,
+ stripe_len) + bg_start;
+ *found_ret = stripe_start;
+
+ scrub_fs_fill_sector_types(sfctx, stripe_start, extent_start,
+ extent_size, extent_flags, extent_gen);
+
+ cur_logical = min(stripe_start + stripe_len, extent_start + extent_size);
+
+ /* Now iterate all the remaining extents inside the stripe. */
+ while (cur_logical < stripe_start + stripe_len) {
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ stripe_start + stripe_len - cur_logical);
+ if (ret)
+ goto out;
+
+ get_extent_info(&path, &extent_start, &extent_size,
+ &extent_flags, &extent_gen);
+ scrub_fs_fill_sector_types(sfctx, stripe_start, extent_start,
+ extent_size, extent_flags, extent_gen);
+ cur_logical = extent_start + extent_size;
+ }
+out:
+ btrfs_release_path(&path);
+ /*
+ * Found nothing, the first get_extent_info() returned error or no
+ * extent found at all, just return @ret directly.
+ */
+ if (!stripe_start)
+ return ret;
+
+ /*
+ * Now we have hit at least one extent, if ret > 0, then it means
+ * we still need to handle the extents we found, in that case we
+ * return 0, so we will scrub what we found.
+ */
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static void scrub_fs_fill_one_ordered_sum(struct scrub_fs_ctx *sfctx,
+ struct btrfs_ordered_sum *sum)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ const u64 stripe_start = sfctx->cur_logical;
+ const u32 stripe_len = sfctx->sectors_per_stripe <<
+ fs_info->sectorsize_bits;
+ u64 cur;
+
+ ASSERT(stripe_start <= sum->bytenr &&
+ sum->bytenr + sum->len <= stripe_start + stripe_len);
+
+ for (cur = sum->bytenr; cur < sum->bytenr + sum->len;
+ cur += fs_info->sectorsize) {
+ int sector_nr = (cur - stripe_start) >> fs_info->sectorsize_bits;
+ int mirror_nr;
+ u8 *csum = sum->sums + (((cur - sum->bytenr) >>
+ fs_info->sectorsize_bits) * fs_info->csum_size);
+
+ /* Fill csum_buf first. */
+ memcpy(sfctx->csum_buf + sector_nr * fs_info->csum_size,
+ csum, fs_info->csum_size);
+
+ /* Make sectors in all mirrors to point to the correct csum. */
+ for (mirror_nr = 0; mirror_nr < sfctx->nr_copies; mirror_nr++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+
+ ASSERT(sector->flags & SCRUB_FS_SECTOR_FLAG_DATA);
+ sector->csum = sfctx->csum_buf + sector_nr * fs_info->csum_size;
+ }
+ }
+}
+
+static int scrub_fs_fill_stripe_csum(struct scrub_fs_ctx *sfctx)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+ sfctx->cur_bg->start);
+ const u64 stripe_start = sfctx->cur_logical;
+ const u32 stripe_len = sfctx->sectors_per_stripe << fs_info->sectorsize_bits;
+ LIST_HEAD(csum_list);
+ int ret;
+
+ ret = btrfs_lookup_csums_range(csum_root, stripe_start,
+ stripe_start + stripe_len - 1,
+ &csum_list, true, false);
+ if (ret < 0)
+ return ret;
+
+ /* Extract csum_list and fill them into csum_buf. */
+ while (!list_empty(&csum_list)) {
+ struct btrfs_ordered_sum *sum;
+
+ sum = list_first_entry(&csum_list, struct btrfs_ordered_sum,
+ list);
+ scrub_fs_fill_one_ordered_sum(sfctx, sum);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ return 0;
+}
+
+/*
+ * Reset the content of pages/csum_buf and reset sector types/csum, so
+ * no leftover data for the next run.
+ */
+static void scrub_fs_reset_stripe(struct scrub_fs_ctx *sfctx)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ const int nr_pages = (sfctx->total_sectors <<
+ fs_info->sectorsize_bits) >> PAGE_SHIFT;
+ int i;
+
+ ASSERT(nr_pages);
+
+ /* Zero page content. */
+ for (i = 0; i < nr_pages; i++)
+ memzero_page(sfctx->pages[i], 0, PAGE_SIZE);
+
+ /* Zero csum_buf. */
+ if (sfctx->csum_buf)
+ memset(sfctx->csum_buf, 0, sfctx->sectors_per_stripe *
+ fs_info->csum_size);
+
+ /* Clear sector types and its csum pointer. */
+ for (i = 0; i < sfctx->total_sectors; i++) {
+ struct scrub_fs_sector *sector = &sfctx->sectors[i];
+
+ sector->flags = SCRUB_FS_SECTOR_FLAG_UNUSED;
+ sector->csum = NULL;
+ sector->eb_generation = 0;
+ sector->eb_logical = 0;
+ }
+}
static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
struct btrfs_block_group *bg)
{
+ const struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
+ u32 stripe_len;
+ u64 cur_logical = bg->start;
int ret;
/* Not yet supported, just skip RAID56 bgs for now. */
@@ -4690,8 +4959,43 @@ static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
if (ret < 0)
return ret;
- /* Place holder for the loop itearting the sectors. */
- ret = 0;
+ /*
+ * We can only trust anything inside sfctx after
+ * scrub_fs_init_for_bg().
+ */
+ stripe_len = sfctx->sectors_per_stripe << fs_info->sectorsize_bits;
+ ASSERT(stripe_len);
+
+ while (cur_logical < bg->start + bg->length) {
+ u64 stripe_start;
+
+ ret = scrub_fs_locate_and_fill_stripe(sfctx, cur_logical,
+ &stripe_start);
+ if (ret < 0)
+ break;
+
+ /* No more extent left in the bg, we have finished the bg. */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+
+ sfctx->cur_logical = stripe_start;
+
+ if (is_data) {
+ ret = scrub_fs_fill_stripe_csum(sfctx);
+ if (ret < 0)
+ break;
+ }
+
+ /* Place holder for real stripe scrubbing. */
+ ret = 0;
+
+ /* Reset the stripe for next run. */
+ scrub_fs_reset_stripe(sfctx);
+
+ cur_logical = stripe_start + stripe_len;
+ }
scrub_fs_cleanup_for_bg(sfctx);
return ret;
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 06/10] btrfs: scrub: submit and wait for the read of each copy
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (4 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 05/10] btrfs: scrub: add helpers to fulfill csum/extent_generation Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 07/10] btrfs: scrub: implement metadata verification code for scrub_fs Qu Wenruo
` (3 subsequent siblings)
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
This patch introduce a helper, scrub_fs_one_stripe().
Currently it's only doing the following work:
- Submit bios for each copy of 64K stripe
We don't need to skip any range which doesn't have data/metadata.
That would only eat up the IOPS performance of the disk.
At per-stripe initialization time we have marked all sectors unused,
until extent tree search time marks the needed sectors DATA/METADATA.
So at verification time we can skip those unused sectors.
- Wait for the bios to finish
No csum verification yet.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 218 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f04d2e552666..cb0973e7ffd2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -203,6 +203,11 @@ struct scrub_ctx {
#define SCRUB_FS_SECTOR_FLAG_META (1 << 2)
#define SCRUB_FS_SECTOR_FLAG_PARITY (1 << 3)
+/* This marks if the sector belongs to a missing device. */
+#define SCRUB_FS_SECTOR_FLAG_DEV_MISSING (1 << 4)
+#define SCRUB_FS_SECTOR_FLAG_IO_ERROR (1 << 5)
+#define SCRUB_FS_SECTOR_FLAG_IO_DONE (1 << 6)
+
/*
* Represent a sector.
*
@@ -237,6 +242,14 @@ struct scrub_fs_sector {
};
};
+/* Endio ctrl for each device. */
+struct scrub_fs_endio_ctrl {
+ struct scrub_fs_ctx *sfctx;
+
+ /* To locate the real sectors of the stripe. */
+ int mirror_nr;
+};
+
/* This structure should only has a lifespan inside btrfs_scrub_fs(). */
struct scrub_fs_ctx {
struct btrfs_fs_info *fs_info;
@@ -286,6 +299,9 @@ struct scrub_fs_ctx {
*/
int total_sectors;
+ /* Endio control for all read operations. */
+ struct scrub_fs_endio_ctrl *endio_ctrls;
+
/* Page array for above total_sectors. */
struct page **pages;
@@ -304,6 +320,8 @@ struct scrub_fs_ctx {
* would point to the same location inside the buffer.
*/
u8 *csum_buf;
+
+ wait_queue_head_t wait;
};
struct scrub_warning {
@@ -4565,6 +4583,7 @@ static struct scrub_fs_ctx *scrub_fs_alloc_ctx(struct btrfs_fs_info *fs_info,
sfctx->fs_info = fs_info;
sfctx->readonly = readonly;
atomic_set(&sfctx->bios_under_io, 0);
+ init_waitqueue_head(&sfctx->wait);
return sfctx;
error:
kfree(sfctx);
@@ -4580,6 +4599,9 @@ static void scrub_fs_cleanup_for_bg(struct scrub_fs_ctx *sfctx)
int i;
const int nr_pages = sfctx->nr_copies * (BTRFS_STRIPE_LEN >> PAGE_SHIFT);
+ kfree(sfctx->endio_ctrls);
+ sfctx->endio_ctrls = NULL;
+
if (sfctx->pages) {
for (i = 0; i < nr_pages; i++) {
if (sfctx->pages[i]) {
@@ -4624,6 +4646,7 @@ static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
ASSERT(!sfctx->pages);
ASSERT(!sfctx->sectors);
ASSERT(!sfctx->csum_buf);
+ ASSERT(!sfctx->endio_ctrls);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, bg->start, bg->length);
@@ -4657,6 +4680,16 @@ static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
nr_pages = (BTRFS_STRIPE_LEN >> PAGE_SHIFT) * sfctx->nr_copies;
+ sfctx->endio_ctrls = kcalloc(sfctx->nr_copies,
+ sizeof(struct scrub_fs_endio_ctrl), GFP_KERNEL);
+ if (!sfctx->endio_ctrls)
+ goto enomem;
+
+ for (i = 0; i < sfctx->nr_copies; i++) {
+ sfctx->endio_ctrls[i].sfctx = sfctx;
+ sfctx->endio_ctrls[i].mirror_nr = i;
+ }
+
sfctx->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!sfctx->pages)
goto enomem;
@@ -4942,6 +4975,188 @@ static void scrub_fs_reset_stripe(struct scrub_fs_ctx *sfctx)
}
}
+static void mark_missing_dev_sectors(struct scrub_fs_ctx *sfctx,
+ int stripe_nr)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ const int sectors_per_stripe = BTRFS_STRIPE_LEN >>
+ fs_info->sectorsize_bits;
+ int i;
+
+ for (i = 0; i < sectors_per_stripe; i++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, i, stripe_nr);
+
+ sector->flags |= SCRUB_FS_SECTOR_FLAG_DEV_MISSING;
+ }
+}
+
+static struct page *scrub_fs_get_page(struct scrub_fs_ctx *sfctx,
+ int sector_nr, int mirror_nr)
+{
+ int page_index;
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, mirror_nr);
+
+ page_index = (sector_nr + mirror_nr * sfctx->sectors_per_stripe) /
+ (PAGE_SIZE >> sfctx->fs_info->sectorsize_bits);
+
+ ASSERT(sfctx->pages[page_index]);
+ return sfctx->pages[page_index];
+}
+
+static unsigned int scrub_fs_get_page_offset(struct scrub_fs_ctx *sfctx,
+ int sector_nr, int mirror_nr)
+{
+ int index;
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, mirror_nr);
+ index = sector_nr + mirror_nr * sfctx->sectors_per_stripe;
+
+ return offset_in_page(index << sfctx->fs_info->sectorsize_bits);
+}
+
+static void scrub_fs_read_endio(struct bio *bio)
+{
+ struct scrub_fs_endio_ctrl *endio_ctrl = bio->bi_private;
+ struct scrub_fs_ctx *sfctx = endio_ctrl->sfctx;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ int bio_size = 0;
+ bool error = (bio->bi_status != BLK_STS_OK);
+ const int mirror_nr = endio_ctrl->mirror_nr;
+ int i;
+
+ /* Grab the bio size for later sanity checks. */
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_size += bvec->bv_len;
+
+ /* We always submit a bio for a stripe length. */
+ ASSERT(bio_size == BTRFS_STRIPE_LEN);
+
+ for (i = 0; i < sfctx->sectors_per_stripe; i++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, i, mirror_nr);
+
+ /*
+ * Here we only set the sector flags, don't do any stat update,
+ * that will be done by the main thread when doing verification.
+ */
+ if (error) {
+ sector->flags |= SCRUB_FS_SECTOR_FLAG_IO_ERROR;
+ continue;
+ }
+ sector->flags |= SCRUB_FS_SECTOR_FLAG_IO_DONE;
+ }
+ atomic_dec(&sfctx->bios_under_io);
+ wake_up(&sfctx->wait);
+ bio_put(bio);
+}
+
+static void submit_stripe_read_bio(struct scrub_fs_ctx *sfctx,
+ struct btrfs_io_context *bioc,
+ int mirror_nr)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct btrfs_io_stripe *stripe = &bioc->stripes[mirror_nr];
+ struct btrfs_device *dev = stripe->dev;
+ struct bio *bio;
+ int ret;
+ int i;
+
+ /*
+ * Missing device, just mark the sectors with missing device
+ * and continue to next copy.
+ */
+ if (!dev || !dev->bdev) {
+ mark_missing_dev_sectors(sfctx, mirror_nr);
+ return;
+ }
+
+ /* Submit a bio to read the stripe length. */
+ bio = bio_alloc(dev->bdev, BIO_MAX_VECS,
+ REQ_OP_READ | REQ_BACKGROUND, GFP_KERNEL);
+
+ /* Bio is backed up by mempool, allocation should not fail. */
+ ASSERT(bio);
+
+ bio->bi_iter.bi_sector = stripe->physical >> SECTOR_SHIFT;
+ for (i = 0; i < sfctx->sectors_per_stripe; i++) {
+ struct page *page = scrub_fs_get_page(sfctx, i, mirror_nr);
+ unsigned int page_off = scrub_fs_get_page_offset(sfctx, i,
+ mirror_nr);
+
+ ret = bio_add_page(bio, page, fs_info->sectorsize, page_off);
+
+ /*
+ * Should not fail as we will at most add STRIPE_LEN / 4K
+ * (aka, 16) sectors, way smaller than BIO_MAX_VECS.
+ */
+ ASSERT(ret == fs_info->sectorsize);
+ }
+
+ bio->bi_private = &sfctx->endio_ctrls[mirror_nr];
+ bio->bi_end_io = scrub_fs_read_endio;
+ atomic_inc(&sfctx->bios_under_io);
+ submit_bio(bio);
+}
+
+static int scrub_fs_one_stripe(struct scrub_fs_ctx *sfctx)
+{
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct btrfs_io_context *bioc = NULL;
+ u64 mapped_len = BTRFS_STRIPE_LEN;
+ int i;
+ int ret;
+
+ /* We should at a stripe start inside current block group. */
+ ASSERT(sfctx->cur_bg->start <= sfctx->cur_logical &&
+ sfctx->cur_logical < sfctx->cur_bg->start +
+ sfctx->cur_bg->length);
+ ASSERT(IS_ALIGNED(sfctx->cur_logical - sfctx->cur_bg->start,
+ BTRFS_STRIPE_LEN));
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ sfctx->cur_logical, &mapped_len, &bioc);
+ if (ret < 0)
+ goto out;
+
+ if (mapped_len < BTRFS_STRIPE_LEN) {
+ btrfs_err_rl(fs_info,
+ "get short map for bytenr %llu, got mapped length %llu expect %u",
+ sfctx->cur_logical, mapped_len, BTRFS_STRIPE_LEN);
+ ret = -EUCLEAN;
+ sfctx->stat.nr_fatal_errors++;
+ goto out;
+ }
+
+ if (bioc->num_stripes != sfctx->nr_copies) {
+ btrfs_err_rl(fs_info,
+ "got unexpected number of stripes, got %d stripes expect %d",
+ bioc->num_stripes, sfctx->nr_copies);
+ ret = -EUCLEAN;
+ sfctx->stat.nr_fatal_errors++;
+ goto out;
+ }
+
+ for (i = 0; i < sfctx->nr_copies; i++)
+ submit_stripe_read_bio(sfctx, bioc, i);
+ wait_event(sfctx->wait, atomic_read(&sfctx->bios_under_io) == 0);
+
+ /*
+ * Place holder to update the accounting.
+ *
+ * Endio functions should have done the verification and updated
+ * sector->flags, but they don't update the accounting as they
+ * don't have the full view of other mirrors.
+ */
+out:
+ btrfs_put_bioc(bioc);
+ btrfs_bio_counter_dec(fs_info);
+ return ret;
+}
+
static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
struct btrfs_block_group *bg)
{
@@ -4988,8 +5203,9 @@ static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
break;
}
- /* Place holder for real stripe scrubbing. */
- ret = 0;
+ ret = scrub_fs_one_stripe(sfctx);
+ if (ret < 0)
+ break;
/* Reset the stripe for next run. */
scrub_fs_reset_stripe(sfctx);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 07/10] btrfs: scrub: implement metadata verification code for scrub_fs
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (5 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 06/10] btrfs: scrub: submit and wait for the read of each copy Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 08/10] btrfs: scrub: implement data " Qu Wenruo
` (2 subsequent siblings)
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
This patch introduces the following functions:
- scrub_fs_verify_one_stripe()
The entrance for all verification code.
Which will iterate every sector in the same vertical stripe.
- scrub_fs_verify_meta()
The helper to verify metadata in one vertical stripe.
(Since no RAID56 support, one vertical stripe just contains
all the same data from different mirrors)
- scrub_fs_verify_one_meta()
This is the real work, the checks includes:
* Basic metadata header checks (bytenr, fsid, level)
For this part, we refactor those checks from
validate_extent_buffer() into btrfs_validate_eb_basic(),
allowing us to suppress the error messages.
* Checksum verification
For this part, we refactor this one check from
validate_extent_buffer() into btrfs_validate_eb_csum(),
allowing us to suppress the error message.
* Tree check verification (NEW)
This is the new one, the old scrub code never fully utilize the
whole extent buffer related facilities, thus only very basic checks.
Now scrub_fs has (almost) the same checks as tree block read
routine.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/disk-io.c | 83 +++++++++++++++++++----------
fs/btrfs/disk-io.h | 2 +
fs/btrfs/scrub.c | 127 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 186 insertions(+), 26 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c10d368aed7b..1ee05c72b210 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -456,55 +456,87 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
return 1;
}
-/* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb)
+/*
+ * The very basic extent buffer checks, including:
+ *
+ * - Bytenr check
+ * - FSID check
+ * - Level check
+ *
+ * If @error_message is true, it will output error message (rate limited).
+ */
+int btrfs_validate_eb_basic(struct extent_buffer *eb, bool error_message)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
- const u32 csum_size = fs_info->csum_size;
u8 found_level;
- u8 result[BTRFS_CSUM_SIZE];
- const u8 *header_csum;
int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info,
+ if (error_message)
+ btrfs_err_rl(fs_info,
"bad tree block start, mirror %u want %llu have %llu",
- eb->read_mirror, eb->start, found_start);
- ret = -EIO;
- goto out;
+ eb->read_mirror, eb->start, found_start);
+ return -EIO;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
- eb->start, eb->read_mirror);
- ret = -EIO;
- goto out;
+ if (error_message)
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
+ return -EIO;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info,
- "bad tree block level, mirror %u level %d on logical %llu",
- eb->read_mirror, btrfs_header_level(eb), eb->start);
- ret = -EIO;
- goto out;
+ if (error_message)
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
+ return -EIO;
}
+ return ret;
+}
+
+int btrfs_validate_eb_csum(struct extent_buffer *eb, bool error_message)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 result[BTRFS_CSUM_SIZE];
+ const u8 *header_csum;
+ const u32 csum_size = fs_info->csum_size;
csum_tree_block(eb, result);
header_csum = page_address(eb->pages[0]) +
get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
- btrfs_warn_rl(fs_info,
+ if (error_message)
+ btrfs_warn_rl(fs_info,
"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- eb->start, eb->read_mirror,
- CSUM_FMT_VALUE(csum_size, header_csum),
- CSUM_FMT_VALUE(csum_size, result),
- btrfs_header_level(eb));
- ret = -EUCLEAN;
- goto out;
+ eb->start, eb->read_mirror,
+ CSUM_FMT_VALUE(csum_size, header_csum),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
+ return -EUCLEAN;
}
+ return 0;
+}
+
+/* Do basic extent buffer checks at read time */
+static inline int validate_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 found_level;
+ int ret = 0;
+
+ ret = btrfs_validate_eb_basic(eb, true);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_validate_eb_csum(eb, true);
+ if (ret < 0)
+ return ret;
+
+ found_level = btrfs_header_level(eb);
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
@@ -524,7 +556,6 @@ static int validate_extent_buffer(struct extent_buffer *eb)
btrfs_err(fs_info,
"read time tree block corruption detected on logical %llu mirror %u",
eb->start, eb->read_mirror);
-out:
return ret;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c67c15d4d20b..65110e8e0c8e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -83,6 +83,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
+int btrfs_validate_eb_basic(struct extent_buffer *eb, bool error_message);
+int btrfs_validate_eb_csum(struct extent_buffer *eb, bool error_message);
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cb0973e7ffd2..a693e35d172d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -20,6 +20,7 @@
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
+#include "tree-checker.h"
#include "zoned.h"
/*
@@ -208,6 +209,18 @@ struct scrub_ctx {
#define SCRUB_FS_SECTOR_FLAG_IO_ERROR (1 << 5)
#define SCRUB_FS_SECTOR_FLAG_IO_DONE (1 << 6)
+/* This marks if the sector is a good one (aka, passed all checks). */
+#define SCRUB_FS_SECTOR_FLAG_GOOD (1 << 7)
+
+/* For both metadata and data. */
+#define SCRUB_FS_SECTOR_FLAG_BAD_CSUM (1 << 8)
+
+/* Only for metadata, indicating some invalid values. */
+#define SCRUB_FS_SECTOR_FLAG_INVALID (1 << 9)
+
+/* Only for metadata, transid mismatch. */
+#define SCRUB_FS_SECTOR_FLAG_TRANSID_MISMATCH (1 << 10)
+
/*
* Represent a sector.
*
@@ -248,6 +261,12 @@ struct scrub_fs_endio_ctrl {
/* To locate the real sectors of the stripe. */
int mirror_nr;
+
+ /*
+ * Dummy extent buffer for metadata verification, so that we can
+ * utlize all eb related accessors.
+ */
+ struct extent_buffer *dummy_eb;
};
/* This structure should only has a lifespan inside btrfs_scrub_fs(). */
@@ -4599,6 +4618,11 @@ static void scrub_fs_cleanup_for_bg(struct scrub_fs_ctx *sfctx)
int i;
const int nr_pages = sfctx->nr_copies * (BTRFS_STRIPE_LEN >> PAGE_SHIFT);
+ if (sfctx->endio_ctrls) {
+ ASSERT(sfctx->nr_copies);
+ for (i = 0; i < sfctx->nr_copies; i++)
+ free_extent_buffer(sfctx->endio_ctrls[i].dummy_eb);
+ }
kfree(sfctx->endio_ctrls);
sfctx->endio_ctrls = NULL;
@@ -4688,6 +4712,13 @@ static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
for (i = 0; i < sfctx->nr_copies; i++) {
sfctx->endio_ctrls[i].sfctx = sfctx;
sfctx->endio_ctrls[i].mirror_nr = i;
+ if (bg->flags & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_SYSTEM)) {
+ sfctx->endio_ctrls[i].dummy_eb =
+ alloc_dummy_extent_buffer(fs_info, 0);
+ if (!sfctx->endio_ctrls[i].dummy_eb)
+ goto enomem;
+ }
}
sfctx->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
@@ -5016,10 +5047,81 @@ static unsigned int scrub_fs_get_page_offset(struct scrub_fs_ctx *sfctx,
return offset_in_page(index << sfctx->fs_info->sectorsize_bits);
}
+static void scrub_fs_verify_meta(struct scrub_fs_endio_ctrl *endio_ctrl,
+ int sector_nr, int mirror_nr)
+{
+ struct scrub_fs_ctx *sfctx = endio_ctrl->sfctx;
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct extent_buffer *eb = endio_ctrl->dummy_eb;
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+ const u64 logical = sector->eb_logical;
+ const u64 expected_gen = sector->eb_generation;
+ unsigned int set_flag;
+ int ret;
+ int i;
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, mirror_nr);
+ ASSERT(eb);
+
+ eb->start = logical;
+
+ /* Copy all the metadata sectors into the dummy eb. */
+ for (i = 0; i < fs_info->nodesize >> fs_info->sectorsize_bits; i++) {
+ struct page *page = scrub_fs_get_page(sfctx, sector_nr + i,
+ mirror_nr);
+ int page_off = scrub_fs_get_page_offset(sfctx, sector_nr + i,
+ mirror_nr);
+ int off_in_eb = i << fs_info->sectorsize_bits;
+
+ write_extent_buffer(eb, page_address(page) + page_off,
+ off_in_eb, fs_info->sectorsize);
+ }
+ /* Basic extent buffer checks. */
+ ret = btrfs_validate_eb_basic(eb, false);
+ if (ret < 0) {
+ set_flag = SCRUB_FS_SECTOR_FLAG_INVALID;
+ goto out;
+ }
+ /* Csum checks. */
+ ret = btrfs_validate_eb_csum(eb, false);
+ if (ret < 0) {
+ set_flag = SCRUB_FS_SECTOR_FLAG_BAD_CSUM;
+ goto out;
+ }
+
+ /* Full tree-check checks. */
+ if (btrfs_header_level(eb) > 0)
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+ if (ret < 0) {
+ set_flag = SCRUB_FS_SECTOR_FLAG_INVALID;
+ goto out;
+ }
+
+ /* Transid check */
+ if (btrfs_header_generation(eb) != expected_gen) {
+ set_flag = SCRUB_FS_SECTOR_FLAG_TRANSID_MISMATCH;
+ goto out;
+ }
+
+ /* All check passed. */
+ set_flag = SCRUB_FS_SECTOR_FLAG_GOOD;
+out:
+ for (i = 0; i < fs_info->nodesize >> fs_info->sectorsize_bits; i++) {
+ struct scrub_fs_sector *sector = scrub_fs_get_sector(sfctx,
+ sector_nr + i, mirror_nr);
+
+ sector->flags |= set_flag;
+ }
+}
+
static void scrub_fs_read_endio(struct bio *bio)
{
struct scrub_fs_endio_ctrl *endio_ctrl = bio->bi_private;
struct scrub_fs_ctx *sfctx = endio_ctrl->sfctx;
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
int bio_size = 0;
@@ -5034,6 +5136,7 @@ static void scrub_fs_read_endio(struct bio *bio)
/* We always submit a bio for a stripe length. */
ASSERT(bio_size == BTRFS_STRIPE_LEN);
+ /* First loop to update IO_DONE flags. */
for (i = 0; i < sfctx->sectors_per_stripe; i++) {
struct scrub_fs_sector *sector =
scrub_fs_get_sector(sfctx, i, mirror_nr);
@@ -5048,6 +5151,30 @@ static void scrub_fs_read_endio(struct bio *bio)
}
sector->flags |= SCRUB_FS_SECTOR_FLAG_IO_DONE;
}
+ if (error)
+ goto out;
+
+ /* Second loop to do the verification. */
+ for (i = 0; i < sfctx->sectors_per_stripe; i++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, i, mirror_nr);
+
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_UNUSED ||
+ !(sector->flags & SCRUB_FS_SECTOR_FLAG_IO_DONE))
+ continue;
+
+ /* Place holder for data verification. */
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_DATA)
+ continue;
+
+ /* We must be at a metadata sector. */
+ ASSERT(sector->flags & SCRUB_FS_SECTOR_FLAG_META);
+ scrub_fs_verify_meta(endio_ctrl, i, mirror_nr);
+ /* Skip to the end of the tree block. */
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
+ }
+
+out:
atomic_dec(&sfctx->bios_under_io);
wake_up(&sfctx->wait);
bio_put(bio);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 08/10] btrfs: scrub: implement data verification code for scrub_fs
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (6 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 07/10] btrfs: scrub: implement metadata verification code for scrub_fs Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 09/10] btrfs: scrub: implement the later stage of verification Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 10/10] btrfs: scrub: implement the repair (writeback) functionality Qu Wenruo
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
For data verification it's much simpler, only need to do csum
verification and we already have very handy helper for it.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 40 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 38 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a693e35d172d..efe49a04dceb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -5117,6 +5117,41 @@ static void scrub_fs_verify_meta(struct scrub_fs_endio_ctrl *endio_ctrl,
}
}
+static void scrub_fs_verify_data(struct scrub_fs_endio_ctrl *endio_ctrl,
+ int sector_nr, int mirror_nr)
+{
+ struct scrub_fs_ctx *sfctx = endio_ctrl->sfctx;
+ struct btrfs_fs_info *fs_info = sfctx->fs_info;
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+ u8 csum_result[BTRFS_CSUM_SIZE] = {0};
+ u8 *csum_expected = sector->csum;
+ unsigned int set_flag;
+ int ret;
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, mirror_nr);
+
+ /*
+ * No checksum case, we can not determine if it's preallocate or real
+ * NODATASUM. Just mark it good unconditionally.
+ */
+ if (!csum_expected) {
+ set_flag = SCRUB_FS_SECTOR_FLAG_GOOD;
+ goto out;
+ }
+
+ ret = btrfs_check_sector_csum(fs_info,
+ scrub_fs_get_page(sfctx, sector_nr, mirror_nr),
+ scrub_fs_get_page_offset(sfctx, sector_nr, mirror_nr),
+ csum_result, csum_expected);
+ if (ret < 0)
+ set_flag = SCRUB_FS_SECTOR_FLAG_BAD_CSUM;
+ else
+ set_flag = SCRUB_FS_SECTOR_FLAG_GOOD;
+out:
+ sector->flags |= set_flag;
+}
+
static void scrub_fs_read_endio(struct bio *bio)
{
struct scrub_fs_endio_ctrl *endio_ctrl = bio->bi_private;
@@ -5163,9 +5198,10 @@ static void scrub_fs_read_endio(struct bio *bio)
!(sector->flags & SCRUB_FS_SECTOR_FLAG_IO_DONE))
continue;
- /* Place holder for data verification. */
- if (sector->flags & SCRUB_FS_SECTOR_FLAG_DATA)
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_DATA) {
+ scrub_fs_verify_data(endio_ctrl, i, mirror_nr);
continue;
+ }
/* We must be at a metadata sector. */
ASSERT(sector->flags & SCRUB_FS_SECTOR_FLAG_META);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 09/10] btrfs: scrub: implement the later stage of verification
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (7 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 08/10] btrfs: scrub: implement data " Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
2022-09-28 8:35 ` [PATCH PoC v2 10/10] btrfs: scrub: implement the repair (writeback) functionality Qu Wenruo
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
At endio time we have verified all the data/metadata in each mirror
and updated sector->flags.
But we haven't yet to update the sfctx->stat, as one of the design is to
avoid concurrency access to sfctx->stat.
Furthermore at endio time we are not aware if other copies has the good
copy.
Thus we have to do the final veritication at the main thread, after all
endio functions finished.
At the later stage, we can update sfctx->stat in a single thread without
lock protection, and determine if we can repair certain sectors.
Furthermore if we found a good copy, we should copy the content to any
bad copy so later we can repair them.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 108 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 104 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efe49a04dceb..89735ff6143a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -221,6 +221,9 @@ struct scrub_ctx {
/* Only for metadata, transid mismatch. */
#define SCRUB_FS_SECTOR_FLAG_TRANSID_MISMATCH (1 << 10)
+/* For both metadata and data, to show we can and need to repair the sector. */
+#define SCRUB_FS_SECTOR_FLAG_RECOVERABLE (1 << 11)
+
/*
* Represent a sector.
*
@@ -5264,6 +5267,101 @@ static void submit_stripe_read_bio(struct scrub_fs_ctx *sfctx,
submit_bio(bio);
}
+static void scrub_fs_copy_sector(struct scrub_fs_ctx *sfctx, int sector_nr,
+ int good_mirror, int bad_mirror)
+{
+ struct page *good_page;
+ struct page *bad_page;
+ size_t offset;
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, good_mirror);
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, bad_mirror);
+
+ good_page = scrub_fs_get_page(sfctx, sector_nr, good_mirror);
+ bad_page = scrub_fs_get_page(sfctx, sector_nr, bad_mirror);
+
+ ASSERT(good_page);
+ ASSERT(bad_page);
+
+ /*
+ * Since both good and bad sectors are in the same vertical stripe,
+ * their page offset should be the same.
+ */
+ offset = scrub_fs_get_page_offset(sfctx, sector_nr, good_mirror);
+ memcpy_page(bad_page, offset, good_page, offset, sfctx->fs_info->sectorsize);
+}
+
+static void scrub_fs_update_veritical(struct scrub_fs_ctx *sfctx,
+ int sector_nr)
+{
+ int mirror_nr;
+ u32 sectorsize = sfctx->fs_info->sectorsize;
+ int good_mirror = -1;
+ bool is_data = (sfctx->sectors[sector_nr].flags &
+ SCRUB_FS_SECTOR_FLAG_DATA);
+
+ scrub_fs_check_sector_mirror_nr(sfctx, sector_nr, 0);
+
+ for (mirror_nr = 0; mirror_nr < sfctx->nr_copies; mirror_nr++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_UNUSED)
+ continue;
+
+ if (is_data)
+ sfctx->stat.data_scrubbed += sectorsize;
+ else
+ sfctx->stat.meta_scrubbed += sectorsize;
+
+ if (is_data && !sector->csum)
+ sfctx->stat.data_nocsum += sectorsize;
+
+ if (!(sector->flags & SCRUB_FS_SECTOR_FLAG_IO_DONE)) {
+ if (is_data)
+ sfctx->stat.data_io_fail += sectorsize;
+ else
+ sfctx->stat.meta_io_fail += sectorsize;
+ }
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_INVALID) {
+ ASSERT(!is_data);
+ sfctx->stat.meta_invalid += sectorsize;
+ }
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_TRANSID_MISMATCH) {
+ ASSERT(!is_data);
+ sfctx->stat.meta_bad_transid += sectorsize;
+ }
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_BAD_CSUM) {
+ if (is_data)
+ sfctx->stat.data_csum_mismatch += sectorsize;
+ else
+ sfctx->stat.meta_bad_csum += sectorsize;
+ }
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_GOOD)
+ good_mirror = mirror_nr;
+ }
+ if (good_mirror < 0)
+ return;
+
+ /* Mark the bad sectors as repairable. */
+ for (mirror_nr = 0; mirror_nr < sfctx->nr_copies; mirror_nr++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, sector_nr, mirror_nr);
+
+ if (!(sector->flags & SCRUB_FS_SECTOR_FLAG_GOOD)) {
+ if (is_data)
+ sfctx->stat.data_recoverable += sectorsize;
+ else
+ sfctx->stat.meta_recoverable += sectorsize;
+
+ /* Copy the content from the good sector for later repair. */
+ scrub_fs_copy_sector(sfctx, sector_nr, good_mirror, mirror_nr);
+
+ sector->flags |= SCRUB_FS_SECTOR_FLAG_RECOVERABLE;
+ }
+ }
+}
+
static int scrub_fs_one_stripe(struct scrub_fs_ctx *sfctx)
{
struct btrfs_fs_info *fs_info = sfctx->fs_info;
@@ -5308,12 +5406,14 @@ static int scrub_fs_one_stripe(struct scrub_fs_ctx *sfctx)
wait_event(sfctx->wait, atomic_read(&sfctx->bios_under_io) == 0);
/*
- * Place holder to update the accounting.
- *
* Endio functions should have done the verification and updated
- * sector->flags, but they don't update the accounting as they
- * don't have the full view of other mirrors.
+ * sector->flags of each mirror, but they don't update the accounting
+ * as they don't have the full view of other mirrors.
*/
+ for (i = 0; i < sfctx->sectors_per_stripe; i++)
+ scrub_fs_update_veritical(sfctx, i);
+
+ /* Place holder for repair write-back code */
out:
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH PoC v2 10/10] btrfs: scrub: implement the repair (writeback) functionality
2022-09-28 8:35 [PATCH PoC v2 00/10] btrfs: scrub: introduce a new family of ioctl, scrub_fs Qu Wenruo
` (8 preceding siblings ...)
2022-09-28 8:35 ` [PATCH PoC v2 09/10] btrfs: scrub: implement the later stage of verification Qu Wenruo
@ 2022-09-28 8:35 ` Qu Wenruo
9 siblings, 0 replies; 15+ messages in thread
From: Qu Wenruo @ 2022-09-28 8:35 UTC (permalink / raw)
To: linux-btrfs
This adds the repair functionality for scrub_fs.
Since previous patch has implemented the final verification part, all
sectors that can be repaired will have SCRUB_FS_SECTOR_FLAG_RECOVERABLE,
we just need to submit write bios for them.
And just like the old scrub interface, we don't report writeback error.
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/scrub.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 89735ff6143a..27d96778206c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -5362,6 +5362,93 @@ static void scrub_fs_update_veritical(struct scrub_fs_ctx *sfctx,
}
}
+static void scrub_fs_write_endio(struct bio *bio)
+{
+ struct scrub_fs_ctx *sfctx = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ unsigned int bio_size = 0;
+
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_size += bvec->bv_len;
+
+ /* Repair should be inside one stripe. */
+ ASSERT(bio_size <= BTRFS_STRIPE_LEN);
+
+ atomic_dec(&sfctx->bios_under_io);
+ wake_up(&sfctx->wait);
+ bio_put(bio);
+}
+
+static void scrub_fs_repair_mirror(struct scrub_fs_ctx *sfctx,
+ struct btrfs_io_context *bioc, int mirror_nr)
+{
+ struct bio *bio = NULL;
+ int last_sector = -1;
+ int i;
+
+ ASSERT(mirror_nr < bioc->num_stripes);
+
+ for (i = 0; i < sfctx->sectors_per_stripe; i++) {
+ struct scrub_fs_sector *sector =
+ scrub_fs_get_sector(sfctx, i, mirror_nr);
+
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_DEV_MISSING ||
+ sector->flags & SCRUB_FS_SECTOR_FLAG_GOOD ||
+ !(sector->flags & SCRUB_FS_SECTOR_FLAG_RECOVERABLE))
+ continue;
+
+ /* No bio allocated, alloc a new one. */
+ if (!bio) {
+ blk_opf_t opf = REQ_OP_WRITE | REQ_BACKGROUND;
+
+ if (sector->flags & SCRUB_FS_SECTOR_FLAG_META)
+ opf |= REQ_META;
+
+ bio = bio_alloc(bioc->stripes[mirror_nr].dev->bdev,
+ sfctx->sectors_per_stripe, opf,
+ GFP_KERNEL);
+ /* It's backed up by mempool. */
+ ASSERT(bio);
+
+ bio->bi_iter.bi_sector =
+ (bioc->stripes[mirror_nr].physical +
+ (i << sfctx->fs_info->sectorsize_bits)) >>
+ SECTOR_SHIFT;
+ bio->bi_private = sfctx;
+ bio->bi_end_io = scrub_fs_write_endio;
+
+ last_sector = i - 1;
+ }
+
+ /* Can merge into preivous bio.*/
+ if (last_sector == i - 1) {
+ struct page *page =
+ scrub_fs_get_page(sfctx, i, mirror_nr);
+ unsigned int page_off =
+ scrub_fs_get_page_offset(sfctx, i, mirror_nr);
+ int ret;
+
+ ret = bio_add_page(bio, page, sfctx->fs_info->sectorsize,
+ page_off);
+ ASSERT(ret == sfctx->fs_info->sectorsize);
+ last_sector = i;
+ continue;
+ }
+
+ /* Can not merge, has to submit the current one and retry. */
+ ASSERT(bio);
+ atomic_inc(&sfctx->bios_under_io);
+ submit_bio(bio);
+ bio = NULL;
+ i--;
+ }
+ if (bio) {
+ atomic_inc(&sfctx->bios_under_io);
+ submit_bio(bio);
+ }
+}
+
static int scrub_fs_one_stripe(struct scrub_fs_ctx *sfctx)
{
struct btrfs_fs_info *fs_info = sfctx->fs_info;
@@ -5413,7 +5500,14 @@ static int scrub_fs_one_stripe(struct scrub_fs_ctx *sfctx)
for (i = 0; i < sfctx->sectors_per_stripe; i++)
scrub_fs_update_veritical(sfctx, i);
- /* Place holder for repair write-back code */
+ /* Submit repair*/
+ if (!sfctx->readonly) {
+ for (i = 0; i < sfctx->nr_copies; i++)
+ scrub_fs_repair_mirror(sfctx, bioc, i);
+ wait_event(sfctx->wait, atomic_read(&sfctx->bios_under_io) == 0);
+ }
+ ASSERT(atomic_read(&sfctx->bios_under_io) == 0);
+
out:
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
--
2.37.3
^ permalink raw reply related [flat|nested] 15+ messages in thread