* [PATCH v7 01/13] btrfs: re-add trans parameter to insert_delayed_ref
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 13:36 ` Anand Jain
2023-03-02 9:45 ` [PATCH v7 02/13] btrfs: add raid stripe tree definitions Johannes Thumshirn
` (13 subsequent siblings)
14 siblings, 1 reply; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Re-add the trans parameter to insert_delayed_ref as it is needed again
later in this series.
This reverts commit bccf28752a99 ("btrfs: drop trans parameter of insert_delayed_ref")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/delayed-ref.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 886ffb232eac..7660ac642c81 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -598,7 +598,8 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return 0 for insert.
* Return >0 for merge.
*/
-static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
@@ -974,7 +975,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1066,7 +1067,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 02/13] btrfs: add raid stripe tree definitions
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 01/13] btrfs: re-add trans parameter to insert_delayed_ref Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 03/13] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
` (12 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig,
Anand Jain
Add definitions for the raid stripe tree. This tree will hold information
about the on-disk layout of the stripes in a RAID set.
Each stripe extent has a 1:1 relationship with an on-disk extent item and
is doing the logical to per-drive physical address translation for the
extent item in question.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/accessors.h | 29 +++++++++++++++++++++++++++++
include/uapi/linux/btrfs_tree.h | 20 ++++++++++++++++++--
2 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index ceadfc5d6c66..6e753b63faae 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -306,6 +306,35 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
+static inline struct btrfs_raid_stride *btrfs_raid_stride_nr(
+ struct btrfs_stripe_extent *dps, int nr)
+{
+ unsigned long offset = (unsigned long)dps;
+
+ offset += offsetof(struct btrfs_stripe_extent, strides);
+ offset += nr * sizeof(struct btrfs_raid_stride);
+ return (struct btrfs_raid_stride *)offset;
+}
+
+static inline u64 btrfs_raid_stride_devid_nr(const struct extent_buffer *eb,
+ struct btrfs_stripe_extent *dps,
+ int nr)
+{
+ return btrfs_raid_stride_devid(eb, btrfs_raid_stride_nr(dps, nr));
+}
+
+static inline u64 btrfs_raid_stride_physical_nr(const struct extent_buffer *eb,
+ struct btrfs_stripe_extent *dps,
+ int nr)
+{
+ return btrfs_raid_stride_physical(eb, btrfs_raid_stride_nr(dps, nr));
+}
+
/* struct btrfs_dev_extent */
BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index ab38d0f411fa..64e6bf2a10d8 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -4,9 +4,8 @@
#include <linux/btrfs.h>
#include <linux/types.h>
-#ifdef __KERNEL__
#include <linux/stddef.h>
-#else
+#ifndef __KERNEL__
#include <stddef.h>
#endif
@@ -73,6 +72,9 @@
/* Holds the block group items for extent tree v2. */
#define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
+/* tracks RAID stripes in block groups. */
+#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+
/* device stats in the device tree */
#define BTRFS_DEV_STATS_OBJECTID 0ULL
@@ -281,6 +283,8 @@
*/
#define BTRFS_QGROUP_RELATION_KEY 246
+#define BTRFS_RAID_STRIPE_KEY 247
+
/*
* Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
*/
@@ -715,6 +719,18 @@ struct btrfs_free_space_header {
__le64 num_bitmaps;
} __attribute__ ((__packed__));
+struct btrfs_raid_stride {
+ /* btrfs device-id this raid extent lives on */
+ __le64 devid;
+ /* physical location on disk */
+ __le64 physical;
+};
+
+struct btrfs_stripe_extent {
+ /* array of raid strides this stripe is composed of */
+ __DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);
+};
+
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 03/13] btrfs: read raid-stripe-tree from disk
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 01/13] btrfs: re-add trans parameter to insert_delayed_ref Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 02/13] btrfs: add raid stripe tree definitions Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents Johannes Thumshirn
` (11 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig,
Anand Jain
If we find a raid-stripe-tree on mount, read it from disk.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/block-rsv.c | 1 +
fs/btrfs/disk-io.c | 22 ++++++++++++++++++++++
fs/btrfs/disk-io.h | 5 +++++
fs/btrfs/fs.h | 4 ++++
include/uapi/linux/btrfs.h | 1 +
5 files changed, 33 insertions(+)
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 5367a14d44d2..384987343a64 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -402,6 +402,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0e0c30fe6df6..ac200b367ec8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1438,6 +1438,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
}
+ if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->stripe_root) ?
+ fs_info->stripe_root : ERR_PTR(-ENOENT);
return NULL;
}
@@ -1516,6 +1519,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
+ btrfs_put_root(fs_info->stripe_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -2051,6 +2055,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
+ free_root_extent_buffers(info->stripe_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2512,6 +2517,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->stripe_root = root;
+ }
+ }
+
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -3020,6 +3039,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
+
+ rwlock_init(&fs_info->stripe_update_lock);
+ fs_info->stripe_update_tree = RB_ROOT;
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4d5772330110..c4de38374b62 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -107,6 +107,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
+static inline struct btrfs_root *btrfs_stripe_tree_root(struct btrfs_fs_info *fs_info)
+{
+ return fs_info->stripe_root;
+}
+
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 4c477eae6891..d0d80540b32b 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -367,6 +367,7 @@ struct btrfs_fs_info {
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
+ struct btrfs_root *stripe_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -790,6 +791,9 @@ struct btrfs_fs_info {
struct lockdep_map btrfs_trans_pending_ordered_map;
struct lockdep_map btrfs_ordered_extent_map;
+ rwlock_t stripe_update_lock;
+ struct rb_root stripe_update_tree;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index ada0a489bf2b..df7b60483642 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -332,6 +332,7 @@ struct btrfs_ioctl_fs_info_args {
#define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11)
#define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12)
#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
+#define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14)
struct btrfs_ioctl_feature_flags {
__u64 compat_flags;
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (2 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 03/13] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 10:58 ` Qu Wenruo
2023-03-02 9:45 ` [PATCH v7 05/13] btrfs: delete stripe extent on extent deletion Johannes Thumshirn
` (10 subsequent siblings)
14 siblings, 1 reply; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Add support for inserting stripe extents into the raid stripe tree on
completion of every write that needs an extra logical-to-physical
translation when using RAID.
Inserting the stripe extents happens after the data I/O has completed,
this is done to a) support zone-append and b) rule out the possibility of
a RAID-write-hole.
This is done by creating in-memory ordered stripe extents, just like the
in memory ordered extents, on I/O completion and the on-disk raid stripe
extents get created once we're running the delayed_refs for the extent
item this stripe extent is tied to.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/Makefile | 2 +-
fs/btrfs/bio.c | 29 +++++
fs/btrfs/delayed-ref.c | 6 +-
fs/btrfs/delayed-ref.h | 2 +
fs/btrfs/extent-tree.c | 60 +++++++++++
fs/btrfs/inode.c | 15 ++-
fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
fs/btrfs/raid-stripe-tree.h | 71 +++++++++++++
fs/btrfs/volumes.c | 4 +-
fs/btrfs/volumes.h | 13 +--
fs/btrfs/zoned.c | 3 +
11 files changed, 397 insertions(+), 12 deletions(-)
create mode 100644 fs/btrfs/raid-stripe-tree.c
create mode 100644 fs/btrfs/raid-stripe-tree.h
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..3bb869a84e54 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o
+ lru_cache.o raid-stripe-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 726592868e9c..2b174865d347 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -15,6 +15,7 @@
#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
+#include "raid-stripe-tree.h"
static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
@@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_put_bioc(bioc);
}
+static void btrfs_raid_stripe_update(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+ struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ int ret;
+
+ ret = btrfs_add_ordered_stripe(bioc);
+ if (ret)
+ bbio->bio.bi_status = errno_to_blk_status(ret);
+ btrfs_orig_bbio_end_io(bbio);
+ btrfs_put_bioc(bioc);
+}
+
static void btrfs_orig_write_end_io(struct bio *bio)
{
struct btrfs_io_stripe *stripe = bio->bi_private;
@@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
+ queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
+ &bbio->end_io_work);
+ return;
+ }
+
btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -381,6 +407,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
+ } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
/* Pass on control to the original bio this one was cloned from */
@@ -440,6 +468,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
+ bioc->size = bio->bi_iter.bi_size;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7660ac642c81..261f52ad8e12 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -14,6 +14,7 @@
#include "space-info.h"
#include "tree-mod-log.h"
#include "fs.h"
+#include "raid-stripe-tree.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -637,8 +638,11 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
exist->ref_mod += mod;
/* remove existing tail if its ref_mod is zero */
- if (exist->ref_mod == 0)
+ if (exist->ref_mod == 0) {
+ btrfs_drop_ordered_stripe(trans->fs_info, exist->bytenr);
drop_delayed_ref(root, href, exist);
+ }
+
spin_unlock(&href->lock);
return ret;
inserted:
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 2eb34abf700f..5096c1a1ed3e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -51,6 +51,8 @@ struct btrfs_delayed_ref_node {
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
+ /* Do we need RAID stripe tree modifications? */
+ unsigned int must_insert_stripe:1;
};
struct btrfs_delayed_extent_op {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b6c59e6805c..7441d784fe03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,6 +42,7 @@
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -1497,6 +1498,56 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
return ret;
}
+static bool delayed_ref_needs_rst_update(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ bool ret = false;
+
+ if (!btrfs_stripe_tree_root(fs_info))
+ return ret;
+
+ em = btrfs_get_chunk_map(fs_info, head->bytenr, head->num_bytes);
+ if (!em)
+ return ret;
+
+ map = em->map_lookup;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type))
+ ret = true;
+
+ free_extent_map(em);
+
+ return ret;
+}
+
+static int add_stripe_entry_for_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_ordered_stripe *stripe;
+ int ret = 0;
+
+ stripe = btrfs_lookup_ordered_stripe(fs_info, node->bytenr);
+ if (!stripe) {
+ btrfs_err(fs_info,
+ "cannot get stripe extent for address %llu (%llu)",
+ node->bytenr, node->num_bytes);
+ return -EINVAL;
+ }
+
+ ASSERT(stripe->logical == node->bytenr);
+
+ ret = btrfs_insert_raid_extent(trans, stripe);
+ /* once for us */
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ /* once for the tree */
+ btrfs_put_ordered_stripe(fs_info, stripe);
+
+ return ret;
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
@@ -1527,11 +1578,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
flags, ref->objectid,
ref->offset, &ins,
node->ref_mod);
+ if (ret)
+ return ret;
+ if (node->must_insert_stripe)
+ ret = add_stripe_entry_for_delayed_ref(trans, node);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
ref->objectid, ref->offset,
node->ref_mod, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ if (node->must_insert_stripe)
+ btrfs_drop_ordered_stripe(trans->fs_info, node->bytenr);
ret = __btrfs_free_extent(trans, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
@@ -1901,6 +1958,8 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_delayed_ref_node *ref;
+ const bool need_rst_update =
+ delayed_ref_needs_rst_update(fs_info, locked_ref);
int must_insert_reserved = 0;
int ret;
@@ -1951,6 +2010,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
+ ref->must_insert_stripe = need_rst_update;
ret = run_one_delayed_ref(trans, ref, extent_op,
must_insert_reserved);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f07d59e8193..aaa1db90e58b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,6 +70,7 @@
#include "verity.h"
#include "super.h"
#include "orphan.h"
+#include "raid-stripe-tree.h"
struct btrfs_iget_args {
u64 ino;
@@ -9495,12 +9496,17 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
if (qgroup_released < 0)
return ERR_PTR(qgroup_released);
+ ret = btrfs_insert_preallocated_raid_stripe(inode->root->fs_info,
+ start, len);
+ if (ret)
+ goto free_qgroup;
+
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, qgroup_released);
if (ret)
- goto free_qgroup;
+ goto free_stripe_extent;
return trans;
}
@@ -9518,7 +9524,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto free_qgroup;
+ goto free_stripe_extent;
}
ret = btrfs_replace_file_extents(inode, path, file_offset,
@@ -9526,9 +9532,12 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
&trans);
btrfs_free_path(path);
if (ret)
- goto free_qgroup;
+ goto free_stripe_extent;
return trans;
+free_stripe_extent:
+ btrfs_drop_ordered_stripe(inode->root->fs_info, start);
+
free_qgroup:
/*
* We have released qgroup data range at the beginning of the function,
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..9d3e7bffe6f8
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static int ordered_stripe_cmp(const void *key, const struct rb_node *node)
+{
+ struct btrfs_ordered_stripe *stripe =
+ rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+ const u64 *logical = key;
+
+ if (*logical < stripe->logical)
+ return -1;
+ if (*logical >= stripe->logical + stripe->num_bytes)
+ return 1;
+ return 0;
+}
+
+static int ordered_stripe_less(struct rb_node *rba, const struct rb_node *rbb)
+{
+ struct btrfs_ordered_stripe *stripe =
+ rb_entry(rba, struct btrfs_ordered_stripe, rb_node);
+ return ordered_stripe_cmp(&stripe->logical, rbb);
+}
+
+int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_ordered_stripe *stripe;
+ struct btrfs_io_stripe *tmp;
+ u64 logical = bioc->logical;
+ u64 length = bioc->size;
+ struct rb_node *node;
+ size_t size;
+
+ size = bioc->num_stripes * sizeof(struct btrfs_io_stripe);
+ stripe = kzalloc(sizeof(struct btrfs_ordered_stripe), GFP_NOFS);
+ if (!stripe)
+ return -ENOMEM;
+
+ spin_lock_init(&stripe->lock);
+ tmp = kmemdup(bioc->stripes, size, GFP_NOFS);
+ if (!tmp) {
+ kfree(stripe);
+ return -ENOMEM;
+ }
+
+ stripe->logical = logical;
+ stripe->num_bytes = length;
+ stripe->num_stripes = bioc->num_stripes;
+ spin_lock(&stripe->lock);
+ stripe->stripes = tmp;
+ spin_unlock(&stripe->lock);
+ refcount_set(&stripe->ref, 1);
+
+ write_lock(&fs_info->stripe_update_lock);
+ node = rb_find_add(&stripe->rb_node, &fs_info->stripe_update_tree,
+ ordered_stripe_less);
+ if (node) {
+ struct btrfs_ordered_stripe *old =
+ rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+
+ btrfs_debug(fs_info, "logical: %llu, length: %llu already exists",
+ logical, length);
+ ASSERT(logical == old->logical);
+
+ rb_replace_node(node, &stripe->rb_node,
+ &fs_info->stripe_update_tree);
+ }
+ write_unlock(&fs_info->stripe_update_lock);
+
+ return 0;
+}
+
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *fs_info,
+ u64 logical)
+{
+ struct rb_root *root = &fs_info->stripe_update_tree;
+ struct btrfs_ordered_stripe *stripe = NULL;
+ struct rb_node *node;
+
+ read_lock(&fs_info->stripe_update_lock);
+ node = rb_find(&logical, root, ordered_stripe_cmp);
+ if (node) {
+ stripe = rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+ refcount_inc(&stripe->ref);
+ }
+ read_unlock(&fs_info->stripe_update_lock);
+
+ return stripe;
+}
+
+void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
+ struct btrfs_ordered_stripe *stripe)
+{
+
+ if (refcount_dec_and_test(&stripe->ref)) {
+ struct rb_node *node;
+
+ write_lock(&fs_info->stripe_update_lock);
+
+ node = &stripe->rb_node;
+ rb_erase(node, &fs_info->stripe_update_tree);
+ RB_CLEAR_NODE(node);
+
+ spin_lock(&stripe->lock);
+ kfree(stripe->stripes);
+ spin_unlock(&stripe->lock);
+ kfree(stripe);
+ write_unlock(&fs_info->stripe_update_lock);
+ }
+}
+
+int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len)
+{
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_ordered_stripe *stripe;
+ u64 map_length = len;
+ int ret;
+
+ if (!btrfs_stripe_tree_root(fs_info))
+ return 0;
+
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, start, &map_length,
+ &bioc, 0);
+ if (ret)
+ return ret;
+
+ bioc->size = len;
+
+ stripe = btrfs_lookup_ordered_stripe(fs_info, start);
+ if (!stripe) {
+ ret = btrfs_add_ordered_stripe(bioc);
+ if (ret)
+ return ret;
+ } else {
+ spin_lock(&stripe->lock);
+ memcpy(stripe->stripes, bioc->stripes,
+ bioc->num_stripes * sizeof(struct btrfs_io_stripe));
+ spin_unlock(&stripe->lock);
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ }
+
+ return 0;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key stripe_key;
+ struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
+ struct btrfs_stripe_extent *stripe_extent;
+ size_t item_size;
+ int ret;
+
+ item_size = stripe->num_stripes * sizeof(struct btrfs_raid_stride);
+
+ stripe_extent = kzalloc(item_size, GFP_NOFS);
+ if (!stripe_extent) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ btrfs_end_transaction(trans);
+ return -ENOMEM;
+ }
+
+ spin_lock(&stripe->lock);
+ for (int i = 0; i < stripe->num_stripes; i++) {
+ u64 devid = stripe->stripes[i].dev->devid;
+ u64 physical = stripe->stripes[i].physical;
+ struct btrfs_raid_stride *raid_stride =
+ &stripe_extent->strides[i];
+
+ btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+ btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+ }
+ spin_unlock(&stripe->lock);
+
+ stripe_key.objectid = stripe->logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = stripe->num_bytes;
+
+ ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+ item_size);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ kfree(stripe_extent);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..60d3f8489cc9
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#include "disk-io.h"
+#include "messages.h"
+
+struct btrfs_io_context;
+
+struct btrfs_ordered_stripe {
+ struct rb_node rb_node;
+
+ u64 logical;
+ u64 num_bytes;
+ int num_stripes;
+ struct btrfs_io_stripe *stripes;
+ spinlock_t lock;
+ refcount_t ref;
+};
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_stripe *stripe);
+int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len);
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
+ struct btrfs_fs_info *fs_info,
+ u64 logical);
+int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc);
+void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
+ struct btrfs_ordered_stripe *stripe);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+ u64 map_type)
+{
+ u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (!btrfs_stripe_tree_root(fs_info))
+ return false;
+
+ if (type != BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ return true;
+
+ return false;
+}
+
+static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
+ u64 logical)
+{
+ struct btrfs_ordered_stripe *stripe;
+
+ if (!btrfs_stripe_tree_root(fs_info))
+ return;
+
+ stripe = btrfs_lookup_ordered_stripe(fs_info, logical);
+ if (!stripe)
+ return;
+ ASSERT(refcount_read(&stripe->ref) == 2);
+ /* once for us */
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ /* once for the tree */
+ btrfs_put_ordered_stripe(fs_info, stripe);
+}
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d6775c7196f..fee611d1b01d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5879,6 +5879,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical,
u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -5898,6 +5899,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->replace_stripe_src = -1;
bioc->full_stripe_logical = (u64)-1;
+ bioc->logical = logical;
return bioc;
}
@@ -6493,7 +6495,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+ bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 650e131d079e..114c76c81eda 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -372,12 +372,10 @@ struct btrfs_fs_devices {
struct btrfs_io_stripe {
struct btrfs_device *dev;
- union {
- /* Block mapping */
- u64 physical;
- /* For the endio handler */
- struct btrfs_io_context *bioc;
- };
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
};
struct btrfs_discard_stripe {
@@ -410,6 +408,9 @@ struct btrfs_io_context {
atomic_t error;
u16 max_errors;
+ u64 logical;
+ u64 size;
+
/*
* The total number of stripes, including the extra duplicated
* stripe for replace.
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f95b2c94d619..7e6cfc7a2918 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1692,6 +1692,9 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
u64 chunk_start_phys;
u64 logical;
+ /* Filesystems with a stripe tree have their own l2p mapping */
+ ASSERT(!btrfs_stripe_tree_root(fs_info));
+
em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
if (IS_ERR(em))
return;
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 9:45 ` [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents Johannes Thumshirn
@ 2023-03-02 10:58 ` Qu Wenruo
2023-03-02 11:25 ` Johannes Thumshirn
0 siblings, 1 reply; 49+ messages in thread
From: Qu Wenruo @ 2023-03-02 10:58 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: linux-btrfs, Josef Bacik, Christoph Hellwig
On 2023/3/2 17:45, Johannes Thumshirn wrote:
> Add support for inserting stripe extents into the raid stripe tree on
> completion of every write that needs an extra logical-to-physical
> translation when using RAID.
>
> Inserting the stripe extents happens after the data I/O has completed,
> this is done to a) support zone-append and b) rule out the possibility of
> a RAID-write-hole.
>
> This is done by creating in-memory ordered stripe extents, just like the
> in memory ordered extents, on I/O completion and the on-disk raid stripe
> extents get created once we're running the delayed_refs for the extent
> item this stripe extent is tied to.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
> fs/btrfs/Makefile | 2 +-
> fs/btrfs/bio.c | 29 +++++
> fs/btrfs/delayed-ref.c | 6 +-
> fs/btrfs/delayed-ref.h | 2 +
> fs/btrfs/extent-tree.c | 60 +++++++++++
> fs/btrfs/inode.c | 15 ++-
> fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
> fs/btrfs/raid-stripe-tree.h | 71 +++++++++++++
> fs/btrfs/volumes.c | 4 +-
> fs/btrfs/volumes.h | 13 +--
> fs/btrfs/zoned.c | 3 +
> 11 files changed, 397 insertions(+), 12 deletions(-)
> create mode 100644 fs/btrfs/raid-stripe-tree.c
> create mode 100644 fs/btrfs/raid-stripe-tree.h
>
> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
> index 90d53209755b..3bb869a84e54 100644
> --- a/fs/btrfs/Makefile
> +++ b/fs/btrfs/Makefile
> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
> uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
> block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
> subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
> - lru_cache.o
> + lru_cache.o raid-stripe-tree.o
>
> btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
> btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index 726592868e9c..2b174865d347 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -15,6 +15,7 @@
> #include "rcu-string.h"
> #include "zoned.h"
> #include "file-item.h"
> +#include "raid-stripe-tree.h"
>
> static struct bio_set btrfs_bioset;
> static struct bio_set btrfs_clone_bioset;
> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
> btrfs_put_bioc(bioc);
> }
>
> +static void btrfs_raid_stripe_update(struct work_struct *work)
> +{
> + struct btrfs_bio *bbio =
> + container_of(work, struct btrfs_bio, end_io_work);
> + struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
> + struct btrfs_io_context *bioc = stripe->bioc;
> + int ret;
> +
> + ret = btrfs_add_ordered_stripe(bioc);
> + if (ret)
> + bbio->bio.bi_status = errno_to_blk_status(ret);
> + btrfs_orig_bbio_end_io(bbio);
> + btrfs_put_bioc(bioc);
> +}
> +
> static void btrfs_orig_write_end_io(struct bio *bio)
> {
> struct btrfs_io_stripe *stripe = bio->bi_private;
> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
> else
> bio->bi_status = BLK_STS_OK;
>
> + if (bio_op(bio) == REQ_OP_ZONE_APPEND)
> + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
> +
> + if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> + INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
> + queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
> + &bbio->end_io_work);
I'm still having the old question, what would happen if the delayed
workload happen after the ordered extent finished?
Since we can not ensure the order between this RST update workload and
finish_ordered_io(), there can be an window where we finish ordered io,
and then the pages get released (by memory pressure), then a new read
happen to the range, then our RST workload happened.
In that case, we would have read failure.
Thus I strongly recommened to do the RST tree update inside
finish_ordered_io().
This has several advantages:
- We don't need in-memory structure as a gap stopper
Since read would be blocked if there is a running ordered extent,
we don't need an in-memory RST mapping.
- finish_ordered_io() itself has all the proper context for tree
updates.
As that's the main location we update the subvolume tree.
The main concern may be the bioc <-> ordered extent mapping, but IIRC
for zoned mode one bioc is one ordered extent, thus this shouldn't be a
super big deal?
Otherwise we may need something to trace all the bioc belong to the
ordered extent.
Thanks,
Qu
> + return;
> + }
> +
> btrfs_orig_bbio_end_io(bbio);
> btrfs_put_bioc(bioc);
> }
> @@ -381,6 +407,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
> if (bio->bi_status) {
> atomic_inc(&stripe->bioc->error);
> btrfs_log_dev_io_error(bio, stripe->dev);
> + } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
> + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
> }
>
> /* Pass on control to the original bio this one was cloned from */
> @@ -440,6 +468,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
> bio->bi_private = &bioc->stripes[dev_nr];
> bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
> bioc->stripes[dev_nr].bioc = bioc;
> + bioc->size = bio->bi_iter.bi_size;
> btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
> }
>
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 7660ac642c81..261f52ad8e12 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -14,6 +14,7 @@
> #include "space-info.h"
> #include "tree-mod-log.h"
> #include "fs.h"
> +#include "raid-stripe-tree.h"
>
> struct kmem_cache *btrfs_delayed_ref_head_cachep;
> struct kmem_cache *btrfs_delayed_tree_ref_cachep;
> @@ -637,8 +638,11 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
> exist->ref_mod += mod;
>
> /* remove existing tail if its ref_mod is zero */
> - if (exist->ref_mod == 0)
> + if (exist->ref_mod == 0) {
> + btrfs_drop_ordered_stripe(trans->fs_info, exist->bytenr);
> drop_delayed_ref(root, href, exist);
> + }
> +
> spin_unlock(&href->lock);
> return ret;
> inserted:
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 2eb34abf700f..5096c1a1ed3e 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -51,6 +51,8 @@ struct btrfs_delayed_ref_node {
> /* is this node still in the rbtree? */
> unsigned int is_head:1;
> unsigned int in_tree:1;
> + /* Do we need RAID stripe tree modifications? */
> + unsigned int must_insert_stripe:1;
> };
>
> struct btrfs_delayed_extent_op {
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 6b6c59e6805c..7441d784fe03 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -42,6 +42,7 @@
> #include "file-item.h"
> #include "orphan.h"
> #include "tree-checker.h"
> +#include "raid-stripe-tree.h"
>
> #undef SCRAMBLE_DELAYED_REFS
>
> @@ -1497,6 +1498,56 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +static bool delayed_ref_needs_rst_update(struct btrfs_fs_info *fs_info,
> + struct btrfs_delayed_ref_head *head)
> +{
> + struct extent_map *em;
> + struct map_lookup *map;
> + bool ret = false;
> +
> + if (!btrfs_stripe_tree_root(fs_info))
> + return ret;
> +
> + em = btrfs_get_chunk_map(fs_info, head->bytenr, head->num_bytes);
> + if (!em)
> + return ret;
> +
> + map = em->map_lookup;
> +
> + if (btrfs_need_stripe_tree_update(fs_info, map->type))
> + ret = true;
> +
> + free_extent_map(em);
> +
> + return ret;
> +}
> +
> +static int add_stripe_entry_for_delayed_ref(struct btrfs_trans_handle *trans,
> + struct btrfs_delayed_ref_node *node)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_ordered_stripe *stripe;
> + int ret = 0;
> +
> + stripe = btrfs_lookup_ordered_stripe(fs_info, node->bytenr);
> + if (!stripe) {
> + btrfs_err(fs_info,
> + "cannot get stripe extent for address %llu (%llu)",
> + node->bytenr, node->num_bytes);
> + return -EINVAL;
> + }
> +
> + ASSERT(stripe->logical == node->bytenr);
> +
> + ret = btrfs_insert_raid_extent(trans, stripe);
> + /* once for us */
> + btrfs_put_ordered_stripe(fs_info, stripe);
> + /* once for the tree */
> + btrfs_put_ordered_stripe(fs_info, stripe);
> +
> + return ret;
> +}
> +
> static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
> struct btrfs_delayed_ref_node *node,
> struct btrfs_delayed_extent_op *extent_op,
> @@ -1527,11 +1578,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
> flags, ref->objectid,
> ref->offset, &ins,
> node->ref_mod);
> + if (ret)
> + return ret;
> + if (node->must_insert_stripe)
> + ret = add_stripe_entry_for_delayed_ref(trans, node);
> } else if (node->action == BTRFS_ADD_DELAYED_REF) {
> ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
> ref->objectid, ref->offset,
> node->ref_mod, extent_op);
> } else if (node->action == BTRFS_DROP_DELAYED_REF) {
> + if (node->must_insert_stripe)
> + btrfs_drop_ordered_stripe(trans->fs_info, node->bytenr);
> ret = __btrfs_free_extent(trans, node, parent,
> ref_root, ref->objectid,
> ref->offset, node->ref_mod,
> @@ -1901,6 +1958,8 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
> struct btrfs_delayed_ref_root *delayed_refs;
> struct btrfs_delayed_extent_op *extent_op;
> struct btrfs_delayed_ref_node *ref;
> + const bool need_rst_update =
> + delayed_ref_needs_rst_update(fs_info, locked_ref);
> int must_insert_reserved = 0;
> int ret;
>
> @@ -1951,6 +2010,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
> locked_ref->extent_op = NULL;
> spin_unlock(&locked_ref->lock);
>
> + ref->must_insert_stripe = need_rst_update;
> ret = run_one_delayed_ref(trans, ref, extent_op,
> must_insert_reserved);
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8f07d59e8193..aaa1db90e58b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -70,6 +70,7 @@
> #include "verity.h"
> #include "super.h"
> #include "orphan.h"
> +#include "raid-stripe-tree.h"
>
> struct btrfs_iget_args {
> u64 ino;
> @@ -9495,12 +9496,17 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
> if (qgroup_released < 0)
> return ERR_PTR(qgroup_released);
>
> + ret = btrfs_insert_preallocated_raid_stripe(inode->root->fs_info,
> + start, len);
> + if (ret)
> + goto free_qgroup;
> +
> if (trans) {
> ret = insert_reserved_file_extent(trans, inode,
> file_offset, &stack_fi,
> true, qgroup_released);
> if (ret)
> - goto free_qgroup;
> + goto free_stripe_extent;
> return trans;
> }
>
> @@ -9518,7 +9524,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
> path = btrfs_alloc_path();
> if (!path) {
> ret = -ENOMEM;
> - goto free_qgroup;
> + goto free_stripe_extent;
> }
>
> ret = btrfs_replace_file_extents(inode, path, file_offset,
> @@ -9526,9 +9532,12 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
> &trans);
> btrfs_free_path(path);
> if (ret)
> - goto free_qgroup;
> + goto free_stripe_extent;
> return trans;
>
> +free_stripe_extent:
> + btrfs_drop_ordered_stripe(inode->root->fs_info, start);
> +
> free_qgroup:
> /*
> * We have released qgroup data range at the beginning of the function,
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> new file mode 100644
> index 000000000000..9d3e7bffe6f8
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -0,0 +1,204 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Western Digital Corporation or its affiliates.
> + */
> +
> +#include <linux/btrfs_tree.h>
> +
> +#include "ctree.h"
> +#include "fs.h"
> +#include "accessors.h"
> +#include "transaction.h"
> +#include "disk-io.h"
> +#include "raid-stripe-tree.h"
> +#include "volumes.h"
> +#include "misc.h"
> +#include "disk-io.h"
> +#include "print-tree.h"
> +
> +static int ordered_stripe_cmp(const void *key, const struct rb_node *node)
> +{
> + struct btrfs_ordered_stripe *stripe =
> + rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> + const u64 *logical = key;
> +
> + if (*logical < stripe->logical)
> + return -1;
> + if (*logical >= stripe->logical + stripe->num_bytes)
> + return 1;
> + return 0;
> +}
> +
> +static int ordered_stripe_less(struct rb_node *rba, const struct rb_node *rbb)
> +{
> + struct btrfs_ordered_stripe *stripe =
> + rb_entry(rba, struct btrfs_ordered_stripe, rb_node);
> + return ordered_stripe_cmp(&stripe->logical, rbb);
> +}
> +
> +int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
> +{
> + struct btrfs_fs_info *fs_info = bioc->fs_info;
> + struct btrfs_ordered_stripe *stripe;
> + struct btrfs_io_stripe *tmp;
> + u64 logical = bioc->logical;
> + u64 length = bioc->size;
> + struct rb_node *node;
> + size_t size;
> +
> + size = bioc->num_stripes * sizeof(struct btrfs_io_stripe);
> + stripe = kzalloc(sizeof(struct btrfs_ordered_stripe), GFP_NOFS);
> + if (!stripe)
> + return -ENOMEM;
> +
> + spin_lock_init(&stripe->lock);
> + tmp = kmemdup(bioc->stripes, size, GFP_NOFS);
> + if (!tmp) {
> + kfree(stripe);
> + return -ENOMEM;
> + }
> +
> + stripe->logical = logical;
> + stripe->num_bytes = length;
> + stripe->num_stripes = bioc->num_stripes;
> + spin_lock(&stripe->lock);
> + stripe->stripes = tmp;
> + spin_unlock(&stripe->lock);
> + refcount_set(&stripe->ref, 1);
> +
> + write_lock(&fs_info->stripe_update_lock);
> + node = rb_find_add(&stripe->rb_node, &fs_info->stripe_update_tree,
> + ordered_stripe_less);
> + if (node) {
> + struct btrfs_ordered_stripe *old =
> + rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> +
> + btrfs_debug(fs_info, "logical: %llu, length: %llu already exists",
> + logical, length);
> + ASSERT(logical == old->logical);
> +
> + rb_replace_node(node, &stripe->rb_node,
> + &fs_info->stripe_update_tree);
> + }
> + write_unlock(&fs_info->stripe_update_lock);
> +
> + return 0;
> +}
> +
> +struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *fs_info,
> + u64 logical)
> +{
> + struct rb_root *root = &fs_info->stripe_update_tree;
> + struct btrfs_ordered_stripe *stripe = NULL;
> + struct rb_node *node;
> +
> + read_lock(&fs_info->stripe_update_lock);
> + node = rb_find(&logical, root, ordered_stripe_cmp);
> + if (node) {
> + stripe = rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> + refcount_inc(&stripe->ref);
> + }
> + read_unlock(&fs_info->stripe_update_lock);
> +
> + return stripe;
> +}
> +
> +void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
> + struct btrfs_ordered_stripe *stripe)
> +{
> +
> + if (refcount_dec_and_test(&stripe->ref)) {
> + struct rb_node *node;
> +
> + write_lock(&fs_info->stripe_update_lock);
> +
> + node = &stripe->rb_node;
> + rb_erase(node, &fs_info->stripe_update_tree);
> + RB_CLEAR_NODE(node);
> +
> + spin_lock(&stripe->lock);
> + kfree(stripe->stripes);
> + spin_unlock(&stripe->lock);
> + kfree(stripe);
> + write_unlock(&fs_info->stripe_update_lock);
> + }
> +}
> +
> +int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
> + u64 start, u64 len)
> +{
> + struct btrfs_io_context *bioc = NULL;
> + struct btrfs_ordered_stripe *stripe;
> + u64 map_length = len;
> + int ret;
> +
> + if (!btrfs_stripe_tree_root(fs_info))
> + return 0;
> +
> + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, start, &map_length,
> + &bioc, 0);
> + if (ret)
> + return ret;
> +
> + bioc->size = len;
> +
> + stripe = btrfs_lookup_ordered_stripe(fs_info, start);
> + if (!stripe) {
> + ret = btrfs_add_ordered_stripe(bioc);
> + if (ret)
> + return ret;
> + } else {
> + spin_lock(&stripe->lock);
> + memcpy(stripe->stripes, bioc->stripes,
> + bioc->num_stripes * sizeof(struct btrfs_io_stripe));
> + spin_unlock(&stripe->lock);
> + btrfs_put_ordered_stripe(fs_info, stripe);
> + }
> +
> + return 0;
> +}
> +
> +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> + struct btrfs_ordered_stripe *stripe)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key stripe_key;
> + struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
> + struct btrfs_stripe_extent *stripe_extent;
> + size_t item_size;
> + int ret;
> +
> + item_size = stripe->num_stripes * sizeof(struct btrfs_raid_stride);
> +
> + stripe_extent = kzalloc(item_size, GFP_NOFS);
> + if (!stripe_extent) {
> + btrfs_abort_transaction(trans, -ENOMEM);
> + btrfs_end_transaction(trans);
> + return -ENOMEM;
> + }
> +
> + spin_lock(&stripe->lock);
> + for (int i = 0; i < stripe->num_stripes; i++) {
> + u64 devid = stripe->stripes[i].dev->devid;
> + u64 physical = stripe->stripes[i].physical;
> + struct btrfs_raid_stride *raid_stride =
> + &stripe_extent->strides[i];
> +
> + btrfs_set_stack_raid_stride_devid(raid_stride, devid);
> + btrfs_set_stack_raid_stride_physical(raid_stride, physical);
> + }
> + spin_unlock(&stripe->lock);
> +
> + stripe_key.objectid = stripe->logical;
> + stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> + stripe_key.offset = stripe->num_bytes;
> +
> + ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
> + item_size);
> + if (ret)
> + btrfs_abort_transaction(trans, ret);
> +
> + kfree(stripe_extent);
> +
> + return ret;
> +}
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> new file mode 100644
> index 000000000000..60d3f8489cc9
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Western Digital Corporation or its affiliates.
> + */
> +
> +#ifndef BTRFS_RAID_STRIPE_TREE_H
> +#define BTRFS_RAID_STRIPE_TREE_H
> +
> +#include "disk-io.h"
> +#include "messages.h"
> +
> +struct btrfs_io_context;
> +
> +struct btrfs_ordered_stripe {
> + struct rb_node rb_node;
> +
> + u64 logical;
> + u64 num_bytes;
> + int num_stripes;
> + struct btrfs_io_stripe *stripes;
> + spinlock_t lock;
> + refcount_t ref;
> +};
> +
> +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> + struct btrfs_ordered_stripe *stripe);
> +int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
> + u64 start, u64 len);
> +struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
> + struct btrfs_fs_info *fs_info,
> + u64 logical);
> +int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc);
> +void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
> + struct btrfs_ordered_stripe *stripe);
> +
> +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
> + u64 map_type)
> +{
> + u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
> + u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
> +
> + if (!btrfs_stripe_tree_root(fs_info))
> + return false;
> +
> + if (type != BTRFS_BLOCK_GROUP_DATA)
> + return false;
> +
> + if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
> + return true;
> +
> + return false;
> +}
> +
> +static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
> + u64 logical)
> +{
> + struct btrfs_ordered_stripe *stripe;
> +
> + if (!btrfs_stripe_tree_root(fs_info))
> + return;
> +
> + stripe = btrfs_lookup_ordered_stripe(fs_info, logical);
> + if (!stripe)
> + return;
> + ASSERT(refcount_read(&stripe->ref) == 2);
> + /* once for us */
> + btrfs_put_ordered_stripe(fs_info, stripe);
> + /* once for the tree */
> + btrfs_put_ordered_stripe(fs_info, stripe);
> +}
> +#endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 9d6775c7196f..fee611d1b01d 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5879,6 +5879,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
> }
>
> static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
> + u64 logical,
> u16 total_stripes)
> {
> struct btrfs_io_context *bioc;
> @@ -5898,6 +5899,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
> bioc->fs_info = fs_info;
> bioc->replace_stripe_src = -1;
> bioc->full_stripe_logical = (u64)-1;
> + bioc->logical = logical;
>
> return bioc;
> }
> @@ -6493,7 +6495,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
> goto out;
> }
>
> - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
> + bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
> if (!bioc) {
> ret = -ENOMEM;
> goto out;
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 650e131d079e..114c76c81eda 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -372,12 +372,10 @@ struct btrfs_fs_devices {
>
> struct btrfs_io_stripe {
> struct btrfs_device *dev;
> - union {
> - /* Block mapping */
> - u64 physical;
> - /* For the endio handler */
> - struct btrfs_io_context *bioc;
> - };
> + /* Block mapping */
> + u64 physical;
> + /* For the endio handler */
> + struct btrfs_io_context *bioc;
> };
>
> struct btrfs_discard_stripe {
> @@ -410,6 +408,9 @@ struct btrfs_io_context {
> atomic_t error;
> u16 max_errors;
>
> + u64 logical;
> + u64 size;
> +
> /*
> * The total number of stripes, including the extra duplicated
> * stripe for replace.
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index f95b2c94d619..7e6cfc7a2918 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -1692,6 +1692,9 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
> u64 chunk_start_phys;
> u64 logical;
>
> + /* Filesystems with a stripe tree have their own l2p mapping */
> + ASSERT(!btrfs_stripe_tree_root(fs_info));
> +
> em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
> if (IS_ERR(em))
> return;
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 10:58 ` Qu Wenruo
@ 2023-03-02 11:25 ` Johannes Thumshirn
2023-03-02 11:45 ` Qu Wenruo
` (2 more replies)
0 siblings, 3 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 11:25 UTC (permalink / raw)
To: Qu Wenruo, David Sterba
Cc: linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On 02.03.23 11:59, Qu Wenruo wrote:
>
>
> On 2023/3/2 17:45, Johannes Thumshirn wrote:
>> Add support for inserting stripe extents into the raid stripe tree on
>> completion of every write that needs an extra logical-to-physical
>> translation when using RAID.
>>
>> Inserting the stripe extents happens after the data I/O has completed,
>> this is done to a) support zone-append and b) rule out the possibility of
>> a RAID-write-hole.
>>
>> This is done by creating in-memory ordered stripe extents, just like the
>> in memory ordered extents, on I/O completion and the on-disk raid stripe
>> extents get created once we're running the delayed_refs for the extent
>> item this stripe extent is tied to.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>> fs/btrfs/Makefile | 2 +-
>> fs/btrfs/bio.c | 29 +++++
>> fs/btrfs/delayed-ref.c | 6 +-
>> fs/btrfs/delayed-ref.h | 2 +
>> fs/btrfs/extent-tree.c | 60 +++++++++++
>> fs/btrfs/inode.c | 15 ++-
>> fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
>> fs/btrfs/raid-stripe-tree.h | 71 +++++++++++++
>> fs/btrfs/volumes.c | 4 +-
>> fs/btrfs/volumes.h | 13 +--
>> fs/btrfs/zoned.c | 3 +
>> 11 files changed, 397 insertions(+), 12 deletions(-)
>> create mode 100644 fs/btrfs/raid-stripe-tree.c
>> create mode 100644 fs/btrfs/raid-stripe-tree.h
>>
>> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
>> index 90d53209755b..3bb869a84e54 100644
>> --- a/fs/btrfs/Makefile
>> +++ b/fs/btrfs/Makefile
>> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>> uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>> block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
>> subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
>> - lru_cache.o
>> + lru_cache.o raid-stripe-tree.o
>>
>> btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>> btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>> index 726592868e9c..2b174865d347 100644
>> --- a/fs/btrfs/bio.c
>> +++ b/fs/btrfs/bio.c
>> @@ -15,6 +15,7 @@
>> #include "rcu-string.h"
>> #include "zoned.h"
>> #include "file-item.h"
>> +#include "raid-stripe-tree.h"
>>
>> static struct bio_set btrfs_bioset;
>> static struct bio_set btrfs_clone_bioset;
>> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
>> btrfs_put_bioc(bioc);
>> }
>>
>> +static void btrfs_raid_stripe_update(struct work_struct *work)
>> +{
>> + struct btrfs_bio *bbio =
>> + container_of(work, struct btrfs_bio, end_io_work);
>> + struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
>> + struct btrfs_io_context *bioc = stripe->bioc;
>> + int ret;
>> +
>> + ret = btrfs_add_ordered_stripe(bioc);
>> + if (ret)
>> + bbio->bio.bi_status = errno_to_blk_status(ret);
>> + btrfs_orig_bbio_end_io(bbio);
>> + btrfs_put_bioc(bioc);
>> +}
>> +
>> static void btrfs_orig_write_end_io(struct bio *bio)
>> {
>> struct btrfs_io_stripe *stripe = bio->bi_private;
>> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>> else
>> bio->bi_status = BLK_STS_OK;
>>
>> + if (bio_op(bio) == REQ_OP_ZONE_APPEND)
>> + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>> +
>> + if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>> + INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
>> + queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
>> + &bbio->end_io_work);
>
> I'm still having the old question, what would happen if the delayed
> workload happen after the ordered extent finished?
>
> Since we can not ensure the order between this RST update workload and
> finish_ordered_io(), there can be an window where we finish ordered io,
> and then the pages get released (by memory pressure), then a new read
> happen to the range, then our RST workload happened.
>
> In that case, we would have read failure.
>
>
> Thus I strongly recommened to do the RST tree update inside
> finish_ordered_io().
>
> This has several advantages:
>
> - We don't need in-memory structure as a gap stopper
> Since read would be blocked if there is a running ordered extent,
> we don't need an in-memory RST mapping.
>
> - finish_ordered_io() itself has all the proper context for tree
> updates.
> As that's the main location we update the subvolume tree.
The first versions of this patchset did do that and then you asked me
to create an in-memory structure and do the update at delayed ref time.
How about adding a completion, or something like a atomic_t
ordered_stripes_pending for the RST updates and have
finish_ordered_io() waiting for it?
> The main concern may be the bioc <-> ordered extent mapping, but IIRC
> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
> super big deal?
Yep, but I want to be able to use RST for non-zoned devices as well
to attack the RAID56 problems and add erasure coding RAID.
> Otherwise we may need something to trace all the bioc belong to the
> ordered extent.
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:25 ` Johannes Thumshirn
@ 2023-03-02 11:45 ` Qu Wenruo
2023-03-02 11:58 ` Johannes Thumshirn
2023-03-02 11:45 ` Johannes Thumshirn
2023-03-02 13:59 ` Christoph Hellwig
2 siblings, 1 reply; 49+ messages in thread
From: Qu Wenruo @ 2023-03-02 11:45 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On 2023/3/2 19:25, Johannes Thumshirn wrote:
> On 02.03.23 11:59, Qu Wenruo wrote:
>>
>>
>> On 2023/3/2 17:45, Johannes Thumshirn wrote:
>>> Add support for inserting stripe extents into the raid stripe tree on
>>> completion of every write that needs an extra logical-to-physical
>>> translation when using RAID.
>>>
>>> Inserting the stripe extents happens after the data I/O has completed,
>>> this is done to a) support zone-append and b) rule out the possibility of
>>> a RAID-write-hole.
>>>
>>> This is done by creating in-memory ordered stripe extents, just like the
>>> in memory ordered extents, on I/O completion and the on-disk raid stripe
>>> extents get created once we're running the delayed_refs for the extent
>>> item this stripe extent is tied to.
>>>
>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>> ---
>>> fs/btrfs/Makefile | 2 +-
>>> fs/btrfs/bio.c | 29 +++++
>>> fs/btrfs/delayed-ref.c | 6 +-
>>> fs/btrfs/delayed-ref.h | 2 +
>>> fs/btrfs/extent-tree.c | 60 +++++++++++
>>> fs/btrfs/inode.c | 15 ++-
>>> fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
>>> fs/btrfs/raid-stripe-tree.h | 71 +++++++++++++
>>> fs/btrfs/volumes.c | 4 +-
>>> fs/btrfs/volumes.h | 13 +--
>>> fs/btrfs/zoned.c | 3 +
>>> 11 files changed, 397 insertions(+), 12 deletions(-)
>>> create mode 100644 fs/btrfs/raid-stripe-tree.c
>>> create mode 100644 fs/btrfs/raid-stripe-tree.h
>>>
>>> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
>>> index 90d53209755b..3bb869a84e54 100644
>>> --- a/fs/btrfs/Makefile
>>> +++ b/fs/btrfs/Makefile
>>> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>>> uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>>> block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
>>> subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
>>> - lru_cache.o
>>> + lru_cache.o raid-stripe-tree.o
>>>
>>> btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>>> btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
>>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>>> index 726592868e9c..2b174865d347 100644
>>> --- a/fs/btrfs/bio.c
>>> +++ b/fs/btrfs/bio.c
>>> @@ -15,6 +15,7 @@
>>> #include "rcu-string.h"
>>> #include "zoned.h"
>>> #include "file-item.h"
>>> +#include "raid-stripe-tree.h"
>>>
>>> static struct bio_set btrfs_bioset;
>>> static struct bio_set btrfs_clone_bioset;
>>> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
>>> btrfs_put_bioc(bioc);
>>> }
>>>
>>> +static void btrfs_raid_stripe_update(struct work_struct *work)
>>> +{
>>> + struct btrfs_bio *bbio =
>>> + container_of(work, struct btrfs_bio, end_io_work);
>>> + struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
>>> + struct btrfs_io_context *bioc = stripe->bioc;
>>> + int ret;
>>> +
>>> + ret = btrfs_add_ordered_stripe(bioc);
>>> + if (ret)
>>> + bbio->bio.bi_status = errno_to_blk_status(ret);
>>> + btrfs_orig_bbio_end_io(bbio);
>>> + btrfs_put_bioc(bioc);
>>> +}
>>> +
>>> static void btrfs_orig_write_end_io(struct bio *bio)
>>> {
>>> struct btrfs_io_stripe *stripe = bio->bi_private;
>>> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>>> else
>>> bio->bi_status = BLK_STS_OK;
>>>
>>> + if (bio_op(bio) == REQ_OP_ZONE_APPEND)
>>> + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>>> +
>>> + if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>>> + INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
>>> + queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
>>> + &bbio->end_io_work);
>>
>> I'm still having the old question, what would happen if the delayed
>> workload happen after the ordered extent finished?
>>
>> Since we can not ensure the order between this RST update workload and
>> finish_ordered_io(), there can be an window where we finish ordered io,
>> and then the pages get released (by memory pressure), then a new read
>> happen to the range, then our RST workload happened.
>>
>> In that case, we would have read failure.
>>
>>
>> Thus I strongly recommened to do the RST tree update inside
>> finish_ordered_io().
>>
>> This has several advantages:
>>
>> - We don't need in-memory structure as a gap stopper
>> Since read would be blocked if there is a running ordered extent,
>> we don't need an in-memory RST mapping.
>>
>> - finish_ordered_io() itself has all the proper context for tree
>> updates.
>> As that's the main location we update the subvolume tree.
>
> The first versions of this patchset did do that and then you asked me
> to create an in-memory structure and do the update at delayed ref time.
I have to admit that, I was a total idiot.
At that time I didn't notice the read would block when there is a
running ordered extent at all...
So, all my fault.
>
> How about adding a completion, or something like a atomic_t
> ordered_stripes_pending for the RST updates and have
> finish_ordered_io() waiting for it?
That's also a feasible solution.
Although I'm a little concerned about the fact that the RST delayed work
is also going into fs_info->endio_workers, which is also used by
finish_ordered_fn().
Thus it can cause deadlock if the workqueue has one max_active, and the
running one is finish_ordered_fn(), which then can be waiting for the
RST work.
But the RST work can only be executed if the endio_workers has finished
its current work, thus leading to a deadlock.
Thanks,
Qu
>
>> The main concern may be the bioc <-> ordered extent mapping, but IIRC
>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
>> super big deal?
>
> Yep, but I want to be able to use RST for non-zoned devices as well
> to attack the RAID56 problems and add erasure coding RAID.
>
>> Otherwise we may need something to trace all the bioc belong to the
>> ordered extent.
>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:45 ` Qu Wenruo
@ 2023-03-02 11:58 ` Johannes Thumshirn
2023-03-02 12:01 ` Qu Wenruo
2023-03-02 14:01 ` Christoph Hellwig
0 siblings, 2 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 11:58 UTC (permalink / raw)
To: Qu Wenruo, David Sterba
Cc: linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On 02.03.23 12:45, Qu Wenruo wrote:
>>
>> How about adding a completion, or something like a atomic_t
>> ordered_stripes_pending for the RST updates and have
>> finish_ordered_io() waiting for it?
>
> That's also a feasible solution.
>
> Although I'm a little concerned about the fact that the RST delayed work
> is also going into fs_info->endio_workers, which is also used by
> finish_ordered_fn().
>
> Thus it can cause deadlock if the workqueue has one max_active, and the
> running one is finish_ordered_fn(), which then can be waiting for the
> RST work.
>
> But the RST work can only be executed if the endio_workers has finished
> its current work, thus leading to a deadlock.
How about adding a new workqueue for RST updates? That should mitigate
the deadlock.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:58 ` Johannes Thumshirn
@ 2023-03-02 12:01 ` Qu Wenruo
2023-03-02 14:01 ` Christoph Hellwig
1 sibling, 0 replies; 49+ messages in thread
From: Qu Wenruo @ 2023-03-02 12:01 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On 2023/3/2 19:58, Johannes Thumshirn wrote:
> On 02.03.23 12:45, Qu Wenruo wrote:
>>>
>>> How about adding a completion, or something like a atomic_t
>>> ordered_stripes_pending for the RST updates and have
>>> finish_ordered_io() waiting for it?
>>
>> That's also a feasible solution.
>>
>> Although I'm a little concerned about the fact that the RST delayed work
>> is also going into fs_info->endio_workers, which is also used by
>> finish_ordered_fn().
>>
>> Thus it can cause deadlock if the workqueue has one max_active, and the
>> running one is finish_ordered_fn(), which then can be waiting for the
>> RST work.
>>
>> But the RST work can only be executed if the endio_workers has finished
>> its current work, thus leading to a deadlock.
>
> How about adding a new workqueue for RST updates? That should mitigate
> the deadlock.
>
My bad, the finish_ordered_io() go endio_write_workers, not
endio_workers, thus it should be fine.
Thanks,
Qu
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:58 ` Johannes Thumshirn
2023-03-02 12:01 ` Qu Wenruo
@ 2023-03-02 14:01 ` Christoph Hellwig
2023-03-02 15:31 ` Johannes Thumshirn
1 sibling, 1 reply; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-02 14:01 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: Qu Wenruo, David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
> > Thus it can cause deadlock if the workqueue has one max_active, and the
> > running one is finish_ordered_fn(), which then can be waiting for the
> > RST work.
> >
> > But the RST work can only be executed if the endio_workers has finished
> > its current work, thus leading to a deadlock.
>
> How about adding a new workqueue for RST updates? That should mitigate
> the deadlock.
The amount of weird workqueues in the btrfs end I/O path is worrysome.
What I plan to do, and might be ready to submit about next week is a
series to actually offload the I/O completion to a workqueue on a
per (original) btrfs_bio basis. This means that RST updates,
ordered_extent processing, compressed write handling etc can all
run from the same end I/O worker. As a bonus we remove all irqsave
locking from btrfs.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 14:01 ` Christoph Hellwig
@ 2023-03-02 15:31 ` Johannes Thumshirn
2023-03-02 22:35 ` Qu Wenruo
2023-03-03 14:15 ` hch
0 siblings, 2 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 15:31 UTC (permalink / raw)
To: hch@infradead.org
Cc: Qu Wenruo, David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 02.03.23 15:02, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
>>> Thus it can cause deadlock if the workqueue has one max_active, and the
>>> running one is finish_ordered_fn(), which then can be waiting for the
>>> RST work.
>>>
>>> But the RST work can only be executed if the endio_workers has finished
>>> its current work, thus leading to a deadlock.
>>
>> How about adding a new workqueue for RST updates? That should mitigate
>> the deadlock.
>
> The amount of weird workqueues in the btrfs end I/O path is worrysome.
> What I plan to do, and might be ready to submit about next week is a
> series to actually offload the I/O completion to a workqueue on a
> per (original) btrfs_bio basis. This means that RST updates,
> ordered_extent processing, compressed write handling etc can all
> run from the same end I/O worker. As a bonus we remove all irqsave
> locking from btrfs.
>
If it's all running from the same end I/O worker then we can make sure
the race Qu suspects can be eliminated, can't we?
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 15:31 ` Johannes Thumshirn
@ 2023-03-02 22:35 ` Qu Wenruo
2023-03-03 11:15 ` Johannes Thumshirn
2023-03-03 14:16 ` hch
2023-03-03 14:15 ` hch
1 sibling, 2 replies; 49+ messages in thread
From: Qu Wenruo @ 2023-03-02 22:35 UTC (permalink / raw)
To: Johannes Thumshirn, hch@infradead.org
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 2023/3/2 23:31, Johannes Thumshirn wrote:
> On 02.03.23 15:02, Christoph Hellwig wrote:
>> On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
>>>> Thus it can cause deadlock if the workqueue has one max_active, and the
>>>> running one is finish_ordered_fn(), which then can be waiting for the
>>>> RST work.
>>>>
>>>> But the RST work can only be executed if the endio_workers has finished
>>>> its current work, thus leading to a deadlock.
>>>
>>> How about adding a new workqueue for RST updates? That should mitigate
>>> the deadlock.
>>
>> The amount of weird workqueues in the btrfs end I/O path is worrysome.
>> What I plan to do, and might be ready to submit about next week is a
>> series to actually offload the I/O completion to a workqueue on a
>> per (original) btrfs_bio basis. This means that RST updates,
>> ordered_extent processing, compressed write handling etc can all
>> run from the same end I/O worker. As a bonus we remove all irqsave
>> locking from btrfs.
>>
>
> If it's all running from the same end I/O worker then we can make sure
> the race Qu suspects can be eliminated, can't we?
In fact, waiting other workqueue inside other workqueue is already a wq
hell...
Just make the in-memory RST update happen in finish_ordered_io() should
be good enough.
Then we can keep the RST tree update in delayed ref.
Thanks,
Qu
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 22:35 ` Qu Wenruo
@ 2023-03-03 11:15 ` Johannes Thumshirn
2023-03-03 11:42 ` Qu Wenruo
2023-03-03 14:17 ` hch
2023-03-03 14:16 ` hch
1 sibling, 2 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-03 11:15 UTC (permalink / raw)
To: Qu Wenruo, hch@infradead.org
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig, Damien Le Moal
On 02.03.23 23:35, Qu Wenruo wrote:
>>
>> If it's all running from the same end I/O worker then we can make sure
>> the race Qu suspects can be eliminated, can't we?
>
> In fact, waiting other workqueue inside other workqueue is already a wq
> hell...
>
> Just make the in-memory RST update happen in finish_ordered_io() should
> be good enough.
> Then we can keep the RST tree update in delayed ref.
There's two possibilities how to handle it:
1) Have a common workfn that handles all the calls in the correct order
2) Do the RST update in btrfs_finish_ordered_io()
To me both are valuable options, so I don't care. Both need a bit of
preparation work before, but that's the nature of the beast.
For 2) we need a pointer to the bioc in ordered_extent, so we need to
make sure the lifetimes are in sync. Or the other way around, have
ordered_stripe hold enough information for the RST updates and the
end_io handler insert it in the ordered_stripe (that needs to be
passed into the bioc or bbio).
*Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
easier to implement.
Qu, Christoph and others, what do you think?
[1] https://lore.kernel.org/linux-btrfs/ZACsVI3mfprrj4j6@infradead.org
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 11:15 ` Johannes Thumshirn
@ 2023-03-03 11:42 ` Qu Wenruo
2023-03-03 14:21 ` hch
2023-03-03 14:17 ` hch
1 sibling, 1 reply; 49+ messages in thread
From: Qu Wenruo @ 2023-03-03 11:42 UTC (permalink / raw)
To: Johannes Thumshirn, Qu Wenruo, hch@infradead.org
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig, Damien Le Moal
On 2023/3/3 19:15, Johannes Thumshirn wrote:
> On 02.03.23 23:35, Qu Wenruo wrote:
>>>
>>> If it's all running from the same end I/O worker then we can make sure
>>> the race Qu suspects can be eliminated, can't we?
>>
>> In fact, waiting other workqueue inside other workqueue is already a wq
>> hell...
>>
>> Just make the in-memory RST update happen in finish_ordered_io() should
>> be good enough.
>> Then we can keep the RST tree update in delayed ref.
>
> There's two possibilities how to handle it:
> 1) Have a common workfn that handles all the calls in the correct order
> 2) Do the RST update in btrfs_finish_ordered_io()
>
> To me both are valuable options, so I don't care. Both need a bit of
> preparation work before, but that's the nature of the beast.
>
> For 2) we need a pointer to the bioc in ordered_extent, so we need to
> make sure the lifetimes are in sync. Or the other way around, have
> ordered_stripe hold enough information for the RST updates and the
> end_io handler insert it in the ordered_stripe (that needs to be
> passed into the bioc or bbio).
>
> *Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
> easier to implement.
From my understanding, HCH's proposal is a super set of 2), not only do
the ordered IO thing in the same workqueue, but also all the other endio
works, thus if done properly can easily handle all the complex dependency.
But that still depends on the final patchset.
Thanks,
Qu
>
> Qu, Christoph and others, what do you think?
>
> [1] https://lore.kernel.org/linux-btrfs/ZACsVI3mfprrj4j6@infradead.org
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 11:42 ` Qu Wenruo
@ 2023-03-03 14:21 ` hch
2023-03-06 10:58 ` Johannes Thumshirn
0 siblings, 1 reply; 49+ messages in thread
From: hch @ 2023-03-03 14:21 UTC (permalink / raw)
To: Qu Wenruo
Cc: Johannes Thumshirn, hch@infradead.org, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig,
Damien Le Moal
On Fri, Mar 03, 2023 at 07:42:00PM +0800, Qu Wenruo wrote:
> From my understanding, HCH's proposal is a super set of 2), not only do the
> ordered IO thing in the same workqueue, but also all the other endio works,
> thus if done properly can easily handle all the complex dependency.
I've pushed my current WIP out, some of the commit logs aren't there
yet, and it still does some superflous offloads for RAID5 reads:
http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/btrfs-io_end-work
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 14:21 ` hch
@ 2023-03-06 10:58 ` Johannes Thumshirn
0 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-06 10:58 UTC (permalink / raw)
To: hch@infradead.org, Qu Wenruo
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig, Damien Le Moal
On 03.03.23 15:21, hch@infradead.org wrote:
> On Fri, Mar 03, 2023 at 07:42:00PM +0800, Qu Wenruo wrote:
>> From my understanding, HCH's proposal is a super set of 2), not only do the
>> ordered IO thing in the same workqueue, but also all the other endio works,
>> thus if done properly can easily handle all the complex dependency.
>
> I've pushed my current WIP out, some of the commit logs aren't there
> yet, and it still does some superflous offloads for RAID5 reads:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/btrfs-io_end-work
>
Thanks, I just had a look at it and it seems like I can easily rebase on top
and implement option 1.
For option 2 there is still plumbing needed and I'll look into that as well.
Shouldn't be too hard.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 11:15 ` Johannes Thumshirn
2023-03-03 11:42 ` Qu Wenruo
@ 2023-03-03 14:17 ` hch
1 sibling, 0 replies; 49+ messages in thread
From: hch @ 2023-03-03 14:17 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: Qu Wenruo, hch@infradead.org, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig,
Damien Le Moal
On Fri, Mar 03, 2023 at 11:15:00AM +0000, Johannes Thumshirn wrote:
> There's two possibilities how to handle it:
> 1) Have a common workfn that handles all the calls in the correct order
> 2) Do the RST update in btrfs_finish_ordered_io()
>
> To me both are valuable options, so I don't care. Both need a bit of
> preparation work before, but that's the nature of the beast.
>
> For 2) we need a pointer to the bioc in ordered_extent, so we need to
> make sure the lifetimes are in sync. Or the other way around, have
> ordered_stripe hold enough information for the RST updates and the
> end_io handler insert it in the ordered_stripe (that needs to be
> passed into the bioc or bbio).
>
> *Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
> easier to implement.
1 is probably easier, and should be done for other reasons. But 2
really feels like the right thing to do in addition to 1.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 22:35 ` Qu Wenruo
2023-03-03 11:15 ` Johannes Thumshirn
@ 2023-03-03 14:16 ` hch
2023-03-08 9:11 ` Johannes Thumshirn
1 sibling, 1 reply; 49+ messages in thread
From: hch @ 2023-03-03 14:16 UTC (permalink / raw)
To: Qu Wenruo
Cc: Johannes Thumshirn, hch@infradead.org, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
> Just make the in-memory RST update happen in finish_ordered_io() should be
> good enough.
> Then we can keep the RST tree update in delayed ref.
Independent of the workqueue changes, doing the RST update in
finish_ordered_io feels like the right thing to me, although my
gut feeling migh not be properly adjusted to btrfs yet :)
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 14:16 ` hch
@ 2023-03-08 9:11 ` Johannes Thumshirn
2023-03-08 12:01 ` Qu Wenruo
2023-03-08 14:33 ` Christoph Hellwig
0 siblings, 2 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-08 9:11 UTC (permalink / raw)
To: hch@infradead.org, Qu Wenruo
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 03.03.23 15:16, hch@infradead.org wrote:
> On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
>> Just make the in-memory RST update happen in finish_ordered_io() should be
>> good enough.
>> Then we can keep the RST tree update in delayed ref.
>
> Independent of the workqueue changes, doing the RST update in
> finish_ordered_io feels like the right thing to me, although my
> gut feeling migh not be properly adjusted to btrfs yet :)
>
Btw, this would look sth like the following (untested):
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ab8f1c21a773..f22e34b4328f 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -352,21 +352,6 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_put_bioc(bioc);
}
-static void btrfs_raid_stripe_update(struct work_struct *work)
-{
- struct btrfs_bio *bbio =
- container_of(work, struct btrfs_bio, end_io_work);
- struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
- struct btrfs_io_context *bioc = stripe->bioc;
- int ret;
-
- ret = btrfs_add_ordered_stripe(bioc);
- if (ret)
- bbio->bio.bi_status = errno_to_blk_status(ret);
- btrfs_orig_bbio_end_io(bbio);
- btrfs_put_bioc(bioc);
-}
-
static void btrfs_orig_write_end_io(struct bio *bio)
{
struct btrfs_io_stripe *stripe = bio->bi_private;
@@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
- INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
- queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
- &bbio->end_io_work);
- return;
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+ btrfs_get_bioc(bioc);
+ oe->bioc = bioc;
+ btrfs_put_ordered_extent(oe);
}
btrfs_orig_bbio_end_io(bbio);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b0cff5c50fb..704e8705bbb9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3159,6 +3159,11 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
+ } else if (ordered_extent->bioc) {
+ ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
+ btrfs_put_bioc(ordered_extent->bioc);
+ if (ret)
+ goto out;
}
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 18007f9c00ad..e3939bb8a525 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -157,6 +157,8 @@ struct btrfs_ordered_extent {
* command in a workqueue context
*/
u64 physical;
+
+ struct btrfs_io_context *bioc;
};
static inline void
^ permalink raw reply related [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-08 9:11 ` Johannes Thumshirn
@ 2023-03-08 12:01 ` Qu Wenruo
2023-03-08 14:33 ` Christoph Hellwig
1 sibling, 0 replies; 49+ messages in thread
From: Qu Wenruo @ 2023-03-08 12:01 UTC (permalink / raw)
To: Johannes Thumshirn, hch@infradead.org
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 2023/3/8 17:11, Johannes Thumshirn wrote:
> On 03.03.23 15:16, hch@infradead.org wrote:
>> On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
>>> Just make the in-memory RST update happen in finish_ordered_io() should be
>>> good enough.
>>> Then we can keep the RST tree update in delayed ref.
>>
>> Independent of the workqueue changes, doing the RST update in
>> finish_ordered_io feels like the right thing to me, although my
>> gut feeling migh not be properly adjusted to btrfs yet :)
>>
>
> Btw, this would look sth like the following (untested):
>
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index ab8f1c21a773..f22e34b4328f 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -352,21 +352,6 @@ static void btrfs_raid56_end_io(struct bio *bio)
> btrfs_put_bioc(bioc);
> }
>
> -static void btrfs_raid_stripe_update(struct work_struct *work)
> -{
> - struct btrfs_bio *bbio =
> - container_of(work, struct btrfs_bio, end_io_work);
> - struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
> - struct btrfs_io_context *bioc = stripe->bioc;
> - int ret;
> -
> - ret = btrfs_add_ordered_stripe(bioc);
> - if (ret)
> - bbio->bio.bi_status = errno_to_blk_status(ret);
> - btrfs_orig_bbio_end_io(bbio);
> - btrfs_put_bioc(bioc);
> -}
> -
> static void btrfs_orig_write_end_io(struct bio *bio)
> {
> struct btrfs_io_stripe *stripe = bio->bi_private;
> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
> stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>
> if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> - INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
> - queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
> - &bbio->end_io_work);
> - return;
> + struct btrfs_ordered_extent *oe;
> +
> + oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
> + btrfs_get_bioc(bioc);
> + oe->bioc = bioc;
Looks valid to me.
Bioc would get a slightly longer lifespan, but it should be fine I guess?
Thanks,
Qu
> + btrfs_put_ordered_extent(oe);
> }
>
> btrfs_orig_bbio_end_io(bbio);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 6b0cff5c50fb..704e8705bbb9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3159,6 +3159,11 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
> btrfs_rewrite_logical_zoned(ordered_extent);
> btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
> ordered_extent->disk_num_bytes);
> + } else if (ordered_extent->bioc) {
> + ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
> + btrfs_put_bioc(ordered_extent->bioc);
> + if (ret)
> + goto out;
> }
>
> if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index 18007f9c00ad..e3939bb8a525 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -157,6 +157,8 @@ struct btrfs_ordered_extent {
> * command in a workqueue context
> */
> u64 physical;
> +
> + struct btrfs_io_context *bioc;
> };
>
> static inline void
>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-08 9:11 ` Johannes Thumshirn
2023-03-08 12:01 ` Qu Wenruo
@ 2023-03-08 14:33 ` Christoph Hellwig
2023-03-09 10:53 ` Johannes Thumshirn
1 sibling, 1 reply; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-08 14:33 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: hch@infradead.org, Qu Wenruo, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On Wed, Mar 08, 2023 at 09:11:54AM +0000, Johannes Thumshirn wrote:
> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
> stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>
> if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> + struct btrfs_ordered_extent *oe;
> +
> + oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
> + btrfs_get_bioc(bioc);
> + oe->bioc = bioc;
> + btrfs_put_ordered_extent(oe);
> + } else if (ordered_extent->bioc) {
> + ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
> + btrfs_put_bioc(ordered_extent->bioc);
> + if (ret)
> + goto out;
Given that btrfs_add_ordered_stripe only really builds the
btrfs_ordered_stripe structure and inserts it into the tree,
can't we just allocate the btrfs_ordered_stripe structure
in the end_io handler and have the ordered_extent point to it?
Also if you don't to split the ordered_extent for each bio,
you could instead have a list of btrfs_ordered_stripes in the
ordered_extent and then process all of them in the ordered_extent
completion handling.
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-08 14:33 ` Christoph Hellwig
@ 2023-03-09 10:53 ` Johannes Thumshirn
2023-03-09 15:20 ` Christoph Hellwig
0 siblings, 1 reply; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-09 10:53 UTC (permalink / raw)
To: Christoph Hellwig
Cc: hch@infradead.org, Qu Wenruo, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik
On 08.03.23 15:34, Christoph Hellwig wrote:
> On Wed, Mar 08, 2023 at 09:11:54AM +0000, Johannes Thumshirn wrote:
>> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>> stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>>
>> if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>> + struct btrfs_ordered_extent *oe;
>> +
>> + oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
>> + btrfs_get_bioc(bioc);
>> + oe->bioc = bioc;
>> + btrfs_put_ordered_extent(oe);
>
>
>> + } else if (ordered_extent->bioc) {
>> + ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
>> + btrfs_put_bioc(ordered_extent->bioc);
>> + if (ret)
>> + goto out;
>
> Given that btrfs_add_ordered_stripe only really builds the
> btrfs_ordered_stripe structure and inserts it into the tree,
> can't we just allocate the btrfs_ordered_stripe structure
> in the end_io handler and have the ordered_extent point to it?
I wanted to avoid memory allocations in the end_io handler though.
If all is offloaded to a common workqueue, like with your proposal,
that'll be ok for me, but atomic allocations don't look right for
this for me.
> Also if you don't to split the ordered_extent for each bio,
> you could instead have a list of btrfs_ordered_stripes in the
> ordered_extent and then process all of them in the ordered_extent
> completion handling.
>
Hmm yeah I need to think about it but theoretically this should be
doable.
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-09 10:53 ` Johannes Thumshirn
@ 2023-03-09 15:20 ` Christoph Hellwig
0 siblings, 0 replies; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-09 15:20 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: Christoph Hellwig, hch@infradead.org, Qu Wenruo, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik
On Thu, Mar 09, 2023 at 10:53:08AM +0000, Johannes Thumshirn wrote:
> I wanted to avoid memory allocations in the end_io handler though.
> If all is offloaded to a common workqueue, like with your proposal,
> that'll be ok for me, but atomic allocations don't look right for
> this for me.
Indeed.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 15:31 ` Johannes Thumshirn
2023-03-02 22:35 ` Qu Wenruo
@ 2023-03-03 14:15 ` hch
1 sibling, 0 replies; 49+ messages in thread
From: hch @ 2023-03-03 14:15 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: hch@infradead.org, Qu Wenruo, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On Thu, Mar 02, 2023 at 03:31:51PM +0000, Johannes Thumshirn wrote:
> If it's all running from the same end I/O worker then we can make sure
> the race Qu suspects can be eliminated, can't we?
Yes.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:25 ` Johannes Thumshirn
2023-03-02 11:45 ` Qu Wenruo
@ 2023-03-02 11:45 ` Johannes Thumshirn
2023-03-02 14:03 ` Christoph Hellwig
2023-03-02 13:59 ` Christoph Hellwig
2 siblings, 1 reply; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 11:45 UTC (permalink / raw)
To: Qu Wenruo, David Sterba
Cc: linux-btrfs@vger.kernel.org, Josef Bacik, Christoph Hellwig
On 02.03.23 12:25, Johannes Thumshirn wrote:
>> I'm still having the old question, what would happen if the delayed
>> workload happen after the ordered extent finished?
>>
>> Since we can not ensure the order between this RST update workload and
>> finish_ordered_io(), there can be an window where we finish ordered io,
>> and then the pages get released (by memory pressure), then a new read
>> happen to the range, then our RST workload happened.
>>
>> In that case, we would have read failure.
>>
>>
>> Thus I strongly recommened to do the RST tree update inside
>> finish_ordered_io().
>>
>> This has several advantages:
>>
>> - We don't need in-memory structure as a gap stopper
>> Since read would be blocked if there is a running ordered extent,
>> we don't need an in-memory RST mapping.
>>
>> - finish_ordered_io() itself has all the proper context for tree
>> updates.
>> As that's the main location we update the subvolume tree.
>
> The first versions of this patchset did do that and then you asked me
> to create an in-memory structure and do the update at delayed ref time.
>
> How about adding a completion, or something like a atomic_t
> ordered_stripes_pending for the RST updates and have
> finish_ordered_io() waiting for it?
Something like the following (completely untested):
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 2b174865d347..f96177a501e4 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -391,6 +391,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
+ btrfs_add_ordered_stripe_pending(bioc->fs_info);
queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
&bbio->end_io_work);
return;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index abbfd71f2cb6..f88a4c92c248 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3044,6 +3044,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
rwlock_init(&fs_info->stripe_update_lock);
fs_info->stripe_update_tree = RB_ROOT;
+ atomic_set(&fs_info->ordered_stripes_pending, 0);
+ init_waitqueue_head(&fs_info->ordered_stripe_wait);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index dd151538d2b1..ab25873267ea 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -794,6 +794,8 @@ struct btrfs_fs_info {
rwlock_t stripe_update_lock;
struct rb_root stripe_update_tree;
+ atomic_t ordered_stripes_pending;
+ wait_queue_head_t ordered_stripe_wait;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aaa1db90e58b..a84e79ad840e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3230,6 +3230,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ wait_event(fs_info->ordered_stripe_wait,
+ !btrfs_ordered_stripes_pending(fs_info));
+
ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 8799a7abaf38..e06457771976 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -110,6 +110,7 @@ int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
rb_replace_node(node, &stripe->rb_node,
&fs_info->stripe_update_tree);
}
+ atomic_dec(&fs_info->ordered_stripes_pending);
write_unlock(&fs_info->stripe_update_lock);
trace_btrfs_ordered_stripe_add(fs_info, stripe);
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 371409351d60..ecc67126ed62 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -84,4 +84,15 @@ static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
/* once for the tree */
btrfs_put_ordered_stripe(fs_info, stripe);
}
+
+static inline void btrfs_add_ordered_stripe_pending(struct btrfs_fs_info *fs_info)
+{
+ atomic_inc(&fs_info->ordered_stripes_pending);
+}
+
+static inline bool btrfs_ordered_stripes_pending(struct btrfs_fs_info *fs_info)
+{
+ return atomic_read(&fs_info->ordered_stripes_pending) != 0;
+}
+
#endif
^ permalink raw reply related [flat|nested] 49+ messages in thread* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:45 ` Johannes Thumshirn
@ 2023-03-02 14:03 ` Christoph Hellwig
0 siblings, 0 replies; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-02 14:03 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: Qu Wenruo, David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On Thu, Mar 02, 2023 at 11:45:44AM +0000, Johannes Thumshirn wrote:
> + wait_event(fs_info->ordered_stripe_wait,
> + !btrfs_ordered_stripes_pending(fs_info));
Ugg. Waiting for processing from one workqueue from another one
is really a big fat warning sign you are in workqueue hell. Let's
take a step back and solve this properly.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 11:25 ` Johannes Thumshirn
2023-03-02 11:45 ` Qu Wenruo
2023-03-02 11:45 ` Johannes Thumshirn
@ 2023-03-02 13:59 ` Christoph Hellwig
2023-03-02 15:29 ` Johannes Thumshirn
2023-03-03 0:13 ` Qu Wenruo
2 siblings, 2 replies; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-02 13:59 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: Qu Wenruo, David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
> > The main concern may be the bioc <-> ordered extent mapping, but IIRC
> > for zoned mode one bioc is one ordered extent, thus this shouldn't be a
> > super big deal?
>
> Yep, but I want to be able to use RST for non-zoned devices as well
> to attack the RAID56 problems and add erasure coding RAID.
I have a series in my queue the limits every btrfs_bio (and thus bioc)
to a single ordered_extent. The bio spanning ordered_extents is a very
strange corner case that rarely happens but causes a lot of problems.
With that series we'll also gain a pointer to the ordered_extent from
the btrfs_bio, which will remove all the ordered_extent lookups from
the fast path.
So I think you can rework your series to also limit the bio to a single
ordered extent, and if needed split the ordered extent for anything that
uses the raid stripe tree and we'll nicely converge there.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 13:59 ` Christoph Hellwig
@ 2023-03-02 15:29 ` Johannes Thumshirn
2023-03-03 0:13 ` Qu Wenruo
1 sibling, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 15:29 UTC (permalink / raw)
To: hch@infradead.org
Cc: Qu Wenruo, David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 02.03.23 14:59, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
>>> The main concern may be the bioc <-> ordered extent mapping, but IIRC
>>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
>>> super big deal?
>>
>> Yep, but I want to be able to use RST for non-zoned devices as well
>> to attack the RAID56 problems and add erasure coding RAID.
>
> I have a series in my queue the limits every btrfs_bio (and thus bioc)
> to a single ordered_extent. The bio spanning ordered_extents is a very
> strange corner case that rarely happens but causes a lot of problems.
> With that series we'll also gain a pointer to the ordered_extent from
> the btrfs_bio, which will remove all the ordered_extent lookups from
> the fast path.
>
> So I think you can rework your series to also limit the bio to a single
> ordered extent, and if needed split the ordered extent for anything that
> uses the raid stripe tree and we'll nicely converge there.
>
That does indeed sound like a good idea to me.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-02 13:59 ` Christoph Hellwig
2023-03-02 15:29 ` Johannes Thumshirn
@ 2023-03-03 0:13 ` Qu Wenruo
2023-03-03 14:15 ` Christoph Hellwig
1 sibling, 1 reply; 49+ messages in thread
From: Qu Wenruo @ 2023-03-03 0:13 UTC (permalink / raw)
To: Christoph Hellwig, Johannes Thumshirn
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On 2023/3/2 21:59, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
>>> The main concern may be the bioc <-> ordered extent mapping, but IIRC
>>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
>>> super big deal?
>>
>> Yep, but I want to be able to use RST for non-zoned devices as well
>> to attack the RAID56 problems and add erasure coding RAID.
>
> I have a series in my queue the limits every btrfs_bio (and thus bioc)
> to a single ordered_extent. The bio spanning ordered_extents is a very
> strange corner case that rarely happens but causes a lot of problems.
Really?
A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily
lead to that situation.
If we really split ordered extents to that stripe len, it can cause a
lot of small file extents thus bloat the size of subvolume trees.
Thanks,
Qu
> With that series we'll also gain a pointer to the ordered_extent from
> the btrfs_bio, which will remove all the ordered_extent lookups from
> the fast path.
>
> So I think you can rework your series to also limit the bio to a single
> ordered extent, and if needed split the ordered extent for anything that
> uses the raid stripe tree and we'll nicely converge there.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 0:13 ` Qu Wenruo
@ 2023-03-03 14:15 ` Christoph Hellwig
2023-03-03 23:03 ` Qu Wenruo
0 siblings, 1 reply; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-03 14:15 UTC (permalink / raw)
To: Qu Wenruo
Cc: Christoph Hellwig, Johannes Thumshirn, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik
On Fri, Mar 03, 2023 at 08:13:23AM +0800, Qu Wenruo wrote:
> > I have a series in my queue the limits every btrfs_bio (and thus bioc)
> > to a single ordered_extent. The bio spanning ordered_extents is a very
> > strange corner case that rarely happens but causes a lot of problems.
>
> Really?
>
> A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily lead
> to that situation.
>
> If we really split ordered extents to that stripe len, it can cause a lot of
> small file extents thus bloat the size of subvolume trees.
I might have been a little more clear in my wording.
This is talking about the btrfs_bio submitted by the upper layers using
btrfs_submit_bio, not the ones split out by it at the extent boundaries.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 14:15 ` Christoph Hellwig
@ 2023-03-03 23:03 ` Qu Wenruo
2023-03-06 17:11 ` Christoph Hellwig
0 siblings, 1 reply; 49+ messages in thread
From: Qu Wenruo @ 2023-03-03 23:03 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Johannes Thumshirn, David Sterba, linux-btrfs@vger.kernel.org,
Josef Bacik
On 2023/3/3 22:15, Christoph Hellwig wrote:
> On Fri, Mar 03, 2023 at 08:13:23AM +0800, Qu Wenruo wrote:
>>> I have a series in my queue the limits every btrfs_bio (and thus bioc)
>>> to a single ordered_extent. The bio spanning ordered_extents is a very
>>> strange corner case that rarely happens but causes a lot of problems.
>>
>> Really?
>>
>> A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily lead
>> to that situation.
>>
>> If we really split ordered extents to that stripe len, it can cause a lot of
>> small file extents thus bloat the size of subvolume trees.
>
> I might have been a little more clear in my wording.
>
> This is talking about the btrfs_bio submitted by the upper layers using
> btrfs_submit_bio, not the ones split out by it at the extent boundaries.
Oh, that indeed solves most of the common cases.
But I'm not familiar enough on the direct IO front, IIRC we have some
recent bugs related to page faulting during a larger direct IO write.
Not sure if that would affect the use case.
Thanks,
Qu
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents
2023-03-03 23:03 ` Qu Wenruo
@ 2023-03-06 17:11 ` Christoph Hellwig
0 siblings, 0 replies; 49+ messages in thread
From: Christoph Hellwig @ 2023-03-06 17:11 UTC (permalink / raw)
To: Qu Wenruo
Cc: Christoph Hellwig, Johannes Thumshirn, David Sterba,
linux-btrfs@vger.kernel.org, Josef Bacik
On Sat, Mar 04, 2023 at 07:03:01AM +0800, Qu Wenruo wrote:
> But I'm not familiar enough on the direct IO front, IIRC we have some recent
> bugs related to page faulting during a larger direct IO write.
Yes. But the fix for that is actually doing some of the same changes
I had planned for my series, so I think we're aligned on the direction.
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v7 05/13] btrfs: delete stripe extent on extent deletion
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (3 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 04/13] btrfs: add support for inserting raid stripe extents Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 06/13] btrfs: lookup physical address from stripe extent Johannes Thumshirn
` (9 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
As each stripe extent is tied to an extent item, delete the stripe extent
once the corresponding extent item is deleted.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/extent-tree.c | 8 ++
fs/btrfs/raid-stripe-tree.c | 176 ++++++++++++++++++++++++++++++++++++
fs/btrfs/raid-stripe-tree.h | 5 +
fs/btrfs/volumes.c | 27 ++++--
4 files changed, 209 insertions(+), 7 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7441d784fe03..b08e7b4688e0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3238,6 +3238,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ if (is_data) {
+ ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9d3e7bffe6f8..f58b28157a9c 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -124,6 +124,37 @@ void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
}
}
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
+ struct btrfs_path *path;
+ struct btrfs_key stripe_key;
+ int ret;
+
+ if (!stripe_root)
+ return 0;
+
+ stripe_key.objectid = start;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = length;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, stripe_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+
+}
+
int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
@@ -202,3 +233,148 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
return ret;
}
+
+static bool btrfs_physical_from_ordered_stripe(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length,
+ int num_stripes,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_ordered_stripe *os;
+ u64 offset;
+ u64 found_end;
+ u64 end;
+ int i;
+
+ os = btrfs_lookup_ordered_stripe(fs_info, logical);
+ if (!os)
+ return false;
+
+ end = logical + *length;
+ found_end = os->logical + os->num_bytes;
+ if (end > found_end)
+ *length -= end - found_end;
+
+ for (i = 0; i < num_stripes; i++) {
+ if (os->stripes[i].dev != stripe->dev)
+ continue;
+
+ ASSERT(logical >= os->logical);
+ offset = logical - os->logical;
+ stripe->physical = os->stripes[i].physical + offset;
+ btrfs_put_ordered_stripe(fs_info, os);
+ break;
+ }
+
+ return true;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
+ int num_stripes = btrfs_bg_type_to_factor(map_type);
+ struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_key stripe_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ u64 offset;
+ u64 found_logical;
+ u64 found_length;
+ u64 end;
+ u64 found_end;
+ int slot;
+ int ret;
+ int i;
+
+ /*
+ * If we still have the stripe in the ordered stripe tree get it from
+ * there
+ */
+ if (btrfs_physical_from_ordered_stripe(fs_info, logical, length,
+ num_stripes, stripe))
+ return 0;
+
+ stripe_key.objectid = logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret) {
+ if (path->slots[0] != 0)
+ path->slots[0]--;
+ }
+
+ end = logical + *length;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ found_logical = found_key.objectid;
+ found_length = found_key.offset;
+
+ if (found_logical > end)
+ break;
+
+ if (!in_range(logical, found_logical, found_length))
+ goto next;
+
+ offset = logical - found_logical;
+ found_end = found_logical + found_length;
+
+ /*
+ * If we have a logically contiguous, but physically
+ * noncontinuous range, we need to split the bio. Record the
+ * length after which we must split the bio.
+ */
+ if (end > found_end)
+ *length -= end - found_end;
+
+ stripe_extent =
+ btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+ for (i = 0; i < num_stripes; i++) {
+ if (btrfs_raid_stride_devid_nr(leaf,
+ stripe_extent, i) != stripe->dev->devid)
+ continue;
+ stripe->physical = btrfs_raid_stride_physical_nr(leaf,
+ stripe_extent, i) + offset;
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * If we're here, we haven't found the requested devid in the
+ * stripe.
+ */
+ ret = -ENOENT;
+ goto out;
+next:
+ ret = btrfs_next_item(stripe_root, path);
+ if (ret)
+ break;
+ }
+
+out:
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret && ret != -EIO) {
+ btrfs_err(fs_info,
+ "cannot find raid-stripe for logical [%llu, %llu]",
+ logical, logical + *length);
+ btrfs_print_tree(leaf, 1);
+ }
+
+free_path:
+ btrfs_free_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 60d3f8489cc9..9359df0ca3f1 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -22,6 +22,11 @@ struct btrfs_ordered_stripe {
refcount_t ref;
};
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ struct btrfs_io_stripe *stripe);
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 length);
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_stripe *stripe);
int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fee611d1b01d..b4b615421643 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -6286,12 +6287,21 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
return U64_MAX;
}
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+ struct map_lookup *map, u32 stripe_index,
+ u64 stripe_offset, u64 stripe_nr)
{
dst->dev = map->stripes[stripe_index].dev;
+
+ if (op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type, dst);
+
dst->physical = map->stripes[stripe_index].physical +
stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ return 0;
}
int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -6485,13 +6495,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
smap->dev = dev_replace->tgtdev;
smap->physical = physical_to_patch_in_first_stripe;
*mirror_num_ret = map->num_stripes + 1;
+ ret = 0;
} else {
- set_io_stripe(smap, map, stripe_index, stripe_offset,
- stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length, smap,
+ map, stripe_index, stripe_offset,
+ stripe_nr);
*mirror_num_ret = mirror_num;
}
*bioc_ret = NULL;
- ret = 0;
goto out;
}
@@ -6522,7 +6533,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->full_stripe_logical = em->start +
((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
for (i = 0; i < num_stripes; i++)
- set_io_stripe(&bioc->stripes[i], map,
+ set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map,
(i + stripe_nr) % num_stripes,
stripe_offset, stripe_nr);
} else {
@@ -6531,7 +6543,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* stripe into the bioc.
*/
for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index,
+ set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map, stripe_index,
stripe_offset, stripe_nr);
stripe_index++;
}
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 06/13] btrfs: lookup physical address from stripe extent
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (4 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 05/13] btrfs: delete stripe extent on extent deletion Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 07/13] btrfs: add raid stripe tree pretty printer Johannes Thumshirn
` (8 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Lookup the physical address from the raid stripe tree when a read on an
RAID volume formatted with the raid stripe tree was attempted.
If the requested logical address was not found in the stripe tree, it may
still be in the in-memory ordered stripe tree, so fallback to searching
the ordered stripe tree in this case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/volumes.c | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b4b615421643..80baabdef153 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6533,23 +6533,29 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->full_stripe_logical = em->start +
((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
for (i = 0; i < num_stripes; i++)
- set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map,
+ (i + stripe_nr) % num_stripes,
+ stripe_offset, stripe_nr);
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
for (i = 0; i < num_stripes; i++) {
- set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map, stripe_index,
+ stripe_offset, stripe_nr);
stripe_index++;
}
}
+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
+ }
+
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 07/13] btrfs: add raid stripe tree pretty printer
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (5 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 06/13] btrfs: lookup physical address from stripe extent Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 08/13] btrfs: zoned: allow zoned RAID Johannes Thumshirn
` (7 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Decode raid-stripe-tree entries on btrfs_print_tree().
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/print-tree.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index b93c96213304..d9506d54298b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,7 @@
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
struct root_name_map {
u64 id;
@@ -28,6 +29,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
{ BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+ { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" },
};
const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -187,6 +189,20 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
}
}
+static void print_raid_stripe_key(struct extent_buffer *eb, u32 item_size,
+ struct btrfs_stripe_extent *stripe)
+{
+ int num_stripes;
+ int i;
+
+ num_stripes = item_size / sizeof(struct btrfs_raid_stride);
+
+ for (i = 0; i < num_stripes; i++)
+ pr_info("\t\t\tstride %d devid %llu physical %llu\n", i,
+ btrfs_raid_stride_devid_nr(eb, stripe, i),
+ btrfs_raid_stride_physical_nr(eb, stripe, i));
+}
+
/*
* Helper to output refs and locking status of extent buffer. Useful to debug
* race condition related problems.
@@ -351,6 +367,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size(l, i));
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ print_raid_stripe_key(l, btrfs_item_size(l, i),
+ btrfs_item_ptr(l, i,
+ struct btrfs_stripe_extent));
+ break;
}
}
}
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 08/13] btrfs: zoned: allow zoned RAID
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (6 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 07/13] btrfs: add raid stripe tree pretty printer Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 09/13] btrfs: check for leaks of ordered stripes on umount Johannes Thumshirn
` (6 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
When we have a raid-stripe-tree, we can do RAID0/1/10 on zoned devices for
data block-groups. For meta-data block-groups, we don't actually need
anything special, as all meta-data I/O is protected by the
btrfs_zoned_meta_io_lock() already.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/raid-stripe-tree.c | 4 ++
fs/btrfs/raid-stripe-tree.h | 10 ++++
fs/btrfs/volumes.c | 5 +-
fs/btrfs/zoned.c | 116 +++++++++++++++++++++++++++++++++++-
4 files changed, 132 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index f58b28157a9c..836299fe0ebe 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -270,10 +270,12 @@ static bool btrfs_physical_from_ordered_stripe(struct btrfs_fs_info *fs_info,
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index,
struct btrfs_io_stripe *stripe)
{
struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
int num_stripes = btrfs_bg_type_to_factor(map_type);
+ const bool is_dup = map_type & BTRFS_BLOCK_GROUP_DUP;
struct btrfs_stripe_extent *stripe_extent;
struct btrfs_key stripe_key;
struct btrfs_key found_key;
@@ -345,6 +347,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
if (btrfs_raid_stride_devid_nr(leaf,
stripe_extent, i) != stripe->dev->devid)
continue;
+ if (is_dup && (stripe_index - 1) != i)
+ continue;
stripe->physical = btrfs_raid_stride_physical_nr(leaf,
stripe_extent, i) + offset;
ret = 0;
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 9359df0ca3f1..c7f6c5377aaa 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -24,6 +24,7 @@ struct btrfs_ordered_stripe {
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index,
struct btrfs_io_stripe *stripe);
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
u64 length);
@@ -50,9 +51,18 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
if (type != BTRFS_BLOCK_GROUP_DATA)
return false;
+ if (profile & BTRFS_BLOCK_GROUP_DUP)
+ return true;
+
if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
return true;
+ if (profile & BTRFS_BLOCK_GROUP_RAID0)
+ return true;
+
+ if (profile & BTRFS_BLOCK_GROUP_RAID10)
+ return true;
+
return false;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 80baabdef153..ae92567e1275 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6297,7 +6297,8 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (op == BTRFS_MAP_READ &&
btrfs_need_stripe_tree_update(fs_info, map->type))
return btrfs_get_raid_extent_offset(fs_info, logical, length,
- map->type, dst);
+ map->type, stripe_index,
+ dst);
dst->physical = map->stripes[stripe_index].physical +
stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
@@ -6488,6 +6489,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* I/O context structure.
*/
if (smap && num_alloc_stripes == 1 &&
+ !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
+ op != BTRFS_MAP_READ) &&
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
(!need_full_stripe(op) || !dev_replace_is_ongoing ||
!dev_replace->tgtdev)) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 7e6cfc7a2918..5328a600f526 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1476,8 +1476,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
break;
case BTRFS_BLOCK_GROUP_DUP:
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ if (map->type & BTRFS_BLOCK_GROUP_DATA &&
+ !btrfs_stripe_tree_root(fs_info)) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs stripe_root");
ret = -EINVAL;
goto out;
}
@@ -1515,8 +1516,116 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
cache->zone_capacity = min(caps[0], caps[1]);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA &&
+ !btrfs_stripe_tree_root(fs_info)) {
+ btrfs_err(fs_info,
+ "zoned: data %s needs stripe_root",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EIO;
+ goto out;
+
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (alloc_offsets[i] == WP_MISSING_DEV ||
+ alloc_offsets[i] == WP_CONVENTIONAL)
+ continue;
+
+ if ((alloc_offsets[0] != alloc_offsets[i]) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
+ }
+ /*
+ * In case a device is missing we have a cap of 0, so don't
+ * use it.
+ */
+ cache->zone_capacity = min_not_zero(caps[0], caps[i]);
+ }
+
+ if (alloc_offsets[0] != WP_MISSING_DEV)
+ cache->alloc_offset = alloc_offsets[0];
+ else
+ cache->alloc_offset = alloc_offsets[i - 1];
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA &&
+ !btrfs_stripe_tree_root(fs_info)) {
+ btrfs_err(fs_info,
+ "zoned: data %s needs stripe_root",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EIO;
+ goto out;
+
+ }
+ for (i = 0; i < map->num_stripes; i++) {
+ if (alloc_offsets[i] == WP_MISSING_DEV ||
+ alloc_offsets[i] == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
+ }
+ cache->zone_capacity += caps[i];
+ cache->alloc_offset += alloc_offsets[i];
+
+ }
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA &&
+ !btrfs_stripe_tree_root(fs_info)) {
+ btrfs_err(fs_info,
+ "zoned: data %s needs stripe_root",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EIO;
+ goto out;
+
+ }
+ for (i = 0; i < map->num_stripes; i++) {
+ if (alloc_offsets[i] == WP_MISSING_DEV ||
+ alloc_offsets[i] == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
+ }
+ if ((i % map->sub_stripes) == 0) {
+ cache->zone_capacity += caps[i];
+ cache->alloc_offset += alloc_offsets[i];
+ }
+
+ }
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
/* non-single profiles are not supported yet */
@@ -1893,6 +2002,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
+ if (!device->zone_info)
+ continue;
+
if (device->zone_info->max_active_zones == 0)
continue;
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 09/13] btrfs: check for leaks of ordered stripes on umount
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (7 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 08/13] btrfs: zoned: allow zoned RAID Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 10/13] btrfs: add tracepoints for ordered stripes Johannes Thumshirn
` (5 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Check if we're leaking any ordered stripes when unmounting a filesystem
with an stripe tree.
This check is gated behind CONFIG_BTRFS_DEBUG to not affect any production
type systems.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/raid-stripe-tree.c | 30 ++++++++++++++++++++++++++++++
fs/btrfs/raid-stripe-tree.h | 1 +
3 files changed, 33 insertions(+)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ac200b367ec8..abbfd71f2cb6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -52,6 +52,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -1522,6 +1523,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->stripe_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
+ btrfs_check_ordered_stripe_leak(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kfree(fs_info->subpage_info);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 836299fe0ebe..391f69effd90 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -36,6 +36,36 @@ static int ordered_stripe_less(struct rb_node *rba, const struct rb_node *rbb)
return ordered_stripe_cmp(&stripe->logical, rbb);
}
+void btrfs_check_ordered_stripe_leak(struct btrfs_fs_info *fs_info)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ struct rb_node *node;
+
+ if (!btrfs_stripe_tree_root(fs_info) ||
+ RB_EMPTY_ROOT(&fs_info->stripe_update_tree))
+ return;
+
+ WARN_ON_ONCE(1);
+ write_lock(&fs_info->stripe_update_lock);
+ while ((node = rb_first_postorder(&fs_info->stripe_update_tree))
+ != NULL) {
+ struct btrfs_ordered_stripe *stripe =
+ rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+
+ write_unlock(&fs_info->stripe_update_lock);
+ btrfs_err(fs_info,
+ "ordered_stripe [%llu, %llu] leaked, refcount=%d",
+ stripe->logical, stripe->logical + stripe->num_bytes,
+ refcount_read(&stripe->ref));
+ while (refcount_read(&stripe->ref) > 1)
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ write_lock(&fs_info->stripe_update_lock);
+ }
+ write_unlock(&fs_info->stripe_update_lock);
+#endif
+}
+
int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index c7f6c5377aaa..371409351d60 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -38,6 +38,7 @@ struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc);
void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
struct btrfs_ordered_stripe *stripe);
+void btrfs_check_ordered_stripe_leak(struct btrfs_fs_info *fs_info);
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type)
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 10/13] btrfs: add tracepoints for ordered stripes
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (8 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 09/13] btrfs: check for leaks of ordered stripes on umount Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 11/13] btrfs: announce presence of raid-stripe-tree in sysfs Johannes Thumshirn
` (4 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Add tracepoints to check the lifetime of btrfs_ordered_stripe entries.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/raid-stripe-tree.c | 4 ++-
fs/btrfs/super.c | 1 +
include/trace/events/btrfs.h | 50 ++++++++++++++++++++++++++++++++++++
3 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 391f69effd90..8799a7abaf38 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -112,6 +112,7 @@ int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
}
write_unlock(&fs_info->stripe_update_lock);
+ trace_btrfs_ordered_stripe_add(fs_info, stripe);
return 0;
}
@@ -127,6 +128,7 @@ struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *f
if (node) {
stripe = rb_entry(node, struct btrfs_ordered_stripe, rb_node);
refcount_inc(&stripe->ref);
+ trace_btrfs_ordered_stripe_lookup(fs_info, stripe);
}
read_unlock(&fs_info->stripe_update_lock);
@@ -136,7 +138,7 @@ struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *f
void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
struct btrfs_ordered_stripe *stripe)
{
-
+ trace_btrfs_ordered_stripe_put(fs_info, stripe);
if (refcount_dec_and_test(&stripe->ref)) {
struct rb_node *node;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8885966e801..fd49da569a8a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -59,6 +59,7 @@
#include "verity.h"
#include "super.h"
#include "extent-tree.h"
+#include "raid-stripe-tree.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8ea9cea9bfeb..7bdc8cc595cc 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -33,6 +33,7 @@ struct btrfs_space_info;
struct btrfs_raid_bio;
struct raid56_bio_trace_info;
struct find_free_extent_ctl;
+struct btrfs_ordered_stripe;
#define show_ref_type(type) \
__print_symbolic(type, \
@@ -2492,6 +2493,55 @@ DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover,
TP_ARGS(rbio, bio, trace_info)
);
+DECLARE_EVENT_CLASS(btrfs__ordered_stripe,
+
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_ordered_stripe *stripe),
+
+ TP_ARGS(fs_info, stripe),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, logical )
+ __field( u64, num_bytes )
+ __field( int, num_stripes )
+ __field( int, ref )
+ ),
+
+ TP_fast_assign_btrfs(fs_info,
+ __entry->logical = stripe->logical;
+ __entry->num_bytes = stripe->num_bytes;
+ __entry->num_stripes = stripe->num_stripes;
+ __entry->ref = refcount_read(&stripe->ref);
+ ),
+
+ TP_printk_btrfs("logical=%llu, num_bytes=%llu, num_stripes=%d, ref=%d",
+ __entry->logical, __entry->num_bytes,
+ __entry->num_stripes, __entry->ref)
+);
+
+DEFINE_EVENT(btrfs__ordered_stripe, btrfs_ordered_stripe_add,
+
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_ordered_stripe *stripe),
+
+ TP_ARGS(fs_info, stripe)
+);
+
+DEFINE_EVENT(btrfs__ordered_stripe, btrfs_ordered_stripe_lookup,
+
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_ordered_stripe *stripe),
+
+ TP_ARGS(fs_info, stripe)
+);
+
+DEFINE_EVENT(btrfs__ordered_stripe, btrfs_ordered_stripe_put,
+
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_ordered_stripe *stripe),
+
+ TP_ARGS(fs_info, stripe)
+);
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 11/13] btrfs: announce presence of raid-stripe-tree in sysfs
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (9 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 10/13] btrfs: add tracepoints for ordered stripes Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 12/13] btrfs: consult raid-stripe-tree when scrubbing Johannes Thumshirn
` (3 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
If a filesystem with a raid-stripe-tree is mounted, show the RST feature
in sysfs.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/sysfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 37fc58a7f27e..bf7190e0b17a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -297,6 +297,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -327,6 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#endif
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+ BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 12/13] btrfs: consult raid-stripe-tree when scrubbing
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (10 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 11/13] btrfs: announce presence of raid-stripe-tree in sysfs Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-02 9:45 ` [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug Johannes Thumshirn
` (2 subsequent siblings)
14 siblings, 0 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
When scrubbing a filesystem which uses the raid-stripe-tree for logical to
physical address translation, consult the RST to perform the address
translation instead of relying on fixed block group offsets.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/scrub.c | 33 +++++++++++++++++++++++++++++++--
1 file changed, 31 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c83ac6b80c2f..86e3374d49b0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -24,6 +24,7 @@
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
+#include "raid-stripe-tree.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -2821,6 +2822,21 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ struct btrfs_io_stripe stripe;
+ const bool stripe_update =
+ btrfs_need_stripe_tree_update(sctx->fs_info, map->type);
+
+ if (stripe_update) {
+ stripe.dev = src_dev;
+ ret = btrfs_get_raid_extent_offset(sctx->fs_info, logical,
+ (u64 *)&len,
+ map->type, mirror_num,
+ &stripe);
+ if (ret)
+ return ret;
+
+ src_physical = stripe.physical;
+ }
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2872,8 +2888,21 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
return ret;
len -= l;
logical += l;
- physical += l;
- src_physical += l;
+ if (stripe_update && len) {
+
+ ret = btrfs_get_raid_extent_offset(sctx->fs_info,
+ logical, (u64 *)&len,
+ map->type, mirror_num,
+ &stripe);
+ if (ret)
+ return ret;
+
+ src_physical = stripe.physical;
+ physical = stripe.physical;
+ } else {
+ physical += l;
+ src_physical += l;
+ }
}
return 0;
}
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (11 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 12/13] btrfs: consult raid-stripe-tree when scrubbing Johannes Thumshirn
@ 2023-03-02 9:45 ` Johannes Thumshirn
2023-03-03 23:27 ` Anand Jain
2023-03-09 7:08 ` Naohiro Aota
2023-03-02 19:38 ` [PATCH v7 00/13] btrfs: introduce RAID stripe tree Neal Gompa
2023-03-03 9:29 ` Anand Jain
14 siblings, 2 replies; 49+ messages in thread
From: Johannes Thumshirn @ 2023-03-02 9:45 UTC (permalink / raw)
To: David Sterba
Cc: Johannes Thumshirn, linux-btrfs, Josef Bacik, Christoph Hellwig
Until the RAID stripe tree code is well enough tested and feature
complete, "hide" it behind CONFIG_BTRFS_DEBUG so only people who
want to use it are actually using it.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/fs.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d0d80540b32b..dd151538d2b1 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -214,7 +214,8 @@ enum {
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED | \
- BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE)
#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
--
2.39.1
^ permalink raw reply related [flat|nested] 49+ messages in thread* Re: [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug
2023-03-02 9:45 ` [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug Johannes Thumshirn
@ 2023-03-03 23:27 ` Anand Jain
2023-03-09 7:08 ` Naohiro Aota
1 sibling, 0 replies; 49+ messages in thread
From: Anand Jain @ 2023-03-03 23:27 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: linux-btrfs, Josef Bacik, Christoph Hellwig, David Sterba
On 02/03/2023 17:45, Johannes Thumshirn wrote:
> Until the RAID stripe tree code is well enough tested and feature
> complete, "hide" it behind CONFIG_BTRFS_DEBUG so only people who
> want to use it are actually using it.
>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug
2023-03-02 9:45 ` [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug Johannes Thumshirn
2023-03-03 23:27 ` Anand Jain
@ 2023-03-09 7:08 ` Naohiro Aota
1 sibling, 0 replies; 49+ messages in thread
From: Naohiro Aota @ 2023-03-09 7:08 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: David Sterba, linux-btrfs@vger.kernel.org, Josef Bacik,
Christoph Hellwig
On Thu, Mar 02, 2023 at 01:45:35AM -0800, Johannes Thumshirn wrote:
> Until the RAID stripe tree code is well enough tested and feature
> complete, "hide" it behind CONFIG_BTRFS_DEBUG so only people who
> want to use it are actually using it.
>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v7 00/13] btrfs: introduce RAID stripe tree
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (12 preceding siblings ...)
2023-03-02 9:45 ` [PATCH v7 13/13] btrfs: add raid-stripe-tree to features enabled with debug Johannes Thumshirn
@ 2023-03-02 19:38 ` Neal Gompa
2023-03-03 8:45 ` Johannes Thumshirn
2023-03-03 9:29 ` Anand Jain
14 siblings, 1 reply; 49+ messages in thread
From: Neal Gompa @ 2023-03-02 19:38 UTC (permalink / raw)
To: Johannes Thumshirn
Cc: David Sterba, linux-btrfs, Josef Bacik, Christoph Hellwig
On Thu, Mar 2, 2023 at 4:56 AM Johannes Thumshirn
<johannes.thumshirn@wdc.com> wrote:
>
> Updates of the raid-stripe-tree are done at delayed-ref time to safe on
> bandwidth while for reading we do the stripe-tree lookup on bio mapping time,
> i.e. when the logical to physical translation happens for regular btrfs RAID
> as well.
>
> The stripe tree is keyed by an extent's disk_bytenr and disk_num_bytes and
> it's contents are the respective physical device id and position.
>
> For an example 1M write (split into 126K segments due to zone-append)
> rapido2:/home/johannes/src/fstests# xfs_io -fdc "pwrite -b 1M 0 1M" -c fsync /mnt/test/test
> wrote 1048576/1048576 bytes at offset 0
> 1 MiB, 1 ops; 0.0065 sec (151.538 MiB/sec and 151.5381 ops/sec)
>
> The tree will look as follows:
>
> rapido2:/home/johannes/src/fstests# btrfs inspect-internal dump-tree -t raid_stripe /dev/nullb0
> btrfs-progs v5.16.1
> raid stripe tree key (RAID_STRIPE_TREE ROOT_ITEM 0)
> leaf 805847040 items 9 free space 15770 generation 9 owner RAID_STRIPE_TREE
> leaf 805847040 flags 0x1(WRITTEN) backref revision 1
> checksum stored 1b22e13800000000000000000000000000000000000000000000000000000000
> checksum calced 1b22e13800000000000000000000000000000000000000000000000000000000
> fs uuid e4f523d1-89a1-41f9-ab75-6ba3c42a28fb
> chunk uuid 6f2d8aaa-d348-4bf2-9b5e-141a37ba4c77
> item 0 key (939524096 RAID_STRIPE_KEY 126976) itemoff 16251 itemsize 32
> stripe 0 devid 1 offset 939524096
> stripe 1 devid 2 offset 536870912
> item 1 key (939651072 RAID_STRIPE_KEY 126976) itemoff 16219 itemsize 32
> stripe 0 devid 1 offset 939651072
> stripe 1 devid 2 offset 536997888
> item 2 key (939778048 RAID_STRIPE_KEY 126976) itemoff 16187 itemsize 32
> stripe 0 devid 1 offset 939778048
> stripe 1 devid 2 offset 537124864
> item 3 key (939905024 RAID_STRIPE_KEY 126976) itemoff 16155 itemsize 32
> stripe 0 devid 1 offset 939905024
> stripe 1 devid 2 offset 537251840
> item 4 key (940032000 RAID_STRIPE_KEY 126976) itemoff 16123 itemsize 32
> stripe 0 devid 1 offset 940032000
> stripe 1 devid 2 offset 537378816
> item 5 key (940158976 RAID_STRIPE_KEY 126976) itemoff 16091 itemsize 32
> stripe 0 devid 1 offset 940158976
> stripe 1 devid 2 offset 537505792
> item 6 key (940285952 RAID_STRIPE_KEY 126976) itemoff 16059 itemsize 32
> stripe 0 devid 1 offset 940285952
> stripe 1 devid 2 offset 537632768
> item 7 key (940412928 RAID_STRIPE_KEY 126976) itemoff 16027 itemsize 32
> stripe 0 devid 1 offset 940412928
> stripe 1 devid 2 offset 537759744
> item 8 key (940539904 RAID_STRIPE_KEY 32768) itemoff 15995 itemsize 32
> stripe 0 devid 1 offset 940539904
> stripe 1 devid 2 offset 537886720
> total bytes 26843545600
> bytes used 1245184
> uuid e4f523d1-89a1-41f9-ab75-6ba3c42a28fb
>
> A design document can be found here:
> https://docs.google.com/document/d/1Iui_jMidCd4MVBNSSLXRfO7p5KmvnoQL/edit?usp=sharing&ouid=103609947580185458266&rtpof=true&sd=true
>
> The user-space part of this series can be found here:
> https://lore.kernel.org/linux-btrfs/20230215143109.2721722-1-johannes.thumshirn@wdc.com
>
Apologies if this is a stupid question, but after reading through the
patch series and the design document, it sounds like the crux of this
change is switching how RAID works to be COW like everything else.
Does that also mean RAID 56 modes benefit from this in that manner?
--
真実はいつも一つ!/ Always, there's only one truth!
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH v7 00/13] btrfs: introduce RAID stripe tree
2023-03-02 9:45 [PATCH v7 00/13] btrfs: introduce RAID stripe tree Johannes Thumshirn
` (13 preceding siblings ...)
2023-03-02 19:38 ` [PATCH v7 00/13] btrfs: introduce RAID stripe tree Neal Gompa
@ 2023-03-03 9:29 ` Anand Jain
2023-03-03 10:32 ` Johannes Thumshirn
14 siblings, 1 reply; 49+ messages in thread
From: Anand Jain @ 2023-03-03 9:29 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: linux-btrfs, Josef Bacik, Christoph Hellwig
Is there a plan to rebase this series to the latest misc-next branch?
Unfortunately, applying this patch fails at multiple patches.
Thanks, Anand
On 02/03/2023 17:45, Johannes Thumshirn wrote:
> Updates of the raid-stripe-tree are done at delayed-ref time to safe on
> bandwidth while for reading we do the stripe-tree lookup on bio mapping time,
> i.e. when the logical to physical translation happens for regular btrfs RAID
> as well.
>
> The stripe tree is keyed by an extent's disk_bytenr and disk_num_bytes and
> it's contents are the respective physical device id and position.
>
> For an example 1M write (split into 126K segments due to zone-append)
> rapido2:/home/johannes/src/fstests# xfs_io -fdc "pwrite -b 1M 0 1M" -c fsync /mnt/test/test
> wrote 1048576/1048576 bytes at offset 0
> 1 MiB, 1 ops; 0.0065 sec (151.538 MiB/sec and 151.5381 ops/sec)
>
> The tree will look as follows:
>
> rapido2:/home/johannes/src/fstests# btrfs inspect-internal dump-tree -t raid_stripe /dev/nullb0
> btrfs-progs v5.16.1
> raid stripe tree key (RAID_STRIPE_TREE ROOT_ITEM 0)
> leaf 805847040 items 9 free space 15770 generation 9 owner RAID_STRIPE_TREE
> leaf 805847040 flags 0x1(WRITTEN) backref revision 1
> checksum stored 1b22e13800000000000000000000000000000000000000000000000000000000
> checksum calced 1b22e13800000000000000000000000000000000000000000000000000000000
> fs uuid e4f523d1-89a1-41f9-ab75-6ba3c42a28fb
> chunk uuid 6f2d8aaa-d348-4bf2-9b5e-141a37ba4c77
> item 0 key (939524096 RAID_STRIPE_KEY 126976) itemoff 16251 itemsize 32
> stripe 0 devid 1 offset 939524096
> stripe 1 devid 2 offset 536870912
> item 1 key (939651072 RAID_STRIPE_KEY 126976) itemoff 16219 itemsize 32
> stripe 0 devid 1 offset 939651072
> stripe 1 devid 2 offset 536997888
> item 2 key (939778048 RAID_STRIPE_KEY 126976) itemoff 16187 itemsize 32
> stripe 0 devid 1 offset 939778048
> stripe 1 devid 2 offset 537124864
> item 3 key (939905024 RAID_STRIPE_KEY 126976) itemoff 16155 itemsize 32
> stripe 0 devid 1 offset 939905024
> stripe 1 devid 2 offset 537251840
> item 4 key (940032000 RAID_STRIPE_KEY 126976) itemoff 16123 itemsize 32
> stripe 0 devid 1 offset 940032000
> stripe 1 devid 2 offset 537378816
> item 5 key (940158976 RAID_STRIPE_KEY 126976) itemoff 16091 itemsize 32
> stripe 0 devid 1 offset 940158976
> stripe 1 devid 2 offset 537505792
> item 6 key (940285952 RAID_STRIPE_KEY 126976) itemoff 16059 itemsize 32
> stripe 0 devid 1 offset 940285952
> stripe 1 devid 2 offset 537632768
> item 7 key (940412928 RAID_STRIPE_KEY 126976) itemoff 16027 itemsize 32
> stripe 0 devid 1 offset 940412928
> stripe 1 devid 2 offset 537759744
> item 8 key (940539904 RAID_STRIPE_KEY 32768) itemoff 15995 itemsize 32
> stripe 0 devid 1 offset 940539904
> stripe 1 devid 2 offset 537886720
> total bytes 26843545600
> bytes used 1245184
> uuid e4f523d1-89a1-41f9-ab75-6ba3c42a28fb
>
> A design document can be found here:
> https://docs.google.com/document/d/1Iui_jMidCd4MVBNSSLXRfO7p5KmvnoQL/edit?usp=sharing&ouid=103609947580185458266&rtpof=true&sd=true
>
> The user-space part of this series can be found here:
> https://lore.kernel.org/linux-btrfs/20230215143109.2721722-1-johannes.thumshirn@wdc.com
>
> Changes to v6:
> - Fix degraded RAID1 mounts
> - Fix RAID0/10 mounts
>
> v6 of the patchset can be found here:
> https://lore/kernel.org/linux-btrfs/cover.1676470614.git.johannes.thumshirn@wdc.com
>
> Changes to v5:
> - Incroporated review comments from Josef and Christoph
> - Rebased onto misc-next
>
> v5 of the patchset can be found here:
> https://lore/kernel.org/linux-btrfs/cover.1675853489.git.johannes.thumshirn@wdc.com
>
> Changes to v4:
> - Added patch to check for RST feature in sysfs
> - Added RST lookups for scrubbing
> - Fixed the error handling bug Josef pointed out
> - Only check if we need to write out a RST once per delayed_ref head
> - Added support for zoned data DUP with RST
>
> Changes to v3:
> - Rebased onto 20221120124734.18634-1-hch@lst.de
> - Incorporated Josef's review
> - Merged related patches
>
> v3 of the patchset can be found here:
> https://lore/kernel.org/linux-btrfs/cover.1666007330.git.johannes.thumshirn@wdc.com
>
> Changes to v2:
> - Bug fixes
> - Rebased onto 20220901074216.1849941-1-hch@lst.de
> - Added tracepoints
> - Added leak checker
> - Added RAID0 and RAID10
>
> v2 of the patchset can be found here:
> https://lore.kernel.org/linux-btrfs/cover.1656513330.git.johannes.thumshirn@wdc.com
>
> Changes to v1:
> - Write the stripe-tree at delayed-ref time (Qu)
> - Add a different write path for preallocation
>
> v1 of the patchset can be found here:
> https://lore.kernel.org/linux-btrfs/cover.1652711187.git.johannes.thumshirn@wdc.com/
>
> Johannes Thumshirn (13):
> btrfs: re-add trans parameter to insert_delayed_ref
> btrfs: add raid stripe tree definitions
> btrfs: read raid-stripe-tree from disk
> btrfs: add support for inserting raid stripe extents
> btrfs: delete stripe extent on extent deletion
> btrfs: lookup physical address from stripe extent
> btrfs: add raid stripe tree pretty printer
> btrfs: zoned: allow zoned RAID
> btrfs: check for leaks of ordered stripes on umount
> btrfs: add tracepoints for ordered stripes
> btrfs: announce presence of raid-stripe-tree in sysfs
> btrfs: consult raid-stripe-tree when scrubbing
> btrfs: add raid-stripe-tree to features enabled with debug
>
> fs/btrfs/Makefile | 2 +-
> fs/btrfs/accessors.h | 29 +++
> fs/btrfs/bio.c | 29 +++
> fs/btrfs/block-rsv.c | 1 +
> fs/btrfs/delayed-ref.c | 13 +-
> fs/btrfs/delayed-ref.h | 2 +
> fs/btrfs/disk-io.c | 24 ++
> fs/btrfs/disk-io.h | 5 +
> fs/btrfs/extent-tree.c | 68 ++++++
> fs/btrfs/fs.h | 7 +-
> fs/btrfs/inode.c | 15 +-
> fs/btrfs/print-tree.c | 21 ++
> fs/btrfs/raid-stripe-tree.c | 416 ++++++++++++++++++++++++++++++++
> fs/btrfs/raid-stripe-tree.h | 87 +++++++
> fs/btrfs/scrub.c | 33 ++-
> fs/btrfs/super.c | 1 +
> fs/btrfs/sysfs.c | 3 +
> fs/btrfs/volumes.c | 46 +++-
> fs/btrfs/volumes.h | 13 +-
> fs/btrfs/zoned.c | 119 ++++++++-
> include/trace/events/btrfs.h | 50 ++++
> include/uapi/linux/btrfs.h | 1 +
> include/uapi/linux/btrfs_tree.h | 20 +-
> 23 files changed, 973 insertions(+), 32 deletions(-)
> create mode 100644 fs/btrfs/raid-stripe-tree.c
> create mode 100644 fs/btrfs/raid-stripe-tree.h
>
^ permalink raw reply [flat|nested] 49+ messages in thread