From: Wang Shilong <wangshilong1991@gmail.com>
To: Josef Bacik <jbacik@fb.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH] Btrfs: remove empty block groups automatically
Date: Sun, 14 Sep 2014 12:46:39 +0800 [thread overview]
Message-ID: <F9E95213-710B-42A2-A904-3EE1CE95B72E@gmail.com> (raw)
In-Reply-To: <1410549510-22336-1-git-send-email-jbacik@fb.com>
Hi Josef,
> One problem that has plagued us is that a user will use up all of his space with
> data, remove a bunch of that data, and then try to create a bunch of small files
> and run out of space. This happens because all the chunks were allocated for
> data since the metadata requirements were so low. But now there's a bunch of
> empty data block groups and not enough metadata space to do anything. This
> patch solves this problem by automatically deleting empty block groups. If we
> notice the used count go down to 0 when deleting or on mount notice that a block
> group has a used count of 0 then we will queue it to be deleted.
>
> When the cleaner thread runs we will double check to make sure the block group
> is still empty and then we will delete it. This patch has the side effect of no
> longer having a bunch of BUG_ON()'s in the chunk delete code, which will be
> helpful for both this and relocate. Thanks,
I am wondering whether we could check if it a Mixed Mode, we don’t need remove
Mixed mode block groups automatically…
>
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
> fs/btrfs/ctree.h | 8 ++-
> fs/btrfs/disk-io.c | 3 +
> fs/btrfs/extent-tree.c | 128 +++++++++++++++++++++++++++++++++++---
> fs/btrfs/tests/free-space-tests.c | 2 +-
> fs/btrfs/volumes.c | 115 ++++++++++++++++++++++------------
> fs/btrfs/volumes.h | 2 +
> 6 files changed, 209 insertions(+), 49 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 6db3d4b..d373c89 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1298,8 +1298,8 @@ struct btrfs_block_group_cache {
> */
> struct list_head cluster_list;
>
> - /* For delayed block group creation */
> - struct list_head new_bg_list;
> + /* For delayed block group creation or deletion of empty block groups */
> + struct list_head bg_list;
> };
>
> /* delayed seq elem */
> @@ -1716,6 +1716,9 @@ struct btrfs_fs_info {
>
> /* Used to reclaim the metadata space in the background. */
> struct work_struct async_reclaim_work;
> +
> + spinlock_t unused_bgs_lock;
> + struct list_head unused_bgs;
> };
>
> struct btrfs_subvolume_writers {
> @@ -3343,6 +3346,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
> u64 size);
> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct btrfs_root *root, u64 group_start);
> +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
> void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
> struct btrfs_root *root);
> u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a224fb9..3409734 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1764,6 +1764,7 @@ static int cleaner_kthread(void *arg)
> }
>
> btrfs_run_delayed_iputs(root);
> + btrfs_delete_unused_bgs(root->fs_info);
> again = btrfs_clean_one_deleted_snapshot(root);
> mutex_unlock(&root->fs_info->cleaner_mutex);
>
> @@ -2224,6 +2225,7 @@ int open_ctree(struct super_block *sb,
> spin_lock_init(&fs_info->super_lock);
> spin_lock_init(&fs_info->qgroup_op_lock);
> spin_lock_init(&fs_info->buffer_lock);
> + spin_lock_init(&fs_info->unused_bgs_lock);
> rwlock_init(&fs_info->tree_mod_log_lock);
> mutex_init(&fs_info->reloc_mutex);
> mutex_init(&fs_info->delalloc_root_mutex);
> @@ -2233,6 +2235,7 @@ int open_ctree(struct super_block *sb,
> INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
> INIT_LIST_HEAD(&fs_info->space_info);
> INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
> + INIT_LIST_HEAD(&fs_info->unused_bgs);
> btrfs_mapping_init(&fs_info->mapping_tree);
> btrfs_init_block_rsv(&fs_info->global_block_rsv,
> BTRFS_BLOCK_RSV_GLOBAL);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index b30ddb4..c68cdb1 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -5433,6 +5433,20 @@ static int update_block_group(struct btrfs_root *root,
> spin_unlock(&cache->space_info->lock);
> } else {
> old_val -= num_bytes;
> +
> + /*
> + * No longer have used bytes in this block group, queue
> + * it for deletion.
> + */
> + if (old_val == 0) {
> + spin_lock(&info->unused_bgs_lock);
> + if (list_empty(&cache->bg_list)) {
> + btrfs_get_block_group(cache);
> + list_add_tail(&cache->bg_list,
> + &info->unused_bgs);
> + }
> + spin_unlock(&info->unused_bgs_lock);
> + }
> btrfs_set_block_group_used(&cache->item, old_val);
> cache->pinned += num_bytes;
> cache->space_info->bytes_pinned += num_bytes;
> @@ -8989,7 +9003,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
> init_rwsem(&cache->data_rwsem);
> INIT_LIST_HEAD(&cache->list);
> INIT_LIST_HEAD(&cache->cluster_list);
> - INIT_LIST_HEAD(&cache->new_bg_list);
> + INIT_LIST_HEAD(&cache->bg_list);
> btrfs_init_free_space_ctl(cache);
>
> return cache;
> @@ -9130,8 +9144,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
> __link_block_group(space_info, cache);
>
> set_avail_alloc_bits(root->fs_info, cache->flags);
> - if (btrfs_chunk_readonly(root, cache->key.objectid))
> + if (btrfs_chunk_readonly(root, cache->key.objectid)) {
> set_block_group_ro(cache, 1);
> + } else if (btrfs_block_group_used(&cache->item) == 0) {
> + spin_lock(&info->unused_bgs_lock);
> + /* Should always be true but just in case. */
> + if (list_empty(&cache->bg_list)) {
> + btrfs_get_block_group(cache);
> + list_add_tail(&cache->bg_list,
> + &info->unused_bgs);
> + }
> + spin_unlock(&info->unused_bgs_lock);
> + }
> }
>
> list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
> @@ -9172,10 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
> struct btrfs_key key;
> int ret = 0;
>
> - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
> - new_bg_list) {
> - list_del_init(&block_group->new_bg_list);
> -
> + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
> + list_del_init(&block_group->bg_list);
> if (ret)
> continue;
>
> @@ -9261,7 +9283,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
>
> __link_block_group(cache->space_info, cache);
>
> - list_add_tail(&cache->new_bg_list, &trans->new_bgs);
> + list_add_tail(&cache->bg_list, &trans->new_bgs);
>
> set_avail_alloc_bits(extent_root->fs_info, type);
>
> @@ -9430,6 +9452,98 @@ out:
> return ret;
> }
>
> +/*
> + * Process the unused_bgs list and remove any that don't have any allocated
> + * space inside of them.
> + */
> +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_block_group_cache *block_group;
> + struct btrfs_space_info *space_info;
> + struct btrfs_root *root = fs_info->extent_root;
> + struct btrfs_trans_handle *trans;
> + int ret = 0;
> +
> + spin_lock(&fs_info->unused_bgs_lock);
> + while (!list_empty(&fs_info->unused_bgs)) {
> + u64 start, end;
> +
> + block_group = list_first_entry(&fs_info->unused_bgs,
> + struct btrfs_block_group_cache,
> + bg_list);
> + space_info = block_group->space_info;
> + list_del_init(&block_group->bg_list);
> + if (ret) {
> + btrfs_put_block_group(block_group);
> + continue;
> + }
> + spin_unlock(&fs_info->unused_bgs_lock);
> +
> + /* Don't want to race with allocators so take the groups_sem */
> + down_write(&space_info->groups_sem);
> + spin_lock(&block_group->lock);
> + if (block_group->reserved ||
> + btrfs_block_group_used(&block_group->item) ||
> + block_group->ro) {
> + /*
> + * We want to bail if we made new allocations or have
> + * outstanding allocations in this block group. We do
> + * the ro check in case balance is currently acting on
> + * this block group.
> + */
> + spin_unlock(&block_group->lock);
> + up_write(&space_info->groups_sem);
> + goto next;
> + }
> + spin_unlock(&block_group->lock);
> +
> + /* We don't want to force the issue, only flip if it's ok. */
> + ret = set_block_group_ro(block_group, 0);
> + up_write(&space_info->groups_sem);
> + if (ret < 0) {
> + ret = 0;
> + goto next;
> + }
> +
> + /*
> + * Want to do this before we do anything else so we can recover
> + * properly if we fail to join the transaction.
> + */
> + trans = btrfs_join_transaction(root);
> + if (IS_ERR(trans)) {
> + btrfs_set_block_group_rw(root, block_group);
> + ret = PTR_ERR(trans);
> + goto next;
> + }
> +
> + /*
> + * We could have pending pinned extents for this block group,
> + * just delete them, we don't care about them anymore.
> + */
> + start = block_group->key.objectid;
> + end = start + block_group->key.offset - 1;
> + clear_extent_bits(&fs_info->freed_extents[0], start, end,
> + EXTENT_DIRTY, GFP_NOFS);
> + clear_extent_bits(&fs_info->freed_extents[1], start, end,
> + EXTENT_DIRTY, GFP_NOFS);
> +
> + /* Reset pinned so btrfs_put_block_group doesn't complain */
> + block_group->pinned = 0;
> +
> + /*
> + * Btrfs_remove_chunk will abort the transaction if things go
> + * horribly wrong.
> + */
> + ret = btrfs_remove_chunk(trans, root,
> + block_group->key.objectid);
> + btrfs_end_transaction(trans, root);
> +next:
> + btrfs_put_block_group(block_group);
> + spin_lock(&fs_info->unused_bgs_lock);
> + }
> + spin_unlock(&fs_info->unused_bgs_lock);
> +}
> +
> int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
> {
> struct btrfs_space_info *space_info;
> diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
> index d78ae10..2299bfd 100644
> --- a/fs/btrfs/tests/free-space-tests.c
> +++ b/fs/btrfs/tests/free-space-tests.c
> @@ -45,7 +45,7 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
> spin_lock_init(&cache->lock);
> INIT_LIST_HEAD(&cache->list);
> INIT_LIST_HEAD(&cache->cluster_list);
> - INIT_LIST_HEAD(&cache->new_bg_list);
> + INIT_LIST_HEAD(&cache->bg_list);
>
> btrfs_init_free_space_ctl(cache);
>
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 77d7fbc..e8fc390 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2568,58 +2568,49 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
> return ret;
> }
>
> -static int btrfs_relocate_chunk(struct btrfs_root *root,
> - u64 chunk_tree, u64 chunk_objectid,
> - u64 chunk_offset)
> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, u64 chunk_offset)
> {
> struct extent_map_tree *em_tree;
> - struct btrfs_root *extent_root;
> - struct btrfs_trans_handle *trans;
> - struct btrfs_device *device;
> struct extent_map *em;
> + struct btrfs_root *extent_root = root->fs_info->extent_root;
> struct map_lookup *map;
> u64 dev_extent_len = 0;
> - int ret;
> - int i;
> + u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> + u64 chunk_tree = root->fs_info->chunk_root->objectid;
> + int i, ret = 0;
>
> + /* Just in case */
> root = root->fs_info->chunk_root;
> - extent_root = root->fs_info->extent_root;
> em_tree = &root->fs_info->mapping_tree.map_tree;
>
> - ret = btrfs_can_relocate(extent_root, chunk_offset);
> - if (ret)
> - return -ENOSPC;
> -
> - /* step one, relocate all the extents inside this chunk */
> - ret = btrfs_relocate_block_group(extent_root, chunk_offset);
> - if (ret)
> - return ret;
> -
> - trans = btrfs_start_transaction(root, 0);
> - if (IS_ERR(trans)) {
> - ret = PTR_ERR(trans);
> - btrfs_std_error(root->fs_info, ret);
> - return ret;
> - }
> -
> - /*
> - * step two, delete the device extents and the
> - * chunk tree entries
> - */
> read_lock(&em_tree->lock);
> em = lookup_extent_mapping(em_tree, chunk_offset, 1);
> read_unlock(&em_tree->lock);
>
> - BUG_ON(!em || em->start > chunk_offset ||
> - em->start + em->len < chunk_offset);
> + if (!em || em->start > chunk_offset ||
> + em->start + em->len < chunk_offset) {
> + /*
> + * This is a logic error, but we don't want to just rely on the
> + * user having built with ASSERT enabled, so if ASSERT doens't
> + * do anything we still error out.
> + */
> + ASSERT(0);
> + if (em)
> + free_extent_map(em);
> + return -EINVAL;
> + }
> map = (struct map_lookup *)em->bdev;
>
> for (i = 0; i < map->num_stripes; i++) {
> - device = map->stripes[i].dev;
> + struct btrfs_device *device = map->stripes[i].dev;
> ret = btrfs_free_dev_extent(trans, device,
> map->stripes[i].physical,
> &dev_extent_len);
> - BUG_ON(ret);
> + if (ret) {
> + btrfs_abort_transaction(trans, root, ret);
> + goto out;
> + }
>
> if (device->bytes_used > 0) {
> lock_chunks(root);
> @@ -2634,23 +2625,34 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
>
> if (map->stripes[i].dev) {
> ret = btrfs_update_device(trans, map->stripes[i].dev);
> - BUG_ON(ret);
> + if (ret) {
> + btrfs_abort_transaction(trans, root, ret);
> + goto out;
> + }
> }
> }
> ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
> chunk_offset);
> -
> - BUG_ON(ret);
> + if (ret) {
> + btrfs_abort_transaction(trans, root, ret);
> + goto out;
> + }
>
> trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
>
> if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
> ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
> - BUG_ON(ret);
> + if (ret) {
> + btrfs_abort_transaction(trans, root, ret);
> + goto out;
> + }
> }
>
> ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
> - BUG_ON(ret);
> + if (ret) {
> + btrfs_abort_transaction(trans, extent_root, ret);
> + goto out;
> + }
>
> write_lock(&em_tree->lock);
> remove_extent_mapping(em_tree, em);
> @@ -2658,11 +2660,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
>
> /* once for the tree */
> free_extent_map(em);
> +out:
> /* once for us */
> free_extent_map(em);
> + return ret;
> +}
> +
> +static int btrfs_relocate_chunk(struct btrfs_root *root,
> + u64 chunk_tree, u64 chunk_objectid,
> + u64 chunk_offset)
> +{
> + struct btrfs_root *extent_root;
> + struct btrfs_trans_handle *trans;
> + int ret;
> +
> + root = root->fs_info->chunk_root;
> + extent_root = root->fs_info->extent_root;
> +
> + ret = btrfs_can_relocate(extent_root, chunk_offset);
> + if (ret)
> + return -ENOSPC;
> +
> + /* step one, relocate all the extents inside this chunk */
> + ret = btrfs_relocate_block_group(extent_root, chunk_offset);
> + if (ret)
> + return ret;
>
> + trans = btrfs_start_transaction(root, 0);
> + if (IS_ERR(trans)) {
> + ret = PTR_ERR(trans);
> + btrfs_std_error(root->fs_info, ret);
> + return ret;
> + }
> +
> + /*
> + * step two, delete the device extents and the
> + * chunk tree entries
> + */
> + ret = btrfs_remove_chunk(trans, root, chunk_offset);
> btrfs_end_transaction(trans, root);
> - return 0;
> + return ret;
> }
>
> static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 2b37da3..f648d19 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -462,6 +462,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
> int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
> struct btrfs_root *extent_root,
> u64 chunk_offset, u64 chunk_size);
> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, u64 chunk_offset);
>
> static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
> {
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Best Regards,
Wang Shilong
next prev parent reply other threads:[~2014-09-14 4:46 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-09-12 19:18 [PATCH] Btrfs: remove empty block groups automatically Josef Bacik
2014-09-12 20:02 ` Chris Mason
2014-09-14 4:46 ` Wang Shilong [this message]
2014-09-15 13:57 ` Josef Bacik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=F9E95213-710B-42A2-A904-3EE1CE95B72E@gmail.com \
--to=wangshilong1991@gmail.com \
--cc=jbacik@fb.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).