From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pa0-f50.google.com ([209.85.220.50]:36873 "EHLO mail-pa0-f50.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751067Ab3LUICI convert rfc822-to-8bit (ORCPT ); Sat, 21 Dec 2013 03:02:08 -0500 Received: by mail-pa0-f50.google.com with SMTP id kp14so1697567pab.9 for ; Sat, 21 Dec 2013 00:02:06 -0800 (PST) Content-Type: text/plain; charset=GB2312 Mime-Version: 1.0 (Mac OS X Mail 6.2 \(1499\)) Subject: Re: [PATCH 2/3] Btrfs: rework qgroup accounting From: Wang Shilong In-Reply-To: <1387400849-7274-3-git-send-email-jbacik@fb.com> Date: Sat, 21 Dec 2013 16:01:58 +0800 Cc: Message-Id: References: <1387400849-7274-1-git-send-email-jbacik@fb.com> <1387400849-7274-3-git-send-email-jbacik@fb.com> To: Josef Bacik Sender: linux-btrfs-owner@vger.kernel.org List-ID: Hi Josef, I compie btrfs-next in my 32-bit machine, i get the following warnings: fs/btrfs/qgroup.c: In function ¡®qgroup_excl_accounting¡¯: fs/btrfs/qgroup.c:1503:12: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] qgroup = (struct btrfs_qgroup *)unode->aux; ^ fs/btrfs/qgroup.c: In function ¡®qgroup_calc_old_refcnt¡¯: fs/btrfs/qgroup.c:1571:9: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] qg = (struct btrfs_qgroup *)tmp_unode->aux; ^ fs/btrfs/qgroup.c: In function ¡®qgroup_account_deleted_refs¡¯: fs/btrfs/qgroup.c:1665:8: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] qg = (struct btrfs_qgroup *)unode->aux; ^ fs/btrfs/qgroup.c: In function ¡®qgroup_calc_new_refcnt¡¯: fs/btrfs/qgroup.c:1705:8: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] qg = (struct btrfs_qgroup *)unode->aux; ^ fs/btrfs/qgroup.c: In function ¡®qgroup_adjust_counters¡¯: fs/btrfs/qgroup.c:1767:8: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] qg = (struct btrfs_qgroup *)unode->aux; this patch is newly added into btrfs-next, so i think it is better that you fix these warnings locally .^_^ Thanks, Wang > Currently qgroups account for space by intercepting delayed ref updates to fs > trees. It does this by adding sequence numbers to delayed ref updates so that > it can figure out how the tree looked before the update so we can adjust the > counters properly. The problem with this is that it does not allow delayed refs > to be merged, so if you say are defragging an extent with 5k snapshots pointing > to it we will thrash the delayed ref lock because we need to go back and > manually merge these things together. Instead we want to process quota changes > when we know they are going to happen, like when we first allocate an extent, we > free a reference for an extent, we add new references etc. This patch does this > and removes the reliance on the delayed ref sequence number. Thanks, > > Signed-off-by: Josef Bacik > --- > fs/btrfs/backref.c | 19 +- > fs/btrfs/backref.h | 5 +- > fs/btrfs/ctree.h | 51 ++- > fs/btrfs/delayed-ref.c | 10 - > fs/btrfs/delayed-ref.h | 19 - > fs/btrfs/disk-io.c | 3 + > fs/btrfs/extent-tree.c | 457 ++++++++++++++++---- > fs/btrfs/qgroup.c | 1075 +++++++++++++++++++++++++++++++++++++----------- > fs/btrfs/transaction.c | 51 ++- > 9 files changed, 1312 insertions(+), 378 deletions(-) > > diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c > index 6a3f7f5..9ee3030 100644 > --- a/fs/btrfs/backref.c > +++ b/fs/btrfs/backref.c > @@ -817,7 +817,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, > static int find_parent_nodes(struct btrfs_trans_handle *trans, > struct btrfs_fs_info *fs_info, u64 bytenr, > u64 time_seq, struct ulist *refs, > - struct ulist *roots, const u64 *extent_item_pos) > + struct ulist *roots, const u64 *extent_item_pos, > + int search_commit_root) > { > struct btrfs_key key; > struct btrfs_path *path; > @@ -842,7 +843,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, > path = btrfs_alloc_path(); > if (!path) > return -ENOMEM; > - if (!trans) > + if (search_commit_root) > path->search_commit_root = 1; > > /* > @@ -1046,7 +1047,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, > } > > ret = find_parent_nodes(trans, fs_info, bytenr, > - time_seq, *leafs, tmp, extent_item_pos); > + time_seq, *leafs, tmp, extent_item_pos, > + (trans == NULL)); > ulist_free(tmp); > > if (ret < 0 && ret != -ENOENT) { > @@ -1071,8 +1073,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, > * returns 0 on success, < 0 on error. > */ > int btrfs_find_all_roots(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info, u64 bytenr, > - u64 time_seq, struct ulist **roots) > + struct btrfs_fs_info *fs_info, u64 bytenr, > + u64 time_seq, struct ulist **roots, > + int search_commit_root) > { > struct ulist *tmp; > struct ulist_node *node = NULL; > @@ -1091,7 +1094,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, > ULIST_ITER_INIT(&uiter); > while (1) { > ret = find_parent_nodes(trans, fs_info, bytenr, > - time_seq, tmp, *roots, NULL); > + time_seq, tmp, *roots, NULL, > + search_commit_root); > if (ret < 0 && ret != -ENOENT) { > ulist_free(tmp); > ulist_free(*roots); > @@ -1497,7 +1501,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, > ULIST_ITER_INIT(&ref_uiter); > while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { > ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, > - tree_mod_seq_elem.seq, &roots); > + tree_mod_seq_elem.seq, &roots, > + search_commit_root); > if (ret) > break; > ULIST_ITER_INIT(&root_uiter); > diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h > index a910b27..0ad87c8 100644 > --- a/fs/btrfs/backref.h > +++ b/fs/btrfs/backref.h > @@ -55,8 +55,9 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, > int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); > > int btrfs_find_all_roots(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info, u64 bytenr, > - u64 time_seq, struct ulist **roots); > + struct btrfs_fs_info *fs_info, u64 bytenr, > + u64 time_seq, struct ulist **roots, > + int search_commit_root); > char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, > u32 name_len, unsigned long name_off, > struct extent_buffer *eb_in, u64 parent, > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 8b3fd61..944c916 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1289,6 +1289,32 @@ enum btrfs_orphan_cleanup_state { > ORPHAN_CLEANUP_DONE = 2, > }; > > +/* > + * A description of the operations, all of these operations only happen when we > + * are adding the 1st reference for that subvolume in the case of adding space > + * or on the last reference delete in the case of subtraction. > + * > + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only > + * one pointing at the bytes we are adding. This is called on the first > + * allocation. > + * > + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be > + * shared between subvols. This is called on the creation of a ref that already > + * has refs from a different subvolume, so basically reflink. > + * > + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only > + * one referencing the range. > + * > + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with > + * refs with other subvolumes. > + */ > +enum btrfs_qgroup_operation_type { > + BTRFS_QGROUP_OPER_ADD_EXCL, > + BTRFS_QGROUP_OPER_ADD_SHARED, > + BTRFS_QGROUP_OPER_SUB_EXCL, > + BTRFS_QGROUP_OPER_SUB_SHARED, > +}; > + > /* used by the raid56 code to lock stripes for read/modify/write */ > struct btrfs_stripe_hash { > struct list_head hash_list; > @@ -1629,7 +1655,10 @@ struct btrfs_fs_info { > > /* holds configuration and tracking. Protected by qgroup_lock */ > struct rb_root qgroup_tree; > + struct rb_root qgroup_op_tree; > spinlock_t qgroup_lock; > + spinlock_t qgroup_op_lock; > + atomic_t qgroup_op_seq; > > /* > * used to avoid frequently calling ulist_alloc()/ulist_free() > @@ -3323,12 +3352,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, > struct btrfs_fs_info *fs_info); > int __get_raid_index(u64 flags); > int lock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > - u64 num_bytes, int for_cow, > - struct btrfs_block_group_cache **block_group, > + u64 num_bytes, struct btrfs_block_group_cache **block_group, > struct extent_state **cached_state); > int unlock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > - u64 num_bytes, int for_cow, > - struct btrfs_block_group_cache *block_group, > + u64 num_bytes, struct btrfs_block_group_cache *block_group, > struct extent_state **cached_state); > /* ctree.c */ > int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, > @@ -4031,12 +4058,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); > void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); > struct btrfs_delayed_extent_op; > int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, > - struct btrfs_delayed_ref_node *node, > - struct btrfs_delayed_extent_op *extent_op); > -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info, > - struct btrfs_delayed_ref_node *node, > - struct btrfs_delayed_extent_op *extent_op); > + struct btrfs_fs_info *fs_info, u64 ref_root, > + u64 bytenr, u64 num_bytes, u64 parent, > + enum btrfs_qgroup_operation_type type); > +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info); > +void btrfs_remove_last_qgroup_operation(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, > + u64 ref_root); > +int btrfs_qgroup_add_shared_ref(struct btrfs_trans_handle *trans, u64 ref_root, > + u64 parent); > int btrfs_run_qgroups(struct btrfs_trans_handle *trans, > struct btrfs_fs_info *fs_info); > int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, > diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c > index ee1c29d..d5a601e 100644 > --- a/fs/btrfs/delayed-ref.c > +++ b/fs/btrfs/delayed-ref.c > @@ -681,9 +681,6 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, > ref->is_head = 0; > ref->in_tree = 1; > ref->for_cow = for_cow; > - > - if (need_ref_seq(for_cow, ref_root)) > - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); > ref->seq = seq; > > full_ref = btrfs_delayed_node_to_tree_ref(ref); > @@ -741,9 +738,6 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, > ref->is_head = 0; > ref->in_tree = 1; > ref->for_cow = for_cow; > - > - if (need_ref_seq(for_cow, ref_root)) > - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); > ref->seq = seq; > > full_ref = btrfs_delayed_node_to_data_ref(ref); > @@ -817,8 +811,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, > num_bytes, parent, ref_root, level, action, > for_cow); > spin_unlock(&delayed_refs->lock); > - if (need_ref_seq(for_cow, ref_root)) > - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); > > return 0; > } > @@ -865,8 +857,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, > num_bytes, parent, ref_root, owner, offset, > action, for_cow); > spin_unlock(&delayed_refs->lock); > - if (need_ref_seq(for_cow, ref_root)) > - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); > > return 0; > } > diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h > index db71a37..b747180 100644 > --- a/fs/btrfs/delayed-ref.h > +++ b/fs/btrfs/delayed-ref.h > @@ -241,25 +241,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, > u64 seq); > > /* > - * delayed refs with a ref_seq > 0 must be held back during backref walking. > - * this only applies to items in one of the fs-trees. for_cow items never need > - * to be held back, so they won't get a ref_seq number. > - */ > -static inline int need_ref_seq(int for_cow, u64 rootid) > -{ > - if (for_cow) > - return 0; > - > - if (rootid == BTRFS_FS_TREE_OBJECTID) > - return 1; > - > - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) > - return 1; > - > - return 0; > -} > - > -/* > * a node might live in a head or a regular ref, this lets you > * test for the proper type to use. > */ > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 4a1871c..3eb27b9 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -2151,6 +2151,7 @@ int open_ctree(struct super_block *sb, > spin_lock_init(&fs_info->free_chunk_lock); > spin_lock_init(&fs_info->tree_mod_seq_lock); > spin_lock_init(&fs_info->super_lock); > + spin_lock_init(&fs_info->qgroup_op_lock); > spin_lock_init(&fs_info->buffer_lock); > rwlock_init(&fs_info->tree_mod_log_lock); > mutex_init(&fs_info->reloc_mutex); > @@ -2175,6 +2176,7 @@ int open_ctree(struct super_block *sb, > atomic_set(&fs_info->async_submit_draining, 0); > atomic_set(&fs_info->nr_async_bios, 0); > atomic_set(&fs_info->defrag_running, 0); > + atomic_set(&fs_info->qgroup_op_seq, 0); > atomic64_set(&fs_info->tree_mod_seq, 0); > fs_info->sb = sb; > fs_info->max_inline = 8192 * 1024; > @@ -2282,6 +2284,7 @@ int open_ctree(struct super_block *sb, > spin_lock_init(&fs_info->qgroup_lock); > mutex_init(&fs_info->qgroup_ioctl_lock); > fs_info->qgroup_tree = RB_ROOT; > + fs_info->qgroup_op_tree = RB_ROOT; > INIT_LIST_HEAD(&fs_info->dirty_qgroups); > fs_info->qgroup_seq = 1; > fs_info->quota_enabled = 0; > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 03b536c..71673d6 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -680,7 +680,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( > * @root_objectid: the root objectid that we are modifying for this extent. > * @bytenr: the byte we are modifying the reference for > * @num_bytes: the number of bytes we are locking. > - * @for_cow: if this operation is for cow then we don't need to lock > * @block_group: we will store the block group we looked up so that the unlock > * doesn't have to do another search. > * @cached_state: this is for caching our location so when we unlock we don't > @@ -689,14 +688,13 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( > * This can return -ENOMEM if we cannot allocate our extent state. > */ > int lock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > - u64 num_bytes, int for_cow, > - struct btrfs_block_group_cache **block_group, > + u64 num_bytes, struct btrfs_block_group_cache **block_group, > struct extent_state **cached_state) > { > struct btrfs_block_group_cache *cache; > int ret; > > - if (!fs_info->quota_enabled || !need_ref_seq(for_cow, root_objectid)) > + if (!fs_info->quota_enabled || !is_fstree(root_objectid)) > return 0; > > cache = btrfs_lookup_block_group(fs_info, bytenr); > @@ -721,7 +719,6 @@ int lock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > * @root_objectid: the root objectid that we are modifying for this extent. > * @bytenr: the byte we are modifying the reference for > * @num_bytes: the number of bytes we are locking. > - * @for_cow: if this ref update is for cow we didn't take the lock. > * @block_group: the block_group we got from lock_ref. > * @cached_state: this is for caching our location so when we unlock we don't > * have to do a tree search. > @@ -729,13 +726,12 @@ int lock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > * This can return -ENOMEM if we fail to allocate an extent state. > */ > int unlock_ref(struct btrfs_fs_info *fs_info, u64 root_objectid, u64 bytenr, > - u64 num_bytes, int for_cow, > - struct btrfs_block_group_cache *block_group, > + u64 num_bytes, struct btrfs_block_group_cache *block_group, > struct extent_state **cached_state) > { > int ret; > > - if (!fs_info->quota_enabled || !need_ref_seq(for_cow, root_objectid)) > + if (!fs_info->quota_enabled || !is_fstree(root_objectid)) > return 0; > > ret = unlock_extent_cached(&block_group->ref_lock, bytenr, > @@ -1342,7 +1338,7 @@ fail: > static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > struct btrfs_path *path, > - int refs_to_drop) > + int refs_to_drop, int *last_ref) > { > struct btrfs_key key; > struct btrfs_extent_data_ref *ref1 = NULL; > @@ -1378,6 +1374,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, > > if (num_refs == 0) { > ret = btrfs_del_item(trans, root, path); > + *last_ref = 1; > } else { > if (key.type == BTRFS_EXTENT_DATA_REF_KEY) > btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); > @@ -1533,6 +1530,193 @@ static int find_next_key(struct btrfs_path *path, int level, > } > > /* > + * Look at an inline extent backref to see if one of the references matches the > + * root we are looking for. > + */ > +static int find_root_in_inline_backref(struct btrfs_trans_handle *trans, > + struct btrfs_path *path, u64 ref_root, > + int slot, int add_shared, > + int *refs_to_add, int *shared_refs) > +{ > + struct extent_buffer *leaf = path->nodes[0]; > + struct btrfs_extent_item *ei; > + struct btrfs_extent_data_ref *dref; > + struct btrfs_extent_inline_ref *iref; > + struct btrfs_key key; > + u64 item_size; > + u64 flags; > + u64 root; > + u64 refs; > + u64 parent; > + unsigned long ptr; > + unsigned long end; > + int type; > + int ret = 0; > + bool found = false; > + > + item_size = btrfs_item_size_nr(leaf, slot); > +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 > + /* We really shouldn't have V0 references with an fs that has quotas. */ > + if (item_size < sizeof(*ei)) > + return -EINVAL; > +#endif > + btrfs_item_key_to_cpu(leaf, &key, slot); > + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); > + flags = btrfs_extent_flags(leaf, ei); > + > + ptr = (unsigned long)(ei + 1); > + end = (unsigned long)ei + item_size; > + > + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && > + key.type == BTRFS_EXTENT_ITEM_KEY) { > + ptr += sizeof(struct btrfs_tree_block_info); > + ASSERT(ptr <= end); > + } > + > + while (!found && ptr < end) { > + iref = (struct btrfs_extent_inline_ref *)ptr; > + type = btrfs_extent_inline_ref_type(leaf, iref); > + > + switch (type) { > + case BTRFS_EXTENT_DATA_REF_KEY: > + dref = (struct btrfs_extent_data_ref *)(&iref->offset); > + root = btrfs_extent_data_ref_root(leaf, dref); > + if (root != ref_root) > + break; > + refs = btrfs_extent_data_ref_count(leaf, dref); > + if (refs > (u64)(*refs_to_add)) > + found = true; > + else > + *refs_to_add -= (int)refs; > + break; > + case BTRFS_TREE_BLOCK_REF_KEY: > + if (btrfs_extent_inline_ref_offset(leaf, iref) != > + ref_root) > + break; > + if (*refs_to_add == 0) > + found = true; > + (*refs_to_add)--; > + break; > + case BTRFS_SHARED_DATA_REF_KEY: > + case BTRFS_SHARED_BLOCK_REF_KEY: > + (*shared_refs)++; > + if (!add_shared) > + break; > + parent = btrfs_extent_inline_ref_offset(leaf, iref); > + ret = btrfs_qgroup_add_shared_ref(trans, ref_root, > + parent); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + if (ret || found) > + break; > + ptr += btrfs_extent_inline_ref_size(type); > + } > + if (found) > + ret = 1; > + return ret; > +} > + > +/* > + * This will look for the given root ref from path. This assumes that path is > + * pointing at the extent item for the extent you want to check. > + */ > +static int root_has_ref(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, struct btrfs_path *path, > + u64 bytenr, u64 ref_root, int can_search_forward, > + int add_shared, int refs_to_add, int *shared_refs) > +{ > + struct extent_buffer *leaf = path->nodes[0]; > + struct btrfs_extent_data_ref *ref; > + struct btrfs_key key; > + u64 refs; > + u64 found_root; > + int slot = path->slots[0]; > + int ret = 0; > + bool found = false; > + > + root = root->fs_info->extent_root; > + > + /* Return 1 so we don't do any of the accounting */ > + if (!root->fs_info->quota_enabled || !is_fstree(ref_root)) > + return 1; > + > + while (!found) { > + btrfs_item_key_to_cpu(leaf, &key, slot); > + if (key.objectid != bytenr) > + break; > + if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) > + goto next; > + switch (key.type) { > + case BTRFS_EXTENT_ITEM_KEY: > + case BTRFS_METADATA_ITEM_KEY: > + ret = find_root_in_inline_backref(trans, path, > + ref_root, slot, > + add_shared, > + &refs_to_add, > + shared_refs); > + if (ret > 0) > + found = true; > + break; > + case BTRFS_EXTENT_DATA_REF_KEY: > + ref = btrfs_item_ptr(leaf, slot, > + struct btrfs_extent_data_ref); > + found_root = btrfs_extent_data_ref_root(leaf, ref); > + if (found_root != ref_root) > + break; > + refs = btrfs_extent_data_ref_count(leaf, ref); > + if (refs > (u64)refs_to_add) > + found = true; > + else > + refs_to_add -= (int)refs; > + break; > + case BTRFS_TREE_BLOCK_REF_KEY: > + if (key.offset != ref_root) > + break; > + if (!refs_to_add) > + found = true; > + refs_to_add--; > + break; > + case BTRFS_SHARED_DATA_REF_KEY: > + case BTRFS_SHARED_BLOCK_REF_KEY: > + (*shared_refs)++; > + if (!add_shared) > + break; > + ret = btrfs_qgroup_add_shared_ref(trans, ref_root, > + key.offset); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + if (ret || found) > + break; > +next: > + slot++; > + if (slot >= btrfs_header_nritems(leaf)) { > + if (!can_search_forward) { > + ret = -EAGAIN; > + break; > + } > + ret = btrfs_next_leaf(root, path); > + if (ret < 0) > + break; > + if (ret > 0) { > + ret = 0; > + break; > + } > + leaf = path->nodes[0]; > + slot = 0; > + } > + } > + if (found) > + ret = 1; > + return ret; > +} > + > +/* > * look for inline back ref. if back ref is found, *ref_ret is set > * to the address of inline back ref, and 0 is returned. > * > @@ -1834,7 +2018,8 @@ void update_inline_extent_backref(struct btrfs_root *root, > struct btrfs_path *path, > struct btrfs_extent_inline_ref *iref, > int refs_to_mod, > - struct btrfs_delayed_extent_op *extent_op) > + struct btrfs_delayed_extent_op *extent_op, > + int *last_ref) > { > struct extent_buffer *leaf; > struct btrfs_extent_item *ei; > @@ -1878,6 +2063,7 @@ void update_inline_extent_backref(struct btrfs_root *root, > else > btrfs_set_shared_data_ref_count(leaf, sref, refs); > } else { > + *last_ref = 1; > size = btrfs_extent_inline_ref_size(type); > item_size = btrfs_item_size_nr(leaf, path->slots[0]); > ptr = (unsigned long)iref; > @@ -1898,9 +2084,11 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, > u64 bytenr, u64 num_bytes, u64 parent, > u64 root_objectid, u64 owner, > u64 offset, int refs_to_add, > - struct btrfs_delayed_extent_op *extent_op) > + struct btrfs_delayed_extent_op *extent_op, > + int *new_ref) > { > struct btrfs_extent_inline_ref *iref; > + int shared_refs = 0; > int ret; > > ret = lookup_inline_extent_backref(trans, root, path, &iref, > @@ -1909,8 +2097,33 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, > if (ret == 0) { > BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); > update_inline_extent_backref(root, path, iref, > - refs_to_add, extent_op); > + refs_to_add, extent_op, NULL); > } else if (ret == -ENOENT) { > + /* > + * We want to quickly see if we are adding a ref for a root that > + * already holds a ref on this backref, so we'll search forward > + * as much as we can to see if that's the case. This is to keep > + * us from searching again if we don't have to, otherwise we'll > + * have to do this whole search over again to make sure we > + * really don't have another guy. Keep in mind this does > + * nothing if quotas aren't enabled. > + */ > + ret = root_has_ref(trans, root, path, bytenr, root_objectid, 0, > + 0, 0, &shared_refs); > + if (ret <= 0) { > + /* > + * If we got an error back from root_has_ref or we > + * tripped over a shared ref then we need to make sure > + * the caller re-calls root_has_ref to either verify > + * this root is not already pointing at this bytenr or > + * in the case of shared_refs we add the shared refs we > + * find. > + */ > + if (shared_refs || ret < 0) > + *new_ref = 2; > + else > + *new_ref = 1; > + } > setup_inline_extent_backref(root, path, iref, parent, > root_objectid, owner, offset, > refs_to_add, extent_op); > @@ -1942,17 +2155,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > struct btrfs_path *path, > struct btrfs_extent_inline_ref *iref, > - int refs_to_drop, int is_data) > + int refs_to_drop, int is_data, int *last_ref) > { > int ret = 0; > > BUG_ON(!is_data && refs_to_drop != 1); > if (iref) { > update_inline_extent_backref(root, path, iref, > - -refs_to_drop, NULL); > + -refs_to_drop, NULL, last_ref); > } else if (is_data) { > - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); > + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, > + last_ref); > } else { > + *last_ref = 1; > ret = btrfs_del_item(trans, root, path); > } > return ret; > @@ -2045,11 +2260,16 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, > u64 owner, u64 offset, int refs_to_add, > struct btrfs_delayed_extent_op *extent_op) > { > + struct btrfs_fs_info *fs_info = root->fs_info; > struct btrfs_path *path; > struct extent_buffer *leaf; > struct btrfs_extent_item *item; > + struct btrfs_key key; > u64 refs; > + int new_ref = 0; > + int shared_refs = 0; > int ret; > + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; > > path = btrfs_alloc_path(); > if (!path) > @@ -2058,16 +2278,75 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, > path->reada = 1; > path->leave_spinning = 1; > /* this will setup the path even if it fails to insert the back ref */ > - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, > - path, bytenr, num_bytes, parent, > + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, > + bytenr, num_bytes, parent, > root_objectid, owner, offset, > - refs_to_add, extent_op); > - if (ret != -EAGAIN) > + refs_to_add, extent_op, &new_ref); > + if ((ret < 0 && ret != -EAGAIN) || (!ret && !new_ref)) > + goto out; > + /* > + * Ok we were able to insert an inline extent and it appears to be a new > + * reference, deal with the qgroup accounting. > + */ > + if (!ret && new_ref) { > + int shared_refs = 0; > + > + ASSERT(root->fs_info->quota_enabled); > + leaf = path->nodes[0]; > + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > + item = btrfs_item_ptr(leaf, path->slots[0], > + struct btrfs_extent_item); > + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) > + type = BTRFS_QGROUP_OPER_ADD_SHARED; > + > + /* Parent doesn't matter for add operations */ > + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, > + bytenr, num_bytes, 0, type); > + if (ret < 0) > + goto out; > + > + /* There are no shared refs, so we can just exit now. */ > + if (new_ref == 1) > + goto out; > + btrfs_release_path(path); > + path->leave_spinning = 0; > + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, > + path, 0, 0); > + if (ret < 0) > + goto out; > + ASSERT(ret == 0); > + > + /* > + * Have to pass the refs we added since we will have added this > + * backref already so we needto make sure there wasn't any other > + * refs for this root. > + */ > + ret = root_has_ref(trans, root, path, bytenr, root_objectid, > + 1, 1, refs_to_add, &shared_refs); > + if (ret < 0) > + goto out; > + /* > + * Upon a second search we found our root referencing this > + * bytenr already. > + */ > + if (ret > 0) > + btrfs_remove_last_qgroup_operation(trans, fs_info, > + root_objectid); > + ret = 0; > goto out; > + } > > + /* > + * Ok we had -EAGAIN which means we didn't have space to insert and > + * inline extent ref, so just update the reference count and add a > + * normal backref. > + */ > leaf = path->nodes[0]; > + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); > refs = btrfs_extent_refs(leaf, item); > + if (refs) > + type = BTRFS_QGROUP_OPER_ADD_SHARED; > btrfs_set_extent_refs(leaf, item, refs + refs_to_add); > if (extent_op) > __run_delayed_extent_op(extent_op, leaf, item); > @@ -2075,9 +2354,37 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, > btrfs_mark_buffer_dirty(leaf); > btrfs_release_path(path); > > + /* > + * Do qgroup accounting. We go ahead and add the ref first in case > + * there are no matches, since in the other case we'd have to run > + * root_has_ref twice, once without add_shared set and then again with > + * add_shared set if there were any shared refs. > + */ > + if (is_fstree(root_objectid) && fs_info->quota_enabled) { > + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, > + bytenr, num_bytes, 0, type); > + if (ret) > + goto out; > + path->leave_spinning = 0; > + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, > + path, 0, 0); > + if (ret < 0) > + goto out; > + ASSERT(ret == 0); > + > + ret = root_has_ref(trans, root, path, bytenr, root_objectid, > + 1, 1, 0, &shared_refs); > + if (ret < 0) > + goto out; > + if (ret > 0) > + btrfs_remove_last_qgroup_operation(trans, fs_info, > + root_objectid); > + btrfs_release_path(path); > + ret = 0; > + } > + > path->reada = 1; > path->leave_spinning = 1; > - > /* now insert the actual backref */ > ret = insert_extent_backref(trans, root->fs_info->extent_root, > path, bytenr, parent, root_objectid, > @@ -2114,11 +2421,10 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, > > if (node->type == BTRFS_SHARED_DATA_REF_KEY) > parent = ref->parent; > - else > - ref_root = ref->root; > + ref_root = ref->root; > > ret = lock_ref(root->fs_info, ref->root, node->bytenr, node->num_bytes, > - node->for_cow, &block_group, &cached_state); > + &block_group, &cached_state); > if (ret) > return ret; > if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { > @@ -2144,8 +2450,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, > BUG(); > } > err = unlock_ref(root->fs_info, ref->root, node->bytenr, > - node->num_bytes, node->for_cow, block_group, > - &cached_state); > + node->num_bytes, block_group, &cached_state); > return ret ? ret : err; > } > > @@ -2282,8 +2587,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, > > if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) > parent = ref->parent; > - else > - ref_root = ref->root; > + ref_root = ref->root; > > ins.objectid = node->bytenr; > if (skinny_metadata) { > @@ -2295,7 +2599,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, > } > > ret = lock_ref(root->fs_info, ref->root, node->bytenr, node->num_bytes, > - node->for_cow, &block_group, &cached_state); > + &block_group, &cached_state); > if (ret) > return ret; > BUG_ON(node->ref_mod != 1); > @@ -2318,8 +2622,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, > BUG(); > } > err = unlock_ref(root->fs_info, ref->root, node->bytenr, > - node->num_bytes, node->for_cow, block_group, > - &cached_state); > + node->num_bytes, block_group, &cached_state); > return ret ? ret : err; > } > > @@ -2633,42 +2936,6 @@ static u64 find_middle(struct rb_root *root) > } > #endif > > -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info) > -{ > - struct qgroup_update *qgroup_update; > - int ret = 0; > - > - if (list_empty(&trans->qgroup_ref_list) != > - !trans->delayed_ref_elem.seq) { > - /* list without seq or seq without list */ > - btrfs_err(fs_info, > - "qgroup accounting update error, list is%s empty, seq is %#x.%x", > - list_empty(&trans->qgroup_ref_list) ? "" : " not", > - (u32)(trans->delayed_ref_elem.seq >> 32), > - (u32)trans->delayed_ref_elem.seq); > - BUG(); > - } > - > - if (!trans->delayed_ref_elem.seq) > - return 0; > - > - while (!list_empty(&trans->qgroup_ref_list)) { > - qgroup_update = list_first_entry(&trans->qgroup_ref_list, > - struct qgroup_update, list); > - list_del(&qgroup_update->list); > - if (!ret) > - ret = btrfs_qgroup_account_ref( > - trans, fs_info, qgroup_update->node, > - qgroup_update->extent_op); > - kfree(qgroup_update); > - } > - > - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); > - > - return ret; > -} > - > static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, > int count) > { > @@ -2754,8 +3021,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, > if (root == root->fs_info->extent_root) > root = root->fs_info->tree_root; > > - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); > - > delayed_refs = &trans->transaction->delayed_refs; > INIT_LIST_HEAD(&cluster); > if (count == 0) { > @@ -2829,6 +3094,7 @@ again: > btrfs_abort_transaction(trans, root, ret); > atomic_dec(&delayed_refs->procs_running_refs); > wake_up(&delayed_refs->wait); > + btrfs_delayed_qgroup_accounting(trans, root->fs_info); > return ret; > } > > @@ -2910,6 +3176,9 @@ out: > wake_up(&delayed_refs->wait); > > spin_unlock(&delayed_refs->lock); > + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); > + if (ret) > + return ret; > assert_qgroups_uptodate(trans); > return 0; > } > @@ -5796,6 +6065,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > int num_to_del = 1; > u32 item_size; > u64 refs; > + int last_ref = 0; > + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; > bool skinny_metadata = btrfs_fs_incompat(root->fs_info, > SKINNY_METADATA); > > @@ -5846,7 +6117,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > BUG_ON(iref); > ret = remove_extent_backref(trans, extent_root, path, > NULL, refs_to_drop, > - is_data); > + is_data, &last_ref); > if (ret) { > btrfs_abort_transaction(trans, extent_root, ret); > goto out; > @@ -5970,6 +6241,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > refs -= refs_to_drop; > > if (refs > 0) { > + type = BTRFS_QGROUP_OPER_SUB_SHARED; > if (extent_op) > __run_delayed_extent_op(extent_op, leaf, ei); > /* > @@ -5985,7 +6257,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > if (found_extent) { > ret = remove_extent_backref(trans, extent_root, path, > iref, refs_to_drop, > - is_data); > + is_data, &last_ref); > if (ret) { > btrfs_abort_transaction(trans, extent_root, ret); > goto out; > @@ -6006,6 +6278,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > } > } > > + last_ref = 1; > ret = btrfs_del_items(trans, extent_root, path, path->slots[0], > num_to_del); > if (ret) { > @@ -6028,6 +6301,35 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > goto out; > } > } > + btrfs_release_path(path); > + > + /* Deal with the quota accounting */ > + if (!ret && last_ref && info->quota_enabled && > + is_fstree(root_objectid)) { > + int shared_refs = 0; > + > + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, > + bytenr, num_bytes, parent, type); > + /* > + * If we deleted the extent item then we can just bail without > + * having to look for more extents. > + */ > + if (ret < 0 || type == BTRFS_QGROUP_OPER_SUB_EXCL) > + goto out; > + path->leave_spinning = 0; > + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); > + if (ret < 0) > + goto out; > + ASSERT(ret == 0); > + ret = root_has_ref(trans, root, path, bytenr, root_objectid, 1, > + 0, 0, &shared_refs); > + if (ret < 0) > + goto out; > + if (ret > 0) > + btrfs_remove_last_qgroup_operation(trans, info, > + root_objectid); > + ret = 0; > + } > out: > btrfs_free_path(path); > return ret; > @@ -6899,6 +7201,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, > btrfs_mark_buffer_dirty(path->nodes[0]); > btrfs_free_path(path); > > + /* Always set parent to 0 here since its exclusive anyway. */ > + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, > + ins->objectid, ins->offset, 0, > + BTRFS_QGROUP_OPER_ADD_EXCL); > + if (ret) > + return ret; > + > ret = update_block_group(root, ins->objectid, ins->offset, 1); > if (ret) { /* -ENOENT, logic error */ > btrfs_err(fs_info, "update block group failed for %llu %llu", > @@ -6923,6 +7232,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, > struct btrfs_path *path; > struct extent_buffer *leaf; > u32 size = sizeof(*extent_item) + sizeof(*iref); > + u64 num_bytes = ins->offset; > bool skinny_metadata = btrfs_fs_incompat(root->fs_info, > SKINNY_METADATA); > > @@ -6956,6 +7266,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, > > if (skinny_metadata) { > iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); > + num_bytes = root->leafsize; > } else { > block_info = (struct btrfs_tree_block_info *)(extent_item + 1); > btrfs_set_tree_block_key(leaf, block_info, key); > @@ -6977,6 +7288,12 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, > btrfs_mark_buffer_dirty(leaf); > btrfs_free_path(path); > > + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, > + ins->objectid, num_bytes, 0, > + BTRFS_QGROUP_OPER_ADD_EXCL); > + if (ret) > + return ret; > + > ret = update_block_group(root, ins->objectid, root->leafsize, 1); > if (ret) { /* -ENOENT, logic error */ > btrfs_err(fs_info, "update block group failed for %llu %llu", > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c > index bd0b058..1ecc528 100644 > --- a/fs/btrfs/qgroup.c > +++ b/fs/btrfs/qgroup.c > @@ -84,8 +84,8 @@ struct btrfs_qgroup { > /* > * temp variables for accounting operations > */ > - u64 tag; > - u64 refcnt; > + u64 old_refcnt; > + u64 new_refcnt; > }; > > /* > @@ -98,6 +98,23 @@ struct btrfs_qgroup_list { > struct btrfs_qgroup *member; > }; > > +struct qgroup_shared_ref { > + u64 parent; > + u64 ref_root; > + struct list_head list; > +}; > + > +struct btrfs_qgroup_operation { > + u64 ref_root; > + u64 bytenr; > + u64 num_bytes; > + u64 seq; > + enum btrfs_qgroup_operation_type type; > + struct rb_node n; > + struct list_head list; > + struct list_head shared_refs; > +}; > + > static int > qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, > int init_flags); > @@ -1174,33 +1191,322 @@ out: > mutex_unlock(&fs_info->qgroup_ioctl_lock); > return ret; > } > +static int comp_oper(struct btrfs_qgroup_operation *oper1, > + struct btrfs_qgroup_operation *oper2) > +{ > + if (oper1->bytenr < oper2->bytenr) > + return -1; > + if (oper1->bytenr > oper2->bytenr) > + return 1; > + if (oper1->seq < oper2->seq) > + return -1; > + if (oper1->seq > oper2->seq) > + return -1; > + if (oper1->ref_root < oper2->ref_root) > + return -1; > + if (oper1->ref_root > oper2->ref_root) > + return 1; > + if (oper1->type < oper2->type) > + return -1; > + if (oper1->type > oper2->type) > + return 1; > + return 0; > +} > + > +/* Looks up the first operation for the given bytenr. */ > +static struct btrfs_qgroup_operation * > +lookup_first_oper(struct btrfs_fs_info *fs_info, u64 bytenr) > +{ > + struct rb_node *n; > + struct btrfs_qgroup_operation *oper, *prev = NULL; > + > + spin_lock(&fs_info->qgroup_op_lock); > + n = fs_info->qgroup_op_tree.rb_node; > + while (n) { > + oper = rb_entry(n, struct btrfs_qgroup_operation, n); > + if (oper->bytenr < bytenr) { > + n = n->rb_right; > + } else if (oper->bytenr > bytenr) { > + n = n->rb_left; > + } else { > + prev = oper; > + n = n->rb_left; > + } > + } > + spin_unlock(&fs_info->qgroup_op_lock); > + return prev; > +} > + > +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper) > +{ > + struct rb_node **p; > + struct rb_node *parent = NULL; > + struct btrfs_qgroup_operation *cur; > + int cmp; > + > + spin_lock(&fs_info->qgroup_op_lock); > + p = &fs_info->qgroup_op_tree.rb_node; > + while (*p) { > + parent = *p; > + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); > + cmp = comp_oper(cur, oper); > + if (cmp < 0) { > + p = &(*p)->rb_right; > + } else if (cmp) { > + p = &(*p)->rb_left; > + } else { > + spin_unlock(&fs_info->qgroup_op_lock); > + return -EEXIST; > + } > + } > + rb_link_node(&oper->n, parent, p); > + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); > + spin_unlock(&fs_info->qgroup_op_lock); > + return 0; > +} > > /* > - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts > - * the modification into a list that's later used by btrfs_end_transaction to > - * pass the recorded modifications on to btrfs_qgroup_account_ref. > + * Record a quota operation for processing later on. > + * @trans: the transaction we are adding the delayed op to. > + * @fs_info: the fs_info for this fs. > + * @ref_root: the root of the reference we are acting on, > + * @num_bytes: the number of bytes in the reference. > + * @parent: if we are removing a shared ref then this will be set. > + * @type: the type of operation this is. > + * > + * We just add it to our trans qgroup_ref_list and carry on and process these > + * operations in order at some later point. If the reference root isn't a fs > + * root then we don't bother with doing anything. > + * > + * MUST BE HOLDING THE REF LOCK. > */ > int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, > - struct btrfs_delayed_ref_node *node, > - struct btrfs_delayed_extent_op *extent_op) > + struct btrfs_fs_info *fs_info, u64 ref_root, > + u64 bytenr, u64 num_bytes, u64 parent, > + enum btrfs_qgroup_operation_type type) > { > - struct qgroup_update *u; > + struct btrfs_qgroup_operation *oper; > + int ret; > > - BUG_ON(!trans->delayed_ref_elem.seq); > - u = kmalloc(sizeof(*u), GFP_NOFS); > - if (!u) > + if (!is_fstree(ref_root) || !fs_info->quota_enabled) > + return 0; > + > + oper = kmalloc(sizeof(*oper), GFP_NOFS); > + if (!oper) > return -ENOMEM; > > - u->node = node; > - u->extent_op = extent_op; > - list_add_tail(&u->list, &trans->qgroup_ref_list); > + oper->ref_root = ref_root; > + oper->bytenr = bytenr; > + oper->num_bytes = num_bytes; > + oper->type = type; > + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); > + INIT_LIST_HEAD(&oper->shared_refs); > + ret = insert_qgroup_oper(fs_info, oper); > + if (ret) { > + /* Shouldn't happen so have an assert for developers */ > + ASSERT(0); > + kfree(oper); > + return ret; > + } > + list_add_tail(&oper->list, &trans->qgroup_ref_list); > > + /* > + * If we are removing a shared extent ref we could possibly have another > + * operation outstanding that needs to lookup this shared ref, so we > + * need to go and look if there exists such a person and update their > + * shared ref dependancy with the root ref so it knows if it needs to do > + * anything. We are relying on the ref lock to protect the actual > + * operation here. > + */ > + if (unlikely(parent) && (type == BTRFS_QGROUP_OPER_SUB_EXCL || > + type == BTRFS_QGROUP_OPER_SUB_SHARED)) { > + u64 seq = oper->seq; > + struct qgroup_shared_ref *ref; > + struct rb_node *n; > + > + oper = lookup_first_oper(fs_info, bytenr); > + if (!oper) > + return 0; > + do { > + list_for_each_entry(ref, &oper->shared_refs, list) { > + if (ref->parent == parent) > + ref->ref_root = ref_root; > + } > + spin_lock(&fs_info->qgroup_op_lock); > + n = rb_next(&oper->n); > + if (n) { > + oper = rb_entry(n, > + struct btrfs_qgroup_operation, > + n); > + if (oper->bytenr != bytenr || oper->seq >= seq) > + n = NULL; > + } > + spin_unlock(&fs_info->qgroup_op_lock); > + } while (n); > + } > return 0; > } > > -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, > - struct ulist *roots, struct ulist *tmp, > - u64 seq) > +/* > + * Remove the last qgroup operation that was added. > + * @trans: the trans handle. > + * @fs_info: the fs_info for this file system. > + * @ref_root: the root for the reference. > + * > + * Sometimes we may pre-emptively add an operation only to find after further > + * investigation that we no longer need it, so this will pop the last guy off > + * the list and free him up. > + */ > +void btrfs_remove_last_qgroup_operation(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, > + u64 ref_root) > +{ > + struct btrfs_qgroup_operation *oper; > + > + if (!is_fstree(ref_root)) > + return; > + > + oper = list_entry(trans->qgroup_ref_list.prev, > + struct btrfs_qgroup_operation, list); > + ASSERT(oper->ref_root == ref_root); > + list_del_init(&oper->list); > + while (!list_empty(&oper->shared_refs)) { > + struct qgroup_shared_ref *ref; > + ref = list_first_entry(&oper->shared_refs, > + struct qgroup_shared_ref, list); > + list_del_init(&ref->list); > + kfree(ref); > + } > + spin_lock(&fs_info->qgroup_op_lock); > + rb_erase(&oper->n, &fs_info->qgroup_op_tree); > + spin_unlock(&fs_info->qgroup_op_lock); > + kfree(oper); > +} > + > +/* > + * Record a shared ref in the most recent qgroup operation added. > + * @trans: the trans handle for this operation. > + * @ref_root: the ref_root this operation was for, this is to make sure we > + * actually need to do anything at all. Use the same ref_root we called > + * btrfs_qgroup_record_ref with. > + * @parent: the parent of the shared ref. > + * > + * This is meant to be called directly after calling btrfs_qgroup_record_ref > + * for the ref we care about, it does no searching to make sure we're on the > + * right qgroup operation. > + * > + * MUST BE HOLDING THE REF LOCK. > + */ > +int btrfs_qgroup_add_shared_ref(struct btrfs_trans_handle *trans, u64 ref_root, > + u64 parent) > +{ > + struct qgroup_shared_ref *ref; > + struct btrfs_qgroup_operation *oper; > + int ret = 0; > + > + if (!is_fstree(ref_root)) > + return 0; > + ref = kmalloc(sizeof(*ref), GFP_NOFS); > + if (!ref) > + return -ENOMEM; > + ref->parent = parent; > + ref->ref_root = 0; > + ASSERT(!list_empty(&trans->qgroup_ref_list)); > + oper = list_entry(trans->qgroup_ref_list.prev, > + struct btrfs_qgroup_operation, list); > + list_add_tail(&ref->list, &oper->shared_refs); > + return ret; > +} > + > +/* > + * The easy accounting, if we are adding/removing the only ref for an extent > + * then this qgroup and all of the parent qgroups get their refrence and > + * exclusive counts adjusted. > + */ > +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper) > +{ > + struct btrfs_qgroup *qgroup; > + struct ulist *tmp; > + struct btrfs_qgroup_list *glist; > + struct ulist_node *unode; > + struct ulist_iterator uiter; > + int sign = 0; > + int ret = 0; > + > + tmp = ulist_alloc(GFP_NOFS); > + if (!tmp) > + return -ENOMEM; > + > + spin_lock(&fs_info->qgroup_lock); > + if (!fs_info->quota_root) > + goto out; > + qgroup = find_qgroup_rb(fs_info, oper->ref_root); > + if (!qgroup) > + goto out; > + switch (oper->type) { > + case BTRFS_QGROUP_OPER_ADD_EXCL: > + sign = 1; > + break; > + case BTRFS_QGROUP_OPER_SUB_EXCL: > + sign = -1; > + break; > + default: > + ASSERT(0); > + } > + qgroup->rfer += sign * oper->num_bytes; > + qgroup->rfer_cmpr += sign * oper->num_bytes; > + > + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); > + qgroup->excl += sign * oper->num_bytes; > + qgroup->excl_cmpr += sign * oper->num_bytes; > + > + qgroup_dirty(fs_info, qgroup); > + > + /* Get all of the parent groups that contain this qgroup */ > + list_for_each_entry(glist, &qgroup->groups, next_group) { > + ret = ulist_add(tmp, glist->group->qgroupid, (u64)glist->group, > + GFP_ATOMIC); > + if (ret < 0) > + goto out; > + } > + > + /* Iterate all of the parents and adjust their reference counts */ > + ULIST_ITER_INIT(&uiter); > + while ((unode = ulist_next(tmp, &uiter))) { > + qgroup = (struct btrfs_qgroup *)unode->aux; > + qgroup->rfer += sign * oper->num_bytes; > + qgroup->rfer_cmpr += sign * oper->num_bytes; > + qgroup->excl += sign * oper->num_bytes; > + if (sign < 0) > + WARN_ON(qgroup->excl < oper->num_bytes); > + qgroup->excl_cmpr += sign * oper->num_bytes; > + qgroup_dirty(fs_info, qgroup); > + > + /* Add any parents of the parents */ > + list_for_each_entry(glist, &qgroup->groups, next_group) { > + ret = ulist_add(tmp, glist->group->qgroupid, > + (u64)glist->group, GFP_ATOMIC); > + if (ret < 0) > + goto out; > + } > + } > + ret = 0; > +out: > + spin_unlock(&fs_info->qgroup_lock); > + ulist_free(tmp); > + return ret; > +} > + > +/* > + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as > + * properly. > + */ > +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, > + u64 root_to_skip, struct ulist *roots, > + struct ulist *qgroups, u64 seq, > + int *old_roots, int rescan) > { > struct ulist_node *unode; > struct ulist_iterator uiter; > @@ -1211,256 +1517,557 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, > > ULIST_ITER_INIT(&uiter); > while ((unode = ulist_next(roots, &uiter))) { > + /* We don't count our current root here */ > + if (unode->val == root_to_skip) > + continue; > qg = find_qgroup_rb(fs_info, unode->val); > if (!qg) > continue; > + /* > + * We could have a pending removal of this same ref so we may > + * not have actually found our ref root when doing > + * btrfs_find_all_roots, so we need to keep track of how many > + * old roots we find in case we removed ours and added a > + * different one at the same time. I don't think this could > + * happen in practice but that sort of thinking leads to pain > + * and suffering and to the dark side. > + */ > + (*old_roots)++; > > - ulist_reinit(tmp); > - /* XXX id not needed */ > - ret = ulist_add(tmp, qg->qgroupid, > - (u64)(uintptr_t)qg, GFP_ATOMIC); > + ulist_reinit(qgroups); > + ret = ulist_add(qgroups, qg->qgroupid, (u64)qg, GFP_ATOMIC); > if (ret < 0) > return ret; > ULIST_ITER_INIT(&tmp_uiter); > - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { > + while ((tmp_unode = ulist_next(qgroups, &tmp_uiter))) { > struct btrfs_qgroup_list *glist; > > - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; > - if (qg->refcnt < seq) > - qg->refcnt = seq + 1; > + qg = (struct btrfs_qgroup *)tmp_unode->aux; > + /* > + * We use this sequence number to keep from having to > + * run the whole list and 0 out the refcnt every time. > + * We basically use sequnce as the known 0 count and > + * then add 1 everytime we see a qgroup. This is how we > + * get how many of the roots actually point up to the > + * upper level qgroups in order to determine exclusive > + * counts. > + * > + * For rescan we want to set old_refcnt to seq so our > + * exclusive calculations end up correct. > + */ > + if (rescan) > + qg->old_refcnt = seq; > + else if (qg->old_refcnt < seq) > + qg->old_refcnt = seq + 1; > else > - ++qg->refcnt; > + qg->old_refcnt++; > > + if (qg->new_refcnt < seq) > + qg->new_refcnt = seq + 1; > + else > + qg->new_refcnt++; > list_for_each_entry(glist, &qg->groups, next_group) { > - ret = ulist_add(tmp, glist->group->qgroupid, > - (u64)(uintptr_t)glist->group, > - GFP_ATOMIC); > + ret = ulist_add(qgroups, glist->group->qgroupid, > + (u64)glist->group, GFP_ATOMIC); > if (ret < 0) > return ret; > } > } > } > + return 0; > +} > + > +/* > + * We need to walk forward in our operation tree and account for any roots that > + * were deleted after we made this operation. > + */ > +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper, > + struct ulist *qgroups, u64 seq, > + int *old_roots) > +{ > + struct ulist_node *unode; > + struct ulist_iterator uiter; > + struct btrfs_qgroup *qg; > + struct btrfs_qgroup_operation *tmp; > + struct rb_node *n; > + int ret; > + > + ulist_reinit(qgroups); > + > + /* > + * We only walk forward in the tree since we're only interested in > + * removals that happened _after_ our operation. > + */ > + spin_lock(&fs_info->qgroup_op_lock); > + n = rb_next(&oper->n); > + spin_unlock(&fs_info->qgroup_op_lock); > + if (!n) > + return 0; > + tmp = rb_entry(n, struct btrfs_qgroup_operation, n); > + while (tmp->bytenr == oper->bytenr) { > + /* > + * If it's not a removal we don't care, additions work out > + * properly with our refcnt tracking. > + */ > + if (tmp->type != BTRFS_QGROUP_OPER_SUB_SHARED && > + tmp->type != BTRFS_QGROUP_OPER_SUB_EXCL) > + goto next; > + qg = find_qgroup_rb(fs_info, oper->ref_root); > + if (!qg) > + goto next; > + ret = ulist_add(qgroups, qg->qgroupid, (u64)qg, GFP_ATOMIC); > + if (ret) > + return ret; > + (*old_roots)++; > +next: > + spin_lock(&fs_info->qgroup_op_lock); > + n = rb_next(&tmp->n); > + spin_unlock(&fs_info->qgroup_op_lock); > + if (!n) > + break; > + tmp = rb_entry(n, struct btrfs_qgroup_operation, n); > + } > + > + /* Ok now process the qgroups we found */ > + ULIST_ITER_INIT(&uiter); > + while ((unode = ulist_next(qgroups, &uiter))) { > + struct btrfs_qgroup_list *glist; > > + qg = (struct btrfs_qgroup *)unode->aux; > + if (qg->old_refcnt < seq) > + qg->old_refcnt = seq + 1; > + else > + qg->old_refcnt++; > + if (qg->new_refcnt < seq) > + qg->new_refcnt = seq + 1; > + else > + qg->new_refcnt++; > + list_for_each_entry(glist, &qg->groups, next_group) { > + ret = ulist_add(qgroups, glist->group->qgroupid, > + (u64)glist->group, GFP_ATOMIC); > + if (ret < 0) > + return ret; > + } > + } > return 0; > } > > -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, > - struct ulist *roots, struct ulist *tmp, > - u64 seq, int sgn, u64 num_bytes, > - struct btrfs_qgroup *qgroup) > +/* Add refcnt for the newly added reference. */ > +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper, > + struct btrfs_qgroup *qgroup, > + struct ulist *roots, struct ulist *qgroups, > + u64 seq) > { > struct ulist_node *unode; > struct ulist_iterator uiter; > struct btrfs_qgroup *qg; > - struct btrfs_qgroup_list *glist; > int ret; > > - ulist_reinit(tmp); > - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); > + ulist_reinit(qgroups); > + ret = ulist_add(qgroups, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); > if (ret < 0) > return ret; > - > ULIST_ITER_INIT(&uiter); > - while ((unode = ulist_next(tmp, &uiter))) { > - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; > - if (qg->refcnt < seq) { > - /* not visited by step 1 */ > - qg->rfer += sgn * num_bytes; > - qg->rfer_cmpr += sgn * num_bytes; > - if (roots->nnodes == 0) { > - qg->excl += sgn * num_bytes; > - qg->excl_cmpr += sgn * num_bytes; > - } > - qgroup_dirty(fs_info, qg); > - } > - WARN_ON(qg->tag >= seq); > - qg->tag = seq; > + while ((unode = ulist_next(qgroups, &uiter))) { > + struct btrfs_qgroup_list *glist; > > + qg = (struct btrfs_qgroup *)unode->aux; > + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { > + if (qg->new_refcnt < seq) > + qg->new_refcnt = seq + 1; > + else > + qg->new_refcnt++; > + } else { > + if (qg->old_refcnt < seq) > + qg->old_refcnt = seq + 1; > + else > + qg->old_refcnt++; > + } > list_for_each_entry(glist, &qg->groups, next_group) { > - ret = ulist_add(tmp, glist->group->qgroupid, > - (uintptr_t)glist->group, GFP_ATOMIC); > + ret = ulist_add(qgroups, glist->group->qgroupid, > + (u64)glist->group, GFP_ATOMIC); > if (ret < 0) > return ret; > } > } > - > return 0; > } > > -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, > - struct ulist *roots, struct ulist *tmp, > - u64 seq, int sgn, u64 num_bytes) > +/* > + * This adjusts the counters for all referenced qgroups if need be. > + */ > +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, > + u64 root_to_skip, u64 num_bytes, > + struct ulist *roots, struct ulist *qgroups, > + u64 seq, int old_roots, int new_roots, int rescan) > { > struct ulist_node *unode; > struct ulist_iterator uiter; > struct btrfs_qgroup *qg; > - struct ulist_node *tmp_unode; > - struct ulist_iterator tmp_uiter; > + u64 cur_new_count, cur_old_count; > int ret; > > ULIST_ITER_INIT(&uiter); > while ((unode = ulist_next(roots, &uiter))) { > + struct btrfs_qgroup_list *glist; > + > + if (unode->val == root_to_skip) > + continue; > qg = find_qgroup_rb(fs_info, unode->val); > if (!qg) > continue; > > - ulist_reinit(tmp); > - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); > + ret = ulist_add(qgroups, qg->qgroupid, (u64)qg, GFP_ATOMIC); > if (ret < 0) > return ret; > + list_for_each_entry(glist, &qg->groups, next_group) { > + ret = ulist_add(qgroups, glist->group->qgroupid, > + (u64)glist->group, GFP_ATOMIC); > + if (ret < 0) > + return ret; > + } > + } > > - ULIST_ITER_INIT(&tmp_uiter); > - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { > - struct btrfs_qgroup_list *glist; > + ULIST_ITER_INIT(&uiter); > + while ((unode = ulist_next(qgroups, &uiter))) { > + bool dirty = false; > > - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; > - if (qg->tag == seq) > - continue; > + qg = (struct btrfs_qgroup *)unode->aux; > + /* > + * Wasn't referenced before but is now, add to the reference > + * counters. > + */ > + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { > + qg->rfer += num_bytes; > + qg->rfer_cmpr += num_bytes; > + dirty = true; > + } > > - if (qg->refcnt - seq == roots->nnodes) { > - qg->excl -= sgn * num_bytes; > - qg->excl_cmpr -= sgn * num_bytes; > - qgroup_dirty(fs_info, qg); > - } > + /* > + * Was referenced before but isn't now, subtract from the > + * reference counters. > + */ > + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { > + qg->rfer -= num_bytes; > + qg->rfer_cmpr -= num_bytes; > + dirty = true; > + } > > - list_for_each_entry(glist, &qg->groups, next_group) { > - ret = ulist_add(tmp, glist->group->qgroupid, > - (uintptr_t)glist->group, > - GFP_ATOMIC); > - if (ret < 0) > - return ret; > - } > + cur_old_count = qg->old_refcnt - seq; > + cur_new_count = qg->new_refcnt - seq; > + > + /* > + * If our refcount was the same as the roots previously but our > + * new count isn't the same as the number of roots now then we > + * went from having a exclusive reference on this range to not. > + */ > + if (old_roots && cur_old_count == old_roots && > + cur_new_count != new_roots) { > + qg->excl -= num_bytes; > + qg->excl_cmpr -= num_bytes; > + dirty = true; > } > - } > > + /* > + * If we didn't reference all the roots before but now we do we > + * have an exclusive reference to this range. > + */ > + if ((!old_roots || (old_roots && cur_old_count != old_roots)) > + && cur_new_count == new_roots) { > + qg->excl += num_bytes; > + qg->excl_cmpr += num_bytes; > + dirty = true; > + } > + > + if (dirty) > + qgroup_dirty(fs_info, qg); > + } > return 0; > } > > /* > - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted > - * from the fs. First, all roots referencing the extent are searched, and > - * then the space is accounted accordingly to the different roots. The > - * accounting algorithm works in 3 steps documented inline. > + * If we added or removed a reference for a root and there were shared refs > + * remaining then we need to go through and see if any of those refs lead back > + * to our same root and if so we don't need to do anything with this operation. > */ > -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info, > - struct btrfs_delayed_ref_node *node, > - struct btrfs_delayed_extent_op *extent_op) > +static int check_shared_refs(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper) > { > - struct btrfs_root *quota_root; > - u64 ref_root; > - struct btrfs_qgroup *qgroup; > struct ulist *roots = NULL; > - u64 seq; > + struct ulist_node *unode; > + struct ulist_iterator uiter; > + struct qgroup_shared_ref *ref; > int ret = 0; > - int sgn; > > - if (!fs_info->quota_enabled) > - return 0; > + /* Now process all of our shared ref dependancies */ > + while (!list_empty(&oper->shared_refs)) { > + ref = list_first_entry(&oper->shared_refs, > + struct qgroup_shared_ref, list); > + if (ret) { > + list_del_init(&ref->list); > + kfree(ref); > + continue; > + } > + if (ref->ref_root) { > + if (ref->ref_root == oper->ref_root) > + ret = 1; > + goto next; > + } > > - BUG_ON(!fs_info->quota_root); > + ret = btrfs_find_all_roots(NULL, fs_info, ref->parent, 0, > + &roots, 0); > + if (ret < 0) > + goto next; > > - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || > - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { > - struct btrfs_delayed_tree_ref *ref; > - ref = btrfs_delayed_node_to_tree_ref(node); > - ref_root = ref->root; > - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || > - node->type == BTRFS_SHARED_DATA_REF_KEY) { > - struct btrfs_delayed_data_ref *ref; > - ref = btrfs_delayed_node_to_data_ref(node); > - ref_root = ref->root; > - } else { > - BUG(); > + ULIST_ITER_INIT(&uiter); > + while ((unode = ulist_next(roots, &uiter))) { > + if (unode->val == oper->ref_root) { > + ret = 1; > + break; > + } > + } > + ulist_free(roots); > + roots = NULL; > +next: > + list_del_init(&ref->list); > + kfree(ref); > } > > - if (!is_fstree(ref_root)) { > - /* > - * non-fs-trees are not being accounted > - */ > - return 0; > - } > + return ret; > +} > > - switch (node->action) { > - case BTRFS_ADD_DELAYED_REF: > - case BTRFS_ADD_DELAYED_EXTENT: > - sgn = 1; > - seq = btrfs_tree_mod_seq_prev(node->seq); > - break; > - case BTRFS_DROP_DELAYED_REF: > - sgn = -1; > - seq = node->seq; > - break; > - case BTRFS_UPDATE_DELAYED_HEAD: > - return 0; > - default: > - BUG(); > - } > +/* > + * If we share a reference across multiple roots then we may need to adjust > + * various qgroups referenced and exclusive counters. The basic premise is this > + * > + * 1) We have seq to represent a 0 count. Instead of looping through all of the > + * qgroups and resetting their refcount to 0 we just constantly bump this > + * sequence number to act as the base reference count. This means that if > + * anybody is equal to or below this sequence they were never referenced. We > + * jack this sequence up by the number of roots we found each time in order to > + * make sure we don't have any overlap. > + * > + * 2) We first search all the roots that reference the area _except_ the root > + * we're acting on currently. This makes up the old_refcnt of all the qgroups > + * before. > + * > + * 3) We walk all of the qgroups referenced by the root we are currently acting > + * on, and will either adjust old_refcnt in the case of a removal or the > + * new_refcnt in the case of an addition. > + * > + * 4) Finally we walk all the qgroups that are referenced by this range > + * including the root we are acting on currently. We will adjust the counters > + * based on the number of roots we had and will have after this operation. > + * > + * Take this example as an illustration > + * > + * [qgroup 1/0] > + * / | \ > + * [qg 0/0] [qg 0/1] [qg 0/2] > + * \ | / > + * [ extent ] > + * > + * Say we are adding a reference that is covered by qg 0/0. The first step > + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with > + * old_roots being 2. Because it is adding new_roots will be 1. We then go > + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's > + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we > + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a > + * reference and thus must add the size to the referenced bytes. Everything > + * else is the same so nothing else changes. > + */ > +static int qgroup_shared_accounting(struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper) > +{ > + struct ulist *roots = NULL; > + struct ulist *qgroups; > + struct btrfs_qgroup *qgroup; > + u64 seq; > + int old_roots = 0; > + int new_roots = 0; > + int ret = 0; > > - mutex_lock(&fs_info->qgroup_rescan_lock); > - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { > - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { > - mutex_unlock(&fs_info->qgroup_rescan_lock); > + if (unlikely(!list_empty(&oper->shared_refs))) { > + ret = check_shared_refs(fs_info, oper); > + if (ret < 0) > + return ret; > + if (ret) > return 0; > - } > } > - mutex_unlock(&fs_info->qgroup_rescan_lock); > > - /* > - * the delayed ref sequence number we pass depends on the direction of > - * the operation. for add operations, we pass > - * tree_mod_log_prev_seq(node->seq) to skip > - * the delayed ref's current sequence number, because we need the state > - * of the tree before the add operation. for delete operations, we pass > - * (node->seq) to include the delayed ref's current sequence number, > - * because we need the state of the tree after the delete operation. > - */ > - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); > - if (ret < 0) > - return ret; > + qgroups = ulist_alloc(GFP_NOFS); > + if (!qgroups) > + return -ENOMEM; > > + ret = btrfs_find_all_roots(NULL, fs_info, oper->bytenr, 0, &roots, 0); > + if (ret < 0) { > + ulist_free(qgroups); > + return ret; > + } > spin_lock(&fs_info->qgroup_lock); > - > - quota_root = fs_info->quota_root; > - if (!quota_root) > - goto unlock; > - > - qgroup = find_qgroup_rb(fs_info, ref_root); > + qgroup = find_qgroup_rb(fs_info, oper->ref_root); > if (!qgroup) > - goto unlock; > + goto out; > + seq = fs_info->qgroup_seq; > > /* > - * step 1: for each old ref, visit all nodes once and inc refcnt > + * So roots is the list of all the roots currently pointing at the > + * bytenr, including the ref we are adding if we are adding, or not if > + * we are removing a ref. So we pass in the ref_root to skip that root > + * in our calculations. We set old_refnct and new_refcnt cause who the > + * hell knows what everything looked like before, and it doesn't matter > + * except... > */ > - ulist_reinit(fs_info->qgroup_ulist); > - seq = fs_info->qgroup_seq; > - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ > + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, roots, qgroups, > + seq, &old_roots, 0); > + if (ret < 0) > + goto out; > > - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, > - seq); > - if (ret) > - goto unlock; > + /* > + * ...in the case of removals. If we had a removal before we got around > + * to processing this operation then we need to find that guy and count > + * his references as if they really existed so we don't end up screwing > + * up the exclusive counts. Then whenever we go to process the delete > + * everything will be grand and we can account for whatever exclusive > + * changes need to be made there. We also have to pass in old_roots so > + * we have an accurate count of the roots as it pertains to this > + * operations view of the world. > + */ > + ret = qgroup_account_deleted_refs(fs_info, oper, qgroups, seq, > + &old_roots); > + if (ret < 0) > + goto out; > > /* > - * step 2: walk from the new root > + * We are adding our root, need to adjust up the number of roots, > + * otherwise old_roots is the number of roots we want. > */ > - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, > - seq, sgn, node->num_bytes, qgroup); > - if (ret) > - goto unlock; > + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { > + new_roots = old_roots + 1; > + } else { > + new_roots = old_roots; > + old_roots++; > + } > + fs_info->qgroup_seq += old_roots + 1; > > /* > - * step 3: walk again from old refs > + * Now adjust the refcounts of the qgroups that care about this > + * reference, either the old_count in the case of removal or new_count > + * in the case of an addition. > */ > - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, > - seq, sgn, node->num_bytes); > - if (ret) > - goto unlock; > + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, roots, qgroups, > + seq); > + if (ret < 0) > + goto out; > > -unlock: > + /* We are doing this in case we have a pending delete */ > + ulist_reinit(qgroups); > + ret = ulist_add(qgroups, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); > + if (ret < 0) > + goto out; > + ret = 0; > + > + /* > + * And now the magic happens, bless Arne for having a pretty elegant > + * solution for this. > + */ > + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, > + roots, qgroups, seq, old_roots, new_roots, 0); > +out: > spin_unlock(&fs_info->qgroup_lock); > + ulist_free(qgroups); > ulist_free(roots); > + return ret; > +} > + > +/* > + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted > + * from the fs. First, all roots referencing the extent are searched, and > + * then the space is accounted accordingly to the different roots. The > + * accounting algorithm works in 3 steps documented inline. > + */ > +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, > + struct btrfs_qgroup_operation *oper) > +{ > + struct btrfs_block_group_cache *cache; > + struct extent_state *cached_state = NULL; > + int ret = 0; > + int err; > + > + if (!fs_info->quota_enabled) > + return 0; > + > + BUG_ON(!fs_info->quota_root); > + > + mutex_lock(&fs_info->qgroup_rescan_lock); > + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { > + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { > + mutex_unlock(&fs_info->qgroup_rescan_lock); > + return 0; > + } > + } > + mutex_unlock(&fs_info->qgroup_rescan_lock); > + > + ASSERT(is_fstree(oper->ref_root)); > + > + ret = lock_ref(fs_info, oper->ref_root, oper->bytenr, oper->num_bytes, > + &cache, &cached_state); > + if (ret) > + return ret; > + > + switch (oper->type) { > + case BTRFS_QGROUP_OPER_ADD_EXCL: > + case BTRFS_QGROUP_OPER_SUB_EXCL: > + ret = qgroup_excl_accounting(fs_info, oper); > + break; > + case BTRFS_QGROUP_OPER_ADD_SHARED: > + case BTRFS_QGROUP_OPER_SUB_SHARED: > + ret = qgroup_shared_accounting(fs_info, oper); > + break; > + default: > + ASSERT(0); > + } > + err = unlock_ref(fs_info, oper->ref_root, oper->bytenr, > + oper->num_bytes, cache, &cached_state); > + return ret ? ret : err; > +} > + > +/* > + * Needs to be called everytime we run delayed refs, even if there is an error > + * in order to cleanup outstanding operations. > + */ > +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info) > +{ > + struct btrfs_qgroup_operation *oper; > + struct qgroup_shared_ref *ref; > + int ret = 0; > > + while (!list_empty(&trans->qgroup_ref_list)) { > + oper = list_first_entry(&trans->qgroup_ref_list, > + struct btrfs_qgroup_operation, list); > + list_del_init(&oper->list); > + if (!ret || !trans->aborted) > + ret = btrfs_qgroup_account(trans, fs_info, oper); > + /* > + * If there was an error we could have things still on our > + * shared refs list. > + */ > + while (!list_empty(&oper->shared_refs)) { > + ASSERT(ret || trans->aborted); > + ref = list_first_entry(&oper->shared_refs, > + struct qgroup_shared_ref, list); > + list_del_init(&ref->list); > + kfree(ref); > + } > + spin_lock(&fs_info->qgroup_op_lock); > + rb_erase(&oper->n, &fs_info->qgroup_op_tree); > + spin_unlock(&fs_info->qgroup_op_lock); > + kfree(oper); > + } > return ret; > } > > @@ -1629,8 +2236,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, > srcgroup = find_qgroup_rb(fs_info, srcid); > if (!srcgroup) > goto unlock; > - dstgroup->rfer = srcgroup->rfer - level_size; > - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; > + > + /* > + * We call inherit after we clone the root in order to make sure > + * our counts don't go crazy, so at this point the only > + * difference between the two roots should be the root node. > + */ > + dstgroup->rfer = srcgroup->rfer; > + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; > + dstgroup->excl = level_size; > + dstgroup->excl_cmpr = level_size; > srcgroup->excl = level_size; > srcgroup->excl_cmpr = level_size; > qgroup_dirty(fs_info, dstgroup); > @@ -1850,11 +2465,13 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, > struct extent_buffer *scratch_leaf) > { > struct btrfs_key found; > + struct btrfs_block_group_cache *cache; > + struct extent_state *cached_state = NULL; > struct ulist *roots = NULL; > - struct ulist_node *unode; > - struct ulist_iterator uiter; > struct seq_list tree_mod_seq_elem = {}; > + u64 num_bytes; > u64 seq; > + int new_roots; > int slot; > int ret; > > @@ -1896,78 +2513,68 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, > > for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { > btrfs_item_key_to_cpu(scratch_leaf, &found, slot); > - if (found.type != BTRFS_EXTENT_ITEM_KEY) > + if (found.type != BTRFS_EXTENT_ITEM_KEY && > + found.type != BTRFS_METADATA_ITEM_KEY) > continue; > - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, > - tree_mod_seq_elem.seq, &roots); > - if (ret < 0) > + > + if (found.type == BTRFS_METADATA_ITEM_KEY) > + num_bytes = fs_info->extent_root->leafsize; > + else > + num_bytes = found.offset; > + > + /* > + * lock_ref checks to make sure we really need to lock by > + * checking the ref root root we are modifying the ref for, so > + * just use the fs tree objectid. > + */ > + ret = lock_ref(fs_info, BTRFS_FS_TREE_OBJECTID, found.objectid, > + num_bytes, &cache, &cached_state); > + if (ret) > goto out; > + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, > + &roots, 0); > + if (ret < 0) { > + unlock_ref(fs_info, BTRFS_FS_TREE_OBJECTID, > + found.objectid, num_bytes, cache, > + &cached_state); > + goto out; > + } > spin_lock(&fs_info->qgroup_lock); > seq = fs_info->qgroup_seq; > - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ > + fs_info->qgroup_seq += roots->nnodes + 1; > > - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); > - if (ret) { > + ulist_reinit(tmp); > + new_roots = 0; > + ret = qgroup_calc_old_refcnt(fs_info, 0, roots, tmp, seq, > + &new_roots, 1); > + if (ret < 0) { > spin_unlock(&fs_info->qgroup_lock); > ulist_free(roots); > + unlock_ref(fs_info, BTRFS_FS_TREE_OBJECTID, > + found.objectid, num_bytes, cache, > + &cached_state); > goto out; > } > > - /* > - * step2 of btrfs_qgroup_account_ref works from a single root, > - * we're doing all at once here. > - */ > ulist_reinit(tmp); > - ULIST_ITER_INIT(&uiter); > - while ((unode = ulist_next(roots, &uiter))) { > - struct btrfs_qgroup *qg; > - > - qg = find_qgroup_rb(fs_info, unode->val); > - if (!qg) > - continue; > - > - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, > - GFP_ATOMIC); > - if (ret < 0) { > - spin_unlock(&fs_info->qgroup_lock); > - ulist_free(roots); > - goto out; > - } > - } > - > - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ > - ULIST_ITER_INIT(&uiter); > - while ((unode = ulist_next(tmp, &uiter))) { > - struct btrfs_qgroup *qg; > - struct btrfs_qgroup_list *glist; > - > - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; > - qg->rfer += found.offset; > - qg->rfer_cmpr += found.offset; > - WARN_ON(qg->tag >= seq); > - if (qg->refcnt - seq == roots->nnodes) { > - qg->excl += found.offset; > - qg->excl_cmpr += found.offset; > - } > - qgroup_dirty(fs_info, qg); > - > - list_for_each_entry(glist, &qg->groups, next_group) { > - ret = ulist_add(tmp, glist->group->qgroupid, > - (uintptr_t)glist->group, > - GFP_ATOMIC); > - if (ret < 0) { > - spin_unlock(&fs_info->qgroup_lock); > - ulist_free(roots); > - goto out; > - } > - } > + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, roots, tmp, > + seq, 0, new_roots, 1); > + if (ret < 0) { > + spin_unlock(&fs_info->qgroup_lock); > + ulist_free(roots); > + unlock_ref(fs_info, BTRFS_FS_TREE_OBJECTID, > + found.objectid, num_bytes, cache, > + &cached_state); > + goto out; > } > - > spin_unlock(&fs_info->qgroup_lock); > ulist_free(roots); > - ret = 0; > + ret = unlock_ref(fs_info, BTRFS_FS_TREE_OBJECTID, > + found.objectid, num_bytes, cache, > + &cached_state); > + if (ret) > + break; > } > - > out: > btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); > > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 026f1fe..503cb46 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -692,23 +692,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, > return 0; > } > > - /* > - * do the qgroup accounting as early as possible > - */ > - err = btrfs_delayed_refs_qgroup_accounting(trans, info); > - > btrfs_trans_release_metadata(trans, root); > trans->block_rsv = NULL; > > - if (trans->qgroup_reserved) { > - /* > - * the same root has to be passed here between start_transaction > - * and end_transaction. Subvolume quota depends on this. > - */ > - btrfs_qgroup_free(trans->root, trans->qgroup_reserved); > - trans->qgroup_reserved = 0; > - } > - > if (!list_empty(&trans->new_bgs)) > btrfs_create_pending_block_groups(trans, root); > > @@ -719,6 +705,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, > btrfs_run_delayed_refs(trans, root, cur); > } > > + if (trans->qgroup_reserved) { > + /* > + * the same root has to be passed here between start_transaction > + * and end_transaction. Subvolume quota depends on this. > + */ > + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); > + trans->qgroup_reserved = 0; > + } > + > btrfs_trans_release_metadata(trans, root); > trans->block_rsv = NULL; > > @@ -1176,12 +1171,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, > goto no_free_objectid; > } > > - pending->error = btrfs_qgroup_inherit(trans, fs_info, > - root->root_key.objectid, > - objectid, pending->inherit); > - if (pending->error) > - goto no_free_objectid; > - > key.objectid = objectid; > key.offset = (u64)-1; > key.type = BTRFS_ROOT_ITEM_KEY; > @@ -1278,6 +1267,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, > goto fail; > } > > + /* > + * We need to flush delayed refs in order to make sure all of our quota > + * operations have been done before we call btrfs_qgroup_inherit. > + */ > + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); > + if (ret) { > + btrfs_abort_transaction(trans, root, ret); > + goto fail; > + } > + > + pending->error = btrfs_qgroup_inherit(trans, fs_info, > + root->root_key.objectid, > + objectid, pending->inherit); > + if (pending->error) > + goto no_free_objectid; > + > /* see comments in should_cow_block() */ > root->force_cow = 1; > smp_wmb(); > @@ -1607,12 +1612,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, > * them now so that they hinder processing of more delayed refs > * as little as possible. > */ > - if (ret) { > - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); > - return ret; > - } > - > - ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); > if (ret) > return ret; > > -- > 1.8.3.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html